· 5 years ago · Jun 26, 2020, 01:58 PM
1diff -Naur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
2--- a/Documentation/vm/uksm.txt 1970-01-01 01:00:00.000000000 +0100
3+++ b/Documentation/vm/uksm.txt 2020-06-15 10:34:30.890359050 +0200
4@@ -0,0 +1,61 @@
5+The Ultra Kernel Samepage Merging feature
6+----------------------------------------------
7+/*
8+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
9+ *
10+ * This is an improvement upon KSM. Some basic data structures and routines
11+ * are borrowed from ksm.c .
12+ *
13+ * Its new features:
14+ * 1. Full system scan:
15+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
16+ * interaction to submit a memory area to KSM is no longer needed.
17+ *
18+ * 2. Rich area detection:
19+ * It automatically detects rich areas containing abundant duplicated
20+ * pages based. Rich areas are given a full scan speed. Poor areas are
21+ * sampled at a reasonable speed with very low CPU consumption.
22+ *
23+ * 3. Ultra Per-page scan speed improvement:
24+ * A new hash algorithm is proposed. As a result, on a machine with
25+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
26+ * can scan memory areas that does not contain duplicated pages at speed of
27+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
28+ * 477MB/sec ~ 923MB/sec.
29+ *
30+ * 4. Thrashing area avoidance:
31+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
32+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
33+ * hash value based volatile page detection.
34+ *
35+ *
36+ * 5. Misc changes upon KSM:
37+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
38+ * comparison. It's much faster than default C version on x86.
39+ * * rmap_item now has an struct *page member to loosely cache a
40+ * address-->page mapping, which reduces too much time-costly
41+ * follow_page().
42+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
43+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
44+ * ksm is needed for this case.
45+ *
46+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
47+ * Now uksmd consider full zero pages as special pages and merge them to an
48+ * special unswappable uksm zero page.
49+ */
50+
51+ChangeLog:
52+
53+2012-05-05 The creation of this Doc
54+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
55+2012-05-28 UKSM 0.1.1.2 bug fix release
56+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
57+2012-07-2 UKSM 0.1.2-beta2
58+2012-07-10 UKSM 0.1.2-beta3
59+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
60+2012-10-13 UKSM 0.1.2.1 Bug fixes.
61+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
62+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
63+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
64+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
65+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
66diff -Naur a/fs/exec.c b/fs/exec.c
67--- a/fs/exec.c 2020-06-14 21:45:04.000000000 +0200
68+++ b/fs/exec.c 2020-06-15 10:34:30.890359050 +0200
69@@ -62,6 +62,7 @@
70 #include <linux/oom.h>
71 #include <linux/compat.h>
72 #include <linux/vmalloc.h>
73+#include <linux/ksm.h>
74
75 #include <linux/uaccess.h>
76 #include <asm/mmu_context.h>
77diff -Naur a/fs/proc/meminfo.c b/fs/proc/meminfo.c
78--- a/fs/proc/meminfo.c 2020-06-14 21:45:04.000000000 +0200
79+++ b/fs/proc/meminfo.c 2020-06-15 10:34:30.890359050 +0200
80@@ -108,7 +108,10 @@
81 #endif
82 show_val_kb(m, "PageTables: ",
83 global_zone_page_state(NR_PAGETABLE));
84-
85+#ifdef CONFIG_UKSM
86+ show_val_kb(m, "KsmZeroPages: ",
87+ global_zone_page_state(NR_UKSM_ZERO_PAGES));
88+#endif
89 show_val_kb(m, "NFS_Unstable: ", 0);
90 show_val_kb(m, "Bounce: ",
91 global_zone_page_state(NR_BOUNCE));
92diff -Naur a/include/linux/ksm.h b/include/linux/ksm.h
93--- a/include/linux/ksm.h 2020-06-14 21:45:04.000000000 +0200
94+++ b/include/linux/ksm.h 2020-06-15 10:34:30.890359050 +0200
95@@ -21,20 +21,16 @@
96 #ifdef CONFIG_KSM
97 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
98 unsigned long end, int advice, unsigned long *vm_flags);
99-int __ksm_enter(struct mm_struct *mm);
100-void __ksm_exit(struct mm_struct *mm);
101
102-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
103+static inline struct stable_node *page_stable_node(struct page *page)
104 {
105- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
106- return __ksm_enter(mm);
107- return 0;
108+ return PageKsm(page) ? page_rmapping(page) : NULL;
109 }
110
111-static inline void ksm_exit(struct mm_struct *mm)
112+static inline void set_page_stable_node(struct page *page,
113+ struct stable_node *stable_node)
114 {
115- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
116- __ksm_exit(mm);
117+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
118 }
119
120 /*
121@@ -56,6 +52,33 @@
122 bool reuse_ksm_page(struct page *page,
123 struct vm_area_struct *vma, unsigned long address);
124
125+#ifdef CONFIG_KSM_LEGACY
126+int __ksm_enter(struct mm_struct *mm);
127+void __ksm_exit(struct mm_struct *mm);
128+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
129+{
130+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
131+ return __ksm_enter(mm);
132+ return 0;
133+}
134+
135+static inline void ksm_exit(struct mm_struct *mm)
136+{
137+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
138+ __ksm_exit(mm);
139+}
140+
141+#elif defined(CONFIG_UKSM)
142+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
143+{
144+ return 0;
145+}
146+
147+static inline void ksm_exit(struct mm_struct *mm)
148+{
149+}
150+#endif /* !CONFIG_UKSM */
151+
152 #else /* !CONFIG_KSM */
153
154 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
155@@ -96,4 +119,6 @@
156 #endif /* CONFIG_MMU */
157 #endif /* !CONFIG_KSM */
158
159+#include <linux/uksm.h>
160+
161 #endif /* __LINUX_KSM_H */
162diff -Naur a/include/linux/mm_types.h b/include/linux/mm_types.h
163--- a/include/linux/mm_types.h 2020-06-14 21:45:04.000000000 +0200
164+++ b/include/linux/mm_types.h 2020-06-15 10:34:30.890359050 +0200
165@@ -367,6 +367,9 @@
166 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
167 #endif
168 struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
169+#ifdef CONFIG_UKSM
170+ struct vma_slot *uksm_vma_slot;
171+#endif
172 } __randomize_layout;
173
174 struct core_thread {
175diff -Naur a/include/linux/mmzone.h b/include/linux/mmzone.h
176--- a/include/linux/mmzone.h 2020-06-14 21:45:04.000000000 +0200
177+++ b/include/linux/mmzone.h 2020-06-15 10:34:30.890359050 +0200
178@@ -165,6 +165,9 @@
179 NR_ZSPAGES, /* allocated in zsmalloc */
180 #endif
181 NR_FREE_CMA_PAGES,
182+#ifdef CONFIG_UKSM
183+ NR_UKSM_ZERO_PAGES,
184+#endif
185 NR_VM_ZONE_STAT_ITEMS };
186
187 enum node_stat_item {
188diff -Naur a/include/linux/pgtable.h b/include/linux/pgtable.h
189--- a/include/linux/pgtable.h 2020-06-14 21:45:04.000000000 +0200
190+++ b/include/linux/pgtable.h 2020-06-15 10:34:30.890359050 +0200
191@@ -1020,12 +1020,25 @@
192 extern void untrack_pfn_moved(struct vm_area_struct *vma);
193 #endif
194
195+#ifdef CONFIG_UKSM
196+static inline int is_uksm_zero_pfn(unsigned long pfn)
197+{
198+ extern unsigned long uksm_zero_pfn;
199+ return pfn == uksm_zero_pfn;
200+}
201+#else
202+static inline int is_uksm_zero_pfn(unsigned long pfn)
203+{
204+ return 0;
205+}
206+#endif
207+
208 #ifdef __HAVE_COLOR_ZERO_PAGE
209 static inline int is_zero_pfn(unsigned long pfn)
210 {
211 extern unsigned long zero_pfn;
212 unsigned long offset_from_zero_pfn = pfn - zero_pfn;
213- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
214+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
215 }
216
217 #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
218@@ -1034,7 +1047,7 @@
219 static inline int is_zero_pfn(unsigned long pfn)
220 {
221 extern unsigned long zero_pfn;
222- return pfn == zero_pfn;
223+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
224 }
225
226 static inline unsigned long my_zero_pfn(unsigned long addr)
227diff -Naur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
228--- a/include/linux/sradix-tree.h 1970-01-01 01:00:00.000000000 +0100
229+++ b/include/linux/sradix-tree.h 2020-06-15 10:34:30.890359050 +0200
230@@ -0,0 +1,77 @@
231+#ifndef _LINUX_SRADIX_TREE_H
232+#define _LINUX_SRADIX_TREE_H
233+
234+
235+#define INIT_SRADIX_TREE(root, mask) \
236+do { \
237+ (root)->height = 0; \
238+ (root)->gfp_mask = (mask); \
239+ (root)->rnode = NULL; \
240+} while (0)
241+
242+#define ULONG_BITS (sizeof(unsigned long) * 8)
243+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
244+//#define SRADIX_TREE_MAP_SHIFT 6
245+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
246+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
247+
248+struct sradix_tree_node {
249+ unsigned int height; /* Height from the bottom */
250+ unsigned int count;
251+ unsigned int fulls; /* Number of full sublevel trees */
252+ struct sradix_tree_node *parent;
253+ void *stores[0];
254+};
255+
256+/* A simple radix tree implementation */
257+struct sradix_tree_root {
258+ unsigned int height;
259+ struct sradix_tree_node *rnode;
260+
261+ /* Where found to have available empty stores in its sublevels */
262+ struct sradix_tree_node *enter_node;
263+ unsigned int shift;
264+ unsigned int stores_size;
265+ unsigned int mask;
266+ unsigned long min; /* The first hole index */
267+ unsigned long num;
268+ //unsigned long *height_to_maxindex;
269+
270+ /* How the node is allocated and freed. */
271+ struct sradix_tree_node *(*alloc)(void);
272+ void (*free)(struct sradix_tree_node *node);
273+
274+ /* When a new node is added and removed */
275+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
276+ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
277+ void (*rm)(struct sradix_tree_node *node, unsigned int offset);
278+};
279+
280+struct sradix_tree_path {
281+ struct sradix_tree_node *node;
282+ int offset;
283+};
284+
285+static inline
286+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
287+{
288+ root->height = 0;
289+ root->rnode = NULL;
290+ root->shift = shift;
291+ root->stores_size = 1UL << shift;
292+ root->mask = root->stores_size - 1;
293+}
294+
295+
296+extern void *sradix_tree_next(struct sradix_tree_root *root,
297+ struct sradix_tree_node *node, unsigned long index,
298+ int (*iter)(void *, unsigned long));
299+
300+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
301+
302+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
303+ struct sradix_tree_node *node, unsigned long index);
304+
305+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
306+
307+#endif /* _LINUX_SRADIX_TREE_H */
308diff -Naur a/include/linux/uksm.h b/include/linux/uksm.h
309--- a/include/linux/uksm.h 1970-01-01 01:00:00.000000000 +0100
310+++ b/include/linux/uksm.h 2020-06-15 10:34:30.890359050 +0200
311@@ -0,0 +1,149 @@
312+#ifndef __LINUX_UKSM_H
313+#define __LINUX_UKSM_H
314+/*
315+ * Memory merging support.
316+ *
317+ * This code enables dynamic sharing of identical pages found in different
318+ * memory areas, even if they are not shared by fork().
319+ */
320+
321+/* if !CONFIG_UKSM this file should not be compiled at all. */
322+#ifdef CONFIG_UKSM
323+
324+#include <linux/bitops.h>
325+#include <linux/mm.h>
326+#include <linux/pagemap.h>
327+#include <linux/rmap.h>
328+#include <linux/sched.h>
329+
330+extern unsigned long zero_pfn __read_mostly;
331+extern unsigned long uksm_zero_pfn __read_mostly;
332+extern struct page *empty_uksm_zero_page;
333+
334+/* must be done before linked to mm */
335+extern void uksm_vma_add_new(struct vm_area_struct *vma);
336+extern void uksm_remove_vma(struct vm_area_struct *vma);
337+
338+#define UKSM_SLOT_NEED_SORT (1 << 0)
339+#define UKSM_SLOT_NEED_RERAND (1 << 1)
340+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
341+#define UKSM_SLOT_FUL_SCANNED (1 << 3)
342+#define UKSM_SLOT_IN_UKSM (1 << 4)
343+
344+struct vma_slot {
345+ struct sradix_tree_node *snode;
346+ unsigned long sindex;
347+
348+ struct list_head slot_list;
349+ unsigned long fully_scanned_round;
350+ unsigned long dedup_num;
351+ unsigned long pages_scanned;
352+ unsigned long this_sampled;
353+ unsigned long last_scanned;
354+ unsigned long pages_to_scan;
355+ struct scan_rung *rung;
356+ struct page **rmap_list_pool;
357+ unsigned int *pool_counts;
358+ unsigned long pool_size;
359+ struct vm_area_struct *vma;
360+ struct mm_struct *mm;
361+ unsigned long ctime_j;
362+ unsigned long pages;
363+ unsigned long flags;
364+ unsigned long pages_cowed; /* pages cowed this round */
365+ unsigned long pages_merged; /* pages merged this round */
366+ unsigned long pages_bemerged;
367+
368+ /* when it has page merged in this eval round */
369+ struct list_head dedup_list;
370+};
371+
372+static inline void uksm_unmap_zero_page(pte_t pte)
373+{
374+ if (pte_pfn(pte) == uksm_zero_pfn)
375+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
376+}
377+
378+static inline void uksm_map_zero_page(pte_t pte)
379+{
380+ if (pte_pfn(pte) == uksm_zero_pfn)
381+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
382+}
383+
384+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
385+{
386+ if (vma->uksm_vma_slot && PageKsm(page))
387+ vma->uksm_vma_slot->pages_cowed++;
388+}
389+
390+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
391+{
392+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
393+ vma->uksm_vma_slot->pages_cowed++;
394+}
395+
396+static inline int uksm_flags_can_scan(unsigned long vm_flags)
397+{
398+#ifdef VM_SAO
399+ if (vm_flags & VM_SAO)
400+ return 0;
401+#endif
402+
403+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
404+ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
405+ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
406+}
407+
408+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
409+{
410+ if (uksm_flags_can_scan(*vm_flags_p))
411+ *vm_flags_p |= VM_MERGEABLE;
412+}
413+
414+/*
415+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
416+ * be removed when uksm zero page patch is stable enough.
417+ */
418+static inline void uksm_bugon_zeropage(pte_t pte)
419+{
420+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
421+}
422+#else
423+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
424+{
425+}
426+
427+static inline void uksm_remove_vma(struct vm_area_struct *vma)
428+{
429+}
430+
431+static inline void uksm_unmap_zero_page(pte_t pte)
432+{
433+}
434+
435+static inline void uksm_map_zero_page(pte_t pte)
436+{
437+}
438+
439+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
440+{
441+}
442+
443+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
444+{
445+}
446+
447+static inline int uksm_flags_can_scan(unsigned long vm_flags)
448+{
449+ return 0;
450+}
451+
452+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
453+{
454+}
455+
456+static inline void uksm_bugon_zeropage(pte_t pte)
457+{
458+}
459+#endif /* !CONFIG_UKSM */
460+#endif /* __LINUX_UKSM_H */
461diff -Naur a/kernel/fork.c b/kernel/fork.c
462--- a/kernel/fork.c 2020-06-14 21:45:04.000000000 +0200
463+++ b/kernel/fork.c 2020-06-15 10:34:30.890359050 +0200
464@@ -603,7 +603,7 @@
465 __vma_link_rb(mm, tmp, rb_link, rb_parent);
466 rb_link = &tmp->vm_rb.rb_right;
467 rb_parent = &tmp->vm_rb;
468-
469+ uksm_vma_add_new(tmp);
470 mm->map_count++;
471 if (!(tmp->vm_flags & VM_WIPEONFORK))
472 retval = copy_page_range(mm, oldmm, mpnt);
473diff -Naur a/lib/Makefile b/lib/Makefile
474--- a/lib/Makefile 2020-06-14 21:45:04.000000000 +0200
475+++ b/lib/Makefile 2020-06-15 10:34:30.890359050 +0200
476@@ -29,7 +29,7 @@
477 KCSAN_SANITIZE_random32.o := n
478
479 lib-y := ctype.o string.o vsprintf.o cmdline.o \
480- rbtree.o radix-tree.o timerqueue.o xarray.o \
481+ rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
482 idr.o extable.o sha1.o irq_regs.o argv_split.o \
483 flex_proportions.o ratelimit.o show_mem.o \
484 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
485diff -Naur a/lib/sradix-tree.c b/lib/sradix-tree.c
486--- a/lib/sradix-tree.c 1970-01-01 01:00:00.000000000 +0100
487+++ b/lib/sradix-tree.c 2020-06-15 10:34:30.890359050 +0200
488@@ -0,0 +1,476 @@
489+#include <linux/errno.h>
490+#include <linux/mm.h>
491+#include <linux/mman.h>
492+#include <linux/spinlock.h>
493+#include <linux/slab.h>
494+#include <linux/gcd.h>
495+#include <linux/sradix-tree.h>
496+
497+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
498+{
499+ return node->fulls == root->stores_size ||
500+ (node->height == 1 && node->count == root->stores_size);
501+}
502+
503+/*
504+ * Extend a sradix tree so it can store key @index.
505+ */
506+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
507+{
508+ struct sradix_tree_node *node;
509+ unsigned int height;
510+
511+ if (unlikely(root->rnode == NULL)) {
512+ if (!(node = root->alloc()))
513+ return -ENOMEM;
514+
515+ node->height = 1;
516+ root->rnode = node;
517+ root->height = 1;
518+ }
519+
520+ /* Figure out what the height should be. */
521+ height = root->height;
522+ index >>= root->shift * height;
523+
524+ while (index) {
525+ index >>= root->shift;
526+ height++;
527+ }
528+
529+ while (height > root->height) {
530+ unsigned int newheight;
531+
532+ if (!(node = root->alloc()))
533+ return -ENOMEM;
534+
535+ /* Increase the height. */
536+ node->stores[0] = root->rnode;
537+ root->rnode->parent = node;
538+ if (root->extend)
539+ root->extend(node, root->rnode);
540+
541+ newheight = root->height + 1;
542+ node->height = newheight;
543+ node->count = 1;
544+ if (sradix_node_full(root, root->rnode))
545+ node->fulls = 1;
546+
547+ root->rnode = node;
548+ root->height = newheight;
549+ }
550+
551+ return 0;
552+}
553+
554+/*
555+ * Search the next item from the current node, that is not NULL
556+ * and can satify root->iter().
557+ */
558+void *sradix_tree_next(struct sradix_tree_root *root,
559+ struct sradix_tree_node *node, unsigned long index,
560+ int (*iter)(void *item, unsigned long height))
561+{
562+ unsigned long offset;
563+ void *item;
564+
565+ if (unlikely(node == NULL)) {
566+ node = root->rnode;
567+ for (offset = 0; offset < root->stores_size; offset++) {
568+ item = node->stores[offset];
569+ if (item && (!iter || iter(item, node->height)))
570+ break;
571+ }
572+
573+ if (unlikely(offset >= root->stores_size))
574+ return NULL;
575+
576+ if (node->height == 1)
577+ return item;
578+ else
579+ goto go_down;
580+ }
581+
582+ while (node) {
583+ offset = (index & root->mask) + 1;
584+ for (; offset < root->stores_size; offset++) {
585+ item = node->stores[offset];
586+ if (item && (!iter || iter(item, node->height)))
587+ break;
588+ }
589+
590+ if (offset < root->stores_size)
591+ break;
592+
593+ node = node->parent;
594+ index >>= root->shift;
595+ }
596+
597+ if (!node)
598+ return NULL;
599+
600+ while (node->height > 1) {
601+go_down:
602+ node = item;
603+ for (offset = 0; offset < root->stores_size; offset++) {
604+ item = node->stores[offset];
605+ if (item && (!iter || iter(item, node->height)))
606+ break;
607+ }
608+
609+ if (unlikely(offset >= root->stores_size))
610+ return NULL;
611+ }
612+
613+ BUG_ON(offset > root->stores_size);
614+
615+ return item;
616+}
617+
618+/*
619+ * Blindly insert the item to the tree. Typically, we reuse the
620+ * first empty store item.
621+ */
622+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
623+{
624+ unsigned long index;
625+ unsigned int height;
626+ struct sradix_tree_node *node, *tmp = NULL;
627+ int offset, offset_saved;
628+ void **store = NULL;
629+ int error, i, j, shift;
630+
631+go_on:
632+ index = root->min;
633+
634+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
635+ node = root->enter_node;
636+ BUG_ON((index >> (root->shift * root->height)));
637+ } else {
638+ node = root->rnode;
639+ if (node == NULL || (index >> (root->shift * root->height))
640+ || sradix_node_full(root, node)) {
641+ error = sradix_tree_extend(root, index);
642+ if (error)
643+ return error;
644+
645+ node = root->rnode;
646+ }
647+ }
648+
649+
650+ height = node->height;
651+ shift = (height - 1) * root->shift;
652+ offset = (index >> shift) & root->mask;
653+ while (shift > 0) {
654+ offset_saved = offset;
655+ for (; offset < root->stores_size; offset++) {
656+ store = &node->stores[offset];
657+ tmp = *store;
658+
659+ if (!tmp || !sradix_node_full(root, tmp))
660+ break;
661+ }
662+ BUG_ON(offset >= root->stores_size);
663+
664+ if (offset != offset_saved) {
665+ index += (offset - offset_saved) << shift;
666+ index &= ~((1UL << shift) - 1);
667+ }
668+
669+ if (!tmp) {
670+ if (!(tmp = root->alloc()))
671+ return -ENOMEM;
672+
673+ tmp->height = shift / root->shift;
674+ *store = tmp;
675+ tmp->parent = node;
676+ node->count++;
677+// if (root->extend)
678+// root->extend(node, tmp);
679+ }
680+
681+ node = tmp;
682+ shift -= root->shift;
683+ offset = (index >> shift) & root->mask;
684+ }
685+
686+ BUG_ON(node->height != 1);
687+
688+
689+ store = &node->stores[offset];
690+ for (i = 0, j = 0;
691+ j < root->stores_size - node->count &&
692+ i < root->stores_size - offset && j < num; i++) {
693+ if (!store[i]) {
694+ store[i] = item[j];
695+ if (root->assign)
696+ root->assign(node, index + i, item[j]);
697+ j++;
698+ }
699+ }
700+
701+ node->count += j;
702+ root->num += j;
703+ num -= j;
704+
705+ while (sradix_node_full(root, node)) {
706+ node = node->parent;
707+ if (!node)
708+ break;
709+
710+ node->fulls++;
711+ }
712+
713+ if (unlikely(!node)) {
714+ /* All nodes are full */
715+ root->min = 1 << (root->height * root->shift);
716+ root->enter_node = NULL;
717+ } else {
718+ root->min = index + i - 1;
719+ root->min |= (1UL << (node->height - 1)) - 1;
720+ root->min++;
721+ root->enter_node = node;
722+ }
723+
724+ if (num) {
725+ item += j;
726+ goto go_on;
727+ }
728+
729+ return 0;
730+}
731+
732+
733+/**
734+ * sradix_tree_shrink - shrink height of a sradix tree to minimal
735+ * @root sradix tree root
736+ *
737+ */
738+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
739+{
740+ /* try to shrink tree height */
741+ while (root->height > 1) {
742+ struct sradix_tree_node *to_free = root->rnode;
743+
744+ /*
745+ * The candidate node has more than one child, or its child
746+ * is not at the leftmost store, we cannot shrink.
747+ */
748+ if (to_free->count != 1 || !to_free->stores[0])
749+ break;
750+
751+ root->rnode = to_free->stores[0];
752+ root->rnode->parent = NULL;
753+ root->height--;
754+ if (unlikely(root->enter_node == to_free))
755+ root->enter_node = NULL;
756+ root->free(to_free);
757+ }
758+}
759+
760+/*
761+ * Del the item on the known leaf node and index
762+ */
763+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
764+ struct sradix_tree_node *node, unsigned long index)
765+{
766+ unsigned int offset;
767+ struct sradix_tree_node *start, *end;
768+
769+ BUG_ON(node->height != 1);
770+
771+ start = node;
772+ while (node && !(--node->count))
773+ node = node->parent;
774+
775+ end = node;
776+ if (!node) {
777+ root->rnode = NULL;
778+ root->height = 0;
779+ root->min = 0;
780+ root->num = 0;
781+ root->enter_node = NULL;
782+ } else {
783+ offset = (index >> (root->shift * (node->height - 1))) & root->mask;
784+ if (root->rm)
785+ root->rm(node, offset);
786+ node->stores[offset] = NULL;
787+ root->num--;
788+ if (root->min > index) {
789+ root->min = index;
790+ root->enter_node = node;
791+ }
792+ }
793+
794+ if (start != end) {
795+ do {
796+ node = start;
797+ start = start->parent;
798+ if (unlikely(root->enter_node == node))
799+ root->enter_node = end;
800+ root->free(node);
801+ } while (start != end);
802+
803+ /*
804+ * Note that shrink may free "end", so enter_node still need to
805+ * be checked inside.
806+ */
807+ sradix_tree_shrink(root);
808+ } else if (node->count == root->stores_size - 1) {
809+ /* It WAS a full leaf node. Update the ancestors */
810+ node = node->parent;
811+ while (node) {
812+ node->fulls--;
813+ if (node->fulls != root->stores_size - 1)
814+ break;
815+
816+ node = node->parent;
817+ }
818+ }
819+}
820+
821+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
822+{
823+ unsigned int height, offset;
824+ struct sradix_tree_node *node;
825+ int shift;
826+
827+ node = root->rnode;
828+ if (node == NULL || (index >> (root->shift * root->height)))
829+ return NULL;
830+
831+ height = root->height;
832+ shift = (height - 1) * root->shift;
833+
834+ do {
835+ offset = (index >> shift) & root->mask;
836+ node = node->stores[offset];
837+ if (!node)
838+ return NULL;
839+
840+ shift -= root->shift;
841+ } while (shift >= 0);
842+
843+ return node;
844+}
845+
846+/*
847+ * Return the item if it exists, otherwise create it in place
848+ * and return the created item.
849+ */
850+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
851+ unsigned long index, void *(*item_alloc)(void))
852+{
853+ unsigned int height, offset;
854+ struct sradix_tree_node *node, *tmp;
855+ void *item;
856+ int shift, error;
857+
858+ if (root->rnode == NULL || (index >> (root->shift * root->height))) {
859+ if (item_alloc) {
860+ error = sradix_tree_extend(root, index);
861+ if (error)
862+ return NULL;
863+ } else {
864+ return NULL;
865+ }
866+ }
867+
868+ node = root->rnode;
869+ height = root->height;
870+ shift = (height - 1) * root->shift;
871+
872+ do {
873+ offset = (index >> shift) & root->mask;
874+ if (!node->stores[offset]) {
875+ if (!(tmp = root->alloc()))
876+ return NULL;
877+
878+ tmp->height = shift / root->shift;
879+ node->stores[offset] = tmp;
880+ tmp->parent = node;
881+ node->count++;
882+ node = tmp;
883+ } else {
884+ node = node->stores[offset];
885+ }
886+
887+ shift -= root->shift;
888+ } while (shift > 0);
889+
890+ BUG_ON(node->height != 1);
891+ offset = index & root->mask;
892+ if (node->stores[offset]) {
893+ return node->stores[offset];
894+ } else if (item_alloc) {
895+ if (!(item = item_alloc()))
896+ return NULL;
897+
898+ node->stores[offset] = item;
899+
900+ /*
901+ * NOTE: we do NOT call root->assign here, since this item is
902+ * newly created by us having no meaning. Caller can call this
903+ * if it's necessary to do so.
904+ */
905+
906+ node->count++;
907+ root->num++;
908+
909+ while (sradix_node_full(root, node)) {
910+ node = node->parent;
911+ if (!node)
912+ break;
913+
914+ node->fulls++;
915+ }
916+
917+ if (unlikely(!node)) {
918+ /* All nodes are full */
919+ root->min = 1 << (root->height * root->shift);
920+ } else {
921+ if (root->min == index) {
922+ root->min |= (1UL << (node->height - 1)) - 1;
923+ root->min++;
924+ root->enter_node = node;
925+ }
926+ }
927+
928+ return item;
929+ } else {
930+ return NULL;
931+ }
932+
933+}
934+
935+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
936+{
937+ unsigned int height, offset;
938+ struct sradix_tree_node *node;
939+ int shift;
940+
941+ node = root->rnode;
942+ if (node == NULL || (index >> (root->shift * root->height)))
943+ return -ENOENT;
944+
945+ height = root->height;
946+ shift = (height - 1) * root->shift;
947+
948+ do {
949+ offset = (index >> shift) & root->mask;
950+ node = node->stores[offset];
951+ if (!node)
952+ return -ENOENT;
953+
954+ shift -= root->shift;
955+ } while (shift > 0);
956+
957+ offset = index & root->mask;
958+ if (!node->stores[offset])
959+ return -ENOENT;
960+
961+ sradix_tree_delete_from_leaf(root, node, index);
962+
963+ return 0;
964+}
965diff -Naur a/mm/Kconfig b/mm/Kconfig
966--- a/mm/Kconfig 2020-06-14 21:45:04.000000000 +0200
967+++ b/mm/Kconfig 2020-06-15 10:34:30.890359050 +0200
968@@ -321,6 +321,32 @@
969 See Documentation/vm/ksm.rst for more information: KSM is inactive
970 until a program has madvised that an area is MADV_MERGEABLE, and
971 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
972+choice
973+ prompt "Choose UKSM/KSM strategy"
974+ default UKSM
975+ depends on KSM
976+ help
977+ This option allows to select a UKSM/KSM stragety.
978+
979+config UKSM
980+ bool "Ultra-KSM for page merging"
981+ depends on KSM
982+ help
983+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
984+ page Merging), but with a fundamentally rewritten core algorithm. With
985+ an advanced algorithm, UKSM now can transparently scans all anonymously
986+ mapped user space applications with an significantly improved scan speed
987+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
988+ UKSM. Now UKSM has its first stable release and first real world enterprise user.
989+ For more information, please goto its project page.
990+ (github.com/dolohow/uksm)
991+
992+config KSM_LEGACY
993+ bool "Legacy KSM implementation"
994+ depends on KSM
995+ help
996+ The legacy KSM implementation from Red Hat.
997+endchoice
998
999 config DEFAULT_MMAP_MIN_ADDR
1000 int "Low address space to protect from user allocation"
1001diff -Naur a/mm/ksm.c b/mm/ksm.c
1002--- a/mm/ksm.c 2020-06-14 21:45:04.000000000 +0200
1003+++ b/mm/ksm.c 2020-06-15 10:34:30.892359059 +0200
1004@@ -857,17 +857,6 @@
1005 return err;
1006 }
1007
1008-static inline struct stable_node *page_stable_node(struct page *page)
1009-{
1010- return PageKsm(page) ? page_rmapping(page) : NULL;
1011-}
1012-
1013-static inline void set_page_stable_node(struct page *page,
1014- struct stable_node *stable_node)
1015-{
1016- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1017-}
1018-
1019 #ifdef CONFIG_SYSFS
1020 /*
1021 * Only called through the sysfs control interface:
1022diff -Naur a/mm/Makefile b/mm/Makefile
1023--- a/mm/Makefile 2020-06-14 21:45:04.000000000 +0200
1024+++ b/mm/Makefile 2020-06-15 10:34:30.892359059 +0200
1025@@ -76,7 +76,8 @@
1026 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1027 obj-$(CONFIG_SLOB) += slob.o
1028 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1029-obj-$(CONFIG_KSM) += ksm.o
1030+obj-$(CONFIG_KSM_LEGACY) += ksm.o
1031+obj-$(CONFIG_UKSM) += uksm.o
1032 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
1033 obj-$(CONFIG_SLAB) += slab.o
1034 obj-$(CONFIG_SLUB) += slub.o
1035diff -Naur a/mm/memory.c b/mm/memory.c
1036--- a/mm/memory.c 2020-06-14 21:45:04.000000000 +0200
1037+++ b/mm/memory.c 2020-06-15 10:34:30.892359059 +0200
1038@@ -143,6 +143,25 @@
1039
1040 unsigned long highest_memmap_pfn __read_mostly;
1041
1042+#ifdef CONFIG_UKSM
1043+unsigned long uksm_zero_pfn __read_mostly;
1044+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1045+struct page *empty_uksm_zero_page;
1046+
1047+static int __init setup_uksm_zero_page(void)
1048+{
1049+ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
1050+ if (!empty_uksm_zero_page)
1051+ panic("Oh boy, that early out of memory?");
1052+
1053+ SetPageReserved(empty_uksm_zero_page);
1054+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1055+
1056+ return 0;
1057+}
1058+core_initcall(setup_uksm_zero_page);
1059+#endif
1060+
1061 /*
1062 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1063 */
1064@@ -158,6 +177,7 @@
1065 trace_rss_stat(mm, member, count);
1066 }
1067
1068+
1069 #if defined(SPLIT_RSS_COUNTING)
1070
1071 void sync_mm_rss(struct mm_struct *mm)
1072@@ -801,6 +821,11 @@
1073 get_page(page);
1074 page_dup_rmap(page, false);
1075 rss[mm_counter(page)]++;
1076+
1077+ /* Should return NULL in vm_normal_page() */
1078+ uksm_bugon_zeropage(pte);
1079+ } else {
1080+ uksm_map_zero_page(pte);
1081 }
1082
1083 out_set_pte:
1084@@ -1073,8 +1098,10 @@
1085 ptent = ptep_get_and_clear_full(mm, addr, pte,
1086 tlb->fullmm);
1087 tlb_remove_tlb_entry(tlb, pte, addr);
1088- if (unlikely(!page))
1089+ if (unlikely(!page)) {
1090+ uksm_unmap_zero_page(ptent);
1091 continue;
1092+ }
1093
1094 if (!PageAnon(page)) {
1095 if (pte_dirty(ptent)) {
1096@@ -2409,6 +2436,7 @@
1097
1098 if (likely(src)) {
1099 copy_user_highpage(dst, src, addr, vma);
1100+ uksm_cow_page(vma, src);
1101 return true;
1102 }
1103
1104@@ -2654,6 +2682,7 @@
1105 vmf->address);
1106 if (!new_page)
1107 goto oom;
1108+ uksm_cow_pte(vma, vmf->orig_pte);
1109 } else {
1110 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
1111 vmf->address);
1112@@ -2696,7 +2725,9 @@
1113 mm_counter_file(old_page));
1114 inc_mm_counter_fast(mm, MM_ANONPAGES);
1115 }
1116+ uksm_bugon_zeropage(vmf->orig_pte);
1117 } else {
1118+ uksm_unmap_zero_page(vmf->orig_pte);
1119 inc_mm_counter_fast(mm, MM_ANONPAGES);
1120 }
1121 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
1122diff -Naur a/mm/mmap.c b/mm/mmap.c
1123--- a/mm/mmap.c 2020-06-14 21:45:04.000000000 +0200
1124+++ b/mm/mmap.c 2020-06-15 10:56:23.801940631 +0200
1125@@ -46,6 +46,7 @@
1126 #include <linux/moduleparam.h>
1127 #include <linux/pkeys.h>
1128 #include <linux/oom.h>
1129+#include <linux/ksm.h>
1130 #include <linux/sched/mm.h>
1131
1132 #include <linux/uaccess.h>
1133@@ -181,6 +182,7 @@
1134 if (vma->vm_file)
1135 fput(vma->vm_file);
1136 mpol_put(vma_policy(vma));
1137+ uksm_remove_vma(vma);
1138 vm_area_free(vma);
1139 return next;
1140 }
1141@@ -708,9 +710,16 @@
1142 long adjust_next = 0;
1143 int remove_next = 0;
1144
1145+/*
1146+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1147+ * acquired
1148+ */
1149+ uksm_remove_vma(vma);
1150+
1151 if (next && !insert) {
1152 struct vm_area_struct *exporter = NULL, *importer = NULL;
1153
1154+ uksm_remove_vma(next);
1155 if (end >= next->vm_end) {
1156 /*
1157 * vma expands, overlapping all the next, and
1158@@ -841,6 +850,7 @@
1159 end_changed = true;
1160 }
1161 vma->vm_pgoff = pgoff;
1162+
1163 if (adjust_next) {
1164 next->vm_start += adjust_next << PAGE_SHIFT;
1165 next->vm_pgoff += adjust_next;
1166@@ -946,6 +956,7 @@
1167 if (remove_next == 2) {
1168 remove_next = 1;
1169 end = next->vm_end;
1170+ uksm_remove_vma(next);
1171 goto again;
1172 }
1173 else if (next)
1174@@ -972,10 +983,14 @@
1175 */
1176 VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
1177 }
1178+ } else {
1179+ if (next && !insert)
1180+ uksm_vma_add_new(next);
1181 }
1182 if (insert && file)
1183 uprobe_mmap(insert);
1184
1185+ uksm_vma_add_new(vma);
1186 validate_mm(mm);
1187
1188 return 0;
1189@@ -1434,6 +1449,9 @@
1190 vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1191 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1192
1193+ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
1194+ uksm_vm_flags_mod(&vm_flags);
1195+
1196 if (flags & MAP_LOCKED)
1197 if (!can_do_mlock())
1198 return -EPERM;
1199@@ -1801,6 +1819,7 @@
1200 allow_write_access(file);
1201 }
1202 file = vma->vm_file;
1203+ uksm_vma_add_new(vma);
1204 out:
1205 perf_event_mmap(vma);
1206
1207@@ -1843,6 +1862,7 @@
1208 if (vm_flags & VM_DENYWRITE)
1209 allow_write_access(file);
1210 free_vma:
1211+ uksm_remove_vma(vma);
1212 vm_area_free(vma);
1213 unacct_error:
1214 if (charged)
1215@@ -2694,6 +2714,8 @@
1216 else
1217 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1218
1219+ uksm_vma_add_new(new);
1220+
1221 /* Success. */
1222 if (!err)
1223 return 0;
1224@@ -3000,6 +3022,7 @@
1225 if ((flags & (~VM_EXEC)) != 0)
1226 return -EINVAL;
1227 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1228+ uksm_vm_flags_mod(&flags);
1229
1230 mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1231 if (IS_ERR_VALUE(mapped_addr))
1232@@ -3050,6 +3073,7 @@
1233 vma->vm_flags = flags;
1234 vma->vm_page_prot = vm_get_page_prot(flags);
1235 vma_link(mm, vma, prev, rb_link, rb_parent);
1236+ uksm_vma_add_new(vma);
1237 out:
1238 perf_event_mmap(vma);
1239 mm->total_vm += len >> PAGE_SHIFT;
1240@@ -3127,6 +3151,12 @@
1241 mmap_write_unlock(mm);
1242 }
1243
1244+ /*
1245+ * Taking write lock on mmap_lock does not harm others,
1246+ * but it's crucial for uksm to avoid races.
1247+ */
1248+ mmap_write_lock(mm);
1249+
1250 if (mm->locked_vm) {
1251 vma = mm->mmap;
1252 while (vma) {
1253@@ -3161,6 +3191,11 @@
1254 vma = remove_vma(vma);
1255 }
1256 vm_unacct_memory(nr_accounted);
1257+
1258+ mm->mmap = NULL;
1259+ mm->mm_rb = RB_ROOT;
1260+ vmacache_invalidate(mm);
1261+ mmap_write_unlock(mm);
1262 }
1263
1264 /* Insert vm structure into process list sorted by address
1265@@ -3268,6 +3303,7 @@
1266 new_vma->vm_ops->open(new_vma);
1267 vma_link(mm, new_vma, prev, rb_link, rb_parent);
1268 *need_rmap_locks = false;
1269+ uksm_vma_add_new(new_vma);
1270 }
1271 return new_vma;
1272
1273@@ -3420,6 +3456,7 @@
1274 vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
1275
1276 perf_event_mmap(vma);
1277+ uksm_vma_add_new(vma);
1278
1279 return vma;
1280
1281diff -Naur a/mm/uksm.c b/mm/uksm.c
1282--- a/mm/uksm.c 1970-01-01 01:00:00.000000000 +0100
1283+++ b/mm/uksm.c 2020-06-15 10:34:30.892359059 +0200
1284@@ -0,0 +1,5613 @@
1285+/*
1286+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1287+ *
1288+ * This is an improvement upon KSM. Some basic data structures and routines
1289+ * are borrowed from ksm.c .
1290+ *
1291+ * Its new features:
1292+ * 1. Full system scan:
1293+ * It automatically scans all user processes' anonymous VMAs. Kernel-user
1294+ * interaction to submit a memory area to KSM is no longer needed.
1295+ *
1296+ * 2. Rich area detection:
1297+ * It automatically detects rich areas containing abundant duplicated
1298+ * pages based. Rich areas are given a full scan speed. Poor areas are
1299+ * sampled at a reasonable speed with very low CPU consumption.
1300+ *
1301+ * 3. Ultra Per-page scan speed improvement:
1302+ * A new hash algorithm is proposed. As a result, on a machine with
1303+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1304+ * can scan memory areas that does not contain duplicated pages at speed of
1305+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1306+ * 477MB/sec ~ 923MB/sec.
1307+ *
1308+ * 4. Thrashing area avoidance:
1309+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
1310+ * filtered out. My benchmark shows it's more efficient than KSM's per-page
1311+ * hash value based volatile page detection.
1312+ *
1313+ *
1314+ * 5. Misc changes upon KSM:
1315+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1316+ * comparison. It's much faster than default C version on x86.
1317+ * * rmap_item now has an struct *page member to loosely cache a
1318+ * address-->page mapping, which reduces too much time-costly
1319+ * follow_page().
1320+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1321+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1322+ * ksm is needed for this case.
1323+ *
1324+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
1325+ * Now uksmd consider full zero pages as special pages and merge them to an
1326+ * special unswappable uksm zero page.
1327+ */
1328+
1329+#include <linux/errno.h>
1330+#include <linux/mm.h>
1331+#include <linux/fs.h>
1332+#include <linux/mman.h>
1333+#include <linux/sched.h>
1334+#include <linux/sched/mm.h>
1335+#include <linux/sched/coredump.h>
1336+#include <linux/sched/cputime.h>
1337+#include <linux/rwsem.h>
1338+#include <linux/pagemap.h>
1339+#include <linux/rmap.h>
1340+#include <linux/spinlock.h>
1341+#include <linux/jhash.h>
1342+#include <linux/delay.h>
1343+#include <linux/kthread.h>
1344+#include <linux/wait.h>
1345+#include <linux/slab.h>
1346+#include <linux/rbtree.h>
1347+#include <linux/memory.h>
1348+#include <linux/mmu_notifier.h>
1349+#include <linux/swap.h>
1350+#include <linux/ksm.h>
1351+#include <linux/crypto.h>
1352+#include <linux/scatterlist.h>
1353+#include <crypto/hash.h>
1354+#include <linux/random.h>
1355+#include <linux/math64.h>
1356+#include <linux/gcd.h>
1357+#include <linux/freezer.h>
1358+#include <linux/oom.h>
1359+#include <linux/numa.h>
1360+#include <linux/sradix-tree.h>
1361+
1362+#include <asm/tlbflush.h>
1363+#include "internal.h"
1364+
1365+#ifdef CONFIG_X86
1366+#undef memcmp
1367+
1368+#ifdef CONFIG_X86_32
1369+#define memcmp memcmpx86_32
1370+/*
1371+ * Compare 4-byte-aligned address s1 and s2, with length n
1372+ */
1373+int memcmpx86_32(void *s1, void *s2, size_t n)
1374+{
1375+ size_t num = n / 4;
1376+ register int res;
1377+
1378+ __asm__ __volatile__
1379+ (
1380+ "testl %3,%3\n\t"
1381+ "repe; cmpsd\n\t"
1382+ "je 1f\n\t"
1383+ "sbbl %0,%0\n\t"
1384+ "orl $1,%0\n"
1385+ "1:"
1386+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1387+ : "0" (0)
1388+ : "cc");
1389+
1390+ return res;
1391+}
1392+
1393+/*
1394+ * Check the page is all zero ?
1395+ */
1396+static int is_full_zero(const void *s1, size_t len)
1397+{
1398+ unsigned char same;
1399+
1400+ len /= 4;
1401+
1402+ __asm__ __volatile__
1403+ ("repe; scasl;"
1404+ "sete %0"
1405+ : "=qm" (same), "+D" (s1), "+c" (len)
1406+ : "a" (0)
1407+ : "cc");
1408+
1409+ return same;
1410+}
1411+
1412+
1413+#elif defined(CONFIG_X86_64)
1414+#define memcmp memcmpx86_64
1415+/*
1416+ * Compare 8-byte-aligned address s1 and s2, with length n
1417+ */
1418+int memcmpx86_64(void *s1, void *s2, size_t n)
1419+{
1420+ size_t num = n / 8;
1421+ register int res;
1422+
1423+ __asm__ __volatile__
1424+ (
1425+ "testq %q3,%q3\n\t"
1426+ "repe; cmpsq\n\t"
1427+ "je 1f\n\t"
1428+ "sbbq %q0,%q0\n\t"
1429+ "orq $1,%q0\n"
1430+ "1:"
1431+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1432+ : "0" (0)
1433+ : "cc");
1434+
1435+ return res;
1436+}
1437+
1438+static int is_full_zero(const void *s1, size_t len)
1439+{
1440+ unsigned char same;
1441+
1442+ len /= 8;
1443+
1444+ __asm__ __volatile__
1445+ ("repe; scasq;"
1446+ "sete %0"
1447+ : "=qm" (same), "+D" (s1), "+c" (len)
1448+ : "a" (0)
1449+ : "cc");
1450+
1451+ return same;
1452+}
1453+
1454+#endif
1455+#else
1456+static int is_full_zero(const void *s1, size_t len)
1457+{
1458+ unsigned long *src = s1;
1459+ int i;
1460+
1461+ len /= sizeof(*src);
1462+
1463+ for (i = 0; i < len; i++) {
1464+ if (src[i])
1465+ return 0;
1466+ }
1467+
1468+ return 1;
1469+}
1470+#endif
1471+
1472+#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
1473+#define TIME_RATIO_SCALE 10000
1474+
1475+#define SLOT_TREE_NODE_SHIFT 8
1476+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
1477+struct slot_tree_node {
1478+ unsigned long size;
1479+ struct sradix_tree_node snode;
1480+ void *stores[SLOT_TREE_NODE_STORE_SIZE];
1481+};
1482+
1483+static struct kmem_cache *slot_tree_node_cachep;
1484+
1485+static struct sradix_tree_node *slot_tree_node_alloc(void)
1486+{
1487+ struct slot_tree_node *p;
1488+
1489+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
1490+ __GFP_NORETRY | __GFP_NOWARN);
1491+ if (!p)
1492+ return NULL;
1493+
1494+ return &p->snode;
1495+}
1496+
1497+static void slot_tree_node_free(struct sradix_tree_node *node)
1498+{
1499+ struct slot_tree_node *p;
1500+
1501+ p = container_of(node, struct slot_tree_node, snode);
1502+ kmem_cache_free(slot_tree_node_cachep, p);
1503+}
1504+
1505+static void slot_tree_node_extend(struct sradix_tree_node *parent,
1506+ struct sradix_tree_node *child)
1507+{
1508+ struct slot_tree_node *p, *c;
1509+
1510+ p = container_of(parent, struct slot_tree_node, snode);
1511+ c = container_of(child, struct slot_tree_node, snode);
1512+
1513+ p->size += c->size;
1514+}
1515+
1516+void slot_tree_node_assign(struct sradix_tree_node *node,
1517+ unsigned int index, void *item)
1518+{
1519+ struct vma_slot *slot = item;
1520+ struct slot_tree_node *cur;
1521+
1522+ slot->snode = node;
1523+ slot->sindex = index;
1524+
1525+ while (node) {
1526+ cur = container_of(node, struct slot_tree_node, snode);
1527+ cur->size += slot->pages;
1528+ node = node->parent;
1529+ }
1530+}
1531+
1532+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
1533+{
1534+ struct vma_slot *slot;
1535+ struct slot_tree_node *cur;
1536+ unsigned long pages;
1537+
1538+ if (node->height == 1) {
1539+ slot = node->stores[offset];
1540+ pages = slot->pages;
1541+ } else {
1542+ cur = container_of(node->stores[offset],
1543+ struct slot_tree_node, snode);
1544+ pages = cur->size;
1545+ }
1546+
1547+ while (node) {
1548+ cur = container_of(node, struct slot_tree_node, snode);
1549+ cur->size -= pages;
1550+ node = node->parent;
1551+ }
1552+}
1553+
1554+unsigned long slot_iter_index;
1555+int slot_iter(void *item, unsigned long height)
1556+{
1557+ struct slot_tree_node *node;
1558+ struct vma_slot *slot;
1559+
1560+ if (height == 1) {
1561+ slot = item;
1562+ if (slot_iter_index < slot->pages) {
1563+ /*in this one*/
1564+ return 1;
1565+ } else {
1566+ slot_iter_index -= slot->pages;
1567+ return 0;
1568+ }
1569+
1570+ } else {
1571+ node = container_of(item, struct slot_tree_node, snode);
1572+ if (slot_iter_index < node->size) {
1573+ /*in this one*/
1574+ return 1;
1575+ } else {
1576+ slot_iter_index -= node->size;
1577+ return 0;
1578+ }
1579+ }
1580+}
1581+
1582+
1583+static inline void slot_tree_init_root(struct sradix_tree_root *root)
1584+{
1585+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1586+ root->alloc = slot_tree_node_alloc;
1587+ root->free = slot_tree_node_free;
1588+ root->extend = slot_tree_node_extend;
1589+ root->assign = slot_tree_node_assign;
1590+ root->rm = slot_tree_node_rm;
1591+}
1592+
1593+void slot_tree_init(void)
1594+{
1595+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1596+ sizeof(struct slot_tree_node), 0,
1597+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1598+ NULL);
1599+}
1600+
1601+
1602+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1603+struct scan_rung {
1604+ //struct list_head scanned_list;
1605+ struct sradix_tree_root vma_root;
1606+ struct sradix_tree_root vma_root2;
1607+
1608+ struct vma_slot *current_scan;
1609+ unsigned long current_offset;
1610+
1611+ /*
1612+ * The initial value for current_offset, it should loop over
1613+ * [0~ step - 1] to let all slot have its chance to be scanned.
1614+ */
1615+ unsigned long offset_init;
1616+ unsigned long step; /* dynamic step for current_offset */
1617+ unsigned int flags;
1618+ unsigned long pages_to_scan;
1619+ //unsigned long fully_scanned_slots;
1620+ /*
1621+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1622+ * the cpu time ratio it can spend in rung_i for every scan
1623+ * period. if < 0, then it is the cpu time ratio relative to the
1624+ * max cpu percentage user specified. Both in unit of
1625+ * 1/TIME_RATIO_SCALE
1626+ */
1627+ int cpu_ratio;
1628+
1629+ /*
1630+ * How long it will take for all slots in this rung to be fully
1631+ * scanned? If it's zero, we don't care about the cover time:
1632+ * it's fully scanned.
1633+ */
1634+ unsigned int cover_msecs;
1635+ //unsigned long vma_num;
1636+ //unsigned long pages; /* Sum of all slot's pages in rung */
1637+};
1638+
1639+/**
1640+ * node of either the stable or unstale rbtree
1641+ *
1642+ */
1643+struct tree_node {
1644+ struct rb_node node; /* link in the main (un)stable rbtree */
1645+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1646+ u32 hash;
1647+ unsigned long count; /* TODO: merged with sub_root */
1648+ struct list_head all_list; /* all tree nodes in stable/unstable tree */
1649+};
1650+
1651+/**
1652+ * struct stable_node - node of the stable rbtree
1653+ * @node: rb node of this ksm page in the stable tree
1654+ * @hlist: hlist head of rmap_items using this ksm page
1655+ * @kpfn: page frame number of this ksm page
1656+ */
1657+struct stable_node {
1658+ struct rb_node node; /* link in sub-rbtree */
1659+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1660+ struct hlist_head hlist;
1661+ unsigned long kpfn;
1662+ u32 hash_max; /* if ==0 then it's not been calculated yet */
1663+ struct list_head all_list; /* in a list for all stable nodes */
1664+};
1665+
1666+/**
1667+ * struct node_vma - group rmap_items linked in a same stable
1668+ * node together.
1669+ */
1670+struct node_vma {
1671+ union {
1672+ struct vma_slot *slot;
1673+ unsigned long key; /* slot is used as key sorted on hlist */
1674+ };
1675+ struct hlist_node hlist;
1676+ struct hlist_head rmap_hlist;
1677+ struct stable_node *head;
1678+};
1679+
1680+/**
1681+ * struct rmap_item - reverse mapping item for virtual addresses
1682+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1683+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1684+ * @mm: the memory structure this rmap_item is pointing into
1685+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1686+ * @node: rb node of this rmap_item in the unstable tree
1687+ * @head: pointer to stable_node heading this list in the stable tree
1688+ * @hlist: link into hlist of rmap_items hanging off that stable_node
1689+ */
1690+struct rmap_item {
1691+ struct vma_slot *slot;
1692+ struct page *page;
1693+ unsigned long address; /* + low bits used for flags below */
1694+ unsigned long hash_round;
1695+ unsigned long entry_index;
1696+ union {
1697+ struct {/* when in unstable tree */
1698+ struct rb_node node;
1699+ struct tree_node *tree_node;
1700+ u32 hash_max;
1701+ };
1702+ struct { /* when in stable tree */
1703+ struct node_vma *head;
1704+ struct hlist_node hlist;
1705+ struct anon_vma *anon_vma;
1706+ };
1707+ };
1708+} __aligned(4);
1709+
1710+struct rmap_list_entry {
1711+ union {
1712+ struct rmap_item *item;
1713+ unsigned long addr;
1714+ };
1715+ /* lowest bit is used for is_addr tag */
1716+} __aligned(4); /* 4 aligned to fit in to pages*/
1717+
1718+
1719+/* Basic data structure definition ends */
1720+
1721+
1722+/*
1723+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1724+ * The flags use the low bits of rmap_item.address
1725+ */
1726+#define UNSTABLE_FLAG 0x1
1727+#define STABLE_FLAG 0x2
1728+#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
1729+
1730+/*
1731+ * rmap_list_entry helpers
1732+ */
1733+#define IS_ADDR_FLAG 1
1734+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
1735+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
1736+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1737+
1738+
1739+/*
1740+ * High speed caches for frequently allocated and freed structs
1741+ */
1742+static struct kmem_cache *rmap_item_cache;
1743+static struct kmem_cache *stable_node_cache;
1744+static struct kmem_cache *node_vma_cache;
1745+static struct kmem_cache *vma_slot_cache;
1746+static struct kmem_cache *tree_node_cache;
1747+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1748+ sizeof(struct __struct), __alignof__(struct __struct),\
1749+ (__flags), NULL)
1750+
1751+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1752+#define SCAN_LADDER_SIZE 4
1753+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1754+
1755+/* The evaluation rounds uksmd has finished */
1756+static unsigned long long uksm_eval_round = 1;
1757+
1758+/*
1759+ * we add 1 to this var when we consider we should rebuild the whole
1760+ * unstable tree.
1761+ */
1762+static unsigned long uksm_hash_round = 1;
1763+
1764+/*
1765+ * How many times the whole memory is scanned.
1766+ */
1767+static unsigned long long fully_scanned_round = 1;
1768+
1769+/* The total number of virtual pages of all vma slots */
1770+static u64 uksm_pages_total;
1771+
1772+/* The number of pages has been scanned since the start up */
1773+static u64 uksm_pages_scanned;
1774+
1775+static u64 scanned_virtual_pages;
1776+
1777+/* The number of pages has been scanned since last encode_benefit call */
1778+static u64 uksm_pages_scanned_last;
1779+
1780+/* If the scanned number is tooo large, we encode it here */
1781+static u64 pages_scanned_stored;
1782+
1783+static unsigned long pages_scanned_base;
1784+
1785+/* The number of nodes in the stable tree */
1786+static unsigned long uksm_pages_shared;
1787+
1788+/* The number of page slots additionally sharing those nodes */
1789+static unsigned long uksm_pages_sharing;
1790+
1791+/* The number of nodes in the unstable tree */
1792+static unsigned long uksm_pages_unshared;
1793+
1794+/*
1795+ * Milliseconds ksmd should sleep between scans,
1796+ * >= 100ms to be consistent with
1797+ * scan_time_to_sleep_msec()
1798+ */
1799+static unsigned int uksm_sleep_jiffies;
1800+
1801+/* The real value for the uksmd next sleep */
1802+static unsigned int uksm_sleep_real;
1803+
1804+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1805+static unsigned int uksm_sleep_saved;
1806+
1807+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1808+static unsigned int uksm_max_cpu_percentage;
1809+
1810+static int uksm_cpu_governor;
1811+
1812+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1813+
1814+struct uksm_cpu_preset_s {
1815+ int cpu_ratio[SCAN_LADDER_SIZE];
1816+ unsigned int cover_msecs[SCAN_LADDER_SIZE];
1817+ unsigned int max_cpu; /* percentage */
1818+};
1819+
1820+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1821+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1822+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1823+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1824+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1825+};
1826+
1827+/* The default value for uksm_ema_page_time if it's not initialized */
1828+#define UKSM_PAGE_TIME_DEFAULT 500
1829+
1830+/*cost to scan one page by expotional moving average in nsecs */
1831+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1832+
1833+/* The expotional moving average alpha weight, in percentage. */
1834+#define EMA_ALPHA 20
1835+
1836+/*
1837+ * The threshold used to filter out thrashing areas,
1838+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1839+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1840+ * will be considered as having a zero duplication ratio.
1841+ */
1842+static unsigned int uksm_thrash_threshold = 50;
1843+
1844+/* How much dedup ratio is considered to be abundant*/
1845+static unsigned int uksm_abundant_threshold = 10;
1846+
1847+/* All slots having merged pages in this eval round. */
1848+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1849+
1850+/* How many times the ksmd has slept since startup */
1851+static unsigned long long uksm_sleep_times;
1852+
1853+#define UKSM_RUN_STOP 0
1854+#define UKSM_RUN_MERGE 1
1855+static unsigned int uksm_run = 1;
1856+
1857+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1858+static DEFINE_MUTEX(uksm_thread_mutex);
1859+
1860+/*
1861+ * List vma_slot_new is for newly created vma_slot waiting to be added by
1862+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1863+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1864+ * VMA has been removed/freed.
1865+ */
1866+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1867+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1868+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1869+static DEFINE_SPINLOCK(vma_slot_list_lock);
1870+
1871+/* The unstable tree heads */
1872+static struct rb_root root_unstable_tree = RB_ROOT;
1873+
1874+/*
1875+ * All tree_nodes are in a list to be freed at once when unstable tree is
1876+ * freed after each scan round.
1877+ */
1878+static struct list_head unstable_tree_node_list =
1879+ LIST_HEAD_INIT(unstable_tree_node_list);
1880+
1881+/* List contains all stable nodes */
1882+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1883+
1884+/*
1885+ * When the hash strength is changed, the stable tree must be delta_hashed and
1886+ * re-structured. We use two set of below structs to speed up the
1887+ * re-structuring of stable tree.
1888+ */
1889+static struct list_head
1890+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1891+ LIST_HEAD_INIT(stable_tree_node_list[1])};
1892+
1893+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1894+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1895+static struct rb_root *root_stable_treep = &root_stable_tree[0];
1896+static unsigned long stable_tree_index;
1897+
1898+/* The hash strength needed to hash a full page */
1899+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
1900+
1901+/* The hash strength needed for loop-back hashing */
1902+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
1903+
1904+/* The random offsets in a page */
1905+static u32 *random_nums;
1906+
1907+/* The hash strength */
1908+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1909+
1910+/* The delta value each time the hash strength increases or decreases */
1911+static unsigned long hash_strength_delta;
1912+#define HASH_STRENGTH_DELTA_MAX 5
1913+
1914+/* The time we have saved due to random_sample_hash */
1915+static u64 rshash_pos;
1916+
1917+/* The time we have wasted due to hash collision */
1918+static u64 rshash_neg;
1919+
1920+struct uksm_benefit {
1921+ u64 pos;
1922+ u64 neg;
1923+ u64 scanned;
1924+ unsigned long base;
1925+} benefit;
1926+
1927+/*
1928+ * The relative cost of memcmp, compared to 1 time unit of random sample
1929+ * hash, this value is tested when ksm module is initialized
1930+ */
1931+static unsigned long memcmp_cost;
1932+
1933+static unsigned long rshash_neg_cont_zero;
1934+static unsigned long rshash_cont_obscure;
1935+
1936+/* The possible states of hash strength adjustment heuristic */
1937+enum rshash_states {
1938+ RSHASH_STILL,
1939+ RSHASH_TRYUP,
1940+ RSHASH_TRYDOWN,
1941+ RSHASH_NEW,
1942+ RSHASH_PRE_STILL,
1943+};
1944+
1945+/* The possible direction we are about to adjust hash strength */
1946+enum rshash_direct {
1947+ GO_UP,
1948+ GO_DOWN,
1949+ OBSCURE,
1950+ STILL,
1951+};
1952+
1953+/* random sampling hash state machine */
1954+static struct {
1955+ enum rshash_states state;
1956+ enum rshash_direct pre_direct;
1957+ u8 below_count;
1958+ /* Keep a lookup window of size 5, iff above_count/below_count > 3
1959+ * in this window we stop trying.
1960+ */
1961+ u8 lookup_window_index;
1962+ u64 stable_benefit;
1963+ unsigned long turn_point_down;
1964+ unsigned long turn_benefit_down;
1965+ unsigned long turn_point_up;
1966+ unsigned long turn_benefit_up;
1967+ unsigned long stable_point;
1968+} rshash_state;
1969+
1970+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
1971+static u32 *zero_hash_table;
1972+
1973+static inline struct node_vma *alloc_node_vma(void)
1974+{
1975+ struct node_vma *node_vma;
1976+
1977+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
1978+ __GFP_NORETRY | __GFP_NOWARN);
1979+ if (node_vma) {
1980+ INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1981+ INIT_HLIST_NODE(&node_vma->hlist);
1982+ }
1983+ return node_vma;
1984+}
1985+
1986+static inline void free_node_vma(struct node_vma *node_vma)
1987+{
1988+ kmem_cache_free(node_vma_cache, node_vma);
1989+}
1990+
1991+
1992+static inline struct vma_slot *alloc_vma_slot(void)
1993+{
1994+ struct vma_slot *slot;
1995+
1996+ /*
1997+ * In case ksm is not initialized by now.
1998+ * Oops, we need to consider the call site of uksm_init() in the future.
1999+ */
2000+ if (!vma_slot_cache)
2001+ return NULL;
2002+
2003+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
2004+ __GFP_NORETRY | __GFP_NOWARN);
2005+ if (slot) {
2006+ INIT_LIST_HEAD(&slot->slot_list);
2007+ INIT_LIST_HEAD(&slot->dedup_list);
2008+ slot->flags |= UKSM_SLOT_NEED_RERAND;
2009+ }
2010+ return slot;
2011+}
2012+
2013+static inline void free_vma_slot(struct vma_slot *vma_slot)
2014+{
2015+ kmem_cache_free(vma_slot_cache, vma_slot);
2016+}
2017+
2018+
2019+
2020+static inline struct rmap_item *alloc_rmap_item(void)
2021+{
2022+ struct rmap_item *rmap_item;
2023+
2024+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
2025+ __GFP_NORETRY | __GFP_NOWARN);
2026+ if (rmap_item) {
2027+ /* bug on lowest bit is not clear for flag use */
2028+ BUG_ON(is_addr(rmap_item));
2029+ }
2030+ return rmap_item;
2031+}
2032+
2033+static inline void free_rmap_item(struct rmap_item *rmap_item)
2034+{
2035+ rmap_item->slot = NULL; /* debug safety */
2036+ kmem_cache_free(rmap_item_cache, rmap_item);
2037+}
2038+
2039+static inline struct stable_node *alloc_stable_node(void)
2040+{
2041+ struct stable_node *node;
2042+
2043+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
2044+ __GFP_NORETRY | __GFP_NOWARN);
2045+ if (!node)
2046+ return NULL;
2047+
2048+ INIT_HLIST_HEAD(&node->hlist);
2049+ list_add(&node->all_list, &stable_node_list);
2050+ return node;
2051+}
2052+
2053+static inline void free_stable_node(struct stable_node *stable_node)
2054+{
2055+ list_del(&stable_node->all_list);
2056+ kmem_cache_free(stable_node_cache, stable_node);
2057+}
2058+
2059+static inline struct tree_node *alloc_tree_node(struct list_head *list)
2060+{
2061+ struct tree_node *node;
2062+
2063+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
2064+ __GFP_NORETRY | __GFP_NOWARN);
2065+ if (!node)
2066+ return NULL;
2067+
2068+ list_add(&node->all_list, list);
2069+ return node;
2070+}
2071+
2072+static inline void free_tree_node(struct tree_node *node)
2073+{
2074+ list_del(&node->all_list);
2075+ kmem_cache_free(tree_node_cache, node);
2076+}
2077+
2078+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2079+{
2080+ struct anon_vma *anon_vma = rmap_item->anon_vma;
2081+
2082+ put_anon_vma(anon_vma);
2083+}
2084+
2085+
2086+/**
2087+ * Remove a stable node from stable_tree, may unlink from its tree_node and
2088+ * may remove its parent tree_node if no other stable node is pending.
2089+ *
2090+ * @stable_node The node need to be removed
2091+ * @unlink_rb Will this node be unlinked from the rbtree?
2092+ * @remove_tree_ node Will its tree_node be removed if empty?
2093+ */
2094+static void remove_node_from_stable_tree(struct stable_node *stable_node,
2095+ int unlink_rb, int remove_tree_node)
2096+{
2097+ struct node_vma *node_vma;
2098+ struct rmap_item *rmap_item;
2099+ struct hlist_node *n;
2100+
2101+ if (!hlist_empty(&stable_node->hlist)) {
2102+ hlist_for_each_entry_safe(node_vma, n,
2103+ &stable_node->hlist, hlist) {
2104+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2105+ uksm_pages_sharing--;
2106+
2107+ uksm_drop_anon_vma(rmap_item);
2108+ rmap_item->address &= PAGE_MASK;
2109+ }
2110+ free_node_vma(node_vma);
2111+ cond_resched();
2112+ }
2113+
2114+ /* the last one is counted as shared */
2115+ uksm_pages_shared--;
2116+ uksm_pages_sharing++;
2117+ }
2118+
2119+ if (stable_node->tree_node && unlink_rb) {
2120+ rb_erase(&stable_node->node,
2121+ &stable_node->tree_node->sub_root);
2122+
2123+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2124+ remove_tree_node) {
2125+ rb_erase(&stable_node->tree_node->node,
2126+ root_stable_treep);
2127+ free_tree_node(stable_node->tree_node);
2128+ } else {
2129+ stable_node->tree_node->count--;
2130+ }
2131+ }
2132+
2133+ free_stable_node(stable_node);
2134+}
2135+
2136+
2137+/*
2138+ * get_uksm_page: checks if the page indicated by the stable node
2139+ * is still its ksm page, despite having held no reference to it.
2140+ * In which case we can trust the content of the page, and it
2141+ * returns the gotten page; but if the page has now been zapped,
2142+ * remove the stale node from the stable tree and return NULL.
2143+ *
2144+ * You would expect the stable_node to hold a reference to the ksm page.
2145+ * But if it increments the page's count, swapping out has to wait for
2146+ * ksmd to come around again before it can free the page, which may take
2147+ * seconds or even minutes: much too unresponsive. So instead we use a
2148+ * "keyhole reference": access to the ksm page from the stable node peeps
2149+ * out through its keyhole to see if that page still holds the right key,
2150+ * pointing back to this stable node. This relies on freeing a PageAnon
2151+ * page to reset its page->mapping to NULL, and relies on no other use of
2152+ * a page to put something that might look like our key in page->mapping.
2153+ *
2154+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2155+ * but this is different - made simpler by uksm_thread_mutex being held, but
2156+ * interesting for assuming that no other use of the struct page could ever
2157+ * put our expected_mapping into page->mapping (or a field of the union which
2158+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
2159+ * to keep the page_count protocol described with page_cache_get_speculative.
2160+ *
2161+ * Note: it is possible that get_uksm_page() will return NULL one moment,
2162+ * then page the next, if the page is in between page_freeze_refs() and
2163+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2164+ * is on its way to being freed; but it is an anomaly to bear in mind.
2165+ *
2166+ * @unlink_rb: if the removal of this node will firstly unlink from
2167+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
2168+ * node from its old tree.
2169+ *
2170+ * @remove_tree_node: if this is the last one of its tree_node, will the
2171+ * tree_node be freed ? If we are inserting stable node, this tree_node may
2172+ * be reused, so don't free it.
2173+ */
2174+static struct page *get_uksm_page(struct stable_node *stable_node,
2175+ int unlink_rb, int remove_tree_node)
2176+{
2177+ struct page *page;
2178+ void *expected_mapping;
2179+ unsigned long kpfn;
2180+
2181+ expected_mapping = (void *)((unsigned long)stable_node |
2182+ PAGE_MAPPING_KSM);
2183+again:
2184+ kpfn = READ_ONCE(stable_node->kpfn);
2185+ page = pfn_to_page(kpfn);
2186+
2187+ /*
2188+ * page is computed from kpfn, so on most architectures reading
2189+ * page->mapping is naturally ordered after reading node->kpfn,
2190+ * but on Alpha we need to be more careful.
2191+ */
2192+ smp_read_barrier_depends();
2193+
2194+ if (READ_ONCE(page->mapping) != expected_mapping)
2195+ goto stale;
2196+
2197+ /*
2198+ * We cannot do anything with the page while its refcount is 0.
2199+ * Usually 0 means free, or tail of a higher-order page: in which
2200+ * case this node is no longer referenced, and should be freed;
2201+ * however, it might mean that the page is under page_freeze_refs().
2202+ * The __remove_mapping() case is easy, again the node is now stale;
2203+ * but if page is swapcache in migrate_page_move_mapping(), it might
2204+ * still be our page, in which case it's essential to keep the node.
2205+ */
2206+ while (!get_page_unless_zero(page)) {
2207+ /*
2208+ * Another check for page->mapping != expected_mapping would
2209+ * work here too. We have chosen the !PageSwapCache test to
2210+ * optimize the common case, when the page is or is about to
2211+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
2212+ * in the freeze_refs section of __remove_mapping(); but Anon
2213+ * page->mapping reset to NULL later, in free_pages_prepare().
2214+ */
2215+ if (!PageSwapCache(page))
2216+ goto stale;
2217+ cpu_relax();
2218+ }
2219+
2220+ if (READ_ONCE(page->mapping) != expected_mapping) {
2221+ put_page(page);
2222+ goto stale;
2223+ }
2224+
2225+ lock_page(page);
2226+ if (READ_ONCE(page->mapping) != expected_mapping) {
2227+ unlock_page(page);
2228+ put_page(page);
2229+ goto stale;
2230+ }
2231+ unlock_page(page);
2232+ return page;
2233+stale:
2234+ /*
2235+ * We come here from above when page->mapping or !PageSwapCache
2236+ * suggests that the node is stale; but it might be under migration.
2237+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
2238+ * before checking whether node->kpfn has been changed.
2239+ */
2240+ smp_rmb();
2241+ if (stable_node->kpfn != kpfn)
2242+ goto again;
2243+
2244+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2245+
2246+ return NULL;
2247+}
2248+
2249+/*
2250+ * Removing rmap_item from stable or unstable tree.
2251+ * This function will clean the information from the stable/unstable tree.
2252+ */
2253+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2254+{
2255+ if (rmap_item->address & STABLE_FLAG) {
2256+ struct stable_node *stable_node;
2257+ struct node_vma *node_vma;
2258+ struct page *page;
2259+
2260+ node_vma = rmap_item->head;
2261+ stable_node = node_vma->head;
2262+ page = get_uksm_page(stable_node, 1, 1);
2263+ if (!page)
2264+ goto out;
2265+
2266+ /*
2267+ * page lock is needed because it's racing with
2268+ * try_to_unmap_ksm(), etc.
2269+ */
2270+ lock_page(page);
2271+ hlist_del(&rmap_item->hlist);
2272+
2273+ if (hlist_empty(&node_vma->rmap_hlist)) {
2274+ hlist_del(&node_vma->hlist);
2275+ free_node_vma(node_vma);
2276+ }
2277+ unlock_page(page);
2278+
2279+ put_page(page);
2280+ if (hlist_empty(&stable_node->hlist)) {
2281+ /* do NOT call remove_node_from_stable_tree() here,
2282+ * it's possible for a forked rmap_item not in
2283+ * stable tree while the in-tree rmap_items were
2284+ * deleted.
2285+ */
2286+ uksm_pages_shared--;
2287+ } else
2288+ uksm_pages_sharing--;
2289+
2290+
2291+ uksm_drop_anon_vma(rmap_item);
2292+ } else if (rmap_item->address & UNSTABLE_FLAG) {
2293+ if (rmap_item->hash_round == uksm_hash_round) {
2294+
2295+ rb_erase(&rmap_item->node,
2296+ &rmap_item->tree_node->sub_root);
2297+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2298+ rb_erase(&rmap_item->tree_node->node,
2299+ &root_unstable_tree);
2300+
2301+ free_tree_node(rmap_item->tree_node);
2302+ } else
2303+ rmap_item->tree_node->count--;
2304+ }
2305+ uksm_pages_unshared--;
2306+ }
2307+
2308+ rmap_item->address &= PAGE_MASK;
2309+ rmap_item->hash_max = 0;
2310+
2311+out:
2312+ cond_resched(); /* we're called from many long loops */
2313+}
2314+
2315+static inline int slot_in_uksm(struct vma_slot *slot)
2316+{
2317+ return list_empty(&slot->slot_list);
2318+}
2319+
2320+/*
2321+ * Test if the mm is exiting
2322+ */
2323+static inline bool uksm_test_exit(struct mm_struct *mm)
2324+{
2325+ return atomic_read(&mm->mm_users) == 0;
2326+}
2327+
2328+static inline unsigned long vma_pool_size(struct vma_slot *slot)
2329+{
2330+ return round_up(sizeof(struct rmap_list_entry) * slot->pages,
2331+ PAGE_SIZE) >> PAGE_SHIFT;
2332+}
2333+
2334+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2335+
2336+/* must be done with sem locked */
2337+static int slot_pool_alloc(struct vma_slot *slot)
2338+{
2339+ unsigned long pool_size;
2340+
2341+ if (slot->rmap_list_pool)
2342+ return 0;
2343+
2344+ pool_size = vma_pool_size(slot);
2345+ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
2346+ GFP_KERNEL);
2347+ if (!slot->rmap_list_pool)
2348+ return -ENOMEM;
2349+
2350+ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
2351+ GFP_KERNEL);
2352+ if (!slot->pool_counts) {
2353+ kfree(slot->rmap_list_pool);
2354+ return -ENOMEM;
2355+ }
2356+
2357+ slot->pool_size = pool_size;
2358+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
2359+ slot->flags |= UKSM_SLOT_IN_UKSM;
2360+ uksm_pages_total += slot->pages;
2361+
2362+ return 0;
2363+}
2364+
2365+/*
2366+ * Called after vma is unlinked from its mm
2367+ */
2368+void uksm_remove_vma(struct vm_area_struct *vma)
2369+{
2370+ struct vma_slot *slot;
2371+
2372+ if (!vma->uksm_vma_slot)
2373+ return;
2374+
2375+ spin_lock(&vma_slot_list_lock);
2376+ slot = vma->uksm_vma_slot;
2377+ if (!slot)
2378+ goto out;
2379+
2380+ if (slot_in_uksm(slot)) {
2381+ /**
2382+ * This slot has been added by ksmd, so move to the del list
2383+ * waiting ksmd to free it.
2384+ */
2385+ list_add_tail(&slot->slot_list, &vma_slot_del);
2386+ } else {
2387+ /**
2388+ * It's still on new list. It's ok to free slot directly.
2389+ */
2390+ list_del(&slot->slot_list);
2391+ free_vma_slot(slot);
2392+ }
2393+out:
2394+ vma->uksm_vma_slot = NULL;
2395+ spin_unlock(&vma_slot_list_lock);
2396+}
2397+
2398+/**
2399+ * Need to do two things:
2400+ * 1. check if slot was moved to del list
2401+ * 2. make sure the mmap_lock is manipulated under valid vma.
2402+ *
2403+ * My concern here is that in some cases, this may make
2404+ * vma_slot_list_lock() waiters to serialized further by some
2405+ * sem->wait_lock, can this really be expensive?
2406+ *
2407+ *
2408+ * @return
2409+ * 0: if successfully locked mmap_lock
2410+ * -ENOENT: this slot was moved to del list
2411+ * -EBUSY: vma lock failed
2412+ */
2413+static int try_down_read_slot_mmap_lock(struct vma_slot *slot)
2414+{
2415+ struct vm_area_struct *vma;
2416+ struct mm_struct *mm;
2417+ struct rw_semaphore *sem;
2418+
2419+ spin_lock(&vma_slot_list_lock);
2420+
2421+ /* the slot_list was removed and inited from new list, when it enters
2422+ * uksm_list. If now it's not empty, then it must be moved to del list
2423+ */
2424+ if (!slot_in_uksm(slot)) {
2425+ spin_unlock(&vma_slot_list_lock);
2426+ return -ENOENT;
2427+ }
2428+
2429+ BUG_ON(slot->pages != vma_pages(slot->vma));
2430+ /* Ok, vma still valid */
2431+ vma = slot->vma;
2432+ mm = vma->vm_mm;
2433+
2434+
2435+ if (uksm_test_exit(mm)) {
2436+ spin_unlock(&vma_slot_list_lock);
2437+ return -ENOENT;
2438+ }
2439+
2440+ if (mmap_read_trylock(mm)) {
2441+ spin_unlock(&vma_slot_list_lock);
2442+ if (slot_pool_alloc(slot)) {
2443+ uksm_remove_vma(vma);
2444+ mmap_read_unlock(mm);
2445+ return -ENOENT;
2446+ }
2447+ return 0;
2448+ }
2449+
2450+ spin_unlock(&vma_slot_list_lock);
2451+ return -EBUSY;
2452+}
2453+
2454+static inline unsigned long
2455+vma_page_address(struct page *page, struct vm_area_struct *vma)
2456+{
2457+ pgoff_t pgoff = page->index;
2458+ unsigned long address;
2459+
2460+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2461+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2462+ /* page should be within @vma mapping range */
2463+ return -EFAULT;
2464+ }
2465+ return address;
2466+}
2467+
2468+
2469+/* return 0 on success with the item's mmap_lock locked */
2470+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2471+{
2472+ struct mm_struct *mm;
2473+ struct vma_slot *slot = item->slot;
2474+ int err = -EINVAL;
2475+
2476+ struct page *page;
2477+
2478+ /*
2479+ * try_down_read_slot_mmap_lock() returns non-zero if the slot
2480+ * has been removed by uksm_remove_vma().
2481+ */
2482+ if (try_down_read_slot_mmap_lock(slot))
2483+ return -EBUSY;
2484+
2485+ mm = slot->vma->vm_mm;
2486+
2487+ if (uksm_test_exit(mm))
2488+ goto failout_up;
2489+
2490+ page = item->page;
2491+ rcu_read_lock();
2492+ if (!get_page_unless_zero(page)) {
2493+ rcu_read_unlock();
2494+ goto failout_up;
2495+ }
2496+
2497+ /* No need to consider huge page here. */
2498+ if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2499+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2500+ /*
2501+ * TODO:
2502+ * should we release this item becase of its stale page
2503+ * mapping?
2504+ */
2505+ put_page(page);
2506+ rcu_read_unlock();
2507+ goto failout_up;
2508+ }
2509+ rcu_read_unlock();
2510+ return 0;
2511+
2512+failout_up:
2513+ mmap_read_unlock(mm);
2514+ return err;
2515+}
2516+
2517+/*
2518+ * What kind of VMA is considered ?
2519+ */
2520+static inline int vma_can_enter(struct vm_area_struct *vma)
2521+{
2522+ return uksm_flags_can_scan(vma->vm_flags);
2523+}
2524+
2525+/*
2526+ * Called whenever a fresh new vma is created A new vma_slot.
2527+ * is created and inserted into a global list Must be called.
2528+ * after vma is inserted to its mm.
2529+ */
2530+void uksm_vma_add_new(struct vm_area_struct *vma)
2531+{
2532+ struct vma_slot *slot;
2533+
2534+ if (!vma_can_enter(vma)) {
2535+ vma->uksm_vma_slot = NULL;
2536+ return;
2537+ }
2538+
2539+ slot = alloc_vma_slot();
2540+ if (!slot) {
2541+ vma->uksm_vma_slot = NULL;
2542+ return;
2543+ }
2544+
2545+ vma->uksm_vma_slot = slot;
2546+ vma->vm_flags |= VM_MERGEABLE;
2547+ slot->vma = vma;
2548+ slot->mm = vma->vm_mm;
2549+ slot->ctime_j = jiffies;
2550+ slot->pages = vma_pages(vma);
2551+ spin_lock(&vma_slot_list_lock);
2552+ list_add_tail(&slot->slot_list, &vma_slot_new);
2553+ spin_unlock(&vma_slot_list_lock);
2554+}
2555+
2556+/* 32/3 < they < 32/2 */
2557+#define shiftl 8
2558+#define shiftr 12
2559+
2560+#define HASH_FROM_TO(from, to) \
2561+for (index = from; index < to; index++) { \
2562+ pos = random_nums[index]; \
2563+ hash += key[pos]; \
2564+ hash += (hash << shiftl); \
2565+ hash ^= (hash >> shiftr); \
2566+}
2567+
2568+
2569+#define HASH_FROM_DOWN_TO(from, to) \
2570+for (index = from - 1; index >= to; index--) { \
2571+ hash ^= (hash >> shiftr); \
2572+ hash ^= (hash >> (shiftr*2)); \
2573+ hash -= (hash << shiftl); \
2574+ hash += (hash << (shiftl*2)); \
2575+ pos = random_nums[index]; \
2576+ hash -= key[pos]; \
2577+}
2578+
2579+/*
2580+ * The main random sample hash function.
2581+ */
2582+static u32 random_sample_hash(void *addr, u32 hash_strength)
2583+{
2584+ u32 hash = 0xdeadbeef;
2585+ int index, pos, loop = hash_strength;
2586+ u32 *key = (u32 *)addr;
2587+
2588+ if (loop > HASH_STRENGTH_FULL)
2589+ loop = HASH_STRENGTH_FULL;
2590+
2591+ HASH_FROM_TO(0, loop);
2592+
2593+ if (hash_strength > HASH_STRENGTH_FULL) {
2594+ loop = hash_strength - HASH_STRENGTH_FULL;
2595+ HASH_FROM_TO(0, loop);
2596+ }
2597+
2598+ return hash;
2599+}
2600+
2601+
2602+/**
2603+ * It's used when hash strength is adjusted
2604+ *
2605+ * @addr The page's virtual address
2606+ * @from The original hash strength
2607+ * @to The hash strength changed to
2608+ * @hash The hash value generated with "from" hash value
2609+ *
2610+ * return the hash value
2611+ */
2612+static u32 delta_hash(void *addr, int from, int to, u32 hash)
2613+{
2614+ u32 *key = (u32 *)addr;
2615+ int index, pos; /* make sure they are int type */
2616+
2617+ if (to > from) {
2618+ if (from >= HASH_STRENGTH_FULL) {
2619+ from -= HASH_STRENGTH_FULL;
2620+ to -= HASH_STRENGTH_FULL;
2621+ HASH_FROM_TO(from, to);
2622+ } else if (to <= HASH_STRENGTH_FULL) {
2623+ HASH_FROM_TO(from, to);
2624+ } else {
2625+ HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2626+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2627+ }
2628+ } else {
2629+ if (from <= HASH_STRENGTH_FULL) {
2630+ HASH_FROM_DOWN_TO(from, to);
2631+ } else if (to >= HASH_STRENGTH_FULL) {
2632+ from -= HASH_STRENGTH_FULL;
2633+ to -= HASH_STRENGTH_FULL;
2634+ HASH_FROM_DOWN_TO(from, to);
2635+ } else {
2636+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2637+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2638+ }
2639+ }
2640+
2641+ return hash;
2642+}
2643+
2644+/**
2645+ *
2646+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2647+ * has finished.
2648+ *
2649+ * return 0 if no page has been scanned since last call, 1 otherwise.
2650+ */
2651+static inline int encode_benefit(void)
2652+{
2653+ u64 scanned_delta, pos_delta, neg_delta;
2654+ unsigned long base = benefit.base;
2655+
2656+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2657+
2658+ if (!scanned_delta)
2659+ return 0;
2660+
2661+ scanned_delta >>= base;
2662+ pos_delta = rshash_pos >> base;
2663+ neg_delta = rshash_neg >> base;
2664+
2665+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2666+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2667+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2668+ benefit.scanned >>= 1;
2669+ benefit.neg >>= 1;
2670+ benefit.pos >>= 1;
2671+ benefit.base++;
2672+ scanned_delta >>= 1;
2673+ pos_delta >>= 1;
2674+ neg_delta >>= 1;
2675+ }
2676+
2677+ benefit.pos += pos_delta;
2678+ benefit.neg += neg_delta;
2679+ benefit.scanned += scanned_delta;
2680+
2681+ BUG_ON(!benefit.scanned);
2682+
2683+ rshash_pos = rshash_neg = 0;
2684+ uksm_pages_scanned_last = uksm_pages_scanned;
2685+
2686+ return 1;
2687+}
2688+
2689+static inline void reset_benefit(void)
2690+{
2691+ benefit.pos = 0;
2692+ benefit.neg = 0;
2693+ benefit.base = 0;
2694+ benefit.scanned = 0;
2695+}
2696+
2697+static inline void inc_rshash_pos(unsigned long delta)
2698+{
2699+ if (CAN_OVERFLOW_U64(rshash_pos, delta))
2700+ encode_benefit();
2701+
2702+ rshash_pos += delta;
2703+}
2704+
2705+static inline void inc_rshash_neg(unsigned long delta)
2706+{
2707+ if (CAN_OVERFLOW_U64(rshash_neg, delta))
2708+ encode_benefit();
2709+
2710+ rshash_neg += delta;
2711+}
2712+
2713+
2714+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2715+ int cost_accounting)
2716+{
2717+ u32 val;
2718+ unsigned long delta;
2719+
2720+ void *addr = kmap_atomic(page);
2721+
2722+ val = random_sample_hash(addr, hash_strength);
2723+ kunmap_atomic(addr);
2724+
2725+ if (cost_accounting) {
2726+ if (hash_strength < HASH_STRENGTH_FULL)
2727+ delta = HASH_STRENGTH_FULL - hash_strength;
2728+ else
2729+ delta = 0;
2730+
2731+ inc_rshash_pos(delta);
2732+ }
2733+
2734+ return val;
2735+}
2736+
2737+static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
2738+ int cost_accounting)
2739+{
2740+ char *addr1, *addr2;
2741+ int ret;
2742+
2743+ addr1 = kmap_atomic(page1);
2744+ addr2 = kmap_atomic(page2);
2745+ ret = memcmp(addr1, addr2, PAGE_SIZE);
2746+ kunmap_atomic(addr2);
2747+ kunmap_atomic(addr1);
2748+
2749+ if (cost_accounting)
2750+ inc_rshash_neg(memcmp_cost);
2751+
2752+ return ret;
2753+}
2754+
2755+static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
2756+{
2757+ return !memcmp_pages_with_cost(page1, page2, 0);
2758+}
2759+
2760+static inline int is_page_full_zero(struct page *page)
2761+{
2762+ char *addr;
2763+ int ret;
2764+
2765+ addr = kmap_atomic(page);
2766+ ret = is_full_zero(addr, PAGE_SIZE);
2767+ kunmap_atomic(addr);
2768+
2769+ return ret;
2770+}
2771+
2772+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2773+ pte_t *orig_pte, pte_t *old_pte)
2774+{
2775+ struct mm_struct *mm = vma->vm_mm;
2776+ struct page_vma_mapped_walk pvmw = {
2777+ .page = page,
2778+ .vma = vma,
2779+ };
2780+ struct mmu_notifier_range range;
2781+ int swapped;
2782+ int err = -EFAULT;
2783+
2784+ pvmw.address = page_address_in_vma(page, vma);
2785+ if (pvmw.address == -EFAULT)
2786+ goto out;
2787+
2788+ BUG_ON(PageTransCompound(page));
2789+
2790+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
2791+ pvmw.address + PAGE_SIZE);
2792+ mmu_notifier_invalidate_range_start(&range);
2793+
2794+ if (!page_vma_mapped_walk(&pvmw))
2795+ goto out_mn;
2796+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
2797+ goto out_unlock;
2798+
2799+ if (old_pte)
2800+ *old_pte = *pvmw.pte;
2801+
2802+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
2803+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
2804+ pte_t entry;
2805+
2806+ swapped = PageSwapCache(page);
2807+ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
2808+ /*
2809+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
2810+ * take any lock, therefore the check that we are going to make
2811+ * with the pagecount against the mapcount is racey and
2812+ * O_DIRECT can happen right after the check.
2813+ * So we clear the pte and flush the tlb before the check
2814+ * this assure us that no O_DIRECT can happen after the check
2815+ * or in the middle of the check.
2816+ */
2817+ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
2818+ /*
2819+ * Check that no O_DIRECT or similar I/O is in progress on the
2820+ * page
2821+ */
2822+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2823+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
2824+ goto out_unlock;
2825+ }
2826+ if (pte_dirty(entry))
2827+ set_page_dirty(page);
2828+
2829+ if (pte_protnone(entry))
2830+ entry = pte_mkclean(pte_clear_savedwrite(entry));
2831+ else
2832+ entry = pte_mkclean(pte_wrprotect(entry));
2833+
2834+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
2835+ }
2836+ *orig_pte = *pvmw.pte;
2837+ err = 0;
2838+
2839+out_unlock:
2840+ page_vma_mapped_walk_done(&pvmw);
2841+out_mn:
2842+ mmu_notifier_invalidate_range_end(&range);
2843+out:
2844+ return err;
2845+}
2846+
2847+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
2848+#define MERGE_ERR_COLLI 2 /* there is a collision */
2849+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
2850+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
2851+
2852+
2853+/**
2854+ * replace_page - replace page in vma by new ksm page
2855+ * @vma: vma that holds the pte pointing to page
2856+ * @page: the page we are replacing by kpage
2857+ * @kpage: the ksm page we replace page by
2858+ * @orig_pte: the original value of the pte
2859+ *
2860+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
2861+ */
2862+static int replace_page(struct vm_area_struct *vma, struct page *page,
2863+ struct page *kpage, pte_t orig_pte)
2864+{
2865+ struct mm_struct *mm = vma->vm_mm;
2866+ struct mmu_notifier_range range;
2867+ pgd_t *pgd;
2868+ p4d_t *p4d;
2869+ pud_t *pud;
2870+ pmd_t *pmd;
2871+ pte_t *ptep;
2872+ spinlock_t *ptl;
2873+ pte_t entry;
2874+
2875+ unsigned long addr;
2876+ int err = MERGE_ERR_PGERR;
2877+
2878+ addr = page_address_in_vma(page, vma);
2879+ if (addr == -EFAULT)
2880+ goto out;
2881+
2882+ pgd = pgd_offset(mm, addr);
2883+ if (!pgd_present(*pgd))
2884+ goto out;
2885+
2886+ p4d = p4d_offset(pgd, addr);
2887+ pud = pud_offset(p4d, addr);
2888+ if (!pud_present(*pud))
2889+ goto out;
2890+
2891+ pmd = pmd_offset(pud, addr);
2892+ BUG_ON(pmd_trans_huge(*pmd));
2893+ if (!pmd_present(*pmd))
2894+ goto out;
2895+
2896+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
2897+ addr + PAGE_SIZE);
2898+ mmu_notifier_invalidate_range_start(&range);
2899+
2900+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2901+ if (!pte_same(*ptep, orig_pte)) {
2902+ pte_unmap_unlock(ptep, ptl);
2903+ goto out_mn;
2904+ }
2905+
2906+ flush_cache_page(vma, addr, pte_pfn(*ptep));
2907+ ptep_clear_flush_notify(vma, addr, ptep);
2908+ entry = mk_pte(kpage, vma->vm_page_prot);
2909+
2910+ /* special treatment is needed for zero_page */
2911+ if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2912+ (page_to_pfn(kpage) == zero_pfn)) {
2913+ entry = pte_mkspecial(entry);
2914+ dec_mm_counter(mm, MM_ANONPAGES);
2915+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
2916+ } else {
2917+ get_page(kpage);
2918+ page_add_anon_rmap(kpage, vma, addr, false);
2919+ }
2920+
2921+ set_pte_at_notify(mm, addr, ptep, entry);
2922+
2923+ page_remove_rmap(page, false);
2924+ if (!page_mapped(page))
2925+ try_to_free_swap(page);
2926+ put_page(page);
2927+
2928+ pte_unmap_unlock(ptep, ptl);
2929+ err = 0;
2930+out_mn:
2931+ mmu_notifier_invalidate_range_end(&range);
2932+out:
2933+ return err;
2934+}
2935+
2936+
2937+/**
2938+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2939+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2940+ * hash_max member has not been calculated.
2941+ *
2942+ * @page The page needs to be hashed
2943+ * @hash_old The hash value calculated with current hash strength
2944+ *
2945+ * return the new hash value calculated at HASH_STRENGTH_MAX
2946+ */
2947+static inline u32 page_hash_max(struct page *page, u32 hash_old)
2948+{
2949+ u32 hash_max = 0;
2950+ void *addr;
2951+
2952+ addr = kmap_atomic(page);
2953+ hash_max = delta_hash(addr, hash_strength,
2954+ HASH_STRENGTH_MAX, hash_old);
2955+
2956+ kunmap_atomic(addr);
2957+
2958+ if (!hash_max)
2959+ hash_max = 1;
2960+
2961+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2962+ return hash_max;
2963+}
2964+
2965+/*
2966+ * We compare the hash again, to ensure that it is really a hash collision
2967+ * instead of being caused by page write.
2968+ */
2969+static inline int check_collision(struct rmap_item *rmap_item,
2970+ u32 hash)
2971+{
2972+ int err;
2973+ struct page *page = rmap_item->page;
2974+
2975+ /* if this rmap_item has already been hash_maxed, then the collision
2976+ * must appears in the second-level rbtree search. In this case we check
2977+ * if its hash_max value has been changed. Otherwise, the collision
2978+ * happens in the first-level rbtree search, so we check against it's
2979+ * current hash value.
2980+ */
2981+ if (rmap_item->hash_max) {
2982+ inc_rshash_neg(memcmp_cost);
2983+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2984+
2985+ if (rmap_item->hash_max == page_hash_max(page, hash))
2986+ err = MERGE_ERR_COLLI;
2987+ else
2988+ err = MERGE_ERR_CHANGED;
2989+ } else {
2990+ inc_rshash_neg(memcmp_cost + hash_strength);
2991+
2992+ if (page_hash(page, hash_strength, 0) == hash)
2993+ err = MERGE_ERR_COLLI;
2994+ else
2995+ err = MERGE_ERR_CHANGED;
2996+ }
2997+
2998+ return err;
2999+}
3000+
3001+/**
3002+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3003+ * already be a ksm page.
3004+ *
3005+ * @return 0 if the pages were merged, -EFAULT otherwise.
3006+ */
3007+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3008+ struct page *kpage, u32 hash)
3009+{
3010+ struct vm_area_struct *vma = rmap_item->slot->vma;
3011+ struct mm_struct *mm = vma->vm_mm;
3012+ pte_t orig_pte = __pte(0);
3013+ int err = MERGE_ERR_PGERR;
3014+ struct page *page;
3015+
3016+ if (uksm_test_exit(mm))
3017+ goto out;
3018+
3019+ page = rmap_item->page;
3020+
3021+ if (page == kpage) { /* ksm page forked */
3022+ err = 0;
3023+ goto out;
3024+ }
3025+
3026+ /*
3027+ * We need the page lock to read a stable PageSwapCache in
3028+ * write_protect_page(). We use trylock_page() instead of
3029+ * lock_page() because we don't want to wait here - we
3030+ * prefer to continue scanning and merging different pages,
3031+ * then come back to this page when it is unlocked.
3032+ */
3033+ if (!trylock_page(page))
3034+ goto out;
3035+
3036+ if (!PageAnon(page) || !PageKsm(kpage))
3037+ goto out_unlock;
3038+
3039+ if (PageTransCompound(page)) {
3040+ err = split_huge_page(page);
3041+ if (err)
3042+ goto out_unlock;
3043+ }
3044+
3045+ /*
3046+ * If this anonymous page is mapped only here, its pte may need
3047+ * to be write-protected. If it's mapped elsewhere, all of its
3048+ * ptes are necessarily already write-protected. But in either
3049+ * case, we need to lock and check page_count is not raised.
3050+ */
3051+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3052+ if (pages_identical_with_cost(page, kpage))
3053+ err = replace_page(vma, page, kpage, orig_pte);
3054+ else
3055+ err = check_collision(rmap_item, hash);
3056+ }
3057+
3058+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3059+ munlock_vma_page(page);
3060+ if (!PageMlocked(kpage)) {
3061+ unlock_page(page);
3062+ lock_page(kpage);
3063+ mlock_vma_page(kpage);
3064+ page = kpage; /* for final unlock */
3065+ }
3066+ }
3067+
3068+out_unlock:
3069+ unlock_page(page);
3070+out:
3071+ return err;
3072+}
3073+
3074+
3075+
3076+/**
3077+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3078+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
3079+ *
3080+ * @return 0 on success.
3081+ */
3082+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3083+ pte_t orig_pte, pte_t wprt_pte)
3084+{
3085+ struct mm_struct *mm = vma->vm_mm;
3086+ pgd_t *pgd;
3087+ p4d_t *p4d;
3088+ pud_t *pud;
3089+ pmd_t *pmd;
3090+ pte_t *ptep;
3091+ spinlock_t *ptl;
3092+
3093+ int err = -EFAULT;
3094+
3095+ pgd = pgd_offset(mm, addr);
3096+ if (!pgd_present(*pgd))
3097+ goto out;
3098+
3099+ p4d = p4d_offset(pgd, addr);
3100+ pud = pud_offset(p4d, addr);
3101+ if (!pud_present(*pud))
3102+ goto out;
3103+
3104+ pmd = pmd_offset(pud, addr);
3105+ if (!pmd_present(*pmd))
3106+ goto out;
3107+
3108+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3109+ if (!pte_same(*ptep, wprt_pte)) {
3110+ /* already copied, let it be */
3111+ pte_unmap_unlock(ptep, ptl);
3112+ goto out;
3113+ }
3114+
3115+ /*
3116+ * Good boy, still here. When we still get the ksm page, it does not
3117+ * return to the free page pool, there is no way that a pte was changed
3118+ * to other page and gets back to this page. And remind that ksm page
3119+ * do not reuse in do_wp_page(). So it's safe to restore the original
3120+ * pte.
3121+ */
3122+ flush_cache_page(vma, addr, pte_pfn(*ptep));
3123+ ptep_clear_flush_notify(vma, addr, ptep);
3124+ set_pte_at_notify(mm, addr, ptep, orig_pte);
3125+
3126+ pte_unmap_unlock(ptep, ptl);
3127+ err = 0;
3128+out:
3129+ return err;
3130+}
3131+
3132+/**
3133+ * try_to_merge_two_pages() - take two identical pages and prepare
3134+ * them to be merged into one page(rmap_item->page)
3135+ *
3136+ * @return 0 if we successfully merged two identical pages into
3137+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3138+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3139+ * changed since it's hashed. MERGE_ERR_PGERR otherwise.
3140+ *
3141+ */
3142+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3143+ struct rmap_item *tree_rmap_item,
3144+ u32 hash)
3145+{
3146+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3147+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3148+ struct vm_area_struct *vma1 = rmap_item->slot->vma;
3149+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3150+ struct page *page = rmap_item->page;
3151+ struct page *tree_page = tree_rmap_item->page;
3152+ int err = MERGE_ERR_PGERR;
3153+ struct address_space *saved_mapping;
3154+
3155+
3156+ if (rmap_item->page == tree_rmap_item->page)
3157+ goto out;
3158+
3159+ if (!trylock_page(page))
3160+ goto out;
3161+
3162+ if (!PageAnon(page))
3163+ goto out_unlock;
3164+
3165+ if (PageTransCompound(page)) {
3166+ err = split_huge_page(page);
3167+ if (err)
3168+ goto out_unlock;
3169+ }
3170+
3171+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3172+ unlock_page(page);
3173+ goto out;
3174+ }
3175+
3176+ /*
3177+ * While we hold page lock, upgrade page from
3178+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
3179+ * stable_tree_insert() will update stable_node.
3180+ */
3181+ saved_mapping = page->mapping;
3182+ set_page_stable_node(page, NULL);
3183+ mark_page_accessed(page);
3184+ if (!PageDirty(page))
3185+ SetPageDirty(page);
3186+
3187+ unlock_page(page);
3188+
3189+ if (!trylock_page(tree_page))
3190+ goto restore_out;
3191+
3192+ if (!PageAnon(tree_page)) {
3193+ unlock_page(tree_page);
3194+ goto restore_out;
3195+ }
3196+
3197+ if (PageTransCompound(tree_page)) {
3198+ err = split_huge_page(tree_page);
3199+ if (err) {
3200+ unlock_page(tree_page);
3201+ goto restore_out;
3202+ }
3203+ }
3204+
3205+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3206+ unlock_page(tree_page);
3207+ goto restore_out;
3208+ }
3209+
3210+ if (pages_identical_with_cost(page, tree_page)) {
3211+ err = replace_page(vma2, tree_page, page, wprt_pte2);
3212+ if (err) {
3213+ unlock_page(tree_page);
3214+ goto restore_out;
3215+ }
3216+
3217+ if ((vma2->vm_flags & VM_LOCKED)) {
3218+ munlock_vma_page(tree_page);
3219+ if (!PageMlocked(page)) {
3220+ unlock_page(tree_page);
3221+ lock_page(page);
3222+ mlock_vma_page(page);
3223+ tree_page = page; /* for final unlock */
3224+ }
3225+ }
3226+
3227+ unlock_page(tree_page);
3228+
3229+ goto out; /* success */
3230+
3231+ } else {
3232+ if (tree_rmap_item->hash_max &&
3233+ tree_rmap_item->hash_max == rmap_item->hash_max) {
3234+ err = MERGE_ERR_COLLI_MAX;
3235+ } else if (page_hash(page, hash_strength, 0) ==
3236+ page_hash(tree_page, hash_strength, 0)) {
3237+ inc_rshash_neg(memcmp_cost + hash_strength * 2);
3238+ err = MERGE_ERR_COLLI;
3239+ } else {
3240+ err = MERGE_ERR_CHANGED;
3241+ }
3242+
3243+ unlock_page(tree_page);
3244+ }
3245+
3246+restore_out:
3247+ lock_page(page);
3248+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3249+ orig_pte1, wprt_pte1))
3250+ page->mapping = saved_mapping;
3251+
3252+out_unlock:
3253+ unlock_page(page);
3254+out:
3255+ return err;
3256+}
3257+
3258+static inline int hash_cmp(u32 new_val, u32 node_val)
3259+{
3260+ if (new_val > node_val)
3261+ return 1;
3262+ else if (new_val < node_val)
3263+ return -1;
3264+ else
3265+ return 0;
3266+}
3267+
3268+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3269+{
3270+ u32 hash_max = item->hash_max;
3271+
3272+ if (!hash_max) {
3273+ hash_max = page_hash_max(item->page, hash);
3274+
3275+ item->hash_max = hash_max;
3276+ }
3277+
3278+ return hash_max;
3279+}
3280+
3281+
3282+
3283+/**
3284+ * stable_tree_search() - search the stable tree for a page
3285+ *
3286+ * @item: the rmap_item we are comparing with
3287+ * @hash: the hash value of this item->page already calculated
3288+ *
3289+ * @return the page we have found, NULL otherwise. The page returned has
3290+ * been gotten.
3291+ */
3292+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3293+{
3294+ struct rb_node *node = root_stable_treep->rb_node;
3295+ struct tree_node *tree_node;
3296+ unsigned long hash_max;
3297+ struct page *page = item->page;
3298+ struct stable_node *stable_node;
3299+
3300+ stable_node = page_stable_node(page);
3301+ if (stable_node) {
3302+ /* ksm page forked, that is
3303+ * if (PageKsm(page) && !in_stable_tree(rmap_item))
3304+ * it's actually gotten once outside.
3305+ */
3306+ get_page(page);
3307+ return page;
3308+ }
3309+
3310+ while (node) {
3311+ int cmp;
3312+
3313+ tree_node = rb_entry(node, struct tree_node, node);
3314+
3315+ cmp = hash_cmp(hash, tree_node->hash);
3316+
3317+ if (cmp < 0)
3318+ node = node->rb_left;
3319+ else if (cmp > 0)
3320+ node = node->rb_right;
3321+ else
3322+ break;
3323+ }
3324+
3325+ if (!node)
3326+ return NULL;
3327+
3328+ if (tree_node->count == 1) {
3329+ stable_node = rb_entry(tree_node->sub_root.rb_node,
3330+ struct stable_node, node);
3331+ BUG_ON(!stable_node);
3332+
3333+ goto get_page_out;
3334+ }
3335+
3336+ /*
3337+ * ok, we have to search the second
3338+ * level subtree, hash the page to a
3339+ * full strength.
3340+ */
3341+ node = tree_node->sub_root.rb_node;
3342+ BUG_ON(!node);
3343+ hash_max = rmap_item_hash_max(item, hash);
3344+
3345+ while (node) {
3346+ int cmp;
3347+
3348+ stable_node = rb_entry(node, struct stable_node, node);
3349+
3350+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3351+
3352+ if (cmp < 0)
3353+ node = node->rb_left;
3354+ else if (cmp > 0)
3355+ node = node->rb_right;
3356+ else
3357+ goto get_page_out;
3358+ }
3359+
3360+ return NULL;
3361+
3362+get_page_out:
3363+ page = get_uksm_page(stable_node, 1, 1);
3364+ return page;
3365+}
3366+
3367+static int try_merge_rmap_item(struct rmap_item *item,
3368+ struct page *kpage,
3369+ struct page *tree_page)
3370+{
3371+ struct vm_area_struct *vma = item->slot->vma;
3372+ struct page_vma_mapped_walk pvmw = {
3373+ .page = kpage,
3374+ .vma = vma,
3375+ };
3376+
3377+ pvmw.address = get_rmap_addr(item);
3378+ if (!page_vma_mapped_walk(&pvmw))
3379+ return 0;
3380+
3381+ if (pte_write(*pvmw.pte)) {
3382+ /* has changed, abort! */
3383+ page_vma_mapped_walk_done(&pvmw);
3384+ return 0;
3385+ }
3386+
3387+ get_page(tree_page);
3388+ page_add_anon_rmap(tree_page, vma, pvmw.address, false);
3389+
3390+ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
3391+ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
3392+ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
3393+ mk_pte(tree_page, vma->vm_page_prot));
3394+
3395+ page_remove_rmap(kpage, false);
3396+ put_page(kpage);
3397+
3398+ page_vma_mapped_walk_done(&pvmw);
3399+
3400+ return 1;
3401+}
3402+
3403+/**
3404+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3405+ * into stable tree, the page was found to be identical to a stable ksm page,
3406+ * this is the last chance we can merge them into one.
3407+ *
3408+ * @item1: the rmap_item holding the page which we wanted to insert
3409+ * into stable tree.
3410+ * @item2: the other rmap_item we found when unstable tree search
3411+ * @oldpage: the page currently mapped by the two rmap_items
3412+ * @tree_page: the page we found identical in stable tree node
3413+ * @success1: return if item1 is successfully merged
3414+ * @success2: return if item2 is successfully merged
3415+ */
3416+static void try_merge_with_stable(struct rmap_item *item1,
3417+ struct rmap_item *item2,
3418+ struct page **kpage,
3419+ struct page *tree_page,
3420+ int *success1, int *success2)
3421+{
3422+ struct vm_area_struct *vma1 = item1->slot->vma;
3423+ struct vm_area_struct *vma2 = item2->slot->vma;
3424+ *success1 = 0;
3425+ *success2 = 0;
3426+
3427+ if (unlikely(*kpage == tree_page)) {
3428+ /* I don't think this can really happen */
3429+ pr_warn("UKSM: unexpected condition detected in "
3430+ "%s -- *kpage == tree_page !\n", __func__);
3431+ *success1 = 1;
3432+ *success2 = 1;
3433+ return;
3434+ }
3435+
3436+ if (!PageAnon(*kpage) || !PageKsm(*kpage))
3437+ goto failed;
3438+
3439+ if (!trylock_page(tree_page))
3440+ goto failed;
3441+
3442+ /* If the oldpage is still ksm and still pointed
3443+ * to in the right place, and still write protected,
3444+ * we are confident it's not changed, no need to
3445+ * memcmp anymore.
3446+ * be ware, we cannot take nested pte locks,
3447+ * deadlock risk.
3448+ */
3449+ if (!try_merge_rmap_item(item1, *kpage, tree_page))
3450+ goto unlock_failed;
3451+
3452+ /* ok, then vma2, remind that pte1 already set */
3453+ if (!try_merge_rmap_item(item2, *kpage, tree_page))
3454+ goto success_1;
3455+
3456+ *success2 = 1;
3457+success_1:
3458+ *success1 = 1;
3459+
3460+
3461+ if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3462+ (*success2 && vma2->vm_flags & VM_LOCKED)) {
3463+ munlock_vma_page(*kpage);
3464+ if (!PageMlocked(tree_page))
3465+ mlock_vma_page(tree_page);
3466+ }
3467+
3468+ /*
3469+ * We do not need oldpage any more in the caller, so can break the lock
3470+ * now.
3471+ */
3472+ unlock_page(*kpage);
3473+ *kpage = tree_page; /* Get unlocked outside. */
3474+ return;
3475+
3476+unlock_failed:
3477+ unlock_page(tree_page);
3478+failed:
3479+ return;
3480+}
3481+
3482+static inline void stable_node_hash_max(struct stable_node *node,
3483+ struct page *page, u32 hash)
3484+{
3485+ u32 hash_max = node->hash_max;
3486+
3487+ if (!hash_max) {
3488+ hash_max = page_hash_max(page, hash);
3489+ node->hash_max = hash_max;
3490+ }
3491+}
3492+
3493+static inline
3494+struct stable_node *new_stable_node(struct tree_node *tree_node,
3495+ struct page *kpage, u32 hash_max)
3496+{
3497+ struct stable_node *new_stable_node;
3498+
3499+ new_stable_node = alloc_stable_node();
3500+ if (!new_stable_node)
3501+ return NULL;
3502+
3503+ new_stable_node->kpfn = page_to_pfn(kpage);
3504+ new_stable_node->hash_max = hash_max;
3505+ new_stable_node->tree_node = tree_node;
3506+ set_page_stable_node(kpage, new_stable_node);
3507+
3508+ return new_stable_node;
3509+}
3510+
3511+static inline
3512+struct stable_node *first_level_insert(struct tree_node *tree_node,
3513+ struct rmap_item *rmap_item,
3514+ struct rmap_item *tree_rmap_item,
3515+ struct page **kpage, u32 hash,
3516+ int *success1, int *success2)
3517+{
3518+ int cmp;
3519+ struct page *tree_page;
3520+ u32 hash_max = 0;
3521+ struct stable_node *stable_node, *new_snode;
3522+ struct rb_node *parent = NULL, **new;
3523+
3524+ /* this tree node contains no sub-tree yet */
3525+ stable_node = rb_entry(tree_node->sub_root.rb_node,
3526+ struct stable_node, node);
3527+
3528+ tree_page = get_uksm_page(stable_node, 1, 0);
3529+ if (tree_page) {
3530+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3531+ if (!cmp) {
3532+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3533+ tree_page, success1, success2);
3534+ put_page(tree_page);
3535+ if (!*success1 && !*success2)
3536+ goto failed;
3537+
3538+ return stable_node;
3539+
3540+ } else {
3541+ /*
3542+ * collision in first level try to create a subtree.
3543+ * A new node need to be created.
3544+ */
3545+ put_page(tree_page);
3546+
3547+ stable_node_hash_max(stable_node, tree_page,
3548+ tree_node->hash);
3549+ hash_max = rmap_item_hash_max(rmap_item, hash);
3550+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3551+
3552+ parent = &stable_node->node;
3553+ if (cmp < 0)
3554+ new = &parent->rb_left;
3555+ else if (cmp > 0)
3556+ new = &parent->rb_right;
3557+ else
3558+ goto failed;
3559+ }
3560+
3561+ } else {
3562+ /* the only stable_node deleted, we reuse its tree_node.
3563+ */
3564+ parent = NULL;
3565+ new = &tree_node->sub_root.rb_node;
3566+ }
3567+
3568+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
3569+ if (!new_snode)
3570+ goto failed;
3571+
3572+ rb_link_node(&new_snode->node, parent, new);
3573+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
3574+ tree_node->count++;
3575+ *success1 = *success2 = 1;
3576+
3577+ return new_snode;
3578+
3579+failed:
3580+ return NULL;
3581+}
3582+
3583+static inline
3584+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3585+ struct rmap_item *rmap_item,
3586+ struct rmap_item *tree_rmap_item,
3587+ struct page **kpage, u32 hash,
3588+ int *success1, int *success2)
3589+{
3590+ struct page *tree_page;
3591+ u32 hash_max;
3592+ struct stable_node *stable_node, *new_snode;
3593+ struct rb_node *parent, **new;
3594+
3595+research:
3596+ parent = NULL;
3597+ new = &tree_node->sub_root.rb_node;
3598+ BUG_ON(!*new);
3599+ hash_max = rmap_item_hash_max(rmap_item, hash);
3600+ while (*new) {
3601+ int cmp;
3602+
3603+ stable_node = rb_entry(*new, struct stable_node, node);
3604+
3605+ cmp = hash_cmp(hash_max, stable_node->hash_max);
3606+
3607+ if (cmp < 0) {
3608+ parent = *new;
3609+ new = &parent->rb_left;
3610+ } else if (cmp > 0) {
3611+ parent = *new;
3612+ new = &parent->rb_right;
3613+ } else {
3614+ tree_page = get_uksm_page(stable_node, 1, 0);
3615+ if (tree_page) {
3616+ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3617+ if (!cmp) {
3618+ try_merge_with_stable(rmap_item,
3619+ tree_rmap_item, kpage,
3620+ tree_page, success1, success2);
3621+
3622+ put_page(tree_page);
3623+ if (!*success1 && !*success2)
3624+ goto failed;
3625+ /*
3626+ * successfully merged with a stable
3627+ * node
3628+ */
3629+ return stable_node;
3630+ } else {
3631+ put_page(tree_page);
3632+ goto failed;
3633+ }
3634+ } else {
3635+ /*
3636+ * stable node may be deleted,
3637+ * and subtree maybe
3638+ * restructed, cannot
3639+ * continue, research it.
3640+ */
3641+ if (tree_node->count) {
3642+ goto research;
3643+ } else {
3644+ /* reuse the tree node*/
3645+ parent = NULL;
3646+ new = &tree_node->sub_root.rb_node;
3647+ }
3648+ }
3649+ }
3650+ }
3651+
3652+ new_snode = new_stable_node(tree_node, *kpage, hash_max);
3653+ if (!new_snode)
3654+ goto failed;
3655+
3656+ rb_link_node(&new_snode->node, parent, new);
3657+ rb_insert_color(&new_snode->node, &tree_node->sub_root);
3658+ tree_node->count++;
3659+ *success1 = *success2 = 1;
3660+
3661+ return new_snode;
3662+
3663+failed:
3664+ return NULL;
3665+}
3666+
3667+
3668+/**
3669+ * stable_tree_insert() - try to insert a merged page in unstable tree to
3670+ * the stable tree
3671+ *
3672+ * @kpage: the page need to be inserted
3673+ * @hash: the current hash of this page
3674+ * @rmap_item: the rmap_item being scanned
3675+ * @tree_rmap_item: the rmap_item found on unstable tree
3676+ * @success1: return if rmap_item is merged
3677+ * @success2: return if tree_rmap_item is merged
3678+ *
3679+ * @return the stable_node on stable tree if at least one
3680+ * rmap_item is inserted into stable tree, NULL
3681+ * otherwise.
3682+ */
3683+static struct stable_node *
3684+stable_tree_insert(struct page **kpage, u32 hash,
3685+ struct rmap_item *rmap_item,
3686+ struct rmap_item *tree_rmap_item,
3687+ int *success1, int *success2)
3688+{
3689+ struct rb_node **new = &root_stable_treep->rb_node;
3690+ struct rb_node *parent = NULL;
3691+ struct stable_node *stable_node;
3692+ struct tree_node *tree_node;
3693+ u32 hash_max = 0;
3694+
3695+ *success1 = *success2 = 0;
3696+
3697+ while (*new) {
3698+ int cmp;
3699+
3700+ tree_node = rb_entry(*new, struct tree_node, node);
3701+
3702+ cmp = hash_cmp(hash, tree_node->hash);
3703+
3704+ if (cmp < 0) {
3705+ parent = *new;
3706+ new = &parent->rb_left;
3707+ } else if (cmp > 0) {
3708+ parent = *new;
3709+ new = &parent->rb_right;
3710+ } else
3711+ break;
3712+ }
3713+
3714+ if (*new) {
3715+ if (tree_node->count == 1) {
3716+ stable_node = first_level_insert(tree_node, rmap_item,
3717+ tree_rmap_item, kpage,
3718+ hash, success1, success2);
3719+ } else {
3720+ stable_node = stable_subtree_insert(tree_node,
3721+ rmap_item, tree_rmap_item, kpage,
3722+ hash, success1, success2);
3723+ }
3724+ } else {
3725+
3726+ /* no tree node found */
3727+ tree_node = alloc_tree_node(stable_tree_node_listp);
3728+ if (!tree_node) {
3729+ stable_node = NULL;
3730+ goto out;
3731+ }
3732+
3733+ stable_node = new_stable_node(tree_node, *kpage, hash_max);
3734+ if (!stable_node) {
3735+ free_tree_node(tree_node);
3736+ goto out;
3737+ }
3738+
3739+ tree_node->hash = hash;
3740+ rb_link_node(&tree_node->node, parent, new);
3741+ rb_insert_color(&tree_node->node, root_stable_treep);
3742+ parent = NULL;
3743+ new = &tree_node->sub_root.rb_node;
3744+
3745+ rb_link_node(&stable_node->node, parent, new);
3746+ rb_insert_color(&stable_node->node, &tree_node->sub_root);
3747+ tree_node->count++;
3748+ *success1 = *success2 = 1;
3749+ }
3750+
3751+out:
3752+ return stable_node;
3753+}
3754+
3755+
3756+/**
3757+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_lock
3758+ *
3759+ * @return 0 on success, -EBUSY if unable to lock the mmap_lock,
3760+ * -EINVAL if the page mapping has been changed.
3761+ */
3762+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3763+{
3764+ int err;
3765+
3766+ err = get_mergeable_page_lock_mmap(tree_rmap_item);
3767+
3768+ if (err == -EINVAL) {
3769+ /* its page map has been changed, remove it */
3770+ remove_rmap_item_from_tree(tree_rmap_item);
3771+ }
3772+
3773+ /* The page is gotten and mmap_lock is locked now. */
3774+ return err;
3775+}
3776+
3777+
3778+/**
3779+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3780+ * same hash value. Get its page and trylock the mmap_lock
3781+ */
3782+static inline
3783+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3784+ u32 hash)
3785+
3786+{
3787+ struct rb_node **new = &root_unstable_tree.rb_node;
3788+ struct rb_node *parent = NULL;
3789+ struct tree_node *tree_node;
3790+ u32 hash_max;
3791+ struct rmap_item *tree_rmap_item;
3792+
3793+ while (*new) {
3794+ int cmp;
3795+
3796+ tree_node = rb_entry(*new, struct tree_node, node);
3797+
3798+ cmp = hash_cmp(hash, tree_node->hash);
3799+
3800+ if (cmp < 0) {
3801+ parent = *new;
3802+ new = &parent->rb_left;
3803+ } else if (cmp > 0) {
3804+ parent = *new;
3805+ new = &parent->rb_right;
3806+ } else
3807+ break;
3808+ }
3809+
3810+ if (*new) {
3811+ /* got the tree_node */
3812+ if (tree_node->count == 1) {
3813+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3814+ struct rmap_item, node);
3815+ BUG_ON(!tree_rmap_item);
3816+
3817+ goto get_page_out;
3818+ }
3819+
3820+ /* well, search the collision subtree */
3821+ new = &tree_node->sub_root.rb_node;
3822+ BUG_ON(!*new);
3823+ hash_max = rmap_item_hash_max(rmap_item, hash);
3824+
3825+ while (*new) {
3826+ int cmp;
3827+
3828+ tree_rmap_item = rb_entry(*new, struct rmap_item,
3829+ node);
3830+
3831+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3832+ parent = *new;
3833+ if (cmp < 0)
3834+ new = &parent->rb_left;
3835+ else if (cmp > 0)
3836+ new = &parent->rb_right;
3837+ else
3838+ goto get_page_out;
3839+ }
3840+ } else {
3841+ /* alloc a new tree_node */
3842+ tree_node = alloc_tree_node(&unstable_tree_node_list);
3843+ if (!tree_node)
3844+ return NULL;
3845+
3846+ tree_node->hash = hash;
3847+ rb_link_node(&tree_node->node, parent, new);
3848+ rb_insert_color(&tree_node->node, &root_unstable_tree);
3849+ parent = NULL;
3850+ new = &tree_node->sub_root.rb_node;
3851+ }
3852+
3853+ /* did not found even in sub-tree */
3854+ rmap_item->tree_node = tree_node;
3855+ rmap_item->address |= UNSTABLE_FLAG;
3856+ rmap_item->hash_round = uksm_hash_round;
3857+ rb_link_node(&rmap_item->node, parent, new);
3858+ rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3859+
3860+ uksm_pages_unshared++;
3861+ return NULL;
3862+
3863+get_page_out:
3864+ if (tree_rmap_item->page == rmap_item->page)
3865+ return NULL;
3866+
3867+ if (get_tree_rmap_item_page(tree_rmap_item))
3868+ return NULL;
3869+
3870+ return tree_rmap_item;
3871+}
3872+
3873+static void hold_anon_vma(struct rmap_item *rmap_item,
3874+ struct anon_vma *anon_vma)
3875+{
3876+ rmap_item->anon_vma = anon_vma;
3877+ get_anon_vma(anon_vma);
3878+}
3879+
3880+
3881+/**
3882+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3883+ * ratio statistics is done in this function.
3884+ *
3885+ */
3886+static void stable_tree_append(struct rmap_item *rmap_item,
3887+ struct stable_node *stable_node, int logdedup)
3888+{
3889+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3890+ unsigned long key = (unsigned long)rmap_item->slot;
3891+ unsigned long factor = rmap_item->slot->rung->step;
3892+
3893+ BUG_ON(!stable_node);
3894+ rmap_item->address |= STABLE_FLAG;
3895+
3896+ if (hlist_empty(&stable_node->hlist)) {
3897+ uksm_pages_shared++;
3898+ goto node_vma_new;
3899+ } else {
3900+ uksm_pages_sharing++;
3901+ }
3902+
3903+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3904+ if (node_vma->key >= key)
3905+ break;
3906+
3907+ if (logdedup) {
3908+ node_vma->slot->pages_bemerged += factor;
3909+ if (list_empty(&node_vma->slot->dedup_list))
3910+ list_add(&node_vma->slot->dedup_list,
3911+ &vma_slot_dedup);
3912+ }
3913+ }
3914+
3915+ if (node_vma) {
3916+ if (node_vma->key == key) {
3917+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3918+ goto node_vma_ok;
3919+ } else if (node_vma->key > key) {
3920+ node_vma_cont = node_vma;
3921+ }
3922+ }
3923+
3924+node_vma_new:
3925+ /* no same vma already in node, alloc a new node_vma */
3926+ new_node_vma = alloc_node_vma();
3927+ BUG_ON(!new_node_vma);
3928+ new_node_vma->head = stable_node;
3929+ new_node_vma->slot = rmap_item->slot;
3930+
3931+ if (!node_vma) {
3932+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3933+ } else if (node_vma->key != key) {
3934+ if (node_vma->key < key)
3935+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3936+ else {
3937+ hlist_add_before(&new_node_vma->hlist,
3938+ &node_vma->hlist);
3939+ }
3940+
3941+ }
3942+ node_vma = new_node_vma;
3943+
3944+node_vma_ok: /* ok, ready to add to the list */
3945+ rmap_item->head = node_vma;
3946+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3947+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3948+ if (logdedup) {
3949+ rmap_item->slot->pages_merged++;
3950+ if (node_vma_cont) {
3951+ node_vma = node_vma_cont;
3952+ hlist_for_each_entry_continue(node_vma, hlist) {
3953+ node_vma->slot->pages_bemerged += factor;
3954+ if (list_empty(&node_vma->slot->dedup_list))
3955+ list_add(&node_vma->slot->dedup_list,
3956+ &vma_slot_dedup);
3957+ }
3958+ }
3959+ }
3960+}
3961+
3962+/*
3963+ * We use break_ksm to break COW on a ksm page: it's a stripped down
3964+ *
3965+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
3966+ * put_page(page);
3967+ *
3968+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3969+ * in case the application has unmapped and remapped mm,addr meanwhile.
3970+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
3971+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3972+ */
3973+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3974+{
3975+ struct page *page;
3976+ int ret = 0;
3977+
3978+ do {
3979+ cond_resched();
3980+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
3981+ if (IS_ERR_OR_NULL(page))
3982+ break;
3983+ if (PageKsm(page)) {
3984+ ret = handle_mm_fault(vma, addr,
3985+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
3986+ } else
3987+ ret = VM_FAULT_WRITE;
3988+ put_page(page);
3989+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
3990+ /*
3991+ * We must loop because handle_mm_fault() may back out if there's
3992+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
3993+ *
3994+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3995+ * COW has been broken, even if the vma does not permit VM_WRITE;
3996+ * but note that a concurrent fault might break PageKsm for us.
3997+ *
3998+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
3999+ * backing file, which also invalidates anonymous pages: that's
4000+ * okay, that truncation will have unmapped the PageKsm for us.
4001+ *
4002+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
4003+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
4004+ * current task has TIF_MEMDIE set, and will be OOM killed on return
4005+ * to user; and ksmd, having no mm, would never be chosen for that.
4006+ *
4007+ * But if the mm is in a limited mem_cgroup, then the fault may fail
4008+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
4009+ * even ksmd can fail in this way - though it's usually breaking ksm
4010+ * just to undo a merge it made a moment before, so unlikely to oom.
4011+ *
4012+ * That's a pity: we might therefore have more kernel pages allocated
4013+ * than we're counting as nodes in the stable tree; but uksm_do_scan
4014+ * will retry to break_cow on each pass, so should recover the page
4015+ * in due course. The important thing is to not let VM_MERGEABLE
4016+ * be cleared while any such pages might remain in the area.
4017+ */
4018+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4019+}
4020+
4021+static void break_cow(struct rmap_item *rmap_item)
4022+{
4023+ struct vm_area_struct *vma = rmap_item->slot->vma;
4024+ struct mm_struct *mm = vma->vm_mm;
4025+ unsigned long addr = get_rmap_addr(rmap_item);
4026+
4027+ if (uksm_test_exit(mm))
4028+ goto out;
4029+
4030+ break_ksm(vma, addr);
4031+out:
4032+ return;
4033+}
4034+
4035+/*
4036+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4037+ * than check every pte of a given vma, the locking doesn't quite work for
4038+ * that - an rmap_item is assigned to the stable tree after inserting ksm
4039+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
4040+ * rmap_items from parent to child at fork time (so as not to waste time
4041+ * if exit comes before the next scan reaches it).
4042+ *
4043+ * Similarly, although we'd like to remove rmap_items (so updating counts
4044+ * and freeing memory) when unmerging an area, it's easier to leave that
4045+ * to the next pass of ksmd - consider, for example, how ksmd might be
4046+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
4047+ */
4048+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4049+ unsigned long start, unsigned long end)
4050+{
4051+ unsigned long addr;
4052+ int err = 0;
4053+
4054+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4055+ if (uksm_test_exit(vma->vm_mm))
4056+ break;
4057+ if (signal_pending(current))
4058+ err = -ERESTARTSYS;
4059+ else
4060+ err = break_ksm(vma, addr);
4061+ }
4062+ return err;
4063+}
4064+
4065+static inline void inc_uksm_pages_scanned(void)
4066+{
4067+ u64 delta;
4068+
4069+
4070+ if (uksm_pages_scanned == U64_MAX) {
4071+ encode_benefit();
4072+
4073+ delta = uksm_pages_scanned >> pages_scanned_base;
4074+
4075+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4076+ pages_scanned_stored >>= 1;
4077+ delta >>= 1;
4078+ pages_scanned_base++;
4079+ }
4080+
4081+ pages_scanned_stored += delta;
4082+
4083+ uksm_pages_scanned = uksm_pages_scanned_last = 0;
4084+ }
4085+
4086+ uksm_pages_scanned++;
4087+}
4088+
4089+static inline int find_zero_page_hash(int strength, u32 hash)
4090+{
4091+ return (zero_hash_table[strength] == hash);
4092+}
4093+
4094+static
4095+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4096+{
4097+ struct page *zero_page = empty_uksm_zero_page;
4098+ struct mm_struct *mm = vma->vm_mm;
4099+ pte_t orig_pte = __pte(0);
4100+ int err = -EFAULT;
4101+
4102+ if (uksm_test_exit(mm))
4103+ goto out;
4104+
4105+ if (!trylock_page(page))
4106+ goto out;
4107+
4108+ if (!PageAnon(page))
4109+ goto out_unlock;
4110+
4111+ if (PageTransCompound(page)) {
4112+ err = split_huge_page(page);
4113+ if (err)
4114+ goto out_unlock;
4115+ }
4116+
4117+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4118+ if (is_page_full_zero(page))
4119+ err = replace_page(vma, page, zero_page, orig_pte);
4120+ }
4121+
4122+out_unlock:
4123+ unlock_page(page);
4124+out:
4125+ return err;
4126+}
4127+
4128+/*
4129+ * cmp_and_merge_page() - first see if page can be merged into the stable
4130+ * tree; if not, compare hash to previous and if it's the same, see if page
4131+ * can be inserted into the unstable tree, or merged with a page already there
4132+ * and both transferred to the stable tree.
4133+ *
4134+ * @page: the page that we are searching identical page to.
4135+ * @rmap_item: the reverse mapping into the virtual address of this page
4136+ */
4137+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4138+{
4139+ struct rmap_item *tree_rmap_item;
4140+ struct page *page;
4141+ struct page *kpage = NULL;
4142+ u32 hash_max;
4143+ int err;
4144+ unsigned int success1, success2;
4145+ struct stable_node *snode;
4146+ int cmp;
4147+ struct rb_node *parent = NULL, **new;
4148+
4149+ remove_rmap_item_from_tree(rmap_item);
4150+ page = rmap_item->page;
4151+
4152+ /* We first start with searching the page inside the stable tree */
4153+ kpage = stable_tree_search(rmap_item, hash);
4154+ if (kpage) {
4155+ err = try_to_merge_with_uksm_page(rmap_item, kpage,
4156+ hash);
4157+ if (!err) {
4158+ /*
4159+ * The page was successfully merged, add
4160+ * its rmap_item to the stable tree.
4161+ * page lock is needed because it's
4162+ * racing with try_to_unmap_ksm(), etc.
4163+ */
4164+ lock_page(kpage);
4165+ snode = page_stable_node(kpage);
4166+ stable_tree_append(rmap_item, snode, 1);
4167+ unlock_page(kpage);
4168+ put_page(kpage);
4169+ return; /* success */
4170+ }
4171+ put_page(kpage);
4172+
4173+ /*
4174+ * if it's a collision and it has been search in sub-rbtree
4175+ * (hash_max != 0), we want to abort, because if it is
4176+ * successfully merged in unstable tree, the collision trends to
4177+ * happen again.
4178+ */
4179+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4180+ return;
4181+ }
4182+
4183+ tree_rmap_item =
4184+ unstable_tree_search_insert(rmap_item, hash);
4185+ if (tree_rmap_item) {
4186+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4187+ /*
4188+ * As soon as we merge this page, we want to remove the
4189+ * rmap_item of the page we have merged with from the unstable
4190+ * tree, and insert it instead as new node in the stable tree.
4191+ */
4192+ if (!err) {
4193+ kpage = page;
4194+ remove_rmap_item_from_tree(tree_rmap_item);
4195+ lock_page(kpage);
4196+ snode = stable_tree_insert(&kpage, hash,
4197+ rmap_item, tree_rmap_item,
4198+ &success1, &success2);
4199+
4200+ /*
4201+ * Do not log dedup for tree item, it's not counted as
4202+ * scanned in this round.
4203+ */
4204+ if (success2)
4205+ stable_tree_append(tree_rmap_item, snode, 0);
4206+
4207+ /*
4208+ * The order of these two stable append is important:
4209+ * we are scanning rmap_item.
4210+ */
4211+ if (success1)
4212+ stable_tree_append(rmap_item, snode, 1);
4213+
4214+ /*
4215+ * The original kpage may be unlocked inside
4216+ * stable_tree_insert() already. This page
4217+ * should be unlocked before doing
4218+ * break_cow().
4219+ */
4220+ unlock_page(kpage);
4221+
4222+ if (!success1)
4223+ break_cow(rmap_item);
4224+
4225+ if (!success2)
4226+ break_cow(tree_rmap_item);
4227+
4228+ } else if (err == MERGE_ERR_COLLI) {
4229+ BUG_ON(tree_rmap_item->tree_node->count > 1);
4230+
4231+ rmap_item_hash_max(tree_rmap_item,
4232+ tree_rmap_item->tree_node->hash);
4233+
4234+ hash_max = rmap_item_hash_max(rmap_item, hash);
4235+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4236+ parent = &tree_rmap_item->node;
4237+ if (cmp < 0)
4238+ new = &parent->rb_left;
4239+ else if (cmp > 0)
4240+ new = &parent->rb_right;
4241+ else
4242+ goto put_up_out;
4243+
4244+ rmap_item->tree_node = tree_rmap_item->tree_node;
4245+ rmap_item->address |= UNSTABLE_FLAG;
4246+ rmap_item->hash_round = uksm_hash_round;
4247+ rb_link_node(&rmap_item->node, parent, new);
4248+ rb_insert_color(&rmap_item->node,
4249+ &tree_rmap_item->tree_node->sub_root);
4250+ rmap_item->tree_node->count++;
4251+ } else {
4252+ /*
4253+ * either one of the page has changed or they collide
4254+ * at the max hash, we consider them as ill items.
4255+ */
4256+ remove_rmap_item_from_tree(tree_rmap_item);
4257+ }
4258+put_up_out:
4259+ put_page(tree_rmap_item->page);
4260+ mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
4261+ }
4262+}
4263+
4264+
4265+
4266+
4267+static inline unsigned long get_pool_index(struct vma_slot *slot,
4268+ unsigned long index)
4269+{
4270+ unsigned long pool_index;
4271+
4272+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4273+ if (pool_index >= slot->pool_size)
4274+ BUG();
4275+ return pool_index;
4276+}
4277+
4278+static inline unsigned long index_page_offset(unsigned long index)
4279+{
4280+ return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4281+}
4282+
4283+static inline
4284+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4285+ unsigned long index, int need_alloc)
4286+{
4287+ unsigned long pool_index;
4288+ struct page *page;
4289+ void *addr;
4290+
4291+
4292+ pool_index = get_pool_index(slot, index);
4293+ if (!slot->rmap_list_pool[pool_index]) {
4294+ if (!need_alloc)
4295+ return NULL;
4296+
4297+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4298+ if (!page)
4299+ return NULL;
4300+
4301+ slot->rmap_list_pool[pool_index] = page;
4302+ }
4303+
4304+ addr = kmap(slot->rmap_list_pool[pool_index]);
4305+ addr += index_page_offset(index);
4306+
4307+ return addr;
4308+}
4309+
4310+static inline void put_rmap_list_entry(struct vma_slot *slot,
4311+ unsigned long index)
4312+{
4313+ unsigned long pool_index;
4314+
4315+ pool_index = get_pool_index(slot, index);
4316+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4317+ kunmap(slot->rmap_list_pool[pool_index]);
4318+}
4319+
4320+static inline int entry_is_new(struct rmap_list_entry *entry)
4321+{
4322+ return !entry->item;
4323+}
4324+
4325+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4326+ unsigned long index)
4327+{
4328+ return slot->vma->vm_start + (index << PAGE_SHIFT);
4329+}
4330+
4331+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4332+{
4333+ unsigned long addr;
4334+
4335+ if (is_addr(entry->addr))
4336+ addr = get_clean_addr(entry->addr);
4337+ else if (entry->item)
4338+ addr = get_rmap_addr(entry->item);
4339+ else
4340+ BUG();
4341+
4342+ return addr;
4343+}
4344+
4345+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4346+{
4347+ if (is_addr(entry->addr))
4348+ return NULL;
4349+
4350+ return entry->item;
4351+}
4352+
4353+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4354+ unsigned long index)
4355+{
4356+ unsigned long pool_index;
4357+
4358+ pool_index = get_pool_index(slot, index);
4359+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4360+ slot->pool_counts[pool_index]++;
4361+}
4362+
4363+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4364+ unsigned long index)
4365+{
4366+ unsigned long pool_index;
4367+
4368+ pool_index = get_pool_index(slot, index);
4369+ BUG_ON(!slot->rmap_list_pool[pool_index]);
4370+ BUG_ON(!slot->pool_counts[pool_index]);
4371+ slot->pool_counts[pool_index]--;
4372+}
4373+
4374+static inline int entry_has_rmap(struct rmap_list_entry *entry)
4375+{
4376+ return !is_addr(entry->addr) && entry->item;
4377+}
4378+
4379+static inline void swap_entries(struct rmap_list_entry *entry1,
4380+ unsigned long index1,
4381+ struct rmap_list_entry *entry2,
4382+ unsigned long index2)
4383+{
4384+ struct rmap_list_entry tmp;
4385+
4386+ /* swapping two new entries is meaningless */
4387+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4388+
4389+ tmp = *entry1;
4390+ *entry1 = *entry2;
4391+ *entry2 = tmp;
4392+
4393+ if (entry_has_rmap(entry1))
4394+ entry1->item->entry_index = index1;
4395+
4396+ if (entry_has_rmap(entry2))
4397+ entry2->item->entry_index = index2;
4398+
4399+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4400+ inc_rmap_list_pool_count(entry1->item->slot, index1);
4401+ dec_rmap_list_pool_count(entry1->item->slot, index2);
4402+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4403+ inc_rmap_list_pool_count(entry2->item->slot, index2);
4404+ dec_rmap_list_pool_count(entry2->item->slot, index1);
4405+ }
4406+}
4407+
4408+static inline void free_entry_item(struct rmap_list_entry *entry)
4409+{
4410+ unsigned long index;
4411+ struct rmap_item *item;
4412+
4413+ if (!is_addr(entry->addr)) {
4414+ BUG_ON(!entry->item);
4415+ item = entry->item;
4416+ entry->addr = get_rmap_addr(item);
4417+ set_is_addr(entry->addr);
4418+ index = item->entry_index;
4419+ remove_rmap_item_from_tree(item);
4420+ dec_rmap_list_pool_count(item->slot, index);
4421+ free_rmap_item(item);
4422+ }
4423+}
4424+
4425+static inline int pool_entry_boundary(unsigned long index)
4426+{
4427+ unsigned long linear_addr;
4428+
4429+ linear_addr = sizeof(struct rmap_list_entry *) * index;
4430+ return index && !offset_in_page(linear_addr);
4431+}
4432+
4433+static inline void try_free_last_pool(struct vma_slot *slot,
4434+ unsigned long index)
4435+{
4436+ unsigned long pool_index;
4437+
4438+ pool_index = get_pool_index(slot, index);
4439+ if (slot->rmap_list_pool[pool_index] &&
4440+ !slot->pool_counts[pool_index]) {
4441+ __free_page(slot->rmap_list_pool[pool_index]);
4442+ slot->rmap_list_pool[pool_index] = NULL;
4443+ slot->flags |= UKSM_SLOT_NEED_SORT;
4444+ }
4445+
4446+}
4447+
4448+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4449+ struct rmap_item *item)
4450+{
4451+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4452+}
4453+
4454+static int within_same_pool(struct vma_slot *slot,
4455+ unsigned long i, unsigned long j)
4456+{
4457+ unsigned long pool_i, pool_j;
4458+
4459+ pool_i = get_pool_index(slot, i);
4460+ pool_j = get_pool_index(slot, j);
4461+
4462+ return (pool_i == pool_j);
4463+}
4464+
4465+static void sort_rmap_entry_list(struct vma_slot *slot)
4466+{
4467+ unsigned long i, j;
4468+ struct rmap_list_entry *entry, *swap_entry;
4469+
4470+ entry = get_rmap_list_entry(slot, 0, 0);
4471+ for (i = 0; i < slot->pages; ) {
4472+
4473+ if (!entry)
4474+ goto skip_whole_pool;
4475+
4476+ if (entry_is_new(entry))
4477+ goto next_entry;
4478+
4479+ if (is_addr(entry->addr)) {
4480+ entry->addr = 0;
4481+ goto next_entry;
4482+ }
4483+
4484+ j = vma_item_index(slot->vma, entry->item);
4485+ if (j == i)
4486+ goto next_entry;
4487+
4488+ if (within_same_pool(slot, i, j))
4489+ swap_entry = entry + j - i;
4490+ else
4491+ swap_entry = get_rmap_list_entry(slot, j, 1);
4492+
4493+ swap_entries(entry, i, swap_entry, j);
4494+ if (!within_same_pool(slot, i, j))
4495+ put_rmap_list_entry(slot, j);
4496+ continue;
4497+
4498+skip_whole_pool:
4499+ i += PAGE_SIZE / sizeof(*entry);
4500+ if (i < slot->pages)
4501+ entry = get_rmap_list_entry(slot, i, 0);
4502+ continue;
4503+
4504+next_entry:
4505+ if (i >= slot->pages - 1 ||
4506+ !within_same_pool(slot, i, i + 1)) {
4507+ put_rmap_list_entry(slot, i);
4508+ if (i + 1 < slot->pages)
4509+ entry = get_rmap_list_entry(slot, i + 1, 0);
4510+ } else
4511+ entry++;
4512+ i++;
4513+ continue;
4514+ }
4515+
4516+ /* free empty pool entries which contain no rmap_item */
4517+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4518+ for (i = 0; i < slot->pool_size; i++) {
4519+ unsigned char has_rmap;
4520+ void *addr;
4521+
4522+ if (!slot->rmap_list_pool[i])
4523+ continue;
4524+
4525+ has_rmap = 0;
4526+ addr = kmap(slot->rmap_list_pool[i]);
4527+ BUG_ON(!addr);
4528+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4529+ entry = (struct rmap_list_entry *)addr + j;
4530+ if (is_addr(entry->addr))
4531+ continue;
4532+ if (!entry->item)
4533+ continue;
4534+ has_rmap = 1;
4535+ }
4536+ kunmap(slot->rmap_list_pool[i]);
4537+ if (!has_rmap) {
4538+ BUG_ON(slot->pool_counts[i]);
4539+ __free_page(slot->rmap_list_pool[i]);
4540+ slot->rmap_list_pool[i] = NULL;
4541+ }
4542+ }
4543+
4544+ slot->flags &= ~UKSM_SLOT_NEED_SORT;
4545+}
4546+
4547+/*
4548+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
4549+ */
4550+static inline int vma_fully_scanned(struct vma_slot *slot)
4551+{
4552+ return slot->pages_scanned == slot->pages;
4553+}
4554+
4555+/**
4556+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4557+ * its random permutation. This function is embedded with the random
4558+ * permutation index management code.
4559+ */
4560+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4561+{
4562+ unsigned long rand_range, addr, swap_index, scan_index;
4563+ struct rmap_item *item = NULL;
4564+ struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4565+ struct page *page;
4566+
4567+ scan_index = swap_index = slot->pages_scanned % slot->pages;
4568+
4569+ if (pool_entry_boundary(scan_index))
4570+ try_free_last_pool(slot, scan_index - 1);
4571+
4572+ if (vma_fully_scanned(slot)) {
4573+ if (slot->flags & UKSM_SLOT_NEED_SORT)
4574+ slot->flags |= UKSM_SLOT_NEED_RERAND;
4575+ else
4576+ slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4577+ if (slot->flags & UKSM_SLOT_NEED_SORT)
4578+ sort_rmap_entry_list(slot);
4579+ }
4580+
4581+ scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4582+ if (!scan_entry)
4583+ return NULL;
4584+
4585+ if (entry_is_new(scan_entry)) {
4586+ scan_entry->addr = get_index_orig_addr(slot, scan_index);
4587+ set_is_addr(scan_entry->addr);
4588+ }
4589+
4590+ if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4591+ rand_range = slot->pages - scan_index;
4592+ BUG_ON(!rand_range);
4593+ swap_index = scan_index + (prandom_u32() % rand_range);
4594+ }
4595+
4596+ if (swap_index != scan_index) {
4597+ swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4598+
4599+ if (!swap_entry)
4600+ return NULL;
4601+
4602+ if (entry_is_new(swap_entry)) {
4603+ swap_entry->addr = get_index_orig_addr(slot,
4604+ swap_index);
4605+ set_is_addr(swap_entry->addr);
4606+ }
4607+ swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4608+ }
4609+
4610+ addr = get_entry_address(scan_entry);
4611+ item = get_entry_item(scan_entry);
4612+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4613+
4614+ page = follow_page(slot->vma, addr, FOLL_GET);
4615+ if (IS_ERR_OR_NULL(page))
4616+ goto nopage;
4617+
4618+ if (!PageAnon(page))
4619+ goto putpage;
4620+
4621+ /*check is zero_page pfn or uksm_zero_page*/
4622+ if ((page_to_pfn(page) == zero_pfn)
4623+ || (page_to_pfn(page) == uksm_zero_pfn))
4624+ goto putpage;
4625+
4626+ flush_anon_page(slot->vma, page, addr);
4627+ flush_dcache_page(page);
4628+
4629+
4630+ *hash = page_hash(page, hash_strength, 1);
4631+ inc_uksm_pages_scanned();
4632+ /*if the page content all zero, re-map to zero-page*/
4633+ if (find_zero_page_hash(hash_strength, *hash)) {
4634+ if (!cmp_and_merge_zero_page(slot->vma, page)) {
4635+ slot->pages_merged++;
4636+
4637+ /* For full-zero pages, no need to create rmap item */
4638+ goto putpage;
4639+ } else {
4640+ inc_rshash_neg(memcmp_cost / 2);
4641+ }
4642+ }
4643+
4644+ if (!item) {
4645+ item = alloc_rmap_item();
4646+ if (item) {
4647+ /* It has already been zeroed */
4648+ item->slot = slot;
4649+ item->address = addr;
4650+ item->entry_index = scan_index;
4651+ scan_entry->item = item;
4652+ inc_rmap_list_pool_count(slot, scan_index);
4653+ } else
4654+ goto putpage;
4655+ }
4656+
4657+ BUG_ON(item->slot != slot);
4658+ /* the page may have changed */
4659+ item->page = page;
4660+ put_rmap_list_entry(slot, scan_index);
4661+ if (swap_entry)
4662+ put_rmap_list_entry(slot, swap_index);
4663+ return item;
4664+
4665+putpage:
4666+ put_page(page);
4667+ page = NULL;
4668+nopage:
4669+ /* no page, store addr back and free rmap_item if possible */
4670+ free_entry_item(scan_entry);
4671+ put_rmap_list_entry(slot, scan_index);
4672+ if (swap_entry)
4673+ put_rmap_list_entry(slot, swap_index);
4674+ return NULL;
4675+}
4676+
4677+static inline int in_stable_tree(struct rmap_item *rmap_item)
4678+{
4679+ return rmap_item->address & STABLE_FLAG;
4680+}
4681+
4682+/**
4683+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4684+ * mmap_lock locked.
4685+ */
4686+static noinline void scan_vma_one_page(struct vma_slot *slot)
4687+{
4688+ u32 hash;
4689+ struct mm_struct *mm;
4690+ struct rmap_item *rmap_item = NULL;
4691+ struct vm_area_struct *vma = slot->vma;
4692+
4693+ mm = vma->vm_mm;
4694+ BUG_ON(!mm);
4695+ BUG_ON(!slot);
4696+
4697+ rmap_item = get_next_rmap_item(slot, &hash);
4698+ if (!rmap_item)
4699+ goto out1;
4700+
4701+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4702+ goto out2;
4703+
4704+ cmp_and_merge_page(rmap_item, hash);
4705+out2:
4706+ put_page(rmap_item->page);
4707+out1:
4708+ slot->pages_scanned++;
4709+ slot->this_sampled++;
4710+ if (slot->fully_scanned_round != fully_scanned_round)
4711+ scanned_virtual_pages++;
4712+
4713+ if (vma_fully_scanned(slot))
4714+ slot->fully_scanned_round = fully_scanned_round;
4715+}
4716+
4717+static inline unsigned long rung_get_pages(struct scan_rung *rung)
4718+{
4719+ struct slot_tree_node *node;
4720+
4721+ if (!rung->vma_root.rnode)
4722+ return 0;
4723+
4724+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4725+
4726+ return node->size;
4727+}
4728+
4729+#define RUNG_SAMPLED_MIN 3
4730+
4731+static inline
4732+void uksm_calc_rung_step(struct scan_rung *rung,
4733+ unsigned long page_time, unsigned long ratio)
4734+{
4735+ unsigned long sampled, pages;
4736+
4737+ /* will be fully scanned ? */
4738+ if (!rung->cover_msecs) {
4739+ rung->step = 1;
4740+ return;
4741+ }
4742+
4743+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4744+ * ratio / page_time;
4745+
4746+ /*
4747+ * Before we finsish a scan round and expensive per-round jobs,
4748+ * we need to have a chance to estimate the per page time. So
4749+ * the sampled number can not be too small.
4750+ */
4751+ if (sampled < RUNG_SAMPLED_MIN)
4752+ sampled = RUNG_SAMPLED_MIN;
4753+
4754+ pages = rung_get_pages(rung);
4755+ if (likely(pages > sampled))
4756+ rung->step = pages / sampled;
4757+ else
4758+ rung->step = 1;
4759+}
4760+
4761+static inline int step_need_recalc(struct scan_rung *rung)
4762+{
4763+ unsigned long pages, stepmax;
4764+
4765+ pages = rung_get_pages(rung);
4766+ stepmax = pages / RUNG_SAMPLED_MIN;
4767+
4768+ return pages && (rung->step > pages ||
4769+ (stepmax && rung->step > stepmax));
4770+}
4771+
4772+static inline
4773+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4774+{
4775+ struct vma_slot *slot;
4776+
4777+ if (finished)
4778+ rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4779+
4780+ if (step_recalc || step_need_recalc(rung)) {
4781+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4782+ BUG_ON(step_need_recalc(rung));
4783+ }
4784+
4785+ slot_iter_index = prandom_u32() % rung->step;
4786+ BUG_ON(!rung->vma_root.rnode);
4787+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4788+ BUG_ON(!slot);
4789+
4790+ rung->current_scan = slot;
4791+ rung->current_offset = slot_iter_index;
4792+}
4793+
4794+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4795+{
4796+ return &slot->rung->vma_root;
4797+}
4798+
4799+/*
4800+ * return if resetted.
4801+ */
4802+static int advance_current_scan(struct scan_rung *rung)
4803+{
4804+ unsigned short n;
4805+ struct vma_slot *slot, *next = NULL;
4806+
4807+ BUG_ON(!rung->vma_root.num);
4808+
4809+ slot = rung->current_scan;
4810+ n = (slot->pages - rung->current_offset) % rung->step;
4811+ slot_iter_index = rung->step - n;
4812+ next = sradix_tree_next(&rung->vma_root, slot->snode,
4813+ slot->sindex, slot_iter);
4814+
4815+ if (next) {
4816+ rung->current_offset = slot_iter_index;
4817+ rung->current_scan = next;
4818+ return 0;
4819+ } else {
4820+ reset_current_scan(rung, 1, 0);
4821+ return 1;
4822+ }
4823+}
4824+
4825+static inline void rung_rm_slot(struct vma_slot *slot)
4826+{
4827+ struct scan_rung *rung = slot->rung;
4828+ struct sradix_tree_root *root;
4829+
4830+ if (rung->current_scan == slot)
4831+ advance_current_scan(rung);
4832+
4833+ root = slot_get_root(slot);
4834+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4835+ slot->snode = NULL;
4836+ if (step_need_recalc(rung)) {
4837+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4838+ BUG_ON(step_need_recalc(rung));
4839+ }
4840+
4841+ /* In case advance_current_scan loop back to this slot again */
4842+ if (rung->vma_root.num && rung->current_scan == slot)
4843+ reset_current_scan(slot->rung, 1, 0);
4844+}
4845+
4846+static inline void rung_add_new_slots(struct scan_rung *rung,
4847+ struct vma_slot **slots, unsigned long num)
4848+{
4849+ int err;
4850+ struct vma_slot *slot;
4851+ unsigned long i;
4852+ struct sradix_tree_root *root = &rung->vma_root;
4853+
4854+ err = sradix_tree_enter(root, (void **)slots, num);
4855+ BUG_ON(err);
4856+
4857+ for (i = 0; i < num; i++) {
4858+ slot = slots[i];
4859+ slot->rung = rung;
4860+ BUG_ON(vma_fully_scanned(slot));
4861+ }
4862+
4863+ if (rung->vma_root.num == num)
4864+ reset_current_scan(rung, 0, 1);
4865+}
4866+
4867+static inline int rung_add_one_slot(struct scan_rung *rung,
4868+ struct vma_slot *slot)
4869+{
4870+ int err;
4871+
4872+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4873+ if (err)
4874+ return err;
4875+
4876+ slot->rung = rung;
4877+ if (rung->vma_root.num == 1)
4878+ reset_current_scan(rung, 0, 1);
4879+
4880+ return 0;
4881+}
4882+
4883+/*
4884+ * Return true if the slot is deleted from its rung.
4885+ */
4886+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4887+{
4888+ struct scan_rung *old_rung = slot->rung;
4889+ int err;
4890+
4891+ if (old_rung == rung)
4892+ return 0;
4893+
4894+ rung_rm_slot(slot);
4895+ err = rung_add_one_slot(rung, slot);
4896+ if (err) {
4897+ err = rung_add_one_slot(old_rung, slot);
4898+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4899+ }
4900+
4901+ return 1;
4902+}
4903+
4904+static inline int vma_rung_up(struct vma_slot *slot)
4905+{
4906+ struct scan_rung *rung;
4907+
4908+ rung = slot->rung;
4909+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4910+ rung++;
4911+
4912+ return vma_rung_enter(slot, rung);
4913+}
4914+
4915+static inline int vma_rung_down(struct vma_slot *slot)
4916+{
4917+ struct scan_rung *rung;
4918+
4919+ rung = slot->rung;
4920+ if (slot->rung != &uksm_scan_ladder[0])
4921+ rung--;
4922+
4923+ return vma_rung_enter(slot, rung);
4924+}
4925+
4926+/**
4927+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4928+ */
4929+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4930+{
4931+ unsigned long ret;
4932+ unsigned long pages;
4933+
4934+ pages = slot->this_sampled;
4935+ if (!pages)
4936+ return 0;
4937+
4938+ BUG_ON(slot->pages_scanned == slot->last_scanned);
4939+
4940+ ret = slot->pages_merged;
4941+
4942+ /* Thrashing area filtering */
4943+ if (ret && uksm_thrash_threshold) {
4944+ if (slot->pages_cowed * 100 / slot->pages_merged
4945+ > uksm_thrash_threshold) {
4946+ ret = 0;
4947+ } else {
4948+ ret = slot->pages_merged - slot->pages_cowed;
4949+ }
4950+ }
4951+
4952+ return ret * 100 / pages;
4953+}
4954+
4955+/**
4956+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4957+ */
4958+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4959+{
4960+ unsigned long ret;
4961+ unsigned long pages;
4962+
4963+ pages = slot->pages;
4964+ if (!pages)
4965+ return 0;
4966+
4967+ ret = slot->pages_bemerged;
4968+
4969+ /* Thrashing area filtering */
4970+ if (ret && uksm_thrash_threshold) {
4971+ if (slot->pages_cowed * 100 / slot->pages_bemerged
4972+ > uksm_thrash_threshold) {
4973+ ret = 0;
4974+ } else {
4975+ ret = slot->pages_bemerged - slot->pages_cowed;
4976+ }
4977+ }
4978+
4979+ return ret * 100 / pages;
4980+}
4981+
4982+/**
4983+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
4984+ * stable tree need to be restructured, this is the function re-inserting the
4985+ * stable node.
4986+ */
4987+static inline void stable_node_reinsert(struct stable_node *new_node,
4988+ struct page *page,
4989+ struct rb_root *root_treep,
4990+ struct list_head *tree_node_listp,
4991+ u32 hash)
4992+{
4993+ struct rb_node **new = &root_treep->rb_node;
4994+ struct rb_node *parent = NULL;
4995+ struct stable_node *stable_node;
4996+ struct tree_node *tree_node;
4997+ struct page *tree_page;
4998+ int cmp;
4999+
5000+ while (*new) {
5001+ int cmp;
5002+
5003+ tree_node = rb_entry(*new, struct tree_node, node);
5004+
5005+ cmp = hash_cmp(hash, tree_node->hash);
5006+
5007+ if (cmp < 0) {
5008+ parent = *new;
5009+ new = &parent->rb_left;
5010+ } else if (cmp > 0) {
5011+ parent = *new;
5012+ new = &parent->rb_right;
5013+ } else
5014+ break;
5015+ }
5016+
5017+ if (*new) {
5018+ /* find a stable tree node with same first level hash value */
5019+ stable_node_hash_max(new_node, page, hash);
5020+ if (tree_node->count == 1) {
5021+ stable_node = rb_entry(tree_node->sub_root.rb_node,
5022+ struct stable_node, node);
5023+ tree_page = get_uksm_page(stable_node, 1, 0);
5024+ if (tree_page) {
5025+ stable_node_hash_max(stable_node,
5026+ tree_page, hash);
5027+ put_page(tree_page);
5028+
5029+ /* prepare for stable node insertion */
5030+
5031+ cmp = hash_cmp(new_node->hash_max,
5032+ stable_node->hash_max);
5033+ parent = &stable_node->node;
5034+ if (cmp < 0)
5035+ new = &parent->rb_left;
5036+ else if (cmp > 0)
5037+ new = &parent->rb_right;
5038+ else
5039+ goto failed;
5040+
5041+ goto add_node;
5042+ } else {
5043+ /* the only stable_node deleted, the tree node
5044+ * was not deleted.
5045+ */
5046+ goto tree_node_reuse;
5047+ }
5048+ }
5049+
5050+ /* well, search the collision subtree */
5051+ new = &tree_node->sub_root.rb_node;
5052+ parent = NULL;
5053+ BUG_ON(!*new);
5054+ while (*new) {
5055+ int cmp;
5056+
5057+ stable_node = rb_entry(*new, struct stable_node, node);
5058+
5059+ cmp = hash_cmp(new_node->hash_max,
5060+ stable_node->hash_max);
5061+
5062+ if (cmp < 0) {
5063+ parent = *new;
5064+ new = &parent->rb_left;
5065+ } else if (cmp > 0) {
5066+ parent = *new;
5067+ new = &parent->rb_right;
5068+ } else {
5069+ /* oh, no, still a collision */
5070+ goto failed;
5071+ }
5072+ }
5073+
5074+ goto add_node;
5075+ }
5076+
5077+ /* no tree node found */
5078+ tree_node = alloc_tree_node(tree_node_listp);
5079+ if (!tree_node) {
5080+ pr_err("UKSM: memory allocation error!\n");
5081+ goto failed;
5082+ } else {
5083+ tree_node->hash = hash;
5084+ rb_link_node(&tree_node->node, parent, new);
5085+ rb_insert_color(&tree_node->node, root_treep);
5086+
5087+tree_node_reuse:
5088+ /* prepare for stable node insertion */
5089+ parent = NULL;
5090+ new = &tree_node->sub_root.rb_node;
5091+ }
5092+
5093+add_node:
5094+ rb_link_node(&new_node->node, parent, new);
5095+ rb_insert_color(&new_node->node, &tree_node->sub_root);
5096+ new_node->tree_node = tree_node;
5097+ tree_node->count++;
5098+ return;
5099+
5100+failed:
5101+ /* This can only happen when two nodes have collided
5102+ * in two levels.
5103+ */
5104+ new_node->tree_node = NULL;
5105+ return;
5106+}
5107+
5108+static inline void free_all_tree_nodes(struct list_head *list)
5109+{
5110+ struct tree_node *node, *tmp;
5111+
5112+ list_for_each_entry_safe(node, tmp, list, all_list) {
5113+ free_tree_node(node);
5114+ }
5115+}
5116+
5117+/**
5118+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5119+ * strength to the current hash_strength. It re-structures the hole tree.
5120+ */
5121+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5122+{
5123+ struct stable_node *node, *tmp;
5124+ struct rb_root *root_new_treep;
5125+ struct list_head *new_tree_node_listp;
5126+
5127+ stable_tree_index = (stable_tree_index + 1) % 2;
5128+ root_new_treep = &root_stable_tree[stable_tree_index];
5129+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5130+ *root_new_treep = RB_ROOT;
5131+ BUG_ON(!list_empty(new_tree_node_listp));
5132+
5133+ /*
5134+ * we need to be safe, the node could be removed by get_uksm_page()
5135+ */
5136+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5137+ void *addr;
5138+ struct page *node_page;
5139+ u32 hash;
5140+
5141+ /*
5142+ * We are completely re-structuring the stable nodes to a new
5143+ * stable tree. We don't want to touch the old tree unlinks and
5144+ * old tree_nodes. The old tree_nodes will be freed at once.
5145+ */
5146+ node_page = get_uksm_page(node, 0, 0);
5147+ if (!node_page)
5148+ continue;
5149+
5150+ if (node->tree_node) {
5151+ hash = node->tree_node->hash;
5152+
5153+ addr = kmap_atomic(node_page);
5154+
5155+ hash = delta_hash(addr, prev_hash_strength,
5156+ hash_strength, hash);
5157+ kunmap_atomic(addr);
5158+ } else {
5159+ /*
5160+ *it was not inserted to rbtree due to collision in last
5161+ *round scan.
5162+ */
5163+ hash = page_hash(node_page, hash_strength, 0);
5164+ }
5165+
5166+ stable_node_reinsert(node, node_page, root_new_treep,
5167+ new_tree_node_listp, hash);
5168+ put_page(node_page);
5169+ }
5170+
5171+ root_stable_treep = root_new_treep;
5172+ free_all_tree_nodes(stable_tree_node_listp);
5173+ BUG_ON(!list_empty(stable_tree_node_listp));
5174+ stable_tree_node_listp = new_tree_node_listp;
5175+}
5176+
5177+static inline void inc_hash_strength(unsigned long delta)
5178+{
5179+ hash_strength += 1 << delta;
5180+ if (hash_strength > HASH_STRENGTH_MAX)
5181+ hash_strength = HASH_STRENGTH_MAX;
5182+}
5183+
5184+static inline void dec_hash_strength(unsigned long delta)
5185+{
5186+ unsigned long change = 1 << delta;
5187+
5188+ if (hash_strength <= change + 1)
5189+ hash_strength = 1;
5190+ else
5191+ hash_strength -= change;
5192+}
5193+
5194+static inline void inc_hash_strength_delta(void)
5195+{
5196+ hash_strength_delta++;
5197+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5198+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5199+}
5200+
5201+static inline unsigned long get_current_neg_ratio(void)
5202+{
5203+ u64 pos = benefit.pos;
5204+ u64 neg = benefit.neg;
5205+
5206+ if (!neg)
5207+ return 0;
5208+
5209+ if (!pos || neg > pos)
5210+ return 100;
5211+
5212+ if (neg > div64_u64(U64_MAX, 100))
5213+ pos = div64_u64(pos, 100);
5214+ else
5215+ neg *= 100;
5216+
5217+ return div64_u64(neg, pos);
5218+}
5219+
5220+static inline unsigned long get_current_benefit(void)
5221+{
5222+ u64 pos = benefit.pos;
5223+ u64 neg = benefit.neg;
5224+ u64 scanned = benefit.scanned;
5225+
5226+ if (neg > pos)
5227+ return 0;
5228+
5229+ return div64_u64((pos - neg), scanned);
5230+}
5231+
5232+static inline int judge_rshash_direction(void)
5233+{
5234+ u64 current_neg_ratio, stable_benefit;
5235+ u64 current_benefit, delta = 0;
5236+ int ret = STILL;
5237+
5238+ /*
5239+ * Try to probe a value after the boot, and in case the system
5240+ * are still for a long time.
5241+ */
5242+ if ((fully_scanned_round & 0xFFULL) == 10) {
5243+ ret = OBSCURE;
5244+ goto out;
5245+ }
5246+
5247+ current_neg_ratio = get_current_neg_ratio();
5248+
5249+ if (current_neg_ratio == 0) {
5250+ rshash_neg_cont_zero++;
5251+ if (rshash_neg_cont_zero > 2)
5252+ return GO_DOWN;
5253+ else
5254+ return STILL;
5255+ }
5256+ rshash_neg_cont_zero = 0;
5257+
5258+ if (current_neg_ratio > 90) {
5259+ ret = GO_UP;
5260+ goto out;
5261+ }
5262+
5263+ current_benefit = get_current_benefit();
5264+ stable_benefit = rshash_state.stable_benefit;
5265+
5266+ if (!stable_benefit) {
5267+ ret = OBSCURE;
5268+ goto out;
5269+ }
5270+
5271+ if (current_benefit > stable_benefit)
5272+ delta = current_benefit - stable_benefit;
5273+ else if (current_benefit < stable_benefit)
5274+ delta = stable_benefit - current_benefit;
5275+
5276+ delta = div64_u64(100 * delta, stable_benefit);
5277+
5278+ if (delta > 50) {
5279+ rshash_cont_obscure++;
5280+ if (rshash_cont_obscure > 2)
5281+ return OBSCURE;
5282+ else
5283+ return STILL;
5284+ }
5285+
5286+out:
5287+ rshash_cont_obscure = 0;
5288+ return ret;
5289+}
5290+
5291+/**
5292+ * rshash_adjust() - The main function to control the random sampling state
5293+ * machine for hash strength adapting.
5294+ *
5295+ * return true if hash_strength has changed.
5296+ */
5297+static inline int rshash_adjust(void)
5298+{
5299+ unsigned long prev_hash_strength = hash_strength;
5300+
5301+ if (!encode_benefit())
5302+ return 0;
5303+
5304+ switch (rshash_state.state) {
5305+ case RSHASH_STILL:
5306+ switch (judge_rshash_direction()) {
5307+ case GO_UP:
5308+ if (rshash_state.pre_direct == GO_DOWN)
5309+ hash_strength_delta = 0;
5310+
5311+ inc_hash_strength(hash_strength_delta);
5312+ inc_hash_strength_delta();
5313+ rshash_state.stable_benefit = get_current_benefit();
5314+ rshash_state.pre_direct = GO_UP;
5315+ break;
5316+
5317+ case GO_DOWN:
5318+ if (rshash_state.pre_direct == GO_UP)
5319+ hash_strength_delta = 0;
5320+
5321+ dec_hash_strength(hash_strength_delta);
5322+ inc_hash_strength_delta();
5323+ rshash_state.stable_benefit = get_current_benefit();
5324+ rshash_state.pre_direct = GO_DOWN;
5325+ break;
5326+
5327+ case OBSCURE:
5328+ rshash_state.stable_point = hash_strength;
5329+ rshash_state.turn_point_down = hash_strength;
5330+ rshash_state.turn_point_up = hash_strength;
5331+ rshash_state.turn_benefit_down = get_current_benefit();
5332+ rshash_state.turn_benefit_up = get_current_benefit();
5333+ rshash_state.lookup_window_index = 0;
5334+ rshash_state.state = RSHASH_TRYDOWN;
5335+ dec_hash_strength(hash_strength_delta);
5336+ inc_hash_strength_delta();
5337+ break;
5338+
5339+ case STILL:
5340+ break;
5341+ default:
5342+ BUG();
5343+ }
5344+ break;
5345+
5346+ case RSHASH_TRYDOWN:
5347+ if (rshash_state.lookup_window_index++ % 5 == 0)
5348+ rshash_state.below_count = 0;
5349+
5350+ if (get_current_benefit() < rshash_state.stable_benefit)
5351+ rshash_state.below_count++;
5352+ else if (get_current_benefit() >
5353+ rshash_state.turn_benefit_down) {
5354+ rshash_state.turn_point_down = hash_strength;
5355+ rshash_state.turn_benefit_down = get_current_benefit();
5356+ }
5357+
5358+ if (rshash_state.below_count >= 3 ||
5359+ judge_rshash_direction() == GO_UP ||
5360+ hash_strength == 1) {
5361+ hash_strength = rshash_state.stable_point;
5362+ hash_strength_delta = 0;
5363+ inc_hash_strength(hash_strength_delta);
5364+ inc_hash_strength_delta();
5365+ rshash_state.lookup_window_index = 0;
5366+ rshash_state.state = RSHASH_TRYUP;
5367+ hash_strength_delta = 0;
5368+ } else {
5369+ dec_hash_strength(hash_strength_delta);
5370+ inc_hash_strength_delta();
5371+ }
5372+ break;
5373+
5374+ case RSHASH_TRYUP:
5375+ if (rshash_state.lookup_window_index++ % 5 == 0)
5376+ rshash_state.below_count = 0;
5377+
5378+ if (get_current_benefit() < rshash_state.turn_benefit_down)
5379+ rshash_state.below_count++;
5380+ else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5381+ rshash_state.turn_point_up = hash_strength;
5382+ rshash_state.turn_benefit_up = get_current_benefit();
5383+ }
5384+
5385+ if (rshash_state.below_count >= 3 ||
5386+ judge_rshash_direction() == GO_DOWN ||
5387+ hash_strength == HASH_STRENGTH_MAX) {
5388+ hash_strength = rshash_state.turn_benefit_up >
5389+ rshash_state.turn_benefit_down ?
5390+ rshash_state.turn_point_up :
5391+ rshash_state.turn_point_down;
5392+
5393+ rshash_state.state = RSHASH_PRE_STILL;
5394+ } else {
5395+ inc_hash_strength(hash_strength_delta);
5396+ inc_hash_strength_delta();
5397+ }
5398+
5399+ break;
5400+
5401+ case RSHASH_NEW:
5402+ case RSHASH_PRE_STILL:
5403+ rshash_state.stable_benefit = get_current_benefit();
5404+ rshash_state.state = RSHASH_STILL;
5405+ hash_strength_delta = 0;
5406+ break;
5407+ default:
5408+ BUG();
5409+ }
5410+
5411+ /* rshash_neg = rshash_pos = 0; */
5412+ reset_benefit();
5413+
5414+ if (prev_hash_strength != hash_strength)
5415+ stable_tree_delta_hash(prev_hash_strength);
5416+
5417+ return prev_hash_strength != hash_strength;
5418+}
5419+
5420+/**
5421+ * round_update_ladder() - The main function to do update of all the
5422+ * adjustments whenever a scan round is finished.
5423+ */
5424+static noinline void round_update_ladder(void)
5425+{
5426+ int i;
5427+ unsigned long dedup;
5428+ struct vma_slot *slot, *tmp_slot;
5429+
5430+ for (i = 0; i < SCAN_LADDER_SIZE; i++)
5431+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5432+
5433+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5434+
5435+ /* slot may be rung_rm_slot() when mm exits */
5436+ if (slot->snode) {
5437+ dedup = cal_dedup_ratio_old(slot);
5438+ if (dedup && dedup >= uksm_abundant_threshold)
5439+ vma_rung_up(slot);
5440+ }
5441+
5442+ slot->pages_bemerged = 0;
5443+ slot->pages_cowed = 0;
5444+
5445+ list_del_init(&slot->dedup_list);
5446+ }
5447+}
5448+
5449+static void uksm_del_vma_slot(struct vma_slot *slot)
5450+{
5451+ int i, j;
5452+ struct rmap_list_entry *entry;
5453+
5454+ if (slot->snode) {
5455+ /*
5456+ * In case it just failed when entering the rung, it's not
5457+ * necessary.
5458+ */
5459+ rung_rm_slot(slot);
5460+ }
5461+
5462+ if (!list_empty(&slot->dedup_list))
5463+ list_del(&slot->dedup_list);
5464+
5465+ if (!slot->rmap_list_pool || !slot->pool_counts) {
5466+ /* In case it OOMed in uksm_vma_enter() */
5467+ goto out;
5468+ }
5469+
5470+ for (i = 0; i < slot->pool_size; i++) {
5471+ void *addr;
5472+
5473+ if (!slot->rmap_list_pool[i])
5474+ continue;
5475+
5476+ addr = kmap(slot->rmap_list_pool[i]);
5477+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5478+ entry = (struct rmap_list_entry *)addr + j;
5479+ if (is_addr(entry->addr))
5480+ continue;
5481+ if (!entry->item)
5482+ continue;
5483+
5484+ remove_rmap_item_from_tree(entry->item);
5485+ free_rmap_item(entry->item);
5486+ slot->pool_counts[i]--;
5487+ }
5488+ BUG_ON(slot->pool_counts[i]);
5489+ kunmap(slot->rmap_list_pool[i]);
5490+ __free_page(slot->rmap_list_pool[i]);
5491+ }
5492+ kfree(slot->rmap_list_pool);
5493+ kfree(slot->pool_counts);
5494+
5495+out:
5496+ slot->rung = NULL;
5497+ if (slot->flags & UKSM_SLOT_IN_UKSM) {
5498+ BUG_ON(uksm_pages_total < slot->pages);
5499+ uksm_pages_total -= slot->pages;
5500+ }
5501+
5502+ if (slot->fully_scanned_round == fully_scanned_round)
5503+ scanned_virtual_pages -= slot->pages;
5504+ else
5505+ scanned_virtual_pages -= slot->pages_scanned;
5506+ free_vma_slot(slot);
5507+}
5508+
5509+
5510+#define SPIN_LOCK_PERIOD 32
5511+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5512+static inline void cleanup_vma_slots(void)
5513+{
5514+ struct vma_slot *slot;
5515+ int i;
5516+
5517+ i = 0;
5518+ spin_lock(&vma_slot_list_lock);
5519+ while (!list_empty(&vma_slot_del)) {
5520+ slot = list_entry(vma_slot_del.next,
5521+ struct vma_slot, slot_list);
5522+ list_del(&slot->slot_list);
5523+ cleanup_slots[i++] = slot;
5524+ if (i == SPIN_LOCK_PERIOD) {
5525+ spin_unlock(&vma_slot_list_lock);
5526+ while (--i >= 0)
5527+ uksm_del_vma_slot(cleanup_slots[i]);
5528+ i = 0;
5529+ spin_lock(&vma_slot_list_lock);
5530+ }
5531+ }
5532+ spin_unlock(&vma_slot_list_lock);
5533+
5534+ while (--i >= 0)
5535+ uksm_del_vma_slot(cleanup_slots[i]);
5536+}
5537+
5538+/*
5539+ * Expotional moving average formula
5540+ */
5541+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5542+{
5543+ /*
5544+ * For a very high burst, even the ema cannot work well, a false very
5545+ * high per-page time estimation can result in feedback in very high
5546+ * overhead of context switch and rung update -- this will then lead
5547+ * to higher per-paper time, this may not converge.
5548+ *
5549+ * Instead, we try to approach this value in a binary manner.
5550+ */
5551+ if (curr > last_ema * 10)
5552+ return last_ema * 2;
5553+
5554+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5555+}
5556+
5557+/*
5558+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5559+ * nanoseconds based on current uksm_sleep_jiffies.
5560+ */
5561+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5562+{
5563+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5564+ (TIME_RATIO_SCALE - ratio) * ratio;
5565+}
5566+
5567+
5568+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5569+{
5570+ unsigned long ret;
5571+
5572+ BUG_ON(!cpu_time_ratio);
5573+
5574+ if (cpu_time_ratio > 0)
5575+ ret = cpu_time_ratio;
5576+ else
5577+ ret = (unsigned long)(-cpu_time_ratio) *
5578+ uksm_max_cpu_percentage / 100UL;
5579+
5580+ return ret ? ret : 1;
5581+}
5582+
5583+static noinline void uksm_calc_scan_pages(void)
5584+{
5585+ struct scan_rung *ladder = uksm_scan_ladder;
5586+ unsigned long sleep_usecs, nsecs;
5587+ unsigned long ratio;
5588+ int i;
5589+ unsigned long per_page;
5590+
5591+ if (uksm_ema_page_time > 100000 ||
5592+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5593+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5594+
5595+ per_page = uksm_ema_page_time;
5596+ BUG_ON(!per_page);
5597+
5598+ /*
5599+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5600+ * based on saved user input.
5601+ */
5602+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5603+ uksm_sleep_jiffies = uksm_sleep_saved;
5604+
5605+ /* We require a rung scan at least 1 page in a period. */
5606+ nsecs = per_page;
5607+ ratio = rung_real_ratio(ladder[0].cpu_ratio);
5608+ if (cpu_ratio_to_nsec(ratio) < nsecs) {
5609+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5610+ / NSEC_PER_USEC;
5611+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5612+ }
5613+
5614+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5615+ ratio = rung_real_ratio(ladder[i].cpu_ratio);
5616+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5617+ per_page;
5618+ BUG_ON(!ladder[i].pages_to_scan);
5619+ uksm_calc_rung_step(&ladder[i], per_page, ratio);
5620+ }
5621+}
5622+
5623+/*
5624+ * From the scan time of this round (ns) to next expected min sleep time
5625+ * (ms), be careful of the possible overflows. ratio is taken from
5626+ * rung_real_ratio()
5627+ */
5628+static inline
5629+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5630+{
5631+ scan_time >>= 20; /* to msec level now */
5632+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5633+
5634+ return (unsigned int) ((unsigned long) scan_time *
5635+ (TIME_RATIO_SCALE - ratio) / ratio);
5636+}
5637+
5638+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5639+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5640+
5641+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5642+{
5643+ struct scan_rung *rung;
5644+
5645+ rung = &uksm_scan_ladder[0];
5646+ rung_add_new_slots(rung, slots, num);
5647+}
5648+
5649+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5650+
5651+static void uksm_enter_all_slots(void)
5652+{
5653+ struct vma_slot *slot;
5654+ unsigned long index;
5655+ struct list_head empty_vma_list;
5656+ int i;
5657+
5658+ i = 0;
5659+ index = 0;
5660+ INIT_LIST_HEAD(&empty_vma_list);
5661+
5662+ spin_lock(&vma_slot_list_lock);
5663+ while (!list_empty(&vma_slot_new)) {
5664+ slot = list_entry(vma_slot_new.next,
5665+ struct vma_slot, slot_list);
5666+
5667+ if (!slot->vma->anon_vma) {
5668+ list_move(&slot->slot_list, &empty_vma_list);
5669+ } else if (vma_can_enter(slot->vma)) {
5670+ batch_slots[index++] = slot;
5671+ list_del_init(&slot->slot_list);
5672+ } else {
5673+ list_move(&slot->slot_list, &vma_slot_noadd);
5674+ }
5675+
5676+ if (++i == SPIN_LOCK_PERIOD ||
5677+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5678+ spin_unlock(&vma_slot_list_lock);
5679+
5680+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5681+ uksm_vma_enter(batch_slots, index);
5682+ index = 0;
5683+ }
5684+ i = 0;
5685+ cond_resched();
5686+ spin_lock(&vma_slot_list_lock);
5687+ }
5688+ }
5689+
5690+ list_splice(&empty_vma_list, &vma_slot_new);
5691+
5692+ spin_unlock(&vma_slot_list_lock);
5693+
5694+ if (index)
5695+ uksm_vma_enter(batch_slots, index);
5696+
5697+}
5698+
5699+static inline int rung_round_finished(struct scan_rung *rung)
5700+{
5701+ return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5702+}
5703+
5704+static inline void judge_slot(struct vma_slot *slot)
5705+{
5706+ struct scan_rung *rung = slot->rung;
5707+ unsigned long dedup;
5708+ int deleted;
5709+
5710+ dedup = cal_dedup_ratio(slot);
5711+ if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5712+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5713+ else if (dedup && dedup >= uksm_abundant_threshold)
5714+ deleted = vma_rung_up(slot);
5715+ else
5716+ deleted = vma_rung_down(slot);
5717+
5718+ slot->pages_merged = 0;
5719+ slot->pages_cowed = 0;
5720+ slot->this_sampled = 0;
5721+
5722+ if (vma_fully_scanned(slot))
5723+ slot->pages_scanned = 0;
5724+
5725+ slot->last_scanned = slot->pages_scanned;
5726+
5727+ /* If its deleted in above, then rung was already advanced. */
5728+ if (!deleted)
5729+ advance_current_scan(rung);
5730+}
5731+
5732+
5733+static inline int hash_round_finished(void)
5734+{
5735+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5736+ scanned_virtual_pages = 0;
5737+ if (uksm_pages_scanned)
5738+ fully_scanned_round++;
5739+
5740+ return 1;
5741+ } else {
5742+ return 0;
5743+ }
5744+}
5745+
5746+#define UKSM_MMSEM_BATCH 5
5747+#define BUSY_RETRY 100
5748+
5749+/**
5750+ * uksm_do_scan() - the main worker function.
5751+ */
5752+static noinline void uksm_do_scan(void)
5753+{
5754+ struct vma_slot *slot, *iter;
5755+ struct mm_struct *busy_mm;
5756+ unsigned char round_finished, all_rungs_emtpy;
5757+ int i, err, mmsem_batch;
5758+ unsigned long pcost;
5759+ long long delta_exec;
5760+ unsigned long vpages, max_cpu_ratio;
5761+ unsigned long long start_time, end_time, scan_time;
5762+ unsigned int expected_jiffies;
5763+
5764+ might_sleep();
5765+
5766+ vpages = 0;
5767+
5768+ start_time = task_sched_runtime(current);
5769+ max_cpu_ratio = 0;
5770+ mmsem_batch = 0;
5771+
5772+ for (i = 0; i < SCAN_LADDER_SIZE;) {
5773+ struct scan_rung *rung = &uksm_scan_ladder[i];
5774+ unsigned long ratio;
5775+ int busy_retry;
5776+
5777+ if (!rung->pages_to_scan) {
5778+ i++;
5779+ continue;
5780+ }
5781+
5782+ if (!rung->vma_root.num) {
5783+ rung->pages_to_scan = 0;
5784+ i++;
5785+ continue;
5786+ }
5787+
5788+ ratio = rung_real_ratio(rung->cpu_ratio);
5789+ if (ratio > max_cpu_ratio)
5790+ max_cpu_ratio = ratio;
5791+
5792+ busy_retry = BUSY_RETRY;
5793+ /*
5794+ * Do not consider rung_round_finished() here, just used up the
5795+ * rung->pages_to_scan quota.
5796+ */
5797+ while (rung->pages_to_scan && rung->vma_root.num &&
5798+ likely(!freezing(current))) {
5799+ int reset = 0;
5800+
5801+ slot = rung->current_scan;
5802+
5803+ BUG_ON(vma_fully_scanned(slot));
5804+
5805+ if (mmsem_batch)
5806+ err = 0;
5807+ else
5808+ err = try_down_read_slot_mmap_lock(slot);
5809+
5810+ if (err == -ENOENT) {
5811+rm_slot:
5812+ rung_rm_slot(slot);
5813+ continue;
5814+ }
5815+
5816+ busy_mm = slot->mm;
5817+
5818+ if (err == -EBUSY) {
5819+ /* skip other vmas on the same mm */
5820+ do {
5821+ reset = advance_current_scan(rung);
5822+ iter = rung->current_scan;
5823+ busy_retry--;
5824+ if (iter->vma->vm_mm != busy_mm ||
5825+ !busy_retry || reset)
5826+ break;
5827+ } while (1);
5828+
5829+ if (iter->vma->vm_mm != busy_mm) {
5830+ continue;
5831+ } else {
5832+ /* scan round finsished */
5833+ break;
5834+ }
5835+ }
5836+
5837+ BUG_ON(!vma_can_enter(slot->vma));
5838+ if (uksm_test_exit(slot->vma->vm_mm)) {
5839+ mmsem_batch = 0;
5840+ mmap_read_unlock(slot->vma->vm_mm);
5841+ goto rm_slot;
5842+ }
5843+
5844+ if (mmsem_batch)
5845+ mmsem_batch--;
5846+ else
5847+ mmsem_batch = UKSM_MMSEM_BATCH;
5848+
5849+ /* Ok, we have take the mmap_lock, ready to scan */
5850+ scan_vma_one_page(slot);
5851+ rung->pages_to_scan--;
5852+ vpages++;
5853+
5854+ if (rung->current_offset + rung->step > slot->pages - 1
5855+ || vma_fully_scanned(slot)) {
5856+ mmap_read_unlock(slot->vma->vm_mm);
5857+ judge_slot(slot);
5858+ mmsem_batch = 0;
5859+ } else {
5860+ rung->current_offset += rung->step;
5861+ if (!mmsem_batch)
5862+ mmap_read_unlock(slot->vma->vm_mm);
5863+ }
5864+
5865+ busy_retry = BUSY_RETRY;
5866+ cond_resched();
5867+ }
5868+
5869+ if (mmsem_batch) {
5870+ mmap_read_unlock(slot->vma->vm_mm);
5871+ mmsem_batch = 0;
5872+ }
5873+
5874+ if (freezing(current))
5875+ break;
5876+
5877+ cond_resched();
5878+ }
5879+ end_time = task_sched_runtime(current);
5880+ delta_exec = end_time - start_time;
5881+
5882+ if (freezing(current))
5883+ return;
5884+
5885+ cleanup_vma_slots();
5886+ uksm_enter_all_slots();
5887+
5888+ round_finished = 1;
5889+ all_rungs_emtpy = 1;
5890+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5891+ struct scan_rung *rung = &uksm_scan_ladder[i];
5892+
5893+ if (rung->vma_root.num) {
5894+ all_rungs_emtpy = 0;
5895+ if (!rung_round_finished(rung))
5896+ round_finished = 0;
5897+ }
5898+ }
5899+
5900+ if (all_rungs_emtpy)
5901+ round_finished = 0;
5902+
5903+ if (round_finished) {
5904+ round_update_ladder();
5905+ uksm_eval_round++;
5906+
5907+ if (hash_round_finished() && rshash_adjust()) {
5908+ /* Reset the unstable root iff hash strength changed */
5909+ uksm_hash_round++;
5910+ root_unstable_tree = RB_ROOT;
5911+ free_all_tree_nodes(&unstable_tree_node_list);
5912+ }
5913+
5914+ /*
5915+ * A number of pages can hang around indefinitely on per-cpu
5916+ * pagevecs, raised page count preventing write_protect_page
5917+ * from merging them. Though it doesn't really matter much,
5918+ * it is puzzling to see some stuck in pages_volatile until
5919+ * other activity jostles them out, and they also prevented
5920+ * LTP's KSM test from succeeding deterministically; so drain
5921+ * them here (here rather than on entry to uksm_do_scan(),
5922+ * so we don't IPI too often when pages_to_scan is set low).
5923+ */
5924+ lru_add_drain_all();
5925+ }
5926+
5927+
5928+ if (vpages && delta_exec > 0) {
5929+ pcost = (unsigned long) delta_exec / vpages;
5930+ if (likely(uksm_ema_page_time))
5931+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5932+ else
5933+ uksm_ema_page_time = pcost;
5934+ }
5935+
5936+ uksm_calc_scan_pages();
5937+ uksm_sleep_real = uksm_sleep_jiffies;
5938+ /* in case of radical cpu bursts, apply the upper bound */
5939+ end_time = task_sched_runtime(current);
5940+ if (max_cpu_ratio && end_time > start_time) {
5941+ scan_time = end_time - start_time;
5942+ expected_jiffies = msecs_to_jiffies(
5943+ scan_time_to_sleep(scan_time, max_cpu_ratio));
5944+
5945+ if (expected_jiffies > uksm_sleep_real)
5946+ uksm_sleep_real = expected_jiffies;
5947+
5948+ /* We have a 1 second up bound for responsiveness. */
5949+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5950+ uksm_sleep_real = msecs_to_jiffies(1000);
5951+ }
5952+
5953+ return;
5954+}
5955+
5956+static int ksmd_should_run(void)
5957+{
5958+ return uksm_run & UKSM_RUN_MERGE;
5959+}
5960+
5961+static int uksm_scan_thread(void *nothing)
5962+{
5963+ set_freezable();
5964+ set_user_nice(current, 5);
5965+
5966+ while (!kthread_should_stop()) {
5967+ mutex_lock(&uksm_thread_mutex);
5968+ if (ksmd_should_run())
5969+ uksm_do_scan();
5970+ mutex_unlock(&uksm_thread_mutex);
5971+
5972+ try_to_freeze();
5973+
5974+ if (ksmd_should_run()) {
5975+ schedule_timeout_interruptible(uksm_sleep_real);
5976+ uksm_sleep_times++;
5977+ } else {
5978+ wait_event_freezable(uksm_thread_wait,
5979+ ksmd_should_run() || kthread_should_stop());
5980+ }
5981+ }
5982+ return 0;
5983+}
5984+
5985+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
5986+{
5987+ struct stable_node *stable_node;
5988+ struct node_vma *node_vma;
5989+ struct rmap_item *rmap_item;
5990+ int search_new_forks = 0;
5991+ unsigned long address;
5992+
5993+ VM_BUG_ON_PAGE(!PageKsm(page), page);
5994+ VM_BUG_ON_PAGE(!PageLocked(page), page);
5995+
5996+ stable_node = page_stable_node(page);
5997+ if (!stable_node)
5998+ return;
5999+again:
6000+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6001+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6002+ struct anon_vma *anon_vma = rmap_item->anon_vma;
6003+ struct anon_vma_chain *vmac;
6004+ struct vm_area_struct *vma;
6005+
6006+ cond_resched();
6007+ anon_vma_lock_read(anon_vma);
6008+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6009+ 0, ULONG_MAX) {
6010+ cond_resched();
6011+ vma = vmac->vma;
6012+ address = get_rmap_addr(rmap_item);
6013+
6014+ if (address < vma->vm_start ||
6015+ address >= vma->vm_end)
6016+ continue;
6017+
6018+ if ((rmap_item->slot->vma == vma) ==
6019+ search_new_forks)
6020+ continue;
6021+
6022+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6023+ continue;
6024+
6025+ if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
6026+ anon_vma_unlock_read(anon_vma);
6027+ return;
6028+ }
6029+
6030+ if (rwc->done && rwc->done(page)) {
6031+ anon_vma_unlock_read(anon_vma);
6032+ return;
6033+ }
6034+ }
6035+ anon_vma_unlock_read(anon_vma);
6036+ }
6037+ }
6038+ if (!search_new_forks++)
6039+ goto again;
6040+}
6041+
6042+#ifdef CONFIG_MIGRATION
6043+/* Common ksm interface but may be specific to uksm */
6044+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6045+{
6046+ struct stable_node *stable_node;
6047+
6048+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6049+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6050+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
6051+
6052+ stable_node = page_stable_node(newpage);
6053+ if (stable_node) {
6054+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6055+ stable_node->kpfn = page_to_pfn(newpage);
6056+ /*
6057+ * newpage->mapping was set in advance; now we need smp_wmb()
6058+ * to make sure that the new stable_node->kpfn is visible
6059+ * to get_ksm_page() before it can see that oldpage->mapping
6060+ * has gone stale (or that PageSwapCache has been cleared).
6061+ */
6062+ smp_wmb();
6063+ set_page_stable_node(oldpage, NULL);
6064+ }
6065+}
6066+#endif /* CONFIG_MIGRATION */
6067+
6068+#ifdef CONFIG_MEMORY_HOTREMOVE
6069+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6070+ unsigned long end_pfn)
6071+{
6072+ struct rb_node *node;
6073+
6074+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6075+ struct stable_node *stable_node;
6076+
6077+ stable_node = rb_entry(node, struct stable_node, node);
6078+ if (stable_node->kpfn >= start_pfn &&
6079+ stable_node->kpfn < end_pfn)
6080+ return stable_node;
6081+ }
6082+ return NULL;
6083+}
6084+
6085+static int uksm_memory_callback(struct notifier_block *self,
6086+ unsigned long action, void *arg)
6087+{
6088+ struct memory_notify *mn = arg;
6089+ struct stable_node *stable_node;
6090+
6091+ switch (action) {
6092+ case MEM_GOING_OFFLINE:
6093+ /*
6094+ * Keep it very simple for now: just lock out ksmd and
6095+ * MADV_UNMERGEABLE while any memory is going offline.
6096+ * mutex_lock_nested() is necessary because lockdep was alarmed
6097+ * that here we take uksm_thread_mutex inside notifier chain
6098+ * mutex, and later take notifier chain mutex inside
6099+ * uksm_thread_mutex to unlock it. But that's safe because both
6100+ * are inside mem_hotplug_mutex.
6101+ */
6102+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6103+ break;
6104+
6105+ case MEM_OFFLINE:
6106+ /*
6107+ * Most of the work is done by page migration; but there might
6108+ * be a few stable_nodes left over, still pointing to struct
6109+ * pages which have been offlined: prune those from the tree.
6110+ */
6111+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6112+ mn->start_pfn + mn->nr_pages)) != NULL)
6113+ remove_node_from_stable_tree(stable_node, 1, 1);
6114+ /* fallthrough */
6115+
6116+ case MEM_CANCEL_OFFLINE:
6117+ mutex_unlock(&uksm_thread_mutex);
6118+ break;
6119+ }
6120+ return NOTIFY_OK;
6121+}
6122+#endif /* CONFIG_MEMORY_HOTREMOVE */
6123+
6124+#ifdef CONFIG_SYSFS
6125+/*
6126+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
6127+ */
6128+
6129+#define UKSM_ATTR_RO(_name) \
6130+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6131+#define UKSM_ATTR(_name) \
6132+ static struct kobj_attribute _name##_attr = \
6133+ __ATTR(_name, 0644, _name##_show, _name##_store)
6134+
6135+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6136+ struct kobj_attribute *attr, char *buf)
6137+{
6138+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6139+}
6140+
6141+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6142+ struct kobj_attribute *attr,
6143+ const char *buf, size_t count)
6144+{
6145+ unsigned long max_cpu_percentage;
6146+ int err;
6147+
6148+ err = kstrtoul(buf, 10, &max_cpu_percentage);
6149+ if (err || max_cpu_percentage > 100)
6150+ return -EINVAL;
6151+
6152+ if (max_cpu_percentage == 100)
6153+ max_cpu_percentage = 99;
6154+ else if (max_cpu_percentage < 10)
6155+ max_cpu_percentage = 10;
6156+
6157+ uksm_max_cpu_percentage = max_cpu_percentage;
6158+
6159+ return count;
6160+}
6161+UKSM_ATTR(max_cpu_percentage);
6162+
6163+static ssize_t sleep_millisecs_show(struct kobject *kobj,
6164+ struct kobj_attribute *attr, char *buf)
6165+{
6166+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6167+}
6168+
6169+static ssize_t sleep_millisecs_store(struct kobject *kobj,
6170+ struct kobj_attribute *attr,
6171+ const char *buf, size_t count)
6172+{
6173+ unsigned long msecs;
6174+ int err;
6175+
6176+ err = kstrtoul(buf, 10, &msecs);
6177+ if (err || msecs > MSEC_PER_SEC)
6178+ return -EINVAL;
6179+
6180+ uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6181+ uksm_sleep_saved = uksm_sleep_jiffies;
6182+
6183+ return count;
6184+}
6185+UKSM_ATTR(sleep_millisecs);
6186+
6187+
6188+static ssize_t cpu_governor_show(struct kobject *kobj,
6189+ struct kobj_attribute *attr, char *buf)
6190+{
6191+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6192+ int i;
6193+
6194+ buf[0] = '\0';
6195+ for (i = 0; i < n ; i++) {
6196+ if (uksm_cpu_governor == i)
6197+ strcat(buf, "[");
6198+
6199+ strcat(buf, uksm_cpu_governor_str[i]);
6200+
6201+ if (uksm_cpu_governor == i)
6202+ strcat(buf, "]");
6203+
6204+ strcat(buf, " ");
6205+ }
6206+ strcat(buf, "\n");
6207+
6208+ return strlen(buf);
6209+}
6210+
6211+static inline void init_performance_values(void)
6212+{
6213+ int i;
6214+ struct scan_rung *rung;
6215+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6216+
6217+
6218+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6219+ rung = uksm_scan_ladder + i;
6220+ rung->cpu_ratio = preset->cpu_ratio[i];
6221+ rung->cover_msecs = preset->cover_msecs[i];
6222+ }
6223+
6224+ uksm_max_cpu_percentage = preset->max_cpu;
6225+}
6226+
6227+static ssize_t cpu_governor_store(struct kobject *kobj,
6228+ struct kobj_attribute *attr,
6229+ const char *buf, size_t count)
6230+{
6231+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6232+
6233+ for (n--; n >= 0 ; n--) {
6234+ if (!strncmp(buf, uksm_cpu_governor_str[n],
6235+ strlen(uksm_cpu_governor_str[n])))
6236+ break;
6237+ }
6238+
6239+ if (n < 0)
6240+ return -EINVAL;
6241+ else
6242+ uksm_cpu_governor = n;
6243+
6244+ init_performance_values();
6245+
6246+ return count;
6247+}
6248+UKSM_ATTR(cpu_governor);
6249+
6250+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6251+ char *buf)
6252+{
6253+ return sprintf(buf, "%u\n", uksm_run);
6254+}
6255+
6256+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6257+ const char *buf, size_t count)
6258+{
6259+ int err;
6260+ unsigned long flags;
6261+
6262+ err = kstrtoul(buf, 10, &flags);
6263+ if (err || flags > UINT_MAX)
6264+ return -EINVAL;
6265+ if (flags > UKSM_RUN_MERGE)
6266+ return -EINVAL;
6267+
6268+ mutex_lock(&uksm_thread_mutex);
6269+ if (uksm_run != flags)
6270+ uksm_run = flags;
6271+ mutex_unlock(&uksm_thread_mutex);
6272+
6273+ if (flags & UKSM_RUN_MERGE)
6274+ wake_up_interruptible(&uksm_thread_wait);
6275+
6276+ return count;
6277+}
6278+UKSM_ATTR(run);
6279+
6280+static ssize_t abundant_threshold_show(struct kobject *kobj,
6281+ struct kobj_attribute *attr, char *buf)
6282+{
6283+ return sprintf(buf, "%u\n", uksm_abundant_threshold);
6284+}
6285+
6286+static ssize_t abundant_threshold_store(struct kobject *kobj,
6287+ struct kobj_attribute *attr,
6288+ const char *buf, size_t count)
6289+{
6290+ int err;
6291+ unsigned long flags;
6292+
6293+ err = kstrtoul(buf, 10, &flags);
6294+ if (err || flags > 99)
6295+ return -EINVAL;
6296+
6297+ uksm_abundant_threshold = flags;
6298+
6299+ return count;
6300+}
6301+UKSM_ATTR(abundant_threshold);
6302+
6303+static ssize_t thrash_threshold_show(struct kobject *kobj,
6304+ struct kobj_attribute *attr, char *buf)
6305+{
6306+ return sprintf(buf, "%u\n", uksm_thrash_threshold);
6307+}
6308+
6309+static ssize_t thrash_threshold_store(struct kobject *kobj,
6310+ struct kobj_attribute *attr,
6311+ const char *buf, size_t count)
6312+{
6313+ int err;
6314+ unsigned long flags;
6315+
6316+ err = kstrtoul(buf, 10, &flags);
6317+ if (err || flags > 99)
6318+ return -EINVAL;
6319+
6320+ uksm_thrash_threshold = flags;
6321+
6322+ return count;
6323+}
6324+UKSM_ATTR(thrash_threshold);
6325+
6326+static ssize_t cpu_ratios_show(struct kobject *kobj,
6327+ struct kobj_attribute *attr, char *buf)
6328+{
6329+ int i, size;
6330+ struct scan_rung *rung;
6331+ char *p = buf;
6332+
6333+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6334+ rung = &uksm_scan_ladder[i];
6335+
6336+ if (rung->cpu_ratio > 0)
6337+ size = sprintf(p, "%d ", rung->cpu_ratio);
6338+ else
6339+ size = sprintf(p, "MAX/%d ",
6340+ TIME_RATIO_SCALE / -rung->cpu_ratio);
6341+
6342+ p += size;
6343+ }
6344+
6345+ *p++ = '\n';
6346+ *p = '\0';
6347+
6348+ return p - buf;
6349+}
6350+
6351+static ssize_t cpu_ratios_store(struct kobject *kobj,
6352+ struct kobj_attribute *attr,
6353+ const char *buf, size_t count)
6354+{
6355+ int i, cpuratios[SCAN_LADDER_SIZE], err;
6356+ unsigned long value;
6357+ struct scan_rung *rung;
6358+ char *p, *end = NULL;
6359+
6360+ p = kzalloc(count, GFP_KERNEL);
6361+ if (!p)
6362+ return -ENOMEM;
6363+
6364+ memcpy(p, buf, count);
6365+
6366+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6367+ if (i != SCAN_LADDER_SIZE - 1) {
6368+ end = strchr(p, ' ');
6369+ if (!end)
6370+ return -EINVAL;
6371+
6372+ *end = '\0';
6373+ }
6374+
6375+ if (strstr(p, "MAX/")) {
6376+ p = strchr(p, '/') + 1;
6377+ err = kstrtoul(p, 10, &value);
6378+ if (err || value > TIME_RATIO_SCALE || !value)
6379+ return -EINVAL;
6380+
6381+ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
6382+ } else {
6383+ err = kstrtoul(p, 10, &value);
6384+ if (err || value > TIME_RATIO_SCALE || !value)
6385+ return -EINVAL;
6386+
6387+ cpuratios[i] = value;
6388+ }
6389+
6390+ p = end + 1;
6391+ }
6392+
6393+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6394+ rung = &uksm_scan_ladder[i];
6395+
6396+ rung->cpu_ratio = cpuratios[i];
6397+ }
6398+
6399+ return count;
6400+}
6401+UKSM_ATTR(cpu_ratios);
6402+
6403+static ssize_t eval_intervals_show(struct kobject *kobj,
6404+ struct kobj_attribute *attr, char *buf)
6405+{
6406+ int i, size;
6407+ struct scan_rung *rung;
6408+ char *p = buf;
6409+
6410+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6411+ rung = &uksm_scan_ladder[i];
6412+ size = sprintf(p, "%u ", rung->cover_msecs);
6413+ p += size;
6414+ }
6415+
6416+ *p++ = '\n';
6417+ *p = '\0';
6418+
6419+ return p - buf;
6420+}
6421+
6422+static ssize_t eval_intervals_store(struct kobject *kobj,
6423+ struct kobj_attribute *attr,
6424+ const char *buf, size_t count)
6425+{
6426+ int i, err;
6427+ unsigned long values[SCAN_LADDER_SIZE];
6428+ struct scan_rung *rung;
6429+ char *p, *end = NULL;
6430+ ssize_t ret = count;
6431+
6432+ p = kzalloc(count + 2, GFP_KERNEL);
6433+ if (!p)
6434+ return -ENOMEM;
6435+
6436+ memcpy(p, buf, count);
6437+
6438+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6439+ if (i != SCAN_LADDER_SIZE - 1) {
6440+ end = strchr(p, ' ');
6441+ if (!end) {
6442+ ret = -EINVAL;
6443+ goto out;
6444+ }
6445+
6446+ *end = '\0';
6447+ }
6448+
6449+ err = kstrtoul(p, 10, &values[i]);
6450+ if (err) {
6451+ ret = -EINVAL;
6452+ goto out;
6453+ }
6454+
6455+ p = end + 1;
6456+ }
6457+
6458+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6459+ rung = &uksm_scan_ladder[i];
6460+
6461+ rung->cover_msecs = values[i];
6462+ }
6463+
6464+out:
6465+ kfree(p);
6466+ return ret;
6467+}
6468+UKSM_ATTR(eval_intervals);
6469+
6470+static ssize_t ema_per_page_time_show(struct kobject *kobj,
6471+ struct kobj_attribute *attr, char *buf)
6472+{
6473+ return sprintf(buf, "%lu\n", uksm_ema_page_time);
6474+}
6475+UKSM_ATTR_RO(ema_per_page_time);
6476+
6477+static ssize_t pages_shared_show(struct kobject *kobj,
6478+ struct kobj_attribute *attr, char *buf)
6479+{
6480+ return sprintf(buf, "%lu\n", uksm_pages_shared);
6481+}
6482+UKSM_ATTR_RO(pages_shared);
6483+
6484+static ssize_t pages_sharing_show(struct kobject *kobj,
6485+ struct kobj_attribute *attr, char *buf)
6486+{
6487+ return sprintf(buf, "%lu\n", uksm_pages_sharing);
6488+}
6489+UKSM_ATTR_RO(pages_sharing);
6490+
6491+static ssize_t pages_unshared_show(struct kobject *kobj,
6492+ struct kobj_attribute *attr, char *buf)
6493+{
6494+ return sprintf(buf, "%lu\n", uksm_pages_unshared);
6495+}
6496+UKSM_ATTR_RO(pages_unshared);
6497+
6498+static ssize_t full_scans_show(struct kobject *kobj,
6499+ struct kobj_attribute *attr, char *buf)
6500+{
6501+ return sprintf(buf, "%llu\n", fully_scanned_round);
6502+}
6503+UKSM_ATTR_RO(full_scans);
6504+
6505+static ssize_t pages_scanned_show(struct kobject *kobj,
6506+ struct kobj_attribute *attr, char *buf)
6507+{
6508+ unsigned long base = 0;
6509+ u64 delta, ret;
6510+
6511+ if (pages_scanned_stored) {
6512+ base = pages_scanned_base;
6513+ ret = pages_scanned_stored;
6514+ delta = uksm_pages_scanned >> base;
6515+ if (CAN_OVERFLOW_U64(ret, delta)) {
6516+ ret >>= 1;
6517+ delta >>= 1;
6518+ base++;
6519+ ret += delta;
6520+ }
6521+ } else {
6522+ ret = uksm_pages_scanned;
6523+ }
6524+
6525+ while (ret > ULONG_MAX) {
6526+ ret >>= 1;
6527+ base++;
6528+ }
6529+
6530+ if (base)
6531+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6532+ else
6533+ return sprintf(buf, "%lu\n", (unsigned long)ret);
6534+}
6535+UKSM_ATTR_RO(pages_scanned);
6536+
6537+static ssize_t hash_strength_show(struct kobject *kobj,
6538+ struct kobj_attribute *attr, char *buf)
6539+{
6540+ return sprintf(buf, "%lu\n", hash_strength);
6541+}
6542+UKSM_ATTR_RO(hash_strength);
6543+
6544+static ssize_t sleep_times_show(struct kobject *kobj,
6545+ struct kobj_attribute *attr, char *buf)
6546+{
6547+ return sprintf(buf, "%llu\n", uksm_sleep_times);
6548+}
6549+UKSM_ATTR_RO(sleep_times);
6550+
6551+
6552+static struct attribute *uksm_attrs[] = {
6553+ &max_cpu_percentage_attr.attr,
6554+ &sleep_millisecs_attr.attr,
6555+ &cpu_governor_attr.attr,
6556+ &run_attr.attr,
6557+ &ema_per_page_time_attr.attr,
6558+ &pages_shared_attr.attr,
6559+ &pages_sharing_attr.attr,
6560+ &pages_unshared_attr.attr,
6561+ &full_scans_attr.attr,
6562+ &pages_scanned_attr.attr,
6563+ &hash_strength_attr.attr,
6564+ &sleep_times_attr.attr,
6565+ &thrash_threshold_attr.attr,
6566+ &abundant_threshold_attr.attr,
6567+ &cpu_ratios_attr.attr,
6568+ &eval_intervals_attr.attr,
6569+ NULL,
6570+};
6571+
6572+static struct attribute_group uksm_attr_group = {
6573+ .attrs = uksm_attrs,
6574+ .name = "uksm",
6575+};
6576+#endif /* CONFIG_SYSFS */
6577+
6578+static inline void init_scan_ladder(void)
6579+{
6580+ int i;
6581+ struct scan_rung *rung;
6582+
6583+ for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6584+ rung = uksm_scan_ladder + i;
6585+ slot_tree_init_root(&rung->vma_root);
6586+ }
6587+
6588+ init_performance_values();
6589+ uksm_calc_scan_pages();
6590+}
6591+
6592+static inline int cal_positive_negative_costs(void)
6593+{
6594+ struct page *p1, *p2;
6595+ unsigned char *addr1, *addr2;
6596+ unsigned long i, time_start, hash_cost;
6597+ unsigned long loopnum = 0;
6598+
6599+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6600+ volatile u32 hash;
6601+ volatile int ret;
6602+
6603+ p1 = alloc_page(GFP_KERNEL);
6604+ if (!p1)
6605+ return -ENOMEM;
6606+
6607+ p2 = alloc_page(GFP_KERNEL);
6608+ if (!p2)
6609+ return -ENOMEM;
6610+
6611+ addr1 = kmap_atomic(p1);
6612+ addr2 = kmap_atomic(p2);
6613+ memset(addr1, prandom_u32(), PAGE_SIZE);
6614+ memcpy(addr2, addr1, PAGE_SIZE);
6615+
6616+ /* make sure that the two pages differ in last byte */
6617+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6618+ kunmap_atomic(addr2);
6619+ kunmap_atomic(addr1);
6620+
6621+ time_start = jiffies;
6622+ while (jiffies - time_start < 100) {
6623+ for (i = 0; i < 100; i++)
6624+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6625+ loopnum += 100;
6626+ }
6627+ hash_cost = (jiffies - time_start);
6628+
6629+ time_start = jiffies;
6630+ for (i = 0; i < loopnum; i++)
6631+ ret = pages_identical_with_cost(p1, p2);
6632+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6633+ memcmp_cost /= hash_cost;
6634+ pr_info("UKSM: relative memcmp_cost = %lu "
6635+ "hash=%u cmp_ret=%d.\n",
6636+ memcmp_cost, hash, ret);
6637+
6638+ __free_page(p1);
6639+ __free_page(p2);
6640+ return 0;
6641+}
6642+
6643+static int init_zeropage_hash_table(void)
6644+{
6645+ struct page *page;
6646+ char *addr;
6647+ int i;
6648+
6649+ page = alloc_page(GFP_KERNEL);
6650+ if (!page)
6651+ return -ENOMEM;
6652+
6653+ addr = kmap_atomic(page);
6654+ memset(addr, 0, PAGE_SIZE);
6655+ kunmap_atomic(addr);
6656+
6657+ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
6658+ GFP_KERNEL);
6659+ if (!zero_hash_table)
6660+ return -ENOMEM;
6661+
6662+ for (i = 0; i < HASH_STRENGTH_MAX; i++)
6663+ zero_hash_table[i] = page_hash(page, i, 0);
6664+
6665+ __free_page(page);
6666+
6667+ return 0;
6668+}
6669+
6670+static inline int init_random_sampling(void)
6671+{
6672+ unsigned long i;
6673+
6674+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6675+ if (!random_nums)
6676+ return -ENOMEM;
6677+
6678+ for (i = 0; i < HASH_STRENGTH_FULL; i++)
6679+ random_nums[i] = i;
6680+
6681+ for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6682+ unsigned long rand_range, swap_index, tmp;
6683+
6684+ rand_range = HASH_STRENGTH_FULL - i;
6685+ swap_index = i + prandom_u32() % rand_range;
6686+ tmp = random_nums[i];
6687+ random_nums[i] = random_nums[swap_index];
6688+ random_nums[swap_index] = tmp;
6689+ }
6690+
6691+ rshash_state.state = RSHASH_NEW;
6692+ rshash_state.below_count = 0;
6693+ rshash_state.lookup_window_index = 0;
6694+
6695+ return cal_positive_negative_costs();
6696+}
6697+
6698+static int __init uksm_slab_init(void)
6699+{
6700+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6701+ if (!rmap_item_cache)
6702+ goto out;
6703+
6704+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6705+ if (!stable_node_cache)
6706+ goto out_free1;
6707+
6708+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6709+ if (!node_vma_cache)
6710+ goto out_free2;
6711+
6712+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6713+ if (!vma_slot_cache)
6714+ goto out_free3;
6715+
6716+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6717+ if (!tree_node_cache)
6718+ goto out_free4;
6719+
6720+ return 0;
6721+
6722+out_free4:
6723+ kmem_cache_destroy(vma_slot_cache);
6724+out_free3:
6725+ kmem_cache_destroy(node_vma_cache);
6726+out_free2:
6727+ kmem_cache_destroy(stable_node_cache);
6728+out_free1:
6729+ kmem_cache_destroy(rmap_item_cache);
6730+out:
6731+ return -ENOMEM;
6732+}
6733+
6734+static void __init uksm_slab_free(void)
6735+{
6736+ kmem_cache_destroy(stable_node_cache);
6737+ kmem_cache_destroy(rmap_item_cache);
6738+ kmem_cache_destroy(node_vma_cache);
6739+ kmem_cache_destroy(vma_slot_cache);
6740+ kmem_cache_destroy(tree_node_cache);
6741+}
6742+
6743+/* Common interface to ksm, different to it. */
6744+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6745+ unsigned long end, int advice, unsigned long *vm_flags)
6746+{
6747+ int err;
6748+
6749+ switch (advice) {
6750+ case MADV_MERGEABLE:
6751+ return 0; /* just ignore the advice */
6752+
6753+ case MADV_UNMERGEABLE:
6754+ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
6755+ return 0; /* just ignore the advice */
6756+
6757+ if (vma->anon_vma) {
6758+ err = unmerge_uksm_pages(vma, start, end);
6759+ if (err)
6760+ return err;
6761+ }
6762+
6763+ uksm_remove_vma(vma);
6764+ *vm_flags &= ~VM_MERGEABLE;
6765+ break;
6766+ }
6767+
6768+ return 0;
6769+}
6770+
6771+/* Common interface to ksm, actually the same. */
6772+struct page *ksm_might_need_to_copy(struct page *page,
6773+ struct vm_area_struct *vma, unsigned long address)
6774+{
6775+ struct anon_vma *anon_vma = page_anon_vma(page);
6776+ struct page *new_page;
6777+
6778+ if (PageKsm(page)) {
6779+ if (page_stable_node(page))
6780+ return page; /* no need to copy it */
6781+ } else if (!anon_vma) {
6782+ return page; /* no need to copy it */
6783+ } else if (anon_vma->root == vma->anon_vma->root &&
6784+ page->index == linear_page_index(vma, address)) {
6785+ return page; /* still no need to copy it */
6786+ }
6787+ if (!PageUptodate(page))
6788+ return page; /* let do_swap_page report the error */
6789+
6790+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6791+ if (new_page) {
6792+ copy_user_highpage(new_page, page, address, vma);
6793+
6794+ SetPageDirty(new_page);
6795+ __SetPageUptodate(new_page);
6796+ __SetPageLocked(new_page);
6797+ }
6798+
6799+ return new_page;
6800+}
6801+
6802+/* Copied from mm/ksm.c and required from 5.1 */
6803+bool reuse_ksm_page(struct page *page,
6804+ struct vm_area_struct *vma,
6805+ unsigned long address)
6806+{
6807+#ifdef CONFIG_DEBUG_VM
6808+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
6809+ WARN_ON(!page_mapped(page)) ||
6810+ WARN_ON(!PageLocked(page))) {
6811+ dump_page(page, "reuse_ksm_page");
6812+ return false;
6813+ }
6814+#endif
6815+
6816+ if (PageSwapCache(page) || !page_stable_node(page))
6817+ return false;
6818+ /* Prohibit parallel get_ksm_page() */
6819+ if (!page_ref_freeze(page, 1))
6820+ return false;
6821+
6822+ page_move_anon_rmap(page, vma);
6823+ page->index = linear_page_index(vma, address);
6824+ page_ref_unfreeze(page, 1);
6825+
6826+ return true;
6827+}
6828+
6829+static int __init uksm_init(void)
6830+{
6831+ struct task_struct *uksm_thread;
6832+ int err;
6833+
6834+ uksm_sleep_jiffies = msecs_to_jiffies(100);
6835+ uksm_sleep_saved = uksm_sleep_jiffies;
6836+
6837+ slot_tree_init();
6838+ init_scan_ladder();
6839+
6840+
6841+ err = init_random_sampling();
6842+ if (err)
6843+ goto out_free2;
6844+
6845+ err = uksm_slab_init();
6846+ if (err)
6847+ goto out_free1;
6848+
6849+ err = init_zeropage_hash_table();
6850+ if (err)
6851+ goto out_free0;
6852+
6853+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6854+ if (IS_ERR(uksm_thread)) {
6855+ pr_err("uksm: creating kthread failed\n");
6856+ err = PTR_ERR(uksm_thread);
6857+ goto out_free;
6858+ }
6859+
6860+#ifdef CONFIG_SYSFS
6861+ err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6862+ if (err) {
6863+ pr_err("uksm: register sysfs failed\n");
6864+ kthread_stop(uksm_thread);
6865+ goto out_free;
6866+ }
6867+#else
6868+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */
6869+
6870+#endif /* CONFIG_SYSFS */
6871+
6872+#ifdef CONFIG_MEMORY_HOTREMOVE
6873+ /*
6874+ * Choose a high priority since the callback takes uksm_thread_mutex:
6875+ * later callbacks could only be taking locks which nest within that.
6876+ */
6877+ hotplug_memory_notifier(uksm_memory_callback, 100);
6878+#endif
6879+ return 0;
6880+
6881+out_free:
6882+ kfree(zero_hash_table);
6883+out_free0:
6884+ uksm_slab_free();
6885+out_free1:
6886+ kfree(random_nums);
6887+out_free2:
6888+ kfree(uksm_scan_ladder);
6889+ return err;
6890+}
6891+
6892+#ifdef MODULE
6893+subsys_initcall(ksm_init);
6894+#else
6895+late_initcall(uksm_init);
6896+#endif
6897+
6898diff -Naur a/mm/vmstat.c b/mm/vmstat.c
6899--- a/mm/vmstat.c 2020-06-14 21:45:04.000000000 +0200
6900+++ b/mm/vmstat.c 2020-06-15 10:34:30.892359059 +0200
6901@@ -1173,6 +1173,9 @@
6902 "nr_foll_pin_acquired",
6903 "nr_foll_pin_released",
6904
6905+#ifdef CONFIG_UKSM
6906+ "nr_uksm_zero_pages",
6907+#endif
6908 /* enum writeback_stat_item counters */
6909 "nr_dirty_threshold",
6910 "nr_dirty_background_threshold",