2nSW624B

· 5 years ago · Jun 26, 2020, 01:58 PM
1diff -Naur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
2--- a/Documentation/vm/uksm.txt	1970-01-01 01:00:00.000000000 +0100
3+++ b/Documentation/vm/uksm.txt	2020-06-15 10:34:30.890359050 +0200
4@@ -0,0 +1,61 @@
5+The Ultra Kernel Samepage Merging feature
6+----------------------------------------------
7+/*
8+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
9+ *
10+ * This is an improvement upon KSM. Some basic data structures and routines
11+ * are borrowed from ksm.c .
12+ *
13+ * Its new features:
14+ * 1. Full system scan:
15+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
16+ *      interaction to submit a memory area to KSM is no longer needed.
17+ *
18+ * 2. Rich area detection:
19+ *      It automatically detects rich areas containing abundant duplicated
20+ *      pages based. Rich areas are given a full scan speed. Poor areas are
21+ *      sampled at a reasonable speed with very low CPU consumption.
22+ *
23+ * 3. Ultra Per-page scan speed improvement:
24+ *      A new hash algorithm is proposed. As a result, on a machine with
25+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
26+ *      can scan memory areas that does not contain duplicated pages at speed of
27+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
28+ *      477MB/sec ~ 923MB/sec.
29+ *
30+ * 4. Thrashing area avoidance:
31+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
32+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
33+ *      hash value based volatile page detection.
34+ *
35+ *
36+ * 5. Misc changes upon KSM:
37+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
38+ *        comparison. It's much faster than default C version on x86.
39+ *      * rmap_item now has an struct *page member to loosely cache a
40+ *        address-->page mapping, which reduces too much time-costly
41+ *        follow_page().
42+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
43+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
44+ *        ksm is needed for this case.
45+ *
46+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
47+ *    Now uksmd consider full zero pages as special pages and merge them to an
48+ *    special unswappable uksm zero page.
49+ */
50+
51+ChangeLog:
52+
53+2012-05-05 The creation of this Doc
54+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
55+2012-05-28 UKSM 0.1.1.2 bug fix release
56+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
57+2012-07-2  UKSM 0.1.2-beta2
58+2012-07-10 UKSM 0.1.2-beta3
59+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
60+2012-10-13 UKSM 0.1.2.1 Bug fixes.
61+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
62+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
63+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
64+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
65+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
66diff -Naur a/fs/exec.c b/fs/exec.c
67--- a/fs/exec.c	2020-06-14 21:45:04.000000000 +0200
68+++ b/fs/exec.c	2020-06-15 10:34:30.890359050 +0200
69@@ -62,6 +62,7 @@
70 #include <linux/oom.h>
71 #include <linux/compat.h>
72 #include <linux/vmalloc.h>
73+#include <linux/ksm.h>
74 
75 #include <linux/uaccess.h>
76 #include <asm/mmu_context.h>
77diff -Naur a/fs/proc/meminfo.c b/fs/proc/meminfo.c
78--- a/fs/proc/meminfo.c	2020-06-14 21:45:04.000000000 +0200
79+++ b/fs/proc/meminfo.c	2020-06-15 10:34:30.890359050 +0200
80@@ -108,7 +108,10 @@
81 #endif
82 	show_val_kb(m, "PageTables:     ",
83 		    global_zone_page_state(NR_PAGETABLE));
84-
85+#ifdef CONFIG_UKSM
86+	show_val_kb(m, "KsmZeroPages:     ",
87+		    global_zone_page_state(NR_UKSM_ZERO_PAGES));
88+#endif
89 	show_val_kb(m, "NFS_Unstable:   ", 0);
90 	show_val_kb(m, "Bounce:         ",
91 		    global_zone_page_state(NR_BOUNCE));
92diff -Naur a/include/linux/ksm.h b/include/linux/ksm.h
93--- a/include/linux/ksm.h	2020-06-14 21:45:04.000000000 +0200
94+++ b/include/linux/ksm.h	2020-06-15 10:34:30.890359050 +0200
95@@ -21,20 +21,16 @@
96 #ifdef CONFIG_KSM
97 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
98 		unsigned long end, int advice, unsigned long *vm_flags);
99-int __ksm_enter(struct mm_struct *mm);
100-void __ksm_exit(struct mm_struct *mm);
101 
102-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
103+static inline struct stable_node *page_stable_node(struct page *page)
104 {
105-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
106-		return __ksm_enter(mm);
107-	return 0;
108+	return PageKsm(page) ? page_rmapping(page) : NULL;
109 }
110 
111-static inline void ksm_exit(struct mm_struct *mm)
112+static inline void set_page_stable_node(struct page *page,
113+					struct stable_node *stable_node)
114 {
115-	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
116-		__ksm_exit(mm);
117+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
118 }
119 
120 /*
121@@ -56,6 +52,33 @@
122 bool reuse_ksm_page(struct page *page,
123 			struct vm_area_struct *vma, unsigned long address);
124 
125+#ifdef CONFIG_KSM_LEGACY
126+int __ksm_enter(struct mm_struct *mm);
127+void __ksm_exit(struct mm_struct *mm);
128+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
129+{
130+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
131+		return __ksm_enter(mm);
132+	return 0;
133+}
134+
135+static inline void ksm_exit(struct mm_struct *mm)
136+{
137+	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
138+		__ksm_exit(mm);
139+}
140+
141+#elif defined(CONFIG_UKSM)
142+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
143+{
144+	return 0;
145+}
146+
147+static inline void ksm_exit(struct mm_struct *mm)
148+{
149+}
150+#endif /* !CONFIG_UKSM */
151+
152 #else  /* !CONFIG_KSM */
153 
154 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
155@@ -96,4 +119,6 @@
156 #endif /* CONFIG_MMU */
157 #endif /* !CONFIG_KSM */
158 
159+#include <linux/uksm.h>
160+
161 #endif /* __LINUX_KSM_H */
162diff -Naur a/include/linux/mm_types.h b/include/linux/mm_types.h
163--- a/include/linux/mm_types.h	2020-06-14 21:45:04.000000000 +0200
164+++ b/include/linux/mm_types.h	2020-06-15 10:34:30.890359050 +0200
165@@ -367,6 +367,9 @@
166 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
167 #endif
168 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
169+#ifdef CONFIG_UKSM
170+	struct vma_slot *uksm_vma_slot;
171+#endif
172 } __randomize_layout;
173 
174 struct core_thread {
175diff -Naur a/include/linux/mmzone.h b/include/linux/mmzone.h
176--- a/include/linux/mmzone.h	2020-06-14 21:45:04.000000000 +0200
177+++ b/include/linux/mmzone.h	2020-06-15 10:34:30.890359050 +0200
178@@ -165,6 +165,9 @@
179 	NR_ZSPAGES,		/* allocated in zsmalloc */
180 #endif
181 	NR_FREE_CMA_PAGES,
182+#ifdef CONFIG_UKSM
183+	NR_UKSM_ZERO_PAGES,
184+#endif
185 	NR_VM_ZONE_STAT_ITEMS };
186 
187 enum node_stat_item {
188diff -Naur a/include/linux/pgtable.h b/include/linux/pgtable.h
189--- a/include/linux/pgtable.h	2020-06-14 21:45:04.000000000 +0200
190+++ b/include/linux/pgtable.h	2020-06-15 10:34:30.890359050 +0200
191@@ -1020,12 +1020,25 @@
192 extern void untrack_pfn_moved(struct vm_area_struct *vma);
193 #endif
194 
195+#ifdef CONFIG_UKSM
196+static inline int is_uksm_zero_pfn(unsigned long pfn)
197+{
198+	extern unsigned long uksm_zero_pfn;
199+	return pfn == uksm_zero_pfn;
200+}
201+#else
202+static inline int is_uksm_zero_pfn(unsigned long pfn)
203+{
204+	return 0;
205+}
206+#endif
207+
208 #ifdef __HAVE_COLOR_ZERO_PAGE
209 static inline int is_zero_pfn(unsigned long pfn)
210 {
211 	extern unsigned long zero_pfn;
212 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
213-	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
214+	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
215 }
216 
217 #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
218@@ -1034,7 +1047,7 @@
219 static inline int is_zero_pfn(unsigned long pfn)
220 {
221 	extern unsigned long zero_pfn;
222-	return pfn == zero_pfn;
223+	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
224 }
225 
226 static inline unsigned long my_zero_pfn(unsigned long addr)
227diff -Naur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
228--- a/include/linux/sradix-tree.h	1970-01-01 01:00:00.000000000 +0100
229+++ b/include/linux/sradix-tree.h	2020-06-15 10:34:30.890359050 +0200
230@@ -0,0 +1,77 @@
231+#ifndef _LINUX_SRADIX_TREE_H
232+#define _LINUX_SRADIX_TREE_H
233+
234+
235+#define INIT_SRADIX_TREE(root, mask)					\
236+do {									\
237+	(root)->height = 0;						\
238+	(root)->gfp_mask = (mask);					\
239+	(root)->rnode = NULL;						\
240+} while (0)
241+
242+#define ULONG_BITS	(sizeof(unsigned long) * 8)
243+#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
244+//#define SRADIX_TREE_MAP_SHIFT	6
245+//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
246+//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
247+
248+struct sradix_tree_node {
249+	unsigned int	height;		/* Height from the bottom */
250+	unsigned int	count;
251+	unsigned int	fulls;		/* Number of full sublevel trees */
252+	struct sradix_tree_node *parent;
253+	void *stores[0];
254+};
255+
256+/* A simple radix tree implementation */
257+struct sradix_tree_root {
258+	unsigned int            height;
259+	struct sradix_tree_node *rnode;
260+
261+	/* Where found to have available empty stores in its sublevels */
262+	struct sradix_tree_node *enter_node;
263+	unsigned int shift;
264+	unsigned int stores_size;
265+	unsigned int mask;
266+	unsigned long min;	/* The first hole index */
267+	unsigned long num;
268+	//unsigned long *height_to_maxindex;
269+
270+	/* How the node is allocated and freed. */
271+	struct sradix_tree_node *(*alloc)(void);
272+	void (*free)(struct sradix_tree_node *node);
273+
274+	/* When a new node is added and removed */
275+	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
276+	void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
277+	void (*rm)(struct sradix_tree_node *node, unsigned int offset);
278+};
279+
280+struct sradix_tree_path {
281+	struct sradix_tree_node *node;
282+	int offset;
283+};
284+
285+static inline
286+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
287+{
288+	root->height = 0;
289+	root->rnode = NULL;
290+	root->shift = shift;
291+	root->stores_size = 1UL << shift;
292+	root->mask = root->stores_size - 1;
293+}
294+
295+
296+extern void *sradix_tree_next(struct sradix_tree_root *root,
297+		       struct sradix_tree_node *node, unsigned long index,
298+		       int (*iter)(void *, unsigned long));
299+
300+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
301+
302+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
303+			struct sradix_tree_node *node, unsigned long index);
304+
305+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
306+
307+#endif /* _LINUX_SRADIX_TREE_H */
308diff -Naur a/include/linux/uksm.h b/include/linux/uksm.h
309--- a/include/linux/uksm.h	1970-01-01 01:00:00.000000000 +0100
310+++ b/include/linux/uksm.h	2020-06-15 10:34:30.890359050 +0200
311@@ -0,0 +1,149 @@
312+#ifndef __LINUX_UKSM_H
313+#define __LINUX_UKSM_H
314+/*
315+ * Memory merging support.
316+ *
317+ * This code enables dynamic sharing of identical pages found in different
318+ * memory areas, even if they are not shared by fork().
319+ */
320+
321+/* if !CONFIG_UKSM this file should not be compiled at all. */
322+#ifdef CONFIG_UKSM
323+
324+#include <linux/bitops.h>
325+#include <linux/mm.h>
326+#include <linux/pagemap.h>
327+#include <linux/rmap.h>
328+#include <linux/sched.h>
329+
330+extern unsigned long zero_pfn __read_mostly;
331+extern unsigned long uksm_zero_pfn __read_mostly;
332+extern struct page *empty_uksm_zero_page;
333+
334+/* must be done before linked to mm */
335+extern void uksm_vma_add_new(struct vm_area_struct *vma);
336+extern void uksm_remove_vma(struct vm_area_struct *vma);
337+
338+#define UKSM_SLOT_NEED_SORT	(1 << 0)
339+#define UKSM_SLOT_NEED_RERAND	(1 << 1)
340+#define UKSM_SLOT_SCANNED	(1 << 2) /* It's scanned in this round */
341+#define UKSM_SLOT_FUL_SCANNED	(1 << 3)
342+#define UKSM_SLOT_IN_UKSM	(1 << 4)
343+
344+struct vma_slot {
345+	struct sradix_tree_node *snode;
346+	unsigned long sindex;
347+
348+	struct list_head slot_list;
349+	unsigned long fully_scanned_round;
350+	unsigned long dedup_num;
351+	unsigned long pages_scanned;
352+	unsigned long this_sampled;
353+	unsigned long last_scanned;
354+	unsigned long pages_to_scan;
355+	struct scan_rung *rung;
356+	struct page **rmap_list_pool;
357+	unsigned int *pool_counts;
358+	unsigned long pool_size;
359+	struct vm_area_struct *vma;
360+	struct mm_struct *mm;
361+	unsigned long ctime_j;
362+	unsigned long pages;
363+	unsigned long flags;
364+	unsigned long pages_cowed; /* pages cowed this round */
365+	unsigned long pages_merged; /* pages merged this round */
366+	unsigned long pages_bemerged;
367+
368+	/* when it has page merged in this eval round */
369+	struct list_head dedup_list;
370+};
371+
372+static inline void uksm_unmap_zero_page(pte_t pte)
373+{
374+	if (pte_pfn(pte) == uksm_zero_pfn)
375+		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
376+}
377+
378+static inline void uksm_map_zero_page(pte_t pte)
379+{
380+	if (pte_pfn(pte) == uksm_zero_pfn)
381+		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
382+}
383+
384+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
385+{
386+	if (vma->uksm_vma_slot && PageKsm(page))
387+		vma->uksm_vma_slot->pages_cowed++;
388+}
389+
390+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
391+{
392+	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
393+		vma->uksm_vma_slot->pages_cowed++;
394+}
395+
396+static inline int uksm_flags_can_scan(unsigned long vm_flags)
397+{
398+#ifdef VM_SAO
399+		if (vm_flags & VM_SAO)
400+			return 0;
401+#endif
402+
403+	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
404+			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
405+			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
406+}
407+
408+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
409+{
410+	if (uksm_flags_can_scan(*vm_flags_p))
411+		*vm_flags_p |= VM_MERGEABLE;
412+}
413+
414+/*
415+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
416+ * be removed when uksm zero page patch is stable enough.
417+ */
418+static inline void uksm_bugon_zeropage(pte_t pte)
419+{
420+	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
421+}
422+#else
423+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
424+{
425+}
426+
427+static inline void uksm_remove_vma(struct vm_area_struct *vma)
428+{
429+}
430+
431+static inline void uksm_unmap_zero_page(pte_t pte)
432+{
433+}
434+
435+static inline void uksm_map_zero_page(pte_t pte)
436+{
437+}
438+
439+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
440+{
441+}
442+
443+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
444+{
445+}
446+
447+static inline int uksm_flags_can_scan(unsigned long vm_flags)
448+{
449+	return 0;
450+}
451+
452+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
453+{
454+}
455+
456+static inline void uksm_bugon_zeropage(pte_t pte)
457+{
458+}
459+#endif /* !CONFIG_UKSM */
460+#endif /* __LINUX_UKSM_H */
461diff -Naur a/kernel/fork.c b/kernel/fork.c
462--- a/kernel/fork.c	2020-06-14 21:45:04.000000000 +0200
463+++ b/kernel/fork.c	2020-06-15 10:34:30.890359050 +0200
464@@ -603,7 +603,7 @@
465 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
466 		rb_link = &tmp->vm_rb.rb_right;
467 		rb_parent = &tmp->vm_rb;
468-
469+		uksm_vma_add_new(tmp);
470 		mm->map_count++;
471 		if (!(tmp->vm_flags & VM_WIPEONFORK))
472 			retval = copy_page_range(mm, oldmm, mpnt);
473diff -Naur a/lib/Makefile b/lib/Makefile
474--- a/lib/Makefile	2020-06-14 21:45:04.000000000 +0200
475+++ b/lib/Makefile	2020-06-15 10:34:30.890359050 +0200
476@@ -29,7 +29,7 @@
477 KCSAN_SANITIZE_random32.o := n
478 
479 lib-y := ctype.o string.o vsprintf.o cmdline.o \
480-	 rbtree.o radix-tree.o timerqueue.o xarray.o \
481+	 rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
482 	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
483 	 flex_proportions.o ratelimit.o show_mem.o \
484 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
485diff -Naur a/lib/sradix-tree.c b/lib/sradix-tree.c
486--- a/lib/sradix-tree.c	1970-01-01 01:00:00.000000000 +0100
487+++ b/lib/sradix-tree.c	2020-06-15 10:34:30.890359050 +0200
488@@ -0,0 +1,476 @@
489+#include <linux/errno.h>
490+#include <linux/mm.h>
491+#include <linux/mman.h>
492+#include <linux/spinlock.h>
493+#include <linux/slab.h>
494+#include <linux/gcd.h>
495+#include <linux/sradix-tree.h>
496+
497+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
498+{
499+	return node->fulls == root->stores_size ||
500+		(node->height == 1 && node->count == root->stores_size);
501+}
502+
503+/*
504+ *	Extend a sradix tree so it can store key @index.
505+ */
506+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
507+{
508+	struct sradix_tree_node *node;
509+	unsigned int height;
510+
511+	if (unlikely(root->rnode == NULL)) {
512+		if (!(node = root->alloc()))
513+			return -ENOMEM;
514+
515+		node->height = 1;
516+		root->rnode = node;
517+		root->height = 1;
518+	}
519+
520+	/* Figure out what the height should be.  */
521+	height = root->height;
522+	index >>= root->shift * height;
523+
524+	while (index) {
525+		index >>= root->shift;
526+		height++;
527+	}
528+
529+	while (height > root->height) {
530+		unsigned int newheight;
531+
532+		if (!(node = root->alloc()))
533+			return -ENOMEM;
534+
535+		/* Increase the height.  */
536+		node->stores[0] = root->rnode;
537+		root->rnode->parent = node;
538+		if (root->extend)
539+			root->extend(node, root->rnode);
540+
541+		newheight = root->height + 1;
542+		node->height = newheight;
543+		node->count = 1;
544+		if (sradix_node_full(root, root->rnode))
545+			node->fulls = 1;
546+
547+		root->rnode = node;
548+		root->height = newheight;
549+	}
550+
551+	return 0;
552+}
553+
554+/*
555+ * Search the next item from the current node, that is not NULL
556+ * and can satify root->iter().
557+ */
558+void *sradix_tree_next(struct sradix_tree_root *root,
559+		       struct sradix_tree_node *node, unsigned long index,
560+		       int (*iter)(void *item, unsigned long height))
561+{
562+	unsigned long offset;
563+	void *item;
564+
565+	if (unlikely(node == NULL)) {
566+		node = root->rnode;
567+		for (offset = 0; offset < root->stores_size; offset++) {
568+			item = node->stores[offset];
569+			if (item && (!iter || iter(item, node->height)))
570+				break;
571+		}
572+
573+		if (unlikely(offset >= root->stores_size))
574+			return NULL;
575+
576+		if (node->height == 1)
577+			return item;
578+		else
579+			goto go_down;
580+	}
581+
582+	while (node) {
583+		offset = (index & root->mask) + 1;
584+		for (; offset < root->stores_size; offset++) {
585+			item = node->stores[offset];
586+			if (item && (!iter || iter(item, node->height)))
587+				break;
588+		}
589+
590+		if (offset < root->stores_size)
591+			break;
592+
593+		node = node->parent;
594+		index >>= root->shift;
595+	}
596+
597+	if (!node)
598+		return NULL;
599+
600+	while (node->height > 1) {
601+go_down:
602+		node = item;
603+		for (offset = 0; offset < root->stores_size; offset++) {
604+			item = node->stores[offset];
605+			if (item && (!iter || iter(item, node->height)))
606+				break;
607+		}
608+
609+		if (unlikely(offset >= root->stores_size))
610+			return NULL;
611+	}
612+
613+	BUG_ON(offset > root->stores_size);
614+
615+	return item;
616+}
617+
618+/*
619+ * Blindly insert the item to the tree. Typically, we reuse the
620+ * first empty store item.
621+ */
622+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
623+{
624+	unsigned long index;
625+	unsigned int height;
626+	struct sradix_tree_node *node, *tmp = NULL;
627+	int offset, offset_saved;
628+	void **store = NULL;
629+	int error, i, j, shift;
630+
631+go_on:
632+	index = root->min;
633+
634+	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
635+		node = root->enter_node;
636+		BUG_ON((index >> (root->shift * root->height)));
637+	} else {
638+		node = root->rnode;
639+		if (node == NULL || (index >> (root->shift * root->height))
640+		    || sradix_node_full(root, node)) {
641+			error = sradix_tree_extend(root, index);
642+			if (error)
643+				return error;
644+
645+			node = root->rnode;
646+		}
647+	}
648+
649+
650+	height = node->height;
651+	shift = (height - 1) * root->shift;
652+	offset = (index >> shift) & root->mask;
653+	while (shift > 0) {
654+		offset_saved = offset;
655+		for (; offset < root->stores_size; offset++) {
656+			store = &node->stores[offset];
657+			tmp = *store;
658+
659+			if (!tmp || !sradix_node_full(root, tmp))
660+				break;
661+		}
662+		BUG_ON(offset >= root->stores_size);
663+
664+		if (offset != offset_saved) {
665+			index += (offset - offset_saved) << shift;
666+			index &= ~((1UL << shift) - 1);
667+		}
668+
669+		if (!tmp) {
670+			if (!(tmp = root->alloc()))
671+				return -ENOMEM;
672+
673+			tmp->height = shift / root->shift;
674+			*store = tmp;
675+			tmp->parent = node;
676+			node->count++;
677+//			if (root->extend)
678+//				root->extend(node, tmp);
679+		}
680+
681+		node = tmp;
682+		shift -= root->shift;
683+		offset = (index >> shift) & root->mask;
684+	}
685+
686+	BUG_ON(node->height != 1);
687+
688+
689+	store = &node->stores[offset];
690+	for (i = 0, j = 0;
691+	      j < root->stores_size - node->count &&
692+	      i < root->stores_size - offset && j < num; i++) {
693+		if (!store[i]) {
694+			store[i] = item[j];
695+			if (root->assign)
696+				root->assign(node, index + i, item[j]);
697+			j++;
698+		}
699+	}
700+
701+	node->count += j;
702+	root->num += j;
703+	num -= j;
704+
705+	while (sradix_node_full(root, node)) {
706+		node = node->parent;
707+		if (!node)
708+			break;
709+
710+		node->fulls++;
711+	}
712+
713+	if (unlikely(!node)) {
714+		/* All nodes are full */
715+		root->min = 1 << (root->height * root->shift);
716+		root->enter_node = NULL;
717+	} else {
718+		root->min = index + i - 1;
719+		root->min |= (1UL << (node->height - 1)) - 1;
720+		root->min++;
721+		root->enter_node = node;
722+	}
723+
724+	if (num) {
725+		item += j;
726+		goto go_on;
727+	}
728+
729+	return 0;
730+}
731+
732+
733+/**
734+ *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
735+ *      @root		sradix tree root
736+ *
737+ */
738+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
739+{
740+	/* try to shrink tree height */
741+	while (root->height > 1) {
742+		struct sradix_tree_node *to_free = root->rnode;
743+
744+		/*
745+		 * The candidate node has more than one child, or its child
746+		 * is not at the leftmost store, we cannot shrink.
747+		 */
748+		if (to_free->count != 1 || !to_free->stores[0])
749+			break;
750+
751+		root->rnode = to_free->stores[0];
752+		root->rnode->parent = NULL;
753+		root->height--;
754+		if (unlikely(root->enter_node == to_free))
755+			root->enter_node = NULL;
756+		root->free(to_free);
757+	}
758+}
759+
760+/*
761+ * Del the item on the known leaf node and index
762+ */
763+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
764+				  struct sradix_tree_node *node, unsigned long index)
765+{
766+	unsigned int offset;
767+	struct sradix_tree_node *start, *end;
768+
769+	BUG_ON(node->height != 1);
770+
771+	start = node;
772+	while (node && !(--node->count))
773+		node = node->parent;
774+
775+	end = node;
776+	if (!node) {
777+		root->rnode = NULL;
778+		root->height = 0;
779+		root->min = 0;
780+		root->num = 0;
781+		root->enter_node = NULL;
782+	} else {
783+		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
784+		if (root->rm)
785+			root->rm(node, offset);
786+		node->stores[offset] = NULL;
787+		root->num--;
788+		if (root->min > index) {
789+			root->min = index;
790+			root->enter_node = node;
791+		}
792+	}
793+
794+	if (start != end) {
795+		do {
796+			node = start;
797+			start = start->parent;
798+			if (unlikely(root->enter_node == node))
799+				root->enter_node = end;
800+			root->free(node);
801+		} while (start != end);
802+
803+		/*
804+		 * Note that shrink may free "end", so enter_node still need to
805+		 * be checked inside.
806+		 */
807+		sradix_tree_shrink(root);
808+	} else if (node->count == root->stores_size - 1) {
809+		/* It WAS a full leaf node. Update the ancestors */
810+		node = node->parent;
811+		while (node) {
812+			node->fulls--;
813+			if (node->fulls != root->stores_size - 1)
814+				break;
815+
816+			node = node->parent;
817+		}
818+	}
819+}
820+
821+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
822+{
823+	unsigned int height, offset;
824+	struct sradix_tree_node *node;
825+	int shift;
826+
827+	node = root->rnode;
828+	if (node == NULL || (index >> (root->shift * root->height)))
829+		return NULL;
830+
831+	height = root->height;
832+	shift = (height - 1) * root->shift;
833+
834+	do {
835+		offset = (index >> shift) & root->mask;
836+		node = node->stores[offset];
837+		if (!node)
838+			return NULL;
839+
840+		shift -= root->shift;
841+	} while (shift >= 0);
842+
843+	return node;
844+}
845+
846+/*
847+ * Return the item if it exists, otherwise create it in place
848+ * and return the created item.
849+ */
850+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
851+			unsigned long index, void *(*item_alloc)(void))
852+{
853+	unsigned int height, offset;
854+	struct sradix_tree_node *node, *tmp;
855+	void *item;
856+	int shift, error;
857+
858+	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
859+		if (item_alloc) {
860+			error = sradix_tree_extend(root, index);
861+			if (error)
862+				return NULL;
863+		} else {
864+			return NULL;
865+		}
866+	}
867+
868+	node = root->rnode;
869+	height = root->height;
870+	shift = (height - 1) * root->shift;
871+
872+	do {
873+		offset = (index >> shift) & root->mask;
874+		if (!node->stores[offset]) {
875+			if (!(tmp = root->alloc()))
876+				return NULL;
877+
878+			tmp->height = shift / root->shift;
879+			node->stores[offset] = tmp;
880+			tmp->parent = node;
881+			node->count++;
882+			node = tmp;
883+		} else {
884+			node = node->stores[offset];
885+		}
886+
887+		shift -= root->shift;
888+	} while (shift > 0);
889+
890+	BUG_ON(node->height != 1);
891+	offset = index & root->mask;
892+	if (node->stores[offset]) {
893+		return node->stores[offset];
894+	} else if (item_alloc) {
895+		if (!(item = item_alloc()))
896+			return NULL;
897+
898+		node->stores[offset] = item;
899+
900+		/*
901+		 * NOTE: we do NOT call root->assign here, since this item is
902+		 * newly created by us having no meaning. Caller can call this
903+		 * if it's necessary to do so.
904+		 */
905+
906+		node->count++;
907+		root->num++;
908+
909+		while (sradix_node_full(root, node)) {
910+			node = node->parent;
911+			if (!node)
912+				break;
913+
914+			node->fulls++;
915+		}
916+
917+		if (unlikely(!node)) {
918+			/* All nodes are full */
919+			root->min = 1 << (root->height * root->shift);
920+		} else {
921+			if (root->min == index) {
922+				root->min |= (1UL << (node->height - 1)) - 1;
923+				root->min++;
924+				root->enter_node = node;
925+			}
926+		}
927+
928+		return item;
929+	} else {
930+		return NULL;
931+	}
932+
933+}
934+
935+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
936+{
937+	unsigned int height, offset;
938+	struct sradix_tree_node *node;
939+	int shift;
940+
941+	node = root->rnode;
942+	if (node == NULL || (index >> (root->shift * root->height)))
943+		return -ENOENT;
944+
945+	height = root->height;
946+	shift = (height - 1) * root->shift;
947+
948+	do {
949+		offset = (index >> shift) & root->mask;
950+		node = node->stores[offset];
951+		if (!node)
952+			return -ENOENT;
953+
954+		shift -= root->shift;
955+	} while (shift > 0);
956+
957+	offset = index & root->mask;
958+	if (!node->stores[offset])
959+		return -ENOENT;
960+
961+	sradix_tree_delete_from_leaf(root, node, index);
962+
963+	return 0;
964+}
965diff -Naur a/mm/Kconfig b/mm/Kconfig
966--- a/mm/Kconfig	2020-06-14 21:45:04.000000000 +0200
967+++ b/mm/Kconfig	2020-06-15 10:34:30.890359050 +0200
968@@ -321,6 +321,32 @@
969 	  See Documentation/vm/ksm.rst for more information: KSM is inactive
970 	  until a program has madvised that an area is MADV_MERGEABLE, and
971 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
972+choice
973+	prompt "Choose UKSM/KSM strategy"
974+	default UKSM
975+	depends on KSM
976+	help
977+	  This option allows to select a UKSM/KSM stragety.
978+
979+config UKSM
980+	bool "Ultra-KSM for page merging"
981+	depends on KSM
982+	help
983+	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
984+	page Merging), but with a fundamentally rewritten core algorithm. With
985+	an advanced algorithm, UKSM now can transparently scans all anonymously
986+	mapped user space applications with an significantly improved scan speed
987+	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
988+	UKSM. Now UKSM has its first stable release and first real world enterprise user.
989+	For more information, please goto its project page.
990+	(github.com/dolohow/uksm)
991+
992+config KSM_LEGACY
993+	bool "Legacy KSM implementation"
994+	depends on KSM
995+	help
996+	The legacy KSM implementation from Red Hat.
997+endchoice
998 
999 config DEFAULT_MMAP_MIN_ADDR
1000 	int "Low address space to protect from user allocation"
1001diff -Naur a/mm/ksm.c b/mm/ksm.c
1002--- a/mm/ksm.c	2020-06-14 21:45:04.000000000 +0200
1003+++ b/mm/ksm.c	2020-06-15 10:34:30.892359059 +0200
1004@@ -857,17 +857,6 @@
1005 	return err;
1006 }
1007 
1008-static inline struct stable_node *page_stable_node(struct page *page)
1009-{
1010-	return PageKsm(page) ? page_rmapping(page) : NULL;
1011-}
1012-
1013-static inline void set_page_stable_node(struct page *page,
1014-					struct stable_node *stable_node)
1015-{
1016-	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
1017-}
1018-
1019 #ifdef CONFIG_SYSFS
1020 /*
1021  * Only called through the sysfs control interface:
1022diff -Naur a/mm/Makefile b/mm/Makefile
1023--- a/mm/Makefile	2020-06-14 21:45:04.000000000 +0200
1024+++ b/mm/Makefile	2020-06-15 10:34:30.892359059 +0200
1025@@ -76,7 +76,8 @@
1026 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1027 obj-$(CONFIG_SLOB) += slob.o
1028 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1029-obj-$(CONFIG_KSM) += ksm.o
1030+obj-$(CONFIG_KSM_LEGACY) += ksm.o
1031+obj-$(CONFIG_UKSM) += uksm.o
1032 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
1033 obj-$(CONFIG_SLAB) += slab.o
1034 obj-$(CONFIG_SLUB) += slub.o
1035diff -Naur a/mm/memory.c b/mm/memory.c
1036--- a/mm/memory.c	2020-06-14 21:45:04.000000000 +0200
1037+++ b/mm/memory.c	2020-06-15 10:34:30.892359059 +0200
1038@@ -143,6 +143,25 @@
1039 
1040 unsigned long highest_memmap_pfn __read_mostly;
1041 
1042+#ifdef CONFIG_UKSM
1043+unsigned long uksm_zero_pfn __read_mostly;
1044+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1045+struct page *empty_uksm_zero_page;
1046+
1047+static int __init setup_uksm_zero_page(void)
1048+{
1049+	empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
1050+	if (!empty_uksm_zero_page)
1051+		panic("Oh boy, that early out of memory?");
1052+
1053+	SetPageReserved(empty_uksm_zero_page);
1054+	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1055+
1056+	return 0;
1057+}
1058+core_initcall(setup_uksm_zero_page);
1059+#endif
1060+
1061 /*
1062  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1063  */
1064@@ -158,6 +177,7 @@
1065 	trace_rss_stat(mm, member, count);
1066 }
1067 
1068+
1069 #if defined(SPLIT_RSS_COUNTING)
1070 
1071 void sync_mm_rss(struct mm_struct *mm)
1072@@ -801,6 +821,11 @@
1073 		get_page(page);
1074 		page_dup_rmap(page, false);
1075 		rss[mm_counter(page)]++;
1076+
1077+		/* Should return NULL in vm_normal_page() */
1078+		uksm_bugon_zeropage(pte);
1079+	} else {
1080+		uksm_map_zero_page(pte);
1081 	}
1082 
1083 out_set_pte:
1084@@ -1073,8 +1098,10 @@
1085 			ptent = ptep_get_and_clear_full(mm, addr, pte,
1086 							tlb->fullmm);
1087 			tlb_remove_tlb_entry(tlb, pte, addr);
1088-			if (unlikely(!page))
1089+			if (unlikely(!page)) {
1090+				uksm_unmap_zero_page(ptent);
1091 				continue;
1092+			}
1093 
1094 			if (!PageAnon(page)) {
1095 				if (pte_dirty(ptent)) {
1096@@ -2409,6 +2436,7 @@
1097 
1098 	if (likely(src)) {
1099 		copy_user_highpage(dst, src, addr, vma);
1100+		uksm_cow_page(vma, src);
1101 		return true;
1102 	}
1103 
1104@@ -2654,6 +2682,7 @@
1105 							      vmf->address);
1106 		if (!new_page)
1107 			goto oom;
1108+		uksm_cow_pte(vma, vmf->orig_pte);
1109 	} else {
1110 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
1111 				vmf->address);
1112@@ -2696,7 +2725,9 @@
1113 						mm_counter_file(old_page));
1114 				inc_mm_counter_fast(mm, MM_ANONPAGES);
1115 			}
1116+			uksm_bugon_zeropage(vmf->orig_pte);
1117 		} else {
1118+			uksm_unmap_zero_page(vmf->orig_pte);
1119 			inc_mm_counter_fast(mm, MM_ANONPAGES);
1120 		}
1121 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
1122diff -Naur a/mm/mmap.c b/mm/mmap.c
1123--- a/mm/mmap.c	2020-06-14 21:45:04.000000000 +0200
1124+++ b/mm/mmap.c	2020-06-15 10:56:23.801940631 +0200
1125@@ -46,6 +46,7 @@
1126 #include <linux/moduleparam.h>
1127 #include <linux/pkeys.h>
1128 #include <linux/oom.h>
1129+#include <linux/ksm.h>
1130 #include <linux/sched/mm.h>
1131 
1132 #include <linux/uaccess.h>
1133@@ -181,6 +182,7 @@
1134 	if (vma->vm_file)
1135 		fput(vma->vm_file);
1136 	mpol_put(vma_policy(vma));
1137+       uksm_remove_vma(vma);
1138 	vm_area_free(vma);
1139 	return next;
1140 }
1141@@ -708,9 +710,16 @@
1142 	long adjust_next = 0;
1143 	int remove_next = 0;
1144 
1145+/*
1146+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1147+ * acquired
1148+ */
1149+	uksm_remove_vma(vma);
1150+
1151 	if (next && !insert) {
1152 		struct vm_area_struct *exporter = NULL, *importer = NULL;
1153 
1154+		uksm_remove_vma(next);
1155 		if (end >= next->vm_end) {
1156 			/*
1157 			 * vma expands, overlapping all the next, and
1158@@ -841,6 +850,7 @@
1159 		end_changed = true;
1160 	}
1161 	vma->vm_pgoff = pgoff;
1162+
1163 	if (adjust_next) {
1164 		next->vm_start += adjust_next << PAGE_SHIFT;
1165 		next->vm_pgoff += adjust_next;
1166@@ -946,6 +956,7 @@
1167 		if (remove_next == 2) {
1168 			remove_next = 1;
1169 			end = next->vm_end;
1170+			uksm_remove_vma(next);
1171 			goto again;
1172 		}
1173 		else if (next)
1174@@ -972,10 +983,14 @@
1175 			 */
1176 			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
1177 		}
1178+	} else {
1179+		if (next && !insert)
1180+			uksm_vma_add_new(next);
1181 	}
1182 	if (insert && file)
1183 		uprobe_mmap(insert);
1184 
1185+	uksm_vma_add_new(vma);
1186 	validate_mm(mm);
1187 
1188 	return 0;
1189@@ -1434,6 +1449,9 @@
1190 	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1191 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1192 
1193+	/* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
1194+	uksm_vm_flags_mod(&vm_flags);
1195+
1196 	if (flags & MAP_LOCKED)
1197 		if (!can_do_mlock())
1198 			return -EPERM;
1199@@ -1801,6 +1819,7 @@
1200 			allow_write_access(file);
1201 	}
1202 	file = vma->vm_file;
1203+	uksm_vma_add_new(vma);
1204 out:
1205 	perf_event_mmap(vma);
1206 
1207@@ -1843,6 +1862,7 @@
1208 	if (vm_flags & VM_DENYWRITE)
1209 		allow_write_access(file);
1210 free_vma:
1211+	uksm_remove_vma(vma);
1212 	vm_area_free(vma);
1213 unacct_error:
1214 	if (charged)
1215@@ -2694,6 +2714,8 @@
1216 	else
1217 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1218 
1219+	uksm_vma_add_new(new);
1220+
1221 	/* Success. */
1222 	if (!err)
1223 		return 0;
1224@@ -3000,6 +3022,7 @@
1225 	if ((flags & (~VM_EXEC)) != 0)
1226 		return -EINVAL;
1227 	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1228+	uksm_vm_flags_mod(&flags);
1229 
1230 	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1231 	if (IS_ERR_VALUE(mapped_addr))
1232@@ -3050,6 +3073,7 @@
1233 	vma->vm_flags = flags;
1234 	vma->vm_page_prot = vm_get_page_prot(flags);
1235 	vma_link(mm, vma, prev, rb_link, rb_parent);
1236+	uksm_vma_add_new(vma);
1237 out:
1238 	perf_event_mmap(vma);
1239 	mm->total_vm += len >> PAGE_SHIFT;
1240@@ -3127,6 +3151,12 @@
1241 		mmap_write_unlock(mm);
1242 	}
1243 
1244+	/*
1245+	 * Taking write lock on mmap_lock does not harm others,
1246+	 * but it's crucial for uksm to avoid races.
1247+	 */
1248+	mmap_write_lock(mm);
1249+
1250 	if (mm->locked_vm) {
1251 		vma = mm->mmap;
1252 		while (vma) {
1253@@ -3161,6 +3191,11 @@
1254 		vma = remove_vma(vma);
1255 	}
1256 	vm_unacct_memory(nr_accounted);
1257+
1258+	mm->mmap = NULL;
1259+	mm->mm_rb = RB_ROOT;
1260+	vmacache_invalidate(mm);
1261+	mmap_write_unlock(mm);
1262 }
1263 
1264 /* Insert vm structure into process list sorted by address
1265@@ -3268,6 +3303,7 @@
1266 			new_vma->vm_ops->open(new_vma);
1267 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
1268 		*need_rmap_locks = false;
1269+		uksm_vma_add_new(new_vma);
1270 	}
1271 	return new_vma;
1272 
1273@@ -3420,6 +3456,7 @@
1274 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
1275 
1276 	perf_event_mmap(vma);
1277+	uksm_vma_add_new(vma);
1278 
1279 	return vma;
1280 
1281diff -Naur a/mm/uksm.c b/mm/uksm.c
1282--- a/mm/uksm.c	1970-01-01 01:00:00.000000000 +0100
1283+++ b/mm/uksm.c	2020-06-15 10:34:30.892359059 +0200
1284@@ -0,0 +1,5613 @@
1285+/*
1286+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1287+ *
1288+ * This is an improvement upon KSM. Some basic data structures and routines
1289+ * are borrowed from ksm.c .
1290+ *
1291+ * Its new features:
1292+ * 1. Full system scan:
1293+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
1294+ *      interaction to submit a memory area to KSM is no longer needed.
1295+ *
1296+ * 2. Rich area detection:
1297+ *      It automatically detects rich areas containing abundant duplicated
1298+ *      pages based. Rich areas are given a full scan speed. Poor areas are
1299+ *      sampled at a reasonable speed with very low CPU consumption.
1300+ *
1301+ * 3. Ultra Per-page scan speed improvement:
1302+ *      A new hash algorithm is proposed. As a result, on a machine with
1303+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1304+ *      can scan memory areas that does not contain duplicated pages at speed of
1305+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1306+ *      477MB/sec ~ 923MB/sec.
1307+ *
1308+ * 4. Thrashing area avoidance:
1309+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
1310+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
1311+ *      hash value based volatile page detection.
1312+ *
1313+ *
1314+ * 5. Misc changes upon KSM:
1315+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1316+ *        comparison. It's much faster than default C version on x86.
1317+ *      * rmap_item now has an struct *page member to loosely cache a
1318+ *        address-->page mapping, which reduces too much time-costly
1319+ *        follow_page().
1320+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1321+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1322+ *        ksm is needed for this case.
1323+ *
1324+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
1325+ *    Now uksmd consider full zero pages as special pages and merge them to an
1326+ *    special unswappable uksm zero page.
1327+ */
1328+
1329+#include <linux/errno.h>
1330+#include <linux/mm.h>
1331+#include <linux/fs.h>
1332+#include <linux/mman.h>
1333+#include <linux/sched.h>
1334+#include <linux/sched/mm.h>
1335+#include <linux/sched/coredump.h>
1336+#include <linux/sched/cputime.h>
1337+#include <linux/rwsem.h>
1338+#include <linux/pagemap.h>
1339+#include <linux/rmap.h>
1340+#include <linux/spinlock.h>
1341+#include <linux/jhash.h>
1342+#include <linux/delay.h>
1343+#include <linux/kthread.h>
1344+#include <linux/wait.h>
1345+#include <linux/slab.h>
1346+#include <linux/rbtree.h>
1347+#include <linux/memory.h>
1348+#include <linux/mmu_notifier.h>
1349+#include <linux/swap.h>
1350+#include <linux/ksm.h>
1351+#include <linux/crypto.h>
1352+#include <linux/scatterlist.h>
1353+#include <crypto/hash.h>
1354+#include <linux/random.h>
1355+#include <linux/math64.h>
1356+#include <linux/gcd.h>
1357+#include <linux/freezer.h>
1358+#include <linux/oom.h>
1359+#include <linux/numa.h>
1360+#include <linux/sradix-tree.h>
1361+
1362+#include <asm/tlbflush.h>
1363+#include "internal.h"
1364+
1365+#ifdef CONFIG_X86
1366+#undef memcmp
1367+
1368+#ifdef CONFIG_X86_32
1369+#define memcmp memcmpx86_32
1370+/*
1371+ * Compare 4-byte-aligned address s1 and s2, with length n
1372+ */
1373+int memcmpx86_32(void *s1, void *s2, size_t n)
1374+{
1375+	size_t num = n / 4;
1376+	register int res;
1377+
1378+	__asm__ __volatile__
1379+	(
1380+	 "testl %3,%3\n\t"
1381+	 "repe; cmpsd\n\t"
1382+	 "je        1f\n\t"
1383+	 "sbbl      %0,%0\n\t"
1384+	 "orl       $1,%0\n"
1385+	 "1:"
1386+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1387+	 : "0" (0)
1388+	 : "cc");
1389+
1390+	return res;
1391+}
1392+
1393+/*
1394+ * Check the page is all zero ?
1395+ */
1396+static int is_full_zero(const void *s1, size_t len)
1397+{
1398+	unsigned char same;
1399+
1400+	len /= 4;
1401+
1402+	__asm__ __volatile__
1403+	("repe; scasl;"
1404+	 "sete %0"
1405+	 : "=qm" (same), "+D" (s1), "+c" (len)
1406+	 : "a" (0)
1407+	 : "cc");
1408+
1409+	return same;
1410+}
1411+
1412+
1413+#elif defined(CONFIG_X86_64)
1414+#define memcmp memcmpx86_64
1415+/*
1416+ * Compare 8-byte-aligned address s1 and s2, with length n
1417+ */
1418+int memcmpx86_64(void *s1, void *s2, size_t n)
1419+{
1420+	size_t num = n / 8;
1421+	register int res;
1422+
1423+	__asm__ __volatile__
1424+	(
1425+	 "testq %q3,%q3\n\t"
1426+	 "repe; cmpsq\n\t"
1427+	 "je        1f\n\t"
1428+	 "sbbq      %q0,%q0\n\t"
1429+	 "orq       $1,%q0\n"
1430+	 "1:"
1431+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1432+	 : "0" (0)
1433+	 : "cc");
1434+
1435+	return res;
1436+}
1437+
1438+static int is_full_zero(const void *s1, size_t len)
1439+{
1440+	unsigned char same;
1441+
1442+	len /= 8;
1443+
1444+	__asm__ __volatile__
1445+	("repe; scasq;"
1446+	 "sete %0"
1447+	 : "=qm" (same), "+D" (s1), "+c" (len)
1448+	 : "a" (0)
1449+	 : "cc");
1450+
1451+	return same;
1452+}
1453+
1454+#endif
1455+#else
1456+static int is_full_zero(const void *s1, size_t len)
1457+{
1458+	unsigned long *src = s1;
1459+	int i;
1460+
1461+	len /= sizeof(*src);
1462+
1463+	for (i = 0; i < len; i++) {
1464+		if (src[i])
1465+			return 0;
1466+	}
1467+
1468+	return 1;
1469+}
1470+#endif
1471+
1472+#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
1473+#define TIME_RATIO_SCALE	10000
1474+
1475+#define SLOT_TREE_NODE_SHIFT	8
1476+#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
1477+struct slot_tree_node {
1478+	unsigned long size;
1479+	struct sradix_tree_node snode;
1480+	void *stores[SLOT_TREE_NODE_STORE_SIZE];
1481+};
1482+
1483+static struct kmem_cache *slot_tree_node_cachep;
1484+
1485+static struct sradix_tree_node *slot_tree_node_alloc(void)
1486+{
1487+	struct slot_tree_node *p;
1488+
1489+	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
1490+			      __GFP_NORETRY | __GFP_NOWARN);
1491+	if (!p)
1492+		return NULL;
1493+
1494+	return &p->snode;
1495+}
1496+
1497+static void slot_tree_node_free(struct sradix_tree_node *node)
1498+{
1499+	struct slot_tree_node *p;
1500+
1501+	p = container_of(node, struct slot_tree_node, snode);
1502+	kmem_cache_free(slot_tree_node_cachep, p);
1503+}
1504+
1505+static void slot_tree_node_extend(struct sradix_tree_node *parent,
1506+				  struct sradix_tree_node *child)
1507+{
1508+	struct slot_tree_node *p, *c;
1509+
1510+	p = container_of(parent, struct slot_tree_node, snode);
1511+	c = container_of(child, struct slot_tree_node, snode);
1512+
1513+	p->size += c->size;
1514+}
1515+
1516+void slot_tree_node_assign(struct sradix_tree_node *node,
1517+			   unsigned int index, void *item)
1518+{
1519+	struct vma_slot *slot = item;
1520+	struct slot_tree_node *cur;
1521+
1522+	slot->snode = node;
1523+	slot->sindex = index;
1524+
1525+	while (node) {
1526+		cur = container_of(node, struct slot_tree_node, snode);
1527+		cur->size += slot->pages;
1528+		node = node->parent;
1529+	}
1530+}
1531+
1532+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
1533+{
1534+	struct vma_slot *slot;
1535+	struct slot_tree_node *cur;
1536+	unsigned long pages;
1537+
1538+	if (node->height == 1) {
1539+		slot = node->stores[offset];
1540+		pages = slot->pages;
1541+	} else {
1542+		cur = container_of(node->stores[offset],
1543+				   struct slot_tree_node, snode);
1544+		pages = cur->size;
1545+	}
1546+
1547+	while (node) {
1548+		cur = container_of(node, struct slot_tree_node, snode);
1549+		cur->size -= pages;
1550+		node = node->parent;
1551+	}
1552+}
1553+
1554+unsigned long slot_iter_index;
1555+int slot_iter(void *item,  unsigned long height)
1556+{
1557+	struct slot_tree_node *node;
1558+	struct vma_slot *slot;
1559+
1560+	if (height == 1) {
1561+		slot = item;
1562+		if (slot_iter_index < slot->pages) {
1563+			/*in this one*/
1564+			return 1;
1565+		} else {
1566+			slot_iter_index -= slot->pages;
1567+			return 0;
1568+		}
1569+
1570+	} else {
1571+		node = container_of(item, struct slot_tree_node, snode);
1572+		if (slot_iter_index < node->size) {
1573+			/*in this one*/
1574+			return 1;
1575+		} else {
1576+			slot_iter_index -= node->size;
1577+			return 0;
1578+		}
1579+	}
1580+}
1581+
1582+
1583+static inline void slot_tree_init_root(struct sradix_tree_root *root)
1584+{
1585+	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1586+	root->alloc = slot_tree_node_alloc;
1587+	root->free = slot_tree_node_free;
1588+	root->extend = slot_tree_node_extend;
1589+	root->assign = slot_tree_node_assign;
1590+	root->rm = slot_tree_node_rm;
1591+}
1592+
1593+void slot_tree_init(void)
1594+{
1595+	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1596+				sizeof(struct slot_tree_node), 0,
1597+				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1598+				NULL);
1599+}
1600+
1601+
1602+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1603+struct scan_rung {
1604+	//struct list_head scanned_list;
1605+	struct sradix_tree_root vma_root;
1606+	struct sradix_tree_root vma_root2;
1607+
1608+	struct vma_slot *current_scan;
1609+	unsigned long current_offset;
1610+
1611+	/*
1612+	 * The initial value for current_offset, it should loop over
1613+	 * [0~ step - 1] to let all slot have its chance to be scanned.
1614+	 */
1615+	unsigned long offset_init;
1616+	unsigned long step; /* dynamic step for current_offset */
1617+	unsigned int flags;
1618+	unsigned long pages_to_scan;
1619+	//unsigned long fully_scanned_slots;
1620+	/*
1621+	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1622+	 * the cpu time ratio it can spend in rung_i for every scan
1623+	 * period. if < 0, then it is the cpu time ratio relative to the
1624+	 * max cpu percentage user specified. Both in unit of
1625+	 * 1/TIME_RATIO_SCALE
1626+	 */
1627+	int cpu_ratio;
1628+
1629+	/*
1630+	 * How long it will take for all slots in this rung to be fully
1631+	 * scanned? If it's zero, we don't care about the cover time:
1632+	 * it's fully scanned.
1633+	 */
1634+	unsigned int cover_msecs;
1635+	//unsigned long vma_num;
1636+	//unsigned long pages; /* Sum of all slot's pages in rung */
1637+};
1638+
1639+/**
1640+ * node of either the stable or unstale rbtree
1641+ *
1642+ */
1643+struct tree_node {
1644+	struct rb_node node; /* link in the main (un)stable rbtree */
1645+	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1646+	u32 hash;
1647+	unsigned long count; /* TODO: merged with sub_root */
1648+	struct list_head all_list; /* all tree nodes in stable/unstable tree */
1649+};
1650+
1651+/**
1652+ * struct stable_node - node of the stable rbtree
1653+ * @node: rb node of this ksm page in the stable tree
1654+ * @hlist: hlist head of rmap_items using this ksm page
1655+ * @kpfn: page frame number of this ksm page
1656+ */
1657+struct stable_node {
1658+	struct rb_node node; /* link in sub-rbtree */
1659+	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1660+	struct hlist_head hlist;
1661+	unsigned long kpfn;
1662+	u32 hash_max; /* if ==0 then it's not been calculated yet */
1663+	struct list_head all_list; /* in a list for all stable nodes */
1664+};
1665+
1666+/**
1667+ * struct node_vma - group rmap_items linked in a same stable
1668+ * node together.
1669+ */
1670+struct node_vma {
1671+	union {
1672+		struct vma_slot *slot;
1673+		unsigned long key;  /* slot is used as key sorted on hlist */
1674+	};
1675+	struct hlist_node hlist;
1676+	struct hlist_head rmap_hlist;
1677+	struct stable_node *head;
1678+};
1679+
1680+/**
1681+ * struct rmap_item - reverse mapping item for virtual addresses
1682+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1683+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1684+ * @mm: the memory structure this rmap_item is pointing into
1685+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1686+ * @node: rb node of this rmap_item in the unstable tree
1687+ * @head: pointer to stable_node heading this list in the stable tree
1688+ * @hlist: link into hlist of rmap_items hanging off that stable_node
1689+ */
1690+struct rmap_item {
1691+	struct vma_slot *slot;
1692+	struct page *page;
1693+	unsigned long address;	/* + low bits used for flags below */
1694+	unsigned long hash_round;
1695+	unsigned long entry_index;
1696+	union {
1697+		struct {/* when in unstable tree */
1698+			struct rb_node node;
1699+			struct tree_node *tree_node;
1700+			u32 hash_max;
1701+		};
1702+		struct { /* when in stable tree */
1703+			struct node_vma *head;
1704+			struct hlist_node hlist;
1705+			struct anon_vma *anon_vma;
1706+		};
1707+	};
1708+} __aligned(4);
1709+
1710+struct rmap_list_entry {
1711+	union {
1712+		struct rmap_item *item;
1713+		unsigned long addr;
1714+	};
1715+	/* lowest bit is used for is_addr tag */
1716+} __aligned(4); /* 4 aligned to fit in to pages*/
1717+
1718+
1719+/* Basic data structure definition ends */
1720+
1721+
1722+/*
1723+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1724+ * The flags use the low bits of rmap_item.address
1725+ */
1726+#define UNSTABLE_FLAG	0x1
1727+#define STABLE_FLAG	0x2
1728+#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
1729+
1730+/*
1731+ * rmap_list_entry helpers
1732+ */
1733+#define IS_ADDR_FLAG	1
1734+#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
1735+#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
1736+#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1737+
1738+
1739+/*
1740+ * High speed caches for frequently allocated and freed structs
1741+ */
1742+static struct kmem_cache *rmap_item_cache;
1743+static struct kmem_cache *stable_node_cache;
1744+static struct kmem_cache *node_vma_cache;
1745+static struct kmem_cache *vma_slot_cache;
1746+static struct kmem_cache *tree_node_cache;
1747+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1748+		sizeof(struct __struct), __alignof__(struct __struct),\
1749+		(__flags), NULL)
1750+
1751+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1752+#define SCAN_LADDER_SIZE 4
1753+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1754+
1755+/* The evaluation rounds uksmd has finished */
1756+static unsigned long long uksm_eval_round = 1;
1757+
1758+/*
1759+ * we add 1 to this var when we consider we should rebuild the whole
1760+ * unstable tree.
1761+ */
1762+static unsigned long uksm_hash_round = 1;
1763+
1764+/*
1765+ * How many times the whole memory is scanned.
1766+ */
1767+static unsigned long long fully_scanned_round = 1;
1768+
1769+/* The total number of virtual pages of all vma slots */
1770+static u64 uksm_pages_total;
1771+
1772+/* The number of pages has been scanned since the start up */
1773+static u64 uksm_pages_scanned;
1774+
1775+static u64 scanned_virtual_pages;
1776+
1777+/* The number of pages has been scanned since last encode_benefit call */
1778+static u64 uksm_pages_scanned_last;
1779+
1780+/* If the scanned number is tooo large, we encode it here */
1781+static u64 pages_scanned_stored;
1782+
1783+static unsigned long pages_scanned_base;
1784+
1785+/* The number of nodes in the stable tree */
1786+static unsigned long uksm_pages_shared;
1787+
1788+/* The number of page slots additionally sharing those nodes */
1789+static unsigned long uksm_pages_sharing;
1790+
1791+/* The number of nodes in the unstable tree */
1792+static unsigned long uksm_pages_unshared;
1793+
1794+/*
1795+ * Milliseconds ksmd should sleep between scans,
1796+ * >= 100ms to be consistent with
1797+ * scan_time_to_sleep_msec()
1798+ */
1799+static unsigned int uksm_sleep_jiffies;
1800+
1801+/* The real value for the uksmd next sleep */
1802+static unsigned int uksm_sleep_real;
1803+
1804+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1805+static unsigned int uksm_sleep_saved;
1806+
1807+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1808+static unsigned int uksm_max_cpu_percentage;
1809+
1810+static int uksm_cpu_governor;
1811+
1812+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1813+
1814+struct uksm_cpu_preset_s {
1815+	int cpu_ratio[SCAN_LADDER_SIZE];
1816+	unsigned int cover_msecs[SCAN_LADDER_SIZE];
1817+	unsigned int max_cpu; /* percentage */
1818+};
1819+
1820+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1821+	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1822+	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1823+	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1824+	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1825+};
1826+
1827+/* The default value for uksm_ema_page_time if it's not initialized */
1828+#define UKSM_PAGE_TIME_DEFAULT	500
1829+
1830+/*cost to scan one page by expotional moving average in nsecs */
1831+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1832+
1833+/* The expotional moving average alpha weight, in percentage. */
1834+#define EMA_ALPHA	20
1835+
1836+/*
1837+ * The threshold used to filter out thrashing areas,
1838+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1839+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1840+ * will be considered as having a zero duplication ratio.
1841+ */
1842+static unsigned int uksm_thrash_threshold = 50;
1843+
1844+/* How much dedup ratio is considered to be abundant*/
1845+static unsigned int uksm_abundant_threshold = 10;
1846+
1847+/* All slots having merged pages in this eval round. */
1848+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1849+
1850+/* How many times the ksmd has slept since startup */
1851+static unsigned long long uksm_sleep_times;
1852+
1853+#define UKSM_RUN_STOP	0
1854+#define UKSM_RUN_MERGE	1
1855+static unsigned int uksm_run = 1;
1856+
1857+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1858+static DEFINE_MUTEX(uksm_thread_mutex);
1859+
1860+/*
1861+ * List vma_slot_new is for newly created vma_slot waiting to be added by
1862+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1863+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1864+ * VMA has been removed/freed.
1865+ */
1866+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1867+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1868+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1869+static DEFINE_SPINLOCK(vma_slot_list_lock);
1870+
1871+/* The unstable tree heads */
1872+static struct rb_root root_unstable_tree = RB_ROOT;
1873+
1874+/*
1875+ * All tree_nodes are in a list to be freed at once when unstable tree is
1876+ * freed after each scan round.
1877+ */
1878+static struct list_head unstable_tree_node_list =
1879+				LIST_HEAD_INIT(unstable_tree_node_list);
1880+
1881+/* List contains all stable nodes */
1882+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1883+
1884+/*
1885+ * When the hash strength is changed, the stable tree must be delta_hashed and
1886+ * re-structured. We use two set of below structs to speed up the
1887+ * re-structuring of stable tree.
1888+ */
1889+static struct list_head
1890+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1891+			    LIST_HEAD_INIT(stable_tree_node_list[1])};
1892+
1893+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1894+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1895+static struct rb_root *root_stable_treep = &root_stable_tree[0];
1896+static unsigned long stable_tree_index;
1897+
1898+/* The hash strength needed to hash a full page */
1899+#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
1900+
1901+/* The hash strength needed for loop-back hashing */
1902+#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
1903+
1904+/* The random offsets in a page */
1905+static u32 *random_nums;
1906+
1907+/* The hash strength */
1908+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1909+
1910+/* The delta value each time the hash strength increases or decreases */
1911+static unsigned long hash_strength_delta;
1912+#define HASH_STRENGTH_DELTA_MAX	5
1913+
1914+/* The time we have saved due to random_sample_hash */
1915+static u64 rshash_pos;
1916+
1917+/* The time we have wasted due to hash collision */
1918+static u64 rshash_neg;
1919+
1920+struct uksm_benefit {
1921+	u64 pos;
1922+	u64 neg;
1923+	u64 scanned;
1924+	unsigned long base;
1925+} benefit;
1926+
1927+/*
1928+ * The relative cost of memcmp, compared to 1 time unit of random sample
1929+ * hash, this value is tested when ksm module is initialized
1930+ */
1931+static unsigned long memcmp_cost;
1932+
1933+static unsigned long  rshash_neg_cont_zero;
1934+static unsigned long  rshash_cont_obscure;
1935+
1936+/* The possible states of hash strength adjustment heuristic */
1937+enum rshash_states {
1938+		RSHASH_STILL,
1939+		RSHASH_TRYUP,
1940+		RSHASH_TRYDOWN,
1941+		RSHASH_NEW,
1942+		RSHASH_PRE_STILL,
1943+};
1944+
1945+/* The possible direction we are about to adjust hash strength */
1946+enum rshash_direct {
1947+	GO_UP,
1948+	GO_DOWN,
1949+	OBSCURE,
1950+	STILL,
1951+};
1952+
1953+/* random sampling hash state machine */
1954+static struct {
1955+	enum rshash_states state;
1956+	enum rshash_direct pre_direct;
1957+	u8 below_count;
1958+	/* Keep a lookup window of size 5, iff above_count/below_count > 3
1959+	 * in this window we stop trying.
1960+	 */
1961+	u8 lookup_window_index;
1962+	u64 stable_benefit;
1963+	unsigned long turn_point_down;
1964+	unsigned long turn_benefit_down;
1965+	unsigned long turn_point_up;
1966+	unsigned long turn_benefit_up;
1967+	unsigned long stable_point;
1968+} rshash_state;
1969+
1970+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
1971+static u32 *zero_hash_table;
1972+
1973+static inline struct node_vma *alloc_node_vma(void)
1974+{
1975+	struct node_vma *node_vma;
1976+
1977+	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
1978+				     __GFP_NORETRY | __GFP_NOWARN);
1979+	if (node_vma) {
1980+		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
1981+		INIT_HLIST_NODE(&node_vma->hlist);
1982+	}
1983+	return node_vma;
1984+}
1985+
1986+static inline void free_node_vma(struct node_vma *node_vma)
1987+{
1988+	kmem_cache_free(node_vma_cache, node_vma);
1989+}
1990+
1991+
1992+static inline struct vma_slot *alloc_vma_slot(void)
1993+{
1994+	struct vma_slot *slot;
1995+
1996+	/*
1997+	 * In case ksm is not initialized by now.
1998+	 * Oops, we need to consider the call site of uksm_init() in the future.
1999+	 */
2000+	if (!vma_slot_cache)
2001+		return NULL;
2002+
2003+	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
2004+				 __GFP_NORETRY | __GFP_NOWARN);
2005+	if (slot) {
2006+		INIT_LIST_HEAD(&slot->slot_list);
2007+		INIT_LIST_HEAD(&slot->dedup_list);
2008+		slot->flags |= UKSM_SLOT_NEED_RERAND;
2009+	}
2010+	return slot;
2011+}
2012+
2013+static inline void free_vma_slot(struct vma_slot *vma_slot)
2014+{
2015+	kmem_cache_free(vma_slot_cache, vma_slot);
2016+}
2017+
2018+
2019+
2020+static inline struct rmap_item *alloc_rmap_item(void)
2021+{
2022+	struct rmap_item *rmap_item;
2023+
2024+	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
2025+				      __GFP_NORETRY | __GFP_NOWARN);
2026+	if (rmap_item) {
2027+		/* bug on lowest bit is not clear for flag use */
2028+		BUG_ON(is_addr(rmap_item));
2029+	}
2030+	return rmap_item;
2031+}
2032+
2033+static inline void free_rmap_item(struct rmap_item *rmap_item)
2034+{
2035+	rmap_item->slot = NULL;	/* debug safety */
2036+	kmem_cache_free(rmap_item_cache, rmap_item);
2037+}
2038+
2039+static inline struct stable_node *alloc_stable_node(void)
2040+{
2041+	struct stable_node *node;
2042+
2043+	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
2044+				__GFP_NORETRY | __GFP_NOWARN);
2045+	if (!node)
2046+		return NULL;
2047+
2048+	INIT_HLIST_HEAD(&node->hlist);
2049+	list_add(&node->all_list, &stable_node_list);
2050+	return node;
2051+}
2052+
2053+static inline void free_stable_node(struct stable_node *stable_node)
2054+{
2055+	list_del(&stable_node->all_list);
2056+	kmem_cache_free(stable_node_cache, stable_node);
2057+}
2058+
2059+static inline struct tree_node *alloc_tree_node(struct list_head *list)
2060+{
2061+	struct tree_node *node;
2062+
2063+	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
2064+				 __GFP_NORETRY | __GFP_NOWARN);
2065+	if (!node)
2066+		return NULL;
2067+
2068+	list_add(&node->all_list, list);
2069+	return node;
2070+}
2071+
2072+static inline void free_tree_node(struct tree_node *node)
2073+{
2074+	list_del(&node->all_list);
2075+	kmem_cache_free(tree_node_cache, node);
2076+}
2077+
2078+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2079+{
2080+	struct anon_vma *anon_vma = rmap_item->anon_vma;
2081+
2082+	put_anon_vma(anon_vma);
2083+}
2084+
2085+
2086+/**
2087+ * Remove a stable node from stable_tree, may unlink from its tree_node and
2088+ * may remove its parent tree_node if no other stable node is pending.
2089+ *
2090+ * @stable_node	    The node need to be removed
2091+ * @unlink_rb	    Will this node be unlinked from the rbtree?
2092+ * @remove_tree_    node Will its tree_node be removed if empty?
2093+ */
2094+static void remove_node_from_stable_tree(struct stable_node *stable_node,
2095+					 int unlink_rb,  int remove_tree_node)
2096+{
2097+	struct node_vma *node_vma;
2098+	struct rmap_item *rmap_item;
2099+	struct hlist_node *n;
2100+
2101+	if (!hlist_empty(&stable_node->hlist)) {
2102+		hlist_for_each_entry_safe(node_vma, n,
2103+					  &stable_node->hlist, hlist) {
2104+			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2105+				uksm_pages_sharing--;
2106+
2107+				uksm_drop_anon_vma(rmap_item);
2108+				rmap_item->address &= PAGE_MASK;
2109+			}
2110+			free_node_vma(node_vma);
2111+			cond_resched();
2112+		}
2113+
2114+		/* the last one is counted as shared */
2115+		uksm_pages_shared--;
2116+		uksm_pages_sharing++;
2117+	}
2118+
2119+	if (stable_node->tree_node && unlink_rb) {
2120+		rb_erase(&stable_node->node,
2121+			 &stable_node->tree_node->sub_root);
2122+
2123+		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2124+		    remove_tree_node) {
2125+			rb_erase(&stable_node->tree_node->node,
2126+				 root_stable_treep);
2127+			free_tree_node(stable_node->tree_node);
2128+		} else {
2129+			stable_node->tree_node->count--;
2130+		}
2131+	}
2132+
2133+	free_stable_node(stable_node);
2134+}
2135+
2136+
2137+/*
2138+ * get_uksm_page: checks if the page indicated by the stable node
2139+ * is still its ksm page, despite having held no reference to it.
2140+ * In which case we can trust the content of the page, and it
2141+ * returns the gotten page; but if the page has now been zapped,
2142+ * remove the stale node from the stable tree and return NULL.
2143+ *
2144+ * You would expect the stable_node to hold a reference to the ksm page.
2145+ * But if it increments the page's count, swapping out has to wait for
2146+ * ksmd to come around again before it can free the page, which may take
2147+ * seconds or even minutes: much too unresponsive.  So instead we use a
2148+ * "keyhole reference": access to the ksm page from the stable node peeps
2149+ * out through its keyhole to see if that page still holds the right key,
2150+ * pointing back to this stable node.  This relies on freeing a PageAnon
2151+ * page to reset its page->mapping to NULL, and relies on no other use of
2152+ * a page to put something that might look like our key in page->mapping.
2153+ *
2154+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2155+ * but this is different - made simpler by uksm_thread_mutex being held, but
2156+ * interesting for assuming that no other use of the struct page could ever
2157+ * put our expected_mapping into page->mapping (or a field of the union which
2158+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
2159+ * to keep the page_count protocol described with page_cache_get_speculative.
2160+ *
2161+ * Note: it is possible that get_uksm_page() will return NULL one moment,
2162+ * then page the next, if the page is in between page_freeze_refs() and
2163+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2164+ * is on its way to being freed; but it is an anomaly to bear in mind.
2165+ *
2166+ * @unlink_rb:			if the removal of this node will firstly unlink from
2167+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
2168+ * node from its old tree.
2169+ *
2170+ * @remove_tree_node:	if this is the last one of its tree_node, will the
2171+ * tree_node be freed ? If we are inserting stable node, this tree_node may
2172+ * be reused, so don't free it.
2173+ */
2174+static struct page *get_uksm_page(struct stable_node *stable_node,
2175+				 int unlink_rb, int remove_tree_node)
2176+{
2177+	struct page *page;
2178+	void *expected_mapping;
2179+	unsigned long kpfn;
2180+
2181+	expected_mapping = (void *)((unsigned long)stable_node |
2182+				    PAGE_MAPPING_KSM);
2183+again:
2184+	kpfn = READ_ONCE(stable_node->kpfn);
2185+	page = pfn_to_page(kpfn);
2186+
2187+	/*
2188+	 * page is computed from kpfn, so on most architectures reading
2189+	 * page->mapping is naturally ordered after reading node->kpfn,
2190+	 * but on Alpha we need to be more careful.
2191+	 */
2192+	smp_read_barrier_depends();
2193+
2194+	if (READ_ONCE(page->mapping) != expected_mapping)
2195+		goto stale;
2196+
2197+	/*
2198+	 * We cannot do anything with the page while its refcount is 0.
2199+	 * Usually 0 means free, or tail of a higher-order page: in which
2200+	 * case this node is no longer referenced, and should be freed;
2201+	 * however, it might mean that the page is under page_freeze_refs().
2202+	 * The __remove_mapping() case is easy, again the node is now stale;
2203+	 * but if page is swapcache in migrate_page_move_mapping(), it might
2204+	 * still be our page, in which case it's essential to keep the node.
2205+	 */
2206+	while (!get_page_unless_zero(page)) {
2207+		/*
2208+		 * Another check for page->mapping != expected_mapping would
2209+		 * work here too.  We have chosen the !PageSwapCache test to
2210+		 * optimize the common case, when the page is or is about to
2211+		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
2212+		 * in the freeze_refs section of __remove_mapping(); but Anon
2213+		 * page->mapping reset to NULL later, in free_pages_prepare().
2214+		 */
2215+		if (!PageSwapCache(page))
2216+			goto stale;
2217+		cpu_relax();
2218+	}
2219+
2220+	if (READ_ONCE(page->mapping) != expected_mapping) {
2221+		put_page(page);
2222+		goto stale;
2223+	}
2224+
2225+	lock_page(page);
2226+	if (READ_ONCE(page->mapping) != expected_mapping) {
2227+		unlock_page(page);
2228+		put_page(page);
2229+		goto stale;
2230+	}
2231+	unlock_page(page);
2232+	return page;
2233+stale:
2234+	/*
2235+	 * We come here from above when page->mapping or !PageSwapCache
2236+	 * suggests that the node is stale; but it might be under migration.
2237+	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
2238+	 * before checking whether node->kpfn has been changed.
2239+	 */
2240+	smp_rmb();
2241+	if (stable_node->kpfn != kpfn)
2242+		goto again;
2243+
2244+	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2245+
2246+	return NULL;
2247+}
2248+
2249+/*
2250+ * Removing rmap_item from stable or unstable tree.
2251+ * This function will clean the information from the stable/unstable tree.
2252+ */
2253+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2254+{
2255+	if (rmap_item->address & STABLE_FLAG) {
2256+		struct stable_node *stable_node;
2257+		struct node_vma *node_vma;
2258+		struct page *page;
2259+
2260+		node_vma = rmap_item->head;
2261+		stable_node = node_vma->head;
2262+		page = get_uksm_page(stable_node, 1, 1);
2263+		if (!page)
2264+			goto out;
2265+
2266+		/*
2267+		 * page lock is needed because it's racing with
2268+		 * try_to_unmap_ksm(), etc.
2269+		 */
2270+		lock_page(page);
2271+		hlist_del(&rmap_item->hlist);
2272+
2273+		if (hlist_empty(&node_vma->rmap_hlist)) {
2274+			hlist_del(&node_vma->hlist);
2275+			free_node_vma(node_vma);
2276+		}
2277+		unlock_page(page);
2278+
2279+		put_page(page);
2280+		if (hlist_empty(&stable_node->hlist)) {
2281+			/* do NOT call remove_node_from_stable_tree() here,
2282+			 * it's possible for a forked rmap_item not in
2283+			 * stable tree while the in-tree rmap_items were
2284+			 * deleted.
2285+			 */
2286+			uksm_pages_shared--;
2287+		} else
2288+			uksm_pages_sharing--;
2289+
2290+
2291+		uksm_drop_anon_vma(rmap_item);
2292+	} else if (rmap_item->address & UNSTABLE_FLAG) {
2293+		if (rmap_item->hash_round == uksm_hash_round) {
2294+
2295+			rb_erase(&rmap_item->node,
2296+				 &rmap_item->tree_node->sub_root);
2297+			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2298+				rb_erase(&rmap_item->tree_node->node,
2299+					 &root_unstable_tree);
2300+
2301+				free_tree_node(rmap_item->tree_node);
2302+			} else
2303+				rmap_item->tree_node->count--;
2304+		}
2305+		uksm_pages_unshared--;
2306+	}
2307+
2308+	rmap_item->address &= PAGE_MASK;
2309+	rmap_item->hash_max = 0;
2310+
2311+out:
2312+	cond_resched();		/* we're called from many long loops */
2313+}
2314+
2315+static inline int slot_in_uksm(struct vma_slot *slot)
2316+{
2317+	return list_empty(&slot->slot_list);
2318+}
2319+
2320+/*
2321+ * Test if the mm is exiting
2322+ */
2323+static inline bool uksm_test_exit(struct mm_struct *mm)
2324+{
2325+	return atomic_read(&mm->mm_users) == 0;
2326+}
2327+
2328+static inline unsigned long vma_pool_size(struct vma_slot *slot)
2329+{
2330+	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
2331+			PAGE_SIZE) >> PAGE_SHIFT;
2332+}
2333+
2334+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2335+
2336+/* must be done with sem locked */
2337+static int slot_pool_alloc(struct vma_slot *slot)
2338+{
2339+	unsigned long pool_size;
2340+
2341+	if (slot->rmap_list_pool)
2342+		return 0;
2343+
2344+	pool_size = vma_pool_size(slot);
2345+	slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
2346+				       GFP_KERNEL);
2347+	if (!slot->rmap_list_pool)
2348+		return -ENOMEM;
2349+
2350+	slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
2351+				    GFP_KERNEL);
2352+	if (!slot->pool_counts) {
2353+		kfree(slot->rmap_list_pool);
2354+		return -ENOMEM;
2355+	}
2356+
2357+	slot->pool_size = pool_size;
2358+	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
2359+	slot->flags |= UKSM_SLOT_IN_UKSM;
2360+	uksm_pages_total += slot->pages;
2361+
2362+	return 0;
2363+}
2364+
2365+/*
2366+ * Called after vma is unlinked from its mm
2367+ */
2368+void uksm_remove_vma(struct vm_area_struct *vma)
2369+{
2370+	struct vma_slot *slot;
2371+
2372+	if (!vma->uksm_vma_slot)
2373+		return;
2374+
2375+	spin_lock(&vma_slot_list_lock);
2376+	slot = vma->uksm_vma_slot;
2377+	if (!slot)
2378+		goto out;
2379+
2380+	if (slot_in_uksm(slot)) {
2381+		/**
2382+		 * This slot has been added by ksmd, so move to the del list
2383+		 * waiting ksmd to free it.
2384+		 */
2385+		list_add_tail(&slot->slot_list, &vma_slot_del);
2386+	} else {
2387+		/**
2388+		 * It's still on new list. It's ok to free slot directly.
2389+		 */
2390+		list_del(&slot->slot_list);
2391+		free_vma_slot(slot);
2392+	}
2393+out:
2394+	vma->uksm_vma_slot = NULL;
2395+	spin_unlock(&vma_slot_list_lock);
2396+}
2397+
2398+/**
2399+ * Need to do two things:
2400+ * 1. check if slot was moved to del list
2401+ * 2. make sure the mmap_lock is manipulated under valid vma.
2402+ *
2403+ * My concern here is that in some cases, this may make
2404+ * vma_slot_list_lock() waiters to serialized further by some
2405+ * sem->wait_lock, can this really be expensive?
2406+ *
2407+ *
2408+ * @return
2409+ * 0: if successfully locked mmap_lock
2410+ * -ENOENT: this slot was moved to del list
2411+ * -EBUSY: vma lock failed
2412+ */
2413+static int try_down_read_slot_mmap_lock(struct vma_slot *slot)
2414+{
2415+	struct vm_area_struct *vma;
2416+	struct mm_struct *mm;
2417+	struct rw_semaphore *sem;
2418+
2419+	spin_lock(&vma_slot_list_lock);
2420+
2421+	/* the slot_list was removed and inited from new list, when it enters
2422+	 * uksm_list. If now it's not empty, then it must be moved to del list
2423+	 */
2424+	if (!slot_in_uksm(slot)) {
2425+		spin_unlock(&vma_slot_list_lock);
2426+		return -ENOENT;
2427+	}
2428+
2429+	BUG_ON(slot->pages != vma_pages(slot->vma));
2430+	/* Ok, vma still valid */
2431+	vma = slot->vma;
2432+	mm = vma->vm_mm;
2433+	
2434+
2435+	if (uksm_test_exit(mm)) {
2436+		spin_unlock(&vma_slot_list_lock);
2437+		return -ENOENT;
2438+	}
2439+
2440+	if (mmap_read_trylock(mm)) {
2441+		spin_unlock(&vma_slot_list_lock);
2442+		if (slot_pool_alloc(slot)) {
2443+			uksm_remove_vma(vma);
2444+			mmap_read_unlock(mm);
2445+			return -ENOENT;
2446+		}
2447+		return 0;
2448+	}
2449+
2450+	spin_unlock(&vma_slot_list_lock);
2451+	return -EBUSY;
2452+}
2453+
2454+static inline unsigned long
2455+vma_page_address(struct page *page, struct vm_area_struct *vma)
2456+{
2457+	pgoff_t pgoff = page->index;
2458+	unsigned long address;
2459+
2460+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2461+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2462+		/* page should be within @vma mapping range */
2463+		return -EFAULT;
2464+	}
2465+	return address;
2466+}
2467+
2468+
2469+/* return 0 on success with the item's mmap_lock locked */
2470+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2471+{
2472+	struct mm_struct *mm;
2473+	struct vma_slot *slot = item->slot;
2474+	int err = -EINVAL;
2475+
2476+	struct page *page;
2477+
2478+	/*
2479+	 * try_down_read_slot_mmap_lock() returns non-zero if the slot
2480+	 * has been removed by uksm_remove_vma().
2481+	 */
2482+	if (try_down_read_slot_mmap_lock(slot))
2483+		return -EBUSY;
2484+
2485+	mm = slot->vma->vm_mm;
2486+
2487+	if (uksm_test_exit(mm))
2488+		goto failout_up;
2489+
2490+	page = item->page;
2491+	rcu_read_lock();
2492+	if (!get_page_unless_zero(page)) {
2493+		rcu_read_unlock();
2494+		goto failout_up;
2495+	}
2496+
2497+	/* No need to consider huge page here. */
2498+	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2499+	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2500+		/*
2501+		 * TODO:
2502+		 * should we release this item becase of its stale page
2503+		 * mapping?
2504+		 */
2505+		put_page(page);
2506+		rcu_read_unlock();
2507+		goto failout_up;
2508+	}
2509+	rcu_read_unlock();
2510+	return 0;
2511+
2512+failout_up:
2513+	mmap_read_unlock(mm);
2514+	return err;
2515+}
2516+
2517+/*
2518+ * What kind of VMA is considered ?
2519+ */
2520+static inline int vma_can_enter(struct vm_area_struct *vma)
2521+{
2522+	return uksm_flags_can_scan(vma->vm_flags);
2523+}
2524+
2525+/*
2526+ * Called whenever a fresh new vma is created A new vma_slot.
2527+ * is created and inserted into a global list Must be called.
2528+ * after vma is inserted to its mm.
2529+ */
2530+void uksm_vma_add_new(struct vm_area_struct *vma)
2531+{
2532+	struct vma_slot *slot;
2533+
2534+	if (!vma_can_enter(vma)) {
2535+		vma->uksm_vma_slot = NULL;
2536+		return;
2537+	}
2538+
2539+	slot = alloc_vma_slot();
2540+	if (!slot) {
2541+		vma->uksm_vma_slot = NULL;
2542+		return;
2543+	}
2544+
2545+	vma->uksm_vma_slot = slot;
2546+	vma->vm_flags |= VM_MERGEABLE;
2547+	slot->vma = vma;
2548+	slot->mm = vma->vm_mm;
2549+	slot->ctime_j = jiffies;
2550+	slot->pages = vma_pages(vma);
2551+	spin_lock(&vma_slot_list_lock);
2552+	list_add_tail(&slot->slot_list, &vma_slot_new);
2553+	spin_unlock(&vma_slot_list_lock);
2554+}
2555+
2556+/*   32/3 < they < 32/2 */
2557+#define shiftl	8
2558+#define shiftr	12
2559+
2560+#define HASH_FROM_TO(from, to)			\
2561+for (index = from; index < to; index++) {	\
2562+	pos = random_nums[index];		\
2563+	hash += key[pos];			\
2564+	hash += (hash << shiftl);		\
2565+	hash ^= (hash >> shiftr);		\
2566+}
2567+
2568+
2569+#define HASH_FROM_DOWN_TO(from, to)		\
2570+for (index = from - 1; index >= to; index--) {	\
2571+	hash ^= (hash >> shiftr);		\
2572+	hash ^= (hash >> (shiftr*2));		\
2573+	hash -= (hash << shiftl);		\
2574+	hash += (hash << (shiftl*2));		\
2575+	pos = random_nums[index];		\
2576+	hash -= key[pos];			\
2577+}
2578+
2579+/*
2580+ * The main random sample hash function.
2581+ */
2582+static u32 random_sample_hash(void *addr, u32 hash_strength)
2583+{
2584+	u32 hash = 0xdeadbeef;
2585+	int index, pos, loop = hash_strength;
2586+	u32 *key = (u32 *)addr;
2587+
2588+	if (loop > HASH_STRENGTH_FULL)
2589+		loop = HASH_STRENGTH_FULL;
2590+
2591+	HASH_FROM_TO(0, loop);
2592+
2593+	if (hash_strength > HASH_STRENGTH_FULL) {
2594+		loop = hash_strength - HASH_STRENGTH_FULL;
2595+		HASH_FROM_TO(0, loop);
2596+	}
2597+
2598+	return hash;
2599+}
2600+
2601+
2602+/**
2603+ * It's used when hash strength is adjusted
2604+ *
2605+ * @addr The page's virtual address
2606+ * @from The original hash strength
2607+ * @to   The hash strength changed to
2608+ * @hash The hash value generated with "from" hash value
2609+ *
2610+ * return the hash value
2611+ */
2612+static u32 delta_hash(void *addr, int from, int to, u32 hash)
2613+{
2614+	u32 *key = (u32 *)addr;
2615+	int index, pos; /* make sure they are int type */
2616+
2617+	if (to > from) {
2618+		if (from >= HASH_STRENGTH_FULL) {
2619+			from -= HASH_STRENGTH_FULL;
2620+			to -= HASH_STRENGTH_FULL;
2621+			HASH_FROM_TO(from, to);
2622+		} else if (to <= HASH_STRENGTH_FULL) {
2623+			HASH_FROM_TO(from, to);
2624+		} else {
2625+			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2626+			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2627+		}
2628+	} else {
2629+		if (from <= HASH_STRENGTH_FULL) {
2630+			HASH_FROM_DOWN_TO(from, to);
2631+		} else if (to >= HASH_STRENGTH_FULL) {
2632+			from -= HASH_STRENGTH_FULL;
2633+			to -= HASH_STRENGTH_FULL;
2634+			HASH_FROM_DOWN_TO(from, to);
2635+		} else {
2636+			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2637+			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2638+		}
2639+	}
2640+
2641+	return hash;
2642+}
2643+
2644+/**
2645+ *
2646+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2647+ * has finished.
2648+ *
2649+ * return 0 if no page has been scanned since last call, 1 otherwise.
2650+ */
2651+static inline int encode_benefit(void)
2652+{
2653+	u64 scanned_delta, pos_delta, neg_delta;
2654+	unsigned long base = benefit.base;
2655+
2656+	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2657+
2658+	if (!scanned_delta)
2659+		return 0;
2660+
2661+	scanned_delta >>= base;
2662+	pos_delta = rshash_pos >> base;
2663+	neg_delta = rshash_neg >> base;
2664+
2665+	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2666+	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2667+	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2668+		benefit.scanned >>= 1;
2669+		benefit.neg >>= 1;
2670+		benefit.pos >>= 1;
2671+		benefit.base++;
2672+		scanned_delta >>= 1;
2673+		pos_delta >>= 1;
2674+		neg_delta >>= 1;
2675+	}
2676+
2677+	benefit.pos += pos_delta;
2678+	benefit.neg += neg_delta;
2679+	benefit.scanned += scanned_delta;
2680+
2681+	BUG_ON(!benefit.scanned);
2682+
2683+	rshash_pos = rshash_neg = 0;
2684+	uksm_pages_scanned_last = uksm_pages_scanned;
2685+
2686+	return 1;
2687+}
2688+
2689+static inline void reset_benefit(void)
2690+{
2691+	benefit.pos = 0;
2692+	benefit.neg = 0;
2693+	benefit.base = 0;
2694+	benefit.scanned = 0;
2695+}
2696+
2697+static inline void inc_rshash_pos(unsigned long delta)
2698+{
2699+	if (CAN_OVERFLOW_U64(rshash_pos, delta))
2700+		encode_benefit();
2701+
2702+	rshash_pos += delta;
2703+}
2704+
2705+static inline void inc_rshash_neg(unsigned long delta)
2706+{
2707+	if (CAN_OVERFLOW_U64(rshash_neg, delta))
2708+		encode_benefit();
2709+
2710+	rshash_neg += delta;
2711+}
2712+
2713+
2714+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2715+			    int cost_accounting)
2716+{
2717+	u32 val;
2718+	unsigned long delta;
2719+
2720+	void *addr = kmap_atomic(page);
2721+
2722+	val = random_sample_hash(addr, hash_strength);
2723+	kunmap_atomic(addr);
2724+
2725+	if (cost_accounting) {
2726+		if (hash_strength < HASH_STRENGTH_FULL)
2727+			delta = HASH_STRENGTH_FULL - hash_strength;
2728+		else
2729+			delta = 0;
2730+
2731+		inc_rshash_pos(delta);
2732+	}
2733+
2734+	return val;
2735+}
2736+
2737+static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
2738+			int cost_accounting)
2739+{
2740+	char *addr1, *addr2;
2741+	int ret;
2742+
2743+	addr1 = kmap_atomic(page1);
2744+	addr2 = kmap_atomic(page2);
2745+	ret = memcmp(addr1, addr2, PAGE_SIZE);
2746+	kunmap_atomic(addr2);
2747+	kunmap_atomic(addr1);
2748+
2749+	if (cost_accounting)
2750+		inc_rshash_neg(memcmp_cost);
2751+
2752+	return ret;
2753+}
2754+
2755+static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
2756+{
2757+	return !memcmp_pages_with_cost(page1, page2, 0);
2758+}
2759+
2760+static inline int is_page_full_zero(struct page *page)
2761+{
2762+	char *addr;
2763+	int ret;
2764+
2765+	addr = kmap_atomic(page);
2766+	ret = is_full_zero(addr, PAGE_SIZE);
2767+	kunmap_atomic(addr);
2768+
2769+	return ret;
2770+}
2771+
2772+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2773+			      pte_t *orig_pte, pte_t *old_pte)
2774+{
2775+	struct mm_struct *mm = vma->vm_mm;
2776+	struct page_vma_mapped_walk pvmw = {
2777+		.page = page,
2778+		.vma = vma,
2779+	};
2780+       struct mmu_notifier_range range;
2781+	int swapped;
2782+	int err = -EFAULT;
2783+
2784+	pvmw.address = page_address_in_vma(page, vma);
2785+	if (pvmw.address == -EFAULT)
2786+		goto out;
2787+
2788+	BUG_ON(PageTransCompound(page));
2789+
2790+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
2791+                                pvmw.address + PAGE_SIZE);
2792+	mmu_notifier_invalidate_range_start(&range);
2793+
2794+	if (!page_vma_mapped_walk(&pvmw))
2795+		goto out_mn;
2796+	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
2797+		goto out_unlock;
2798+
2799+	if (old_pte)
2800+		*old_pte = *pvmw.pte;
2801+
2802+	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
2803+	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
2804+		pte_t entry;
2805+
2806+		swapped = PageSwapCache(page);
2807+		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
2808+		/*
2809+		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
2810+		 * take any lock, therefore the check that we are going to make
2811+		 * with the pagecount against the mapcount is racey and
2812+		 * O_DIRECT can happen right after the check.
2813+		 * So we clear the pte and flush the tlb before the check
2814+		 * this assure us that no O_DIRECT can happen after the check
2815+		 * or in the middle of the check.
2816+		 */
2817+		entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
2818+		/*
2819+		 * Check that no O_DIRECT or similar I/O is in progress on the
2820+		 * page
2821+		 */
2822+		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2823+			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
2824+			goto out_unlock;
2825+		}
2826+		if (pte_dirty(entry))
2827+			set_page_dirty(page);
2828+
2829+		if (pte_protnone(entry))
2830+			entry = pte_mkclean(pte_clear_savedwrite(entry));
2831+		else
2832+			entry = pte_mkclean(pte_wrprotect(entry));
2833+
2834+		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
2835+	}
2836+	*orig_pte = *pvmw.pte;
2837+	err = 0;
2838+
2839+out_unlock:
2840+	page_vma_mapped_walk_done(&pvmw);
2841+out_mn:
2842+	mmu_notifier_invalidate_range_end(&range);
2843+out:
2844+	return err;
2845+}
2846+
2847+#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
2848+#define MERGE_ERR_COLLI		2 /* there is a collision */
2849+#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
2850+#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
2851+
2852+
2853+/**
2854+ * replace_page - replace page in vma by new ksm page
2855+ * @vma:      vma that holds the pte pointing to page
2856+ * @page:     the page we are replacing by kpage
2857+ * @kpage:    the ksm page we replace page by
2858+ * @orig_pte: the original value of the pte
2859+ *
2860+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
2861+ */
2862+static int replace_page(struct vm_area_struct *vma, struct page *page,
2863+			struct page *kpage, pte_t orig_pte)
2864+{
2865+	struct mm_struct *mm = vma->vm_mm;
2866+       struct mmu_notifier_range range;
2867+	pgd_t *pgd;
2868+	p4d_t *p4d;
2869+	pud_t *pud;
2870+	pmd_t *pmd;
2871+	pte_t *ptep;
2872+	spinlock_t *ptl;
2873+	pte_t entry;
2874+
2875+	unsigned long addr;
2876+	int err = MERGE_ERR_PGERR;
2877+
2878+	addr = page_address_in_vma(page, vma);
2879+	if (addr == -EFAULT)
2880+		goto out;
2881+
2882+	pgd = pgd_offset(mm, addr);
2883+	if (!pgd_present(*pgd))
2884+		goto out;
2885+
2886+	p4d = p4d_offset(pgd, addr);
2887+	pud = pud_offset(p4d, addr);
2888+	if (!pud_present(*pud))
2889+		goto out;
2890+
2891+	pmd = pmd_offset(pud, addr);
2892+	BUG_ON(pmd_trans_huge(*pmd));
2893+	if (!pmd_present(*pmd))
2894+		goto out;
2895+
2896+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
2897+                                addr + PAGE_SIZE);
2898+	mmu_notifier_invalidate_range_start(&range);
2899+
2900+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2901+	if (!pte_same(*ptep, orig_pte)) {
2902+		pte_unmap_unlock(ptep, ptl);
2903+		goto out_mn;
2904+	}
2905+
2906+	flush_cache_page(vma, addr, pte_pfn(*ptep));
2907+	ptep_clear_flush_notify(vma, addr, ptep);
2908+	entry = mk_pte(kpage, vma->vm_page_prot);
2909+
2910+	/* special treatment is needed for zero_page */
2911+	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2912+				(page_to_pfn(kpage) == zero_pfn)) {
2913+		entry = pte_mkspecial(entry);
2914+		dec_mm_counter(mm, MM_ANONPAGES);
2915+		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
2916+	} else {
2917+		get_page(kpage);
2918+		page_add_anon_rmap(kpage, vma, addr, false);
2919+	}
2920+
2921+	set_pte_at_notify(mm, addr, ptep, entry);
2922+
2923+	page_remove_rmap(page, false);
2924+	if (!page_mapped(page))
2925+		try_to_free_swap(page);
2926+	put_page(page);
2927+
2928+	pte_unmap_unlock(ptep, ptl);
2929+	err = 0;
2930+out_mn:
2931+	mmu_notifier_invalidate_range_end(&range);
2932+out:
2933+	return err;
2934+}
2935+
2936+
2937+/**
2938+ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2939+ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2940+ *  hash_max member has not been calculated.
2941+ *
2942+ * @page The page needs to be hashed
2943+ * @hash_old The hash value calculated with current hash strength
2944+ *
2945+ * return the new hash value calculated at HASH_STRENGTH_MAX
2946+ */
2947+static inline u32 page_hash_max(struct page *page, u32 hash_old)
2948+{
2949+	u32 hash_max = 0;
2950+	void *addr;
2951+
2952+	addr = kmap_atomic(page);
2953+	hash_max = delta_hash(addr, hash_strength,
2954+			      HASH_STRENGTH_MAX, hash_old);
2955+
2956+	kunmap_atomic(addr);
2957+
2958+	if (!hash_max)
2959+		hash_max = 1;
2960+
2961+	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2962+	return hash_max;
2963+}
2964+
2965+/*
2966+ * We compare the hash again, to ensure that it is really a hash collision
2967+ * instead of being caused by page write.
2968+ */
2969+static inline int check_collision(struct rmap_item *rmap_item,
2970+				  u32 hash)
2971+{
2972+	int err;
2973+	struct page *page = rmap_item->page;
2974+
2975+	/* if this rmap_item has already been hash_maxed, then the collision
2976+	 * must appears in the second-level rbtree search. In this case we check
2977+	 * if its hash_max value has been changed. Otherwise, the collision
2978+	 * happens in the first-level rbtree search, so we check against it's
2979+	 * current hash value.
2980+	 */
2981+	if (rmap_item->hash_max) {
2982+		inc_rshash_neg(memcmp_cost);
2983+		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2984+
2985+		if (rmap_item->hash_max == page_hash_max(page, hash))
2986+			err = MERGE_ERR_COLLI;
2987+		else
2988+			err = MERGE_ERR_CHANGED;
2989+	} else {
2990+		inc_rshash_neg(memcmp_cost + hash_strength);
2991+
2992+		if (page_hash(page, hash_strength, 0) == hash)
2993+			err = MERGE_ERR_COLLI;
2994+		else
2995+			err = MERGE_ERR_CHANGED;
2996+	}
2997+
2998+	return err;
2999+}
3000+
3001+/**
3002+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3003+ * already be a ksm page.
3004+ *
3005+ * @return 0 if the pages were merged, -EFAULT otherwise.
3006+ */
3007+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3008+				      struct page *kpage, u32 hash)
3009+{
3010+	struct vm_area_struct *vma = rmap_item->slot->vma;
3011+	struct mm_struct *mm = vma->vm_mm;
3012+	pte_t orig_pte = __pte(0);
3013+	int err = MERGE_ERR_PGERR;
3014+	struct page *page;
3015+
3016+	if (uksm_test_exit(mm))
3017+		goto out;
3018+
3019+	page = rmap_item->page;
3020+
3021+	if (page == kpage) { /* ksm page forked */
3022+		err = 0;
3023+		goto out;
3024+	}
3025+
3026+	/*
3027+	 * We need the page lock to read a stable PageSwapCache in
3028+	 * write_protect_page().  We use trylock_page() instead of
3029+	 * lock_page() because we don't want to wait here - we
3030+	 * prefer to continue scanning and merging different pages,
3031+	 * then come back to this page when it is unlocked.
3032+	 */
3033+	if (!trylock_page(page))
3034+		goto out;
3035+
3036+	if (!PageAnon(page) || !PageKsm(kpage))
3037+		goto out_unlock;
3038+
3039+	if (PageTransCompound(page)) {
3040+		err = split_huge_page(page);
3041+		if (err)
3042+			goto out_unlock;
3043+	}
3044+
3045+	/*
3046+	 * If this anonymous page is mapped only here, its pte may need
3047+	 * to be write-protected.  If it's mapped elsewhere, all of its
3048+	 * ptes are necessarily already write-protected.  But in either
3049+	 * case, we need to lock and check page_count is not raised.
3050+	 */
3051+	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3052+		if (pages_identical_with_cost(page, kpage))
3053+			err = replace_page(vma, page, kpage, orig_pte);
3054+		else
3055+			err = check_collision(rmap_item, hash);
3056+	}
3057+
3058+	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3059+		munlock_vma_page(page);
3060+		if (!PageMlocked(kpage)) {
3061+			unlock_page(page);
3062+			lock_page(kpage);
3063+			mlock_vma_page(kpage);
3064+			page = kpage;		/* for final unlock */
3065+		}
3066+	}
3067+
3068+out_unlock:
3069+	unlock_page(page);
3070+out:
3071+	return err;
3072+}
3073+
3074+
3075+
3076+/**
3077+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3078+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
3079+ *
3080+ * @return 0 on success.
3081+ */
3082+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3083+			     pte_t orig_pte, pte_t wprt_pte)
3084+{
3085+	struct mm_struct *mm = vma->vm_mm;
3086+	pgd_t *pgd;
3087+	p4d_t *p4d;
3088+	pud_t *pud;
3089+	pmd_t *pmd;
3090+	pte_t *ptep;
3091+	spinlock_t *ptl;
3092+
3093+	int err = -EFAULT;
3094+
3095+	pgd = pgd_offset(mm, addr);
3096+	if (!pgd_present(*pgd))
3097+		goto out;
3098+
3099+	p4d = p4d_offset(pgd, addr);
3100+	pud = pud_offset(p4d, addr);
3101+	if (!pud_present(*pud))
3102+		goto out;
3103+
3104+	pmd = pmd_offset(pud, addr);
3105+	if (!pmd_present(*pmd))
3106+		goto out;
3107+
3108+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3109+	if (!pte_same(*ptep, wprt_pte)) {
3110+		/* already copied, let it be */
3111+		pte_unmap_unlock(ptep, ptl);
3112+		goto out;
3113+	}
3114+
3115+	/*
3116+	 * Good boy, still here. When we still get the ksm page, it does not
3117+	 * return to the free page pool, there is no way that a pte was changed
3118+	 * to other page and gets back to this page. And remind that ksm page
3119+	 * do not reuse in do_wp_page(). So it's safe to restore the original
3120+	 * pte.
3121+	 */
3122+	flush_cache_page(vma, addr, pte_pfn(*ptep));
3123+	ptep_clear_flush_notify(vma, addr, ptep);
3124+	set_pte_at_notify(mm, addr, ptep, orig_pte);
3125+
3126+	pte_unmap_unlock(ptep, ptl);
3127+	err = 0;
3128+out:
3129+	return err;
3130+}
3131+
3132+/**
3133+ * try_to_merge_two_pages() - take two identical pages and prepare
3134+ * them to be merged into one page(rmap_item->page)
3135+ *
3136+ * @return 0 if we successfully merged two identical pages into
3137+ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3138+ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3139+ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
3140+ *
3141+ */
3142+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3143+				  struct rmap_item *tree_rmap_item,
3144+				  u32 hash)
3145+{
3146+	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3147+	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3148+	struct vm_area_struct *vma1 = rmap_item->slot->vma;
3149+	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3150+	struct page *page = rmap_item->page;
3151+	struct page *tree_page = tree_rmap_item->page;
3152+	int err = MERGE_ERR_PGERR;
3153+	struct address_space *saved_mapping;
3154+
3155+
3156+	if (rmap_item->page == tree_rmap_item->page)
3157+		goto out;
3158+
3159+	if (!trylock_page(page))
3160+		goto out;
3161+
3162+	if (!PageAnon(page))
3163+		goto out_unlock;
3164+
3165+	if (PageTransCompound(page)) {
3166+		err = split_huge_page(page);
3167+		if (err)
3168+			goto out_unlock;
3169+	}
3170+
3171+	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3172+		unlock_page(page);
3173+		goto out;
3174+	}
3175+
3176+	/*
3177+	 * While we hold page lock, upgrade page from
3178+	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
3179+	 * stable_tree_insert() will update stable_node.
3180+	 */
3181+	saved_mapping = page->mapping;
3182+	set_page_stable_node(page, NULL);
3183+	mark_page_accessed(page);
3184+	if (!PageDirty(page))
3185+		SetPageDirty(page);
3186+
3187+	unlock_page(page);
3188+
3189+	if (!trylock_page(tree_page))
3190+		goto restore_out;
3191+
3192+	if (!PageAnon(tree_page)) {
3193+		unlock_page(tree_page);
3194+		goto restore_out;
3195+	}
3196+
3197+	if (PageTransCompound(tree_page)) {
3198+		err = split_huge_page(tree_page);
3199+		if (err) {
3200+			unlock_page(tree_page);
3201+			goto restore_out;
3202+		}
3203+	}
3204+
3205+	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3206+		unlock_page(tree_page);
3207+		goto restore_out;
3208+	}
3209+
3210+	if (pages_identical_with_cost(page, tree_page)) {
3211+		err = replace_page(vma2, tree_page, page, wprt_pte2);
3212+		if (err) {
3213+			unlock_page(tree_page);
3214+			goto restore_out;
3215+		}
3216+
3217+		if ((vma2->vm_flags & VM_LOCKED)) {
3218+			munlock_vma_page(tree_page);
3219+			if (!PageMlocked(page)) {
3220+				unlock_page(tree_page);
3221+				lock_page(page);
3222+				mlock_vma_page(page);
3223+				tree_page = page; /* for final unlock */
3224+			}
3225+		}
3226+
3227+		unlock_page(tree_page);
3228+
3229+		goto out; /* success */
3230+
3231+	} else {
3232+		if (tree_rmap_item->hash_max &&
3233+		    tree_rmap_item->hash_max == rmap_item->hash_max) {
3234+			err = MERGE_ERR_COLLI_MAX;
3235+		} else if (page_hash(page, hash_strength, 0) ==
3236+		    page_hash(tree_page, hash_strength, 0)) {
3237+			inc_rshash_neg(memcmp_cost + hash_strength * 2);
3238+			err = MERGE_ERR_COLLI;
3239+		} else {
3240+			err = MERGE_ERR_CHANGED;
3241+		}
3242+
3243+		unlock_page(tree_page);
3244+	}
3245+
3246+restore_out:
3247+	lock_page(page);
3248+	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3249+				  orig_pte1, wprt_pte1))
3250+		page->mapping = saved_mapping;
3251+
3252+out_unlock:
3253+	unlock_page(page);
3254+out:
3255+	return err;
3256+}
3257+
3258+static inline int hash_cmp(u32 new_val, u32 node_val)
3259+{
3260+	if (new_val > node_val)
3261+		return 1;
3262+	else if (new_val < node_val)
3263+		return -1;
3264+	else
3265+		return 0;
3266+}
3267+
3268+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3269+{
3270+	u32 hash_max = item->hash_max;
3271+
3272+	if (!hash_max) {
3273+		hash_max = page_hash_max(item->page, hash);
3274+
3275+		item->hash_max = hash_max;
3276+	}
3277+
3278+	return hash_max;
3279+}
3280+
3281+
3282+
3283+/**
3284+ * stable_tree_search() - search the stable tree for a page
3285+ *
3286+ * @item:	the rmap_item we are comparing with
3287+ * @hash:	the hash value of this item->page already calculated
3288+ *
3289+ * @return	the page we have found, NULL otherwise. The page returned has
3290+ *			been gotten.
3291+ */
3292+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3293+{
3294+	struct rb_node *node = root_stable_treep->rb_node;
3295+	struct tree_node *tree_node;
3296+	unsigned long hash_max;
3297+	struct page *page = item->page;
3298+	struct stable_node *stable_node;
3299+
3300+	stable_node = page_stable_node(page);
3301+	if (stable_node) {
3302+		/* ksm page forked, that is
3303+		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
3304+		 * it's actually gotten once outside.
3305+		 */
3306+		get_page(page);
3307+		return page;
3308+	}
3309+
3310+	while (node) {
3311+		int cmp;
3312+
3313+		tree_node = rb_entry(node, struct tree_node, node);
3314+
3315+		cmp = hash_cmp(hash, tree_node->hash);
3316+
3317+		if (cmp < 0)
3318+			node = node->rb_left;
3319+		else if (cmp > 0)
3320+			node = node->rb_right;
3321+		else
3322+			break;
3323+	}
3324+
3325+	if (!node)
3326+		return NULL;
3327+
3328+	if (tree_node->count == 1) {
3329+		stable_node = rb_entry(tree_node->sub_root.rb_node,
3330+				       struct stable_node, node);
3331+		BUG_ON(!stable_node);
3332+
3333+		goto get_page_out;
3334+	}
3335+
3336+	/*
3337+	 * ok, we have to search the second
3338+	 * level subtree, hash the page to a
3339+	 * full strength.
3340+	 */
3341+	node = tree_node->sub_root.rb_node;
3342+	BUG_ON(!node);
3343+	hash_max = rmap_item_hash_max(item, hash);
3344+
3345+	while (node) {
3346+		int cmp;
3347+
3348+		stable_node = rb_entry(node, struct stable_node, node);
3349+
3350+		cmp = hash_cmp(hash_max, stable_node->hash_max);
3351+
3352+		if (cmp < 0)
3353+			node = node->rb_left;
3354+		else if (cmp > 0)
3355+			node = node->rb_right;
3356+		else
3357+			goto get_page_out;
3358+	}
3359+
3360+	return NULL;
3361+
3362+get_page_out:
3363+	page = get_uksm_page(stable_node, 1, 1);
3364+	return page;
3365+}
3366+
3367+static int try_merge_rmap_item(struct rmap_item *item,
3368+			       struct page *kpage,
3369+			       struct page *tree_page)
3370+{
3371+	struct vm_area_struct *vma = item->slot->vma;
3372+	struct page_vma_mapped_walk pvmw = {
3373+		.page = kpage,
3374+		.vma = vma,
3375+	};
3376+
3377+	pvmw.address = get_rmap_addr(item);
3378+	if (!page_vma_mapped_walk(&pvmw))
3379+		return 0;
3380+
3381+	if (pte_write(*pvmw.pte)) {
3382+		/* has changed, abort! */
3383+		page_vma_mapped_walk_done(&pvmw);
3384+		return 0;
3385+	}
3386+
3387+	get_page(tree_page);
3388+	page_add_anon_rmap(tree_page, vma, pvmw.address, false);
3389+
3390+	flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
3391+	ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
3392+	set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
3393+			  mk_pte(tree_page, vma->vm_page_prot));
3394+
3395+	page_remove_rmap(kpage, false);
3396+	put_page(kpage);
3397+
3398+	page_vma_mapped_walk_done(&pvmw);
3399+
3400+	return 1;
3401+}
3402+
3403+/**
3404+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3405+ * into stable tree, the page was found to be identical to a stable ksm page,
3406+ * this is the last chance we can merge them into one.
3407+ *
3408+ * @item1:	the rmap_item holding the page which we wanted to insert
3409+ *		into stable tree.
3410+ * @item2:	the other rmap_item we found when unstable tree search
3411+ * @oldpage:	the page currently mapped by the two rmap_items
3412+ * @tree_page:	the page we found identical in stable tree node
3413+ * @success1:	return if item1 is successfully merged
3414+ * @success2:	return if item2 is successfully merged
3415+ */
3416+static void try_merge_with_stable(struct rmap_item *item1,
3417+				  struct rmap_item *item2,
3418+				  struct page **kpage,
3419+				  struct page *tree_page,
3420+				  int *success1, int *success2)
3421+{
3422+	struct vm_area_struct *vma1 = item1->slot->vma;
3423+	struct vm_area_struct *vma2 = item2->slot->vma;
3424+	*success1 = 0;
3425+	*success2 = 0;
3426+
3427+	if (unlikely(*kpage == tree_page)) {
3428+		/* I don't think this can really happen */
3429+		pr_warn("UKSM: unexpected condition detected in "
3430+			"%s -- *kpage == tree_page !\n", __func__);
3431+		*success1 = 1;
3432+		*success2 = 1;
3433+		return;
3434+	}
3435+
3436+	if (!PageAnon(*kpage) || !PageKsm(*kpage))
3437+		goto failed;
3438+
3439+	if (!trylock_page(tree_page))
3440+		goto failed;
3441+
3442+	/* If the oldpage is still ksm and still pointed
3443+	 * to in the right place, and still write protected,
3444+	 * we are confident it's not changed, no need to
3445+	 * memcmp anymore.
3446+	 * be ware, we cannot take nested pte locks,
3447+	 * deadlock risk.
3448+	 */
3449+	if (!try_merge_rmap_item(item1, *kpage, tree_page))
3450+		goto unlock_failed;
3451+
3452+	/* ok, then vma2, remind that pte1 already set */
3453+	if (!try_merge_rmap_item(item2, *kpage, tree_page))
3454+		goto success_1;
3455+
3456+	*success2 = 1;
3457+success_1:
3458+	*success1 = 1;
3459+
3460+
3461+	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3462+	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
3463+		munlock_vma_page(*kpage);
3464+		if (!PageMlocked(tree_page))
3465+			mlock_vma_page(tree_page);
3466+	}
3467+
3468+	/*
3469+	 * We do not need oldpage any more in the caller, so can break the lock
3470+	 * now.
3471+	 */
3472+	unlock_page(*kpage);
3473+	*kpage = tree_page; /* Get unlocked outside. */
3474+	return;
3475+
3476+unlock_failed:
3477+	unlock_page(tree_page);
3478+failed:
3479+	return;
3480+}
3481+
3482+static inline void stable_node_hash_max(struct stable_node *node,
3483+					 struct page *page, u32 hash)
3484+{
3485+	u32 hash_max = node->hash_max;
3486+
3487+	if (!hash_max) {
3488+		hash_max = page_hash_max(page, hash);
3489+		node->hash_max = hash_max;
3490+	}
3491+}
3492+
3493+static inline
3494+struct stable_node *new_stable_node(struct tree_node *tree_node,
3495+				    struct page *kpage, u32 hash_max)
3496+{
3497+	struct stable_node *new_stable_node;
3498+
3499+	new_stable_node = alloc_stable_node();
3500+	if (!new_stable_node)
3501+		return NULL;
3502+
3503+	new_stable_node->kpfn = page_to_pfn(kpage);
3504+	new_stable_node->hash_max = hash_max;
3505+	new_stable_node->tree_node = tree_node;
3506+	set_page_stable_node(kpage, new_stable_node);
3507+
3508+	return new_stable_node;
3509+}
3510+
3511+static inline
3512+struct stable_node *first_level_insert(struct tree_node *tree_node,
3513+				       struct rmap_item *rmap_item,
3514+				       struct rmap_item *tree_rmap_item,
3515+				       struct page **kpage, u32 hash,
3516+				       int *success1, int *success2)
3517+{
3518+	int cmp;
3519+	struct page *tree_page;
3520+	u32 hash_max = 0;
3521+	struct stable_node *stable_node, *new_snode;
3522+	struct rb_node *parent = NULL, **new;
3523+
3524+	/* this tree node contains no sub-tree yet */
3525+	stable_node = rb_entry(tree_node->sub_root.rb_node,
3526+			       struct stable_node, node);
3527+
3528+	tree_page = get_uksm_page(stable_node, 1, 0);
3529+	if (tree_page) {
3530+		cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3531+		if (!cmp) {
3532+			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3533+					      tree_page, success1, success2);
3534+			put_page(tree_page);
3535+			if (!*success1 && !*success2)
3536+				goto failed;
3537+
3538+			return stable_node;
3539+
3540+		} else {
3541+			/*
3542+			 * collision in first level try to create a subtree.
3543+			 * A new node need to be created.
3544+			 */
3545+			put_page(tree_page);
3546+
3547+			stable_node_hash_max(stable_node, tree_page,
3548+					     tree_node->hash);
3549+			hash_max = rmap_item_hash_max(rmap_item, hash);
3550+			cmp = hash_cmp(hash_max, stable_node->hash_max);
3551+
3552+			parent = &stable_node->node;
3553+			if (cmp < 0)
3554+				new = &parent->rb_left;
3555+			else if (cmp > 0)
3556+				new = &parent->rb_right;
3557+			else
3558+				goto failed;
3559+		}
3560+
3561+	} else {
3562+		/* the only stable_node deleted, we reuse its tree_node.
3563+		 */
3564+		parent = NULL;
3565+		new = &tree_node->sub_root.rb_node;
3566+	}
3567+
3568+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
3569+	if (!new_snode)
3570+		goto failed;
3571+
3572+	rb_link_node(&new_snode->node, parent, new);
3573+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
3574+	tree_node->count++;
3575+	*success1 = *success2 = 1;
3576+
3577+	return new_snode;
3578+
3579+failed:
3580+	return NULL;
3581+}
3582+
3583+static inline
3584+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3585+					  struct rmap_item *rmap_item,
3586+					  struct rmap_item *tree_rmap_item,
3587+					  struct page **kpage, u32 hash,
3588+					  int *success1, int *success2)
3589+{
3590+	struct page *tree_page;
3591+	u32 hash_max;
3592+	struct stable_node *stable_node, *new_snode;
3593+	struct rb_node *parent, **new;
3594+
3595+research:
3596+	parent = NULL;
3597+	new = &tree_node->sub_root.rb_node;
3598+	BUG_ON(!*new);
3599+	hash_max = rmap_item_hash_max(rmap_item, hash);
3600+	while (*new) {
3601+		int cmp;
3602+
3603+		stable_node = rb_entry(*new, struct stable_node, node);
3604+
3605+		cmp = hash_cmp(hash_max, stable_node->hash_max);
3606+
3607+		if (cmp < 0) {
3608+			parent = *new;
3609+			new = &parent->rb_left;
3610+		} else if (cmp > 0) {
3611+			parent = *new;
3612+			new = &parent->rb_right;
3613+		} else {
3614+			tree_page = get_uksm_page(stable_node, 1, 0);
3615+			if (tree_page) {
3616+				cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
3617+				if (!cmp) {
3618+					try_merge_with_stable(rmap_item,
3619+						tree_rmap_item, kpage,
3620+						tree_page, success1, success2);
3621+
3622+					put_page(tree_page);
3623+					if (!*success1 && !*success2)
3624+						goto failed;
3625+					/*
3626+					 * successfully merged with a stable
3627+					 * node
3628+					 */
3629+					return stable_node;
3630+				} else {
3631+					put_page(tree_page);
3632+					goto failed;
3633+				}
3634+			} else {
3635+				/*
3636+				 * stable node may be deleted,
3637+				 * and subtree maybe
3638+				 * restructed, cannot
3639+				 * continue, research it.
3640+				 */
3641+				if (tree_node->count) {
3642+					goto research;
3643+				} else {
3644+					/* reuse the tree node*/
3645+					parent = NULL;
3646+					new = &tree_node->sub_root.rb_node;
3647+				}
3648+			}
3649+		}
3650+	}
3651+
3652+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
3653+	if (!new_snode)
3654+		goto failed;
3655+
3656+	rb_link_node(&new_snode->node, parent, new);
3657+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
3658+	tree_node->count++;
3659+	*success1 = *success2 = 1;
3660+
3661+	return new_snode;
3662+
3663+failed:
3664+	return NULL;
3665+}
3666+
3667+
3668+/**
3669+ * stable_tree_insert() - try to insert a merged page in unstable tree to
3670+ * the stable tree
3671+ *
3672+ * @kpage:		the page need to be inserted
3673+ * @hash:		the current hash of this page
3674+ * @rmap_item:		the rmap_item being scanned
3675+ * @tree_rmap_item:	the rmap_item found on unstable tree
3676+ * @success1:		return if rmap_item is merged
3677+ * @success2:		return if tree_rmap_item is merged
3678+ *
3679+ * @return		the stable_node on stable tree if at least one
3680+ *			rmap_item is inserted into stable tree, NULL
3681+ *			otherwise.
3682+ */
3683+static struct stable_node *
3684+stable_tree_insert(struct page **kpage, u32 hash,
3685+		   struct rmap_item *rmap_item,
3686+		   struct rmap_item *tree_rmap_item,
3687+		   int *success1, int *success2)
3688+{
3689+	struct rb_node **new = &root_stable_treep->rb_node;
3690+	struct rb_node *parent = NULL;
3691+	struct stable_node *stable_node;
3692+	struct tree_node *tree_node;
3693+	u32 hash_max = 0;
3694+
3695+	*success1 = *success2 = 0;
3696+
3697+	while (*new) {
3698+		int cmp;
3699+
3700+		tree_node = rb_entry(*new, struct tree_node, node);
3701+
3702+		cmp = hash_cmp(hash, tree_node->hash);
3703+
3704+		if (cmp < 0) {
3705+			parent = *new;
3706+			new = &parent->rb_left;
3707+		} else if (cmp > 0) {
3708+			parent = *new;
3709+			new = &parent->rb_right;
3710+		} else
3711+			break;
3712+	}
3713+
3714+	if (*new) {
3715+		if (tree_node->count == 1) {
3716+			stable_node = first_level_insert(tree_node, rmap_item,
3717+						tree_rmap_item, kpage,
3718+						hash, success1, success2);
3719+		} else {
3720+			stable_node = stable_subtree_insert(tree_node,
3721+					rmap_item, tree_rmap_item, kpage,
3722+					hash, success1, success2);
3723+		}
3724+	} else {
3725+
3726+		/* no tree node found */
3727+		tree_node = alloc_tree_node(stable_tree_node_listp);
3728+		if (!tree_node) {
3729+			stable_node = NULL;
3730+			goto out;
3731+		}
3732+
3733+		stable_node = new_stable_node(tree_node, *kpage, hash_max);
3734+		if (!stable_node) {
3735+			free_tree_node(tree_node);
3736+			goto out;
3737+		}
3738+
3739+		tree_node->hash = hash;
3740+		rb_link_node(&tree_node->node, parent, new);
3741+		rb_insert_color(&tree_node->node, root_stable_treep);
3742+		parent = NULL;
3743+		new = &tree_node->sub_root.rb_node;
3744+
3745+		rb_link_node(&stable_node->node, parent, new);
3746+		rb_insert_color(&stable_node->node, &tree_node->sub_root);
3747+		tree_node->count++;
3748+		*success1 = *success2 = 1;
3749+	}
3750+
3751+out:
3752+	return stable_node;
3753+}
3754+
3755+
3756+/**
3757+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_lock
3758+ *
3759+ * @return	0 on success, -EBUSY if unable to lock the mmap_lock,
3760+ *		-EINVAL if the page mapping has been changed.
3761+ */
3762+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3763+{
3764+	int err;
3765+
3766+	err = get_mergeable_page_lock_mmap(tree_rmap_item);
3767+
3768+	if (err == -EINVAL) {
3769+		/* its page map has been changed, remove it */
3770+		remove_rmap_item_from_tree(tree_rmap_item);
3771+	}
3772+
3773+	/* The page is gotten and mmap_lock is locked now. */
3774+	return err;
3775+}
3776+
3777+
3778+/**
3779+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3780+ * same hash value. Get its page and trylock the mmap_lock
3781+ */
3782+static inline
3783+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3784+					      u32 hash)
3785+
3786+{
3787+	struct rb_node **new = &root_unstable_tree.rb_node;
3788+	struct rb_node *parent = NULL;
3789+	struct tree_node *tree_node;
3790+	u32 hash_max;
3791+	struct rmap_item *tree_rmap_item;
3792+
3793+	while (*new) {
3794+		int cmp;
3795+
3796+		tree_node = rb_entry(*new, struct tree_node, node);
3797+
3798+		cmp = hash_cmp(hash, tree_node->hash);
3799+
3800+		if (cmp < 0) {
3801+			parent = *new;
3802+			new = &parent->rb_left;
3803+		} else if (cmp > 0) {
3804+			parent = *new;
3805+			new = &parent->rb_right;
3806+		} else
3807+			break;
3808+	}
3809+
3810+	if (*new) {
3811+		/* got the tree_node */
3812+		if (tree_node->count == 1) {
3813+			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3814+						  struct rmap_item, node);
3815+			BUG_ON(!tree_rmap_item);
3816+
3817+			goto get_page_out;
3818+		}
3819+
3820+		/* well, search the collision subtree */
3821+		new = &tree_node->sub_root.rb_node;
3822+		BUG_ON(!*new);
3823+		hash_max = rmap_item_hash_max(rmap_item, hash);
3824+
3825+		while (*new) {
3826+			int cmp;
3827+
3828+			tree_rmap_item = rb_entry(*new, struct rmap_item,
3829+						  node);
3830+
3831+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3832+			parent = *new;
3833+			if (cmp < 0)
3834+				new = &parent->rb_left;
3835+			else if (cmp > 0)
3836+				new = &parent->rb_right;
3837+			else
3838+				goto get_page_out;
3839+		}
3840+	} else {
3841+		/* alloc a new tree_node */
3842+		tree_node = alloc_tree_node(&unstable_tree_node_list);
3843+		if (!tree_node)
3844+			return NULL;
3845+
3846+		tree_node->hash = hash;
3847+		rb_link_node(&tree_node->node, parent, new);
3848+		rb_insert_color(&tree_node->node, &root_unstable_tree);
3849+		parent = NULL;
3850+		new = &tree_node->sub_root.rb_node;
3851+	}
3852+
3853+	/* did not found even in sub-tree */
3854+	rmap_item->tree_node = tree_node;
3855+	rmap_item->address |= UNSTABLE_FLAG;
3856+	rmap_item->hash_round = uksm_hash_round;
3857+	rb_link_node(&rmap_item->node, parent, new);
3858+	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3859+
3860+	uksm_pages_unshared++;
3861+	return NULL;
3862+
3863+get_page_out:
3864+	if (tree_rmap_item->page == rmap_item->page)
3865+		return NULL;
3866+
3867+	if (get_tree_rmap_item_page(tree_rmap_item))
3868+		return NULL;
3869+
3870+	return tree_rmap_item;
3871+}
3872+
3873+static void hold_anon_vma(struct rmap_item *rmap_item,
3874+			  struct anon_vma *anon_vma)
3875+{
3876+	rmap_item->anon_vma = anon_vma;
3877+	get_anon_vma(anon_vma);
3878+}
3879+
3880+
3881+/**
3882+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3883+ * ratio statistics is done in this function.
3884+ *
3885+ */
3886+static void stable_tree_append(struct rmap_item *rmap_item,
3887+			       struct stable_node *stable_node, int logdedup)
3888+{
3889+	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3890+	unsigned long key = (unsigned long)rmap_item->slot;
3891+	unsigned long factor = rmap_item->slot->rung->step;
3892+
3893+	BUG_ON(!stable_node);
3894+	rmap_item->address |= STABLE_FLAG;
3895+
3896+	if (hlist_empty(&stable_node->hlist)) {
3897+		uksm_pages_shared++;
3898+		goto node_vma_new;
3899+	} else {
3900+		uksm_pages_sharing++;
3901+	}
3902+
3903+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3904+		if (node_vma->key >= key)
3905+			break;
3906+
3907+		if (logdedup) {
3908+			node_vma->slot->pages_bemerged += factor;
3909+			if (list_empty(&node_vma->slot->dedup_list))
3910+				list_add(&node_vma->slot->dedup_list,
3911+					 &vma_slot_dedup);
3912+		}
3913+	}
3914+
3915+	if (node_vma) {
3916+		if (node_vma->key == key) {
3917+			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3918+			goto node_vma_ok;
3919+		} else if (node_vma->key > key) {
3920+			node_vma_cont = node_vma;
3921+		}
3922+	}
3923+
3924+node_vma_new:
3925+	/* no same vma already in node, alloc a new node_vma */
3926+	new_node_vma = alloc_node_vma();
3927+	BUG_ON(!new_node_vma);
3928+	new_node_vma->head = stable_node;
3929+	new_node_vma->slot = rmap_item->slot;
3930+
3931+	if (!node_vma) {
3932+		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3933+	} else if (node_vma->key != key) {
3934+		if (node_vma->key < key)
3935+			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3936+		else {
3937+			hlist_add_before(&new_node_vma->hlist,
3938+					 &node_vma->hlist);
3939+		}
3940+
3941+	}
3942+	node_vma = new_node_vma;
3943+
3944+node_vma_ok: /* ok, ready to add to the list */
3945+	rmap_item->head = node_vma;
3946+	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3947+	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3948+	if (logdedup) {
3949+		rmap_item->slot->pages_merged++;
3950+		if (node_vma_cont) {
3951+			node_vma = node_vma_cont;
3952+			hlist_for_each_entry_continue(node_vma, hlist) {
3953+				node_vma->slot->pages_bemerged += factor;
3954+				if (list_empty(&node_vma->slot->dedup_list))
3955+					list_add(&node_vma->slot->dedup_list,
3956+						 &vma_slot_dedup);
3957+			}
3958+		}
3959+	}
3960+}
3961+
3962+/*
3963+ * We use break_ksm to break COW on a ksm page: it's a stripped down
3964+ *
3965+ *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
3966+ *		put_page(page);
3967+ *
3968+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3969+ * in case the application has unmapped and remapped mm,addr meanwhile.
3970+ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
3971+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3972+ */
3973+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3974+{
3975+	struct page *page;
3976+	int ret = 0;
3977+
3978+	do {
3979+		cond_resched();
3980+		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
3981+		if (IS_ERR_OR_NULL(page))
3982+			break;
3983+		if (PageKsm(page)) {
3984+			ret = handle_mm_fault(vma, addr,
3985+					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
3986+		} else
3987+			ret = VM_FAULT_WRITE;
3988+		put_page(page);
3989+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
3990+	/*
3991+	 * We must loop because handle_mm_fault() may back out if there's
3992+	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
3993+	 *
3994+	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
3995+	 * COW has been broken, even if the vma does not permit VM_WRITE;
3996+	 * but note that a concurrent fault might break PageKsm for us.
3997+	 *
3998+	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
3999+	 * backing file, which also invalidates anonymous pages: that's
4000+	 * okay, that truncation will have unmapped the PageKsm for us.
4001+	 *
4002+	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
4003+	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
4004+	 * current task has TIF_MEMDIE set, and will be OOM killed on return
4005+	 * to user; and ksmd, having no mm, would never be chosen for that.
4006+	 *
4007+	 * But if the mm is in a limited mem_cgroup, then the fault may fail
4008+	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
4009+	 * even ksmd can fail in this way - though it's usually breaking ksm
4010+	 * just to undo a merge it made a moment before, so unlikely to oom.
4011+	 *
4012+	 * That's a pity: we might therefore have more kernel pages allocated
4013+	 * than we're counting as nodes in the stable tree; but uksm_do_scan
4014+	 * will retry to break_cow on each pass, so should recover the page
4015+	 * in due course.  The important thing is to not let VM_MERGEABLE
4016+	 * be cleared while any such pages might remain in the area.
4017+	 */
4018+	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4019+}
4020+
4021+static void break_cow(struct rmap_item *rmap_item)
4022+{
4023+	struct vm_area_struct *vma = rmap_item->slot->vma;
4024+	struct mm_struct *mm = vma->vm_mm;
4025+	unsigned long addr = get_rmap_addr(rmap_item);
4026+
4027+	if (uksm_test_exit(mm))
4028+		goto out;
4029+
4030+	break_ksm(vma, addr);
4031+out:
4032+	return;
4033+}
4034+
4035+/*
4036+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4037+ * than check every pte of a given vma, the locking doesn't quite work for
4038+ * that - an rmap_item is assigned to the stable tree after inserting ksm
4039+ * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
4040+ * rmap_items from parent to child at fork time (so as not to waste time
4041+ * if exit comes before the next scan reaches it).
4042+ *
4043+ * Similarly, although we'd like to remove rmap_items (so updating counts
4044+ * and freeing memory) when unmerging an area, it's easier to leave that
4045+ * to the next pass of ksmd - consider, for example, how ksmd might be
4046+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
4047+ */
4048+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4049+		      unsigned long start, unsigned long end)
4050+{
4051+	unsigned long addr;
4052+	int err = 0;
4053+
4054+	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4055+		if (uksm_test_exit(vma->vm_mm))
4056+			break;
4057+		if (signal_pending(current))
4058+			err = -ERESTARTSYS;
4059+		else
4060+			err = break_ksm(vma, addr);
4061+	}
4062+	return err;
4063+}
4064+
4065+static inline void inc_uksm_pages_scanned(void)
4066+{
4067+	u64 delta;
4068+
4069+
4070+	if (uksm_pages_scanned == U64_MAX) {
4071+		encode_benefit();
4072+
4073+		delta = uksm_pages_scanned >> pages_scanned_base;
4074+
4075+		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4076+			pages_scanned_stored >>= 1;
4077+			delta >>= 1;
4078+			pages_scanned_base++;
4079+		}
4080+
4081+		pages_scanned_stored += delta;
4082+
4083+		uksm_pages_scanned = uksm_pages_scanned_last = 0;
4084+	}
4085+
4086+	uksm_pages_scanned++;
4087+}
4088+
4089+static inline int find_zero_page_hash(int strength, u32 hash)
4090+{
4091+	return (zero_hash_table[strength] == hash);
4092+}
4093+
4094+static
4095+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4096+{
4097+	struct page *zero_page = empty_uksm_zero_page;
4098+	struct mm_struct *mm = vma->vm_mm;
4099+	pte_t orig_pte = __pte(0);
4100+	int err = -EFAULT;
4101+
4102+	if (uksm_test_exit(mm))
4103+		goto out;
4104+
4105+	if (!trylock_page(page))
4106+		goto out;
4107+
4108+	if (!PageAnon(page))
4109+		goto out_unlock;
4110+
4111+	if (PageTransCompound(page)) {
4112+		err = split_huge_page(page);
4113+		if (err)
4114+			goto out_unlock;
4115+	}
4116+
4117+	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4118+		if (is_page_full_zero(page))
4119+			err = replace_page(vma, page, zero_page, orig_pte);
4120+	}
4121+
4122+out_unlock:
4123+	unlock_page(page);
4124+out:
4125+	return err;
4126+}
4127+
4128+/*
4129+ * cmp_and_merge_page() - first see if page can be merged into the stable
4130+ * tree; if not, compare hash to previous and if it's the same, see if page
4131+ * can be inserted into the unstable tree, or merged with a page already there
4132+ * and both transferred to the stable tree.
4133+ *
4134+ * @page: the page that we are searching identical page to.
4135+ * @rmap_item: the reverse mapping into the virtual address of this page
4136+ */
4137+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4138+{
4139+	struct rmap_item *tree_rmap_item;
4140+	struct page *page;
4141+	struct page *kpage = NULL;
4142+	u32 hash_max;
4143+	int err;
4144+	unsigned int success1, success2;
4145+	struct stable_node *snode;
4146+	int cmp;
4147+	struct rb_node *parent = NULL, **new;
4148+
4149+	remove_rmap_item_from_tree(rmap_item);
4150+	page = rmap_item->page;
4151+
4152+	/* We first start with searching the page inside the stable tree */
4153+	kpage = stable_tree_search(rmap_item, hash);
4154+	if (kpage) {
4155+		err = try_to_merge_with_uksm_page(rmap_item, kpage,
4156+						 hash);
4157+		if (!err) {
4158+			/*
4159+			 * The page was successfully merged, add
4160+			 * its rmap_item to the stable tree.
4161+			 * page lock is needed because it's
4162+			 * racing with try_to_unmap_ksm(), etc.
4163+			 */
4164+			lock_page(kpage);
4165+			snode = page_stable_node(kpage);
4166+			stable_tree_append(rmap_item, snode, 1);
4167+			unlock_page(kpage);
4168+			put_page(kpage);
4169+			return; /* success */
4170+		}
4171+		put_page(kpage);
4172+
4173+		/*
4174+		 * if it's a collision and it has been search in sub-rbtree
4175+		 * (hash_max != 0), we want to abort, because if it is
4176+		 * successfully merged in unstable tree, the collision trends to
4177+		 * happen again.
4178+		 */
4179+		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4180+			return;
4181+	}
4182+
4183+	tree_rmap_item =
4184+		unstable_tree_search_insert(rmap_item, hash);
4185+	if (tree_rmap_item) {
4186+		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4187+		/*
4188+		 * As soon as we merge this page, we want to remove the
4189+		 * rmap_item of the page we have merged with from the unstable
4190+		 * tree, and insert it instead as new node in the stable tree.
4191+		 */
4192+		if (!err) {
4193+			kpage = page;
4194+			remove_rmap_item_from_tree(tree_rmap_item);
4195+			lock_page(kpage);
4196+			snode = stable_tree_insert(&kpage, hash,
4197+						   rmap_item, tree_rmap_item,
4198+						   &success1, &success2);
4199+
4200+			/*
4201+			 * Do not log dedup for tree item, it's not counted as
4202+			 * scanned in this round.
4203+			 */
4204+			if (success2)
4205+				stable_tree_append(tree_rmap_item, snode, 0);
4206+
4207+			/*
4208+			 * The order of these two stable append is important:
4209+			 * we are scanning rmap_item.
4210+			 */
4211+			if (success1)
4212+				stable_tree_append(rmap_item, snode, 1);
4213+
4214+			/*
4215+			 * The original kpage may be unlocked inside
4216+			 * stable_tree_insert() already. This page
4217+			 * should be unlocked before doing
4218+			 * break_cow().
4219+			 */
4220+			unlock_page(kpage);
4221+
4222+			if (!success1)
4223+				break_cow(rmap_item);
4224+
4225+			if (!success2)
4226+				break_cow(tree_rmap_item);
4227+
4228+		} else if (err == MERGE_ERR_COLLI) {
4229+			BUG_ON(tree_rmap_item->tree_node->count > 1);
4230+
4231+			rmap_item_hash_max(tree_rmap_item,
4232+					   tree_rmap_item->tree_node->hash);
4233+
4234+			hash_max = rmap_item_hash_max(rmap_item, hash);
4235+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4236+			parent = &tree_rmap_item->node;
4237+			if (cmp < 0)
4238+				new = &parent->rb_left;
4239+			else if (cmp > 0)
4240+				new = &parent->rb_right;
4241+			else
4242+				goto put_up_out;
4243+
4244+			rmap_item->tree_node = tree_rmap_item->tree_node;
4245+			rmap_item->address |= UNSTABLE_FLAG;
4246+			rmap_item->hash_round = uksm_hash_round;
4247+			rb_link_node(&rmap_item->node, parent, new);
4248+			rb_insert_color(&rmap_item->node,
4249+					&tree_rmap_item->tree_node->sub_root);
4250+			rmap_item->tree_node->count++;
4251+		} else {
4252+			/*
4253+			 * either one of the page has changed or they collide
4254+			 * at the max hash, we consider them as ill items.
4255+			 */
4256+			remove_rmap_item_from_tree(tree_rmap_item);
4257+		}
4258+put_up_out:
4259+		put_page(tree_rmap_item->page);
4260+		mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
4261+	}
4262+}
4263+
4264+
4265+
4266+
4267+static inline unsigned long get_pool_index(struct vma_slot *slot,
4268+					   unsigned long index)
4269+{
4270+	unsigned long pool_index;
4271+
4272+	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4273+	if (pool_index >= slot->pool_size)
4274+		BUG();
4275+	return pool_index;
4276+}
4277+
4278+static inline unsigned long index_page_offset(unsigned long index)
4279+{
4280+	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4281+}
4282+
4283+static inline
4284+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4285+					    unsigned long index, int need_alloc)
4286+{
4287+	unsigned long pool_index;
4288+	struct page *page;
4289+	void *addr;
4290+
4291+
4292+	pool_index = get_pool_index(slot, index);
4293+	if (!slot->rmap_list_pool[pool_index]) {
4294+		if (!need_alloc)
4295+			return NULL;
4296+
4297+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4298+		if (!page)
4299+			return NULL;
4300+
4301+		slot->rmap_list_pool[pool_index] = page;
4302+	}
4303+
4304+	addr = kmap(slot->rmap_list_pool[pool_index]);
4305+	addr += index_page_offset(index);
4306+
4307+	return addr;
4308+}
4309+
4310+static inline void put_rmap_list_entry(struct vma_slot *slot,
4311+				       unsigned long index)
4312+{
4313+	unsigned long pool_index;
4314+
4315+	pool_index = get_pool_index(slot, index);
4316+	BUG_ON(!slot->rmap_list_pool[pool_index]);
4317+	kunmap(slot->rmap_list_pool[pool_index]);
4318+}
4319+
4320+static inline int entry_is_new(struct rmap_list_entry *entry)
4321+{
4322+	return !entry->item;
4323+}
4324+
4325+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4326+						unsigned long index)
4327+{
4328+	return slot->vma->vm_start + (index << PAGE_SHIFT);
4329+}
4330+
4331+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4332+{
4333+	unsigned long addr;
4334+
4335+	if (is_addr(entry->addr))
4336+		addr = get_clean_addr(entry->addr);
4337+	else if (entry->item)
4338+		addr = get_rmap_addr(entry->item);
4339+	else
4340+		BUG();
4341+
4342+	return addr;
4343+}
4344+
4345+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4346+{
4347+	if (is_addr(entry->addr))
4348+		return NULL;
4349+
4350+	return entry->item;
4351+}
4352+
4353+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4354+					    unsigned long index)
4355+{
4356+	unsigned long pool_index;
4357+
4358+	pool_index = get_pool_index(slot, index);
4359+	BUG_ON(!slot->rmap_list_pool[pool_index]);
4360+	slot->pool_counts[pool_index]++;
4361+}
4362+
4363+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4364+					    unsigned long index)
4365+{
4366+	unsigned long pool_index;
4367+
4368+	pool_index = get_pool_index(slot, index);
4369+	BUG_ON(!slot->rmap_list_pool[pool_index]);
4370+	BUG_ON(!slot->pool_counts[pool_index]);
4371+	slot->pool_counts[pool_index]--;
4372+}
4373+
4374+static inline int entry_has_rmap(struct rmap_list_entry *entry)
4375+{
4376+	return !is_addr(entry->addr) && entry->item;
4377+}
4378+
4379+static inline void swap_entries(struct rmap_list_entry *entry1,
4380+				unsigned long index1,
4381+				struct rmap_list_entry *entry2,
4382+				unsigned long index2)
4383+{
4384+	struct rmap_list_entry tmp;
4385+
4386+	/* swapping two new entries is meaningless */
4387+	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4388+
4389+	tmp = *entry1;
4390+	*entry1 = *entry2;
4391+	*entry2 = tmp;
4392+
4393+	if (entry_has_rmap(entry1))
4394+		entry1->item->entry_index = index1;
4395+
4396+	if (entry_has_rmap(entry2))
4397+		entry2->item->entry_index = index2;
4398+
4399+	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4400+		inc_rmap_list_pool_count(entry1->item->slot, index1);
4401+		dec_rmap_list_pool_count(entry1->item->slot, index2);
4402+	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4403+		inc_rmap_list_pool_count(entry2->item->slot, index2);
4404+		dec_rmap_list_pool_count(entry2->item->slot, index1);
4405+	}
4406+}
4407+
4408+static inline void free_entry_item(struct rmap_list_entry *entry)
4409+{
4410+	unsigned long index;
4411+	struct rmap_item *item;
4412+
4413+	if (!is_addr(entry->addr)) {
4414+		BUG_ON(!entry->item);
4415+		item = entry->item;
4416+		entry->addr = get_rmap_addr(item);
4417+		set_is_addr(entry->addr);
4418+		index = item->entry_index;
4419+		remove_rmap_item_from_tree(item);
4420+		dec_rmap_list_pool_count(item->slot, index);
4421+		free_rmap_item(item);
4422+	}
4423+}
4424+
4425+static inline int pool_entry_boundary(unsigned long index)
4426+{
4427+	unsigned long linear_addr;
4428+
4429+	linear_addr = sizeof(struct rmap_list_entry *) * index;
4430+	return index && !offset_in_page(linear_addr);
4431+}
4432+
4433+static inline void try_free_last_pool(struct vma_slot *slot,
4434+				      unsigned long index)
4435+{
4436+	unsigned long pool_index;
4437+
4438+	pool_index = get_pool_index(slot, index);
4439+	if (slot->rmap_list_pool[pool_index] &&
4440+	    !slot->pool_counts[pool_index]) {
4441+		__free_page(slot->rmap_list_pool[pool_index]);
4442+		slot->rmap_list_pool[pool_index] = NULL;
4443+		slot->flags |= UKSM_SLOT_NEED_SORT;
4444+	}
4445+
4446+}
4447+
4448+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4449+					   struct rmap_item *item)
4450+{
4451+	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4452+}
4453+
4454+static int within_same_pool(struct vma_slot *slot,
4455+			    unsigned long i, unsigned long j)
4456+{
4457+	unsigned long pool_i, pool_j;
4458+
4459+	pool_i = get_pool_index(slot, i);
4460+	pool_j = get_pool_index(slot, j);
4461+
4462+	return (pool_i == pool_j);
4463+}
4464+
4465+static void sort_rmap_entry_list(struct vma_slot *slot)
4466+{
4467+	unsigned long i, j;
4468+	struct rmap_list_entry *entry, *swap_entry;
4469+
4470+	entry = get_rmap_list_entry(slot, 0, 0);
4471+	for (i = 0; i < slot->pages; ) {
4472+
4473+		if (!entry)
4474+			goto skip_whole_pool;
4475+
4476+		if (entry_is_new(entry))
4477+			goto next_entry;
4478+
4479+		if (is_addr(entry->addr)) {
4480+			entry->addr = 0;
4481+			goto next_entry;
4482+		}
4483+
4484+		j = vma_item_index(slot->vma, entry->item);
4485+		if (j == i)
4486+			goto next_entry;
4487+
4488+		if (within_same_pool(slot, i, j))
4489+			swap_entry = entry + j - i;
4490+		else
4491+			swap_entry = get_rmap_list_entry(slot, j, 1);
4492+
4493+		swap_entries(entry, i, swap_entry, j);
4494+		if (!within_same_pool(slot, i, j))
4495+			put_rmap_list_entry(slot, j);
4496+		continue;
4497+
4498+skip_whole_pool:
4499+		i += PAGE_SIZE / sizeof(*entry);
4500+		if (i < slot->pages)
4501+			entry = get_rmap_list_entry(slot, i, 0);
4502+		continue;
4503+
4504+next_entry:
4505+		if (i >= slot->pages - 1 ||
4506+		    !within_same_pool(slot, i, i + 1)) {
4507+			put_rmap_list_entry(slot, i);
4508+			if (i + 1 < slot->pages)
4509+				entry = get_rmap_list_entry(slot, i + 1, 0);
4510+		} else
4511+			entry++;
4512+		i++;
4513+		continue;
4514+	}
4515+
4516+	/* free empty pool entries which contain no rmap_item */
4517+	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4518+	for (i = 0; i < slot->pool_size; i++) {
4519+		unsigned char has_rmap;
4520+		void *addr;
4521+
4522+		if (!slot->rmap_list_pool[i])
4523+			continue;
4524+
4525+		has_rmap = 0;
4526+		addr = kmap(slot->rmap_list_pool[i]);
4527+		BUG_ON(!addr);
4528+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4529+			entry = (struct rmap_list_entry *)addr + j;
4530+			if (is_addr(entry->addr))
4531+				continue;
4532+			if (!entry->item)
4533+				continue;
4534+			has_rmap = 1;
4535+		}
4536+		kunmap(slot->rmap_list_pool[i]);
4537+		if (!has_rmap) {
4538+			BUG_ON(slot->pool_counts[i]);
4539+			__free_page(slot->rmap_list_pool[i]);
4540+			slot->rmap_list_pool[i] = NULL;
4541+		}
4542+	}
4543+
4544+	slot->flags &= ~UKSM_SLOT_NEED_SORT;
4545+}
4546+
4547+/*
4548+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
4549+ */
4550+static inline int vma_fully_scanned(struct vma_slot *slot)
4551+{
4552+	return slot->pages_scanned == slot->pages;
4553+}
4554+
4555+/**
4556+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4557+ * its random permutation. This function is embedded with the random
4558+ * permutation index management code.
4559+ */
4560+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4561+{
4562+	unsigned long rand_range, addr, swap_index, scan_index;
4563+	struct rmap_item *item = NULL;
4564+	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4565+	struct page *page;
4566+
4567+	scan_index = swap_index = slot->pages_scanned % slot->pages;
4568+
4569+	if (pool_entry_boundary(scan_index))
4570+		try_free_last_pool(slot, scan_index - 1);
4571+
4572+	if (vma_fully_scanned(slot)) {
4573+		if (slot->flags & UKSM_SLOT_NEED_SORT)
4574+			slot->flags |= UKSM_SLOT_NEED_RERAND;
4575+		else
4576+			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4577+		if (slot->flags & UKSM_SLOT_NEED_SORT)
4578+			sort_rmap_entry_list(slot);
4579+	}
4580+
4581+	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4582+	if (!scan_entry)
4583+		return NULL;
4584+
4585+	if (entry_is_new(scan_entry)) {
4586+		scan_entry->addr = get_index_orig_addr(slot, scan_index);
4587+		set_is_addr(scan_entry->addr);
4588+	}
4589+
4590+	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4591+		rand_range = slot->pages - scan_index;
4592+		BUG_ON(!rand_range);
4593+		swap_index = scan_index + (prandom_u32() % rand_range);
4594+	}
4595+
4596+	if (swap_index != scan_index) {
4597+		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4598+
4599+		if (!swap_entry)
4600+			return NULL;
4601+
4602+		if (entry_is_new(swap_entry)) {
4603+			swap_entry->addr = get_index_orig_addr(slot,
4604+							       swap_index);
4605+			set_is_addr(swap_entry->addr);
4606+		}
4607+		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4608+	}
4609+
4610+	addr = get_entry_address(scan_entry);
4611+	item = get_entry_item(scan_entry);
4612+	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4613+
4614+	page = follow_page(slot->vma, addr, FOLL_GET);
4615+	if (IS_ERR_OR_NULL(page))
4616+		goto nopage;
4617+
4618+	if (!PageAnon(page))
4619+		goto putpage;
4620+
4621+	/*check is zero_page pfn or uksm_zero_page*/
4622+	if ((page_to_pfn(page) == zero_pfn)
4623+			|| (page_to_pfn(page) == uksm_zero_pfn))
4624+		goto putpage;
4625+
4626+	flush_anon_page(slot->vma, page, addr);
4627+	flush_dcache_page(page);
4628+
4629+
4630+	*hash = page_hash(page, hash_strength, 1);
4631+	inc_uksm_pages_scanned();
4632+	/*if the page content all zero, re-map to zero-page*/
4633+	if (find_zero_page_hash(hash_strength, *hash)) {
4634+		if (!cmp_and_merge_zero_page(slot->vma, page)) {
4635+			slot->pages_merged++;
4636+
4637+			/* For full-zero pages, no need to create rmap item */
4638+			goto putpage;
4639+		} else {
4640+			inc_rshash_neg(memcmp_cost / 2);
4641+		}
4642+	}
4643+
4644+	if (!item) {
4645+		item = alloc_rmap_item();
4646+		if (item) {
4647+			/* It has already been zeroed */
4648+			item->slot = slot;
4649+			item->address = addr;
4650+			item->entry_index = scan_index;
4651+			scan_entry->item = item;
4652+			inc_rmap_list_pool_count(slot, scan_index);
4653+		} else
4654+			goto putpage;
4655+	}
4656+
4657+	BUG_ON(item->slot != slot);
4658+	/* the page may have changed */
4659+	item->page = page;
4660+	put_rmap_list_entry(slot, scan_index);
4661+	if (swap_entry)
4662+		put_rmap_list_entry(slot, swap_index);
4663+	return item;
4664+
4665+putpage:
4666+	put_page(page);
4667+	page = NULL;
4668+nopage:
4669+	/* no page, store addr back and free rmap_item if possible */
4670+	free_entry_item(scan_entry);
4671+	put_rmap_list_entry(slot, scan_index);
4672+	if (swap_entry)
4673+		put_rmap_list_entry(slot, swap_index);
4674+	return NULL;
4675+}
4676+
4677+static inline int in_stable_tree(struct rmap_item *rmap_item)
4678+{
4679+	return rmap_item->address & STABLE_FLAG;
4680+}
4681+
4682+/**
4683+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4684+ * mmap_lock locked.
4685+ */
4686+static noinline void scan_vma_one_page(struct vma_slot *slot)
4687+{
4688+	u32 hash;
4689+	struct mm_struct *mm;
4690+	struct rmap_item *rmap_item = NULL;
4691+	struct vm_area_struct *vma = slot->vma;
4692+
4693+	mm = vma->vm_mm;
4694+	BUG_ON(!mm);
4695+	BUG_ON(!slot);
4696+
4697+	rmap_item = get_next_rmap_item(slot, &hash);
4698+	if (!rmap_item)
4699+		goto out1;
4700+
4701+	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4702+		goto out2;
4703+
4704+	cmp_and_merge_page(rmap_item, hash);
4705+out2:
4706+	put_page(rmap_item->page);
4707+out1:
4708+	slot->pages_scanned++;
4709+	slot->this_sampled++;
4710+	if (slot->fully_scanned_round != fully_scanned_round)
4711+		scanned_virtual_pages++;
4712+
4713+	if (vma_fully_scanned(slot))
4714+		slot->fully_scanned_round = fully_scanned_round;
4715+}
4716+
4717+static inline unsigned long rung_get_pages(struct scan_rung *rung)
4718+{
4719+	struct slot_tree_node *node;
4720+
4721+	if (!rung->vma_root.rnode)
4722+		return 0;
4723+
4724+	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4725+
4726+	return node->size;
4727+}
4728+
4729+#define RUNG_SAMPLED_MIN	3
4730+
4731+static inline
4732+void uksm_calc_rung_step(struct scan_rung *rung,
4733+			 unsigned long page_time, unsigned long ratio)
4734+{
4735+	unsigned long sampled, pages;
4736+
4737+	/* will be fully scanned ? */
4738+	if (!rung->cover_msecs) {
4739+		rung->step = 1;
4740+		return;
4741+	}
4742+
4743+	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4744+		  * ratio / page_time;
4745+
4746+	/*
4747+	 *  Before we finsish a scan round and expensive per-round jobs,
4748+	 *  we need to have a chance to estimate the per page time. So
4749+	 *  the sampled number can not be too small.
4750+	 */
4751+	if (sampled < RUNG_SAMPLED_MIN)
4752+		sampled = RUNG_SAMPLED_MIN;
4753+
4754+	pages = rung_get_pages(rung);
4755+	if (likely(pages > sampled))
4756+		rung->step = pages / sampled;
4757+	else
4758+		rung->step = 1;
4759+}
4760+
4761+static inline int step_need_recalc(struct scan_rung *rung)
4762+{
4763+	unsigned long pages, stepmax;
4764+
4765+	pages = rung_get_pages(rung);
4766+	stepmax = pages / RUNG_SAMPLED_MIN;
4767+
4768+	return pages && (rung->step > pages ||
4769+			 (stepmax && rung->step > stepmax));
4770+}
4771+
4772+static inline
4773+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4774+{
4775+	struct vma_slot *slot;
4776+
4777+	if (finished)
4778+		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4779+
4780+	if (step_recalc || step_need_recalc(rung)) {
4781+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4782+		BUG_ON(step_need_recalc(rung));
4783+	}
4784+
4785+	slot_iter_index = prandom_u32() % rung->step;
4786+	BUG_ON(!rung->vma_root.rnode);
4787+	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4788+	BUG_ON(!slot);
4789+
4790+	rung->current_scan = slot;
4791+	rung->current_offset = slot_iter_index;
4792+}
4793+
4794+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4795+{
4796+	return &slot->rung->vma_root;
4797+}
4798+
4799+/*
4800+ * return if resetted.
4801+ */
4802+static int advance_current_scan(struct scan_rung *rung)
4803+{
4804+	unsigned short n;
4805+	struct vma_slot *slot, *next = NULL;
4806+
4807+	BUG_ON(!rung->vma_root.num);
4808+
4809+	slot = rung->current_scan;
4810+	n = (slot->pages - rung->current_offset) % rung->step;
4811+	slot_iter_index = rung->step - n;
4812+	next = sradix_tree_next(&rung->vma_root, slot->snode,
4813+				slot->sindex, slot_iter);
4814+
4815+	if (next) {
4816+		rung->current_offset = slot_iter_index;
4817+		rung->current_scan = next;
4818+		return 0;
4819+	} else {
4820+		reset_current_scan(rung, 1, 0);
4821+		return 1;
4822+	}
4823+}
4824+
4825+static inline void rung_rm_slot(struct vma_slot *slot)
4826+{
4827+	struct scan_rung *rung = slot->rung;
4828+	struct sradix_tree_root *root;
4829+
4830+	if (rung->current_scan == slot)
4831+		advance_current_scan(rung);
4832+
4833+	root = slot_get_root(slot);
4834+	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4835+	slot->snode = NULL;
4836+	if (step_need_recalc(rung)) {
4837+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4838+		BUG_ON(step_need_recalc(rung));
4839+	}
4840+
4841+	/* In case advance_current_scan loop back to this slot again */
4842+	if (rung->vma_root.num && rung->current_scan == slot)
4843+		reset_current_scan(slot->rung, 1, 0);
4844+}
4845+
4846+static inline void rung_add_new_slots(struct scan_rung *rung,
4847+			struct vma_slot **slots, unsigned long num)
4848+{
4849+	int err;
4850+	struct vma_slot *slot;
4851+	unsigned long i;
4852+	struct sradix_tree_root *root = &rung->vma_root;
4853+
4854+	err = sradix_tree_enter(root, (void **)slots, num);
4855+	BUG_ON(err);
4856+
4857+	for (i = 0; i < num; i++) {
4858+		slot = slots[i];
4859+		slot->rung = rung;
4860+		BUG_ON(vma_fully_scanned(slot));
4861+	}
4862+
4863+	if (rung->vma_root.num == num)
4864+		reset_current_scan(rung, 0, 1);
4865+}
4866+
4867+static inline int rung_add_one_slot(struct scan_rung *rung,
4868+				     struct vma_slot *slot)
4869+{
4870+	int err;
4871+
4872+	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4873+	if (err)
4874+		return err;
4875+
4876+	slot->rung = rung;
4877+	if (rung->vma_root.num == 1)
4878+		reset_current_scan(rung, 0, 1);
4879+
4880+	return 0;
4881+}
4882+
4883+/*
4884+ * Return true if the slot is deleted from its rung.
4885+ */
4886+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4887+{
4888+	struct scan_rung *old_rung = slot->rung;
4889+	int err;
4890+
4891+	if (old_rung == rung)
4892+		return 0;
4893+
4894+	rung_rm_slot(slot);
4895+	err = rung_add_one_slot(rung, slot);
4896+	if (err) {
4897+		err = rung_add_one_slot(old_rung, slot);
4898+		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4899+	}
4900+
4901+	return 1;
4902+}
4903+
4904+static inline int vma_rung_up(struct vma_slot *slot)
4905+{
4906+	struct scan_rung *rung;
4907+
4908+	rung = slot->rung;
4909+	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4910+		rung++;
4911+
4912+	return vma_rung_enter(slot, rung);
4913+}
4914+
4915+static inline int vma_rung_down(struct vma_slot *slot)
4916+{
4917+	struct scan_rung *rung;
4918+
4919+	rung = slot->rung;
4920+	if (slot->rung != &uksm_scan_ladder[0])
4921+		rung--;
4922+
4923+	return vma_rung_enter(slot, rung);
4924+}
4925+
4926+/**
4927+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4928+ */
4929+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4930+{
4931+	unsigned long ret;
4932+	unsigned long pages;
4933+
4934+	pages = slot->this_sampled;
4935+	if (!pages)
4936+		return 0;
4937+
4938+	BUG_ON(slot->pages_scanned == slot->last_scanned);
4939+
4940+	ret = slot->pages_merged;
4941+
4942+	/* Thrashing area filtering */
4943+	if (ret && uksm_thrash_threshold) {
4944+		if (slot->pages_cowed * 100 / slot->pages_merged
4945+		    > uksm_thrash_threshold) {
4946+			ret = 0;
4947+		} else {
4948+			ret = slot->pages_merged - slot->pages_cowed;
4949+		}
4950+	}
4951+
4952+	return ret * 100 / pages;
4953+}
4954+
4955+/**
4956+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4957+ */
4958+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4959+{
4960+	unsigned long ret;
4961+	unsigned long pages;
4962+
4963+	pages = slot->pages;
4964+	if (!pages)
4965+		return 0;
4966+
4967+	ret = slot->pages_bemerged;
4968+
4969+	/* Thrashing area filtering */
4970+	if (ret && uksm_thrash_threshold) {
4971+		if (slot->pages_cowed * 100 / slot->pages_bemerged
4972+		    > uksm_thrash_threshold) {
4973+			ret = 0;
4974+		} else {
4975+			ret = slot->pages_bemerged - slot->pages_cowed;
4976+		}
4977+	}
4978+
4979+	return ret * 100 / pages;
4980+}
4981+
4982+/**
4983+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
4984+ * stable tree need to be restructured, this is the function re-inserting the
4985+ * stable node.
4986+ */
4987+static inline void stable_node_reinsert(struct stable_node *new_node,
4988+					struct page *page,
4989+					struct rb_root *root_treep,
4990+					struct list_head *tree_node_listp,
4991+					u32 hash)
4992+{
4993+	struct rb_node **new = &root_treep->rb_node;
4994+	struct rb_node *parent = NULL;
4995+	struct stable_node *stable_node;
4996+	struct tree_node *tree_node;
4997+	struct page *tree_page;
4998+	int cmp;
4999+
5000+	while (*new) {
5001+		int cmp;
5002+
5003+		tree_node = rb_entry(*new, struct tree_node, node);
5004+
5005+		cmp = hash_cmp(hash, tree_node->hash);
5006+
5007+		if (cmp < 0) {
5008+			parent = *new;
5009+			new = &parent->rb_left;
5010+		} else if (cmp > 0) {
5011+			parent = *new;
5012+			new = &parent->rb_right;
5013+		} else
5014+			break;
5015+	}
5016+
5017+	if (*new) {
5018+		/* find a stable tree node with same first level hash value */
5019+		stable_node_hash_max(new_node, page, hash);
5020+		if (tree_node->count == 1) {
5021+			stable_node = rb_entry(tree_node->sub_root.rb_node,
5022+					       struct stable_node, node);
5023+			tree_page = get_uksm_page(stable_node, 1, 0);
5024+			if (tree_page) {
5025+				stable_node_hash_max(stable_node,
5026+						      tree_page, hash);
5027+				put_page(tree_page);
5028+
5029+				/* prepare for stable node insertion */
5030+
5031+				cmp = hash_cmp(new_node->hash_max,
5032+						   stable_node->hash_max);
5033+				parent = &stable_node->node;
5034+				if (cmp < 0)
5035+					new = &parent->rb_left;
5036+				else if (cmp > 0)
5037+					new = &parent->rb_right;
5038+				else
5039+					goto failed;
5040+
5041+				goto add_node;
5042+			} else {
5043+				/* the only stable_node deleted, the tree node
5044+				 * was not deleted.
5045+				 */
5046+				goto tree_node_reuse;
5047+			}
5048+		}
5049+
5050+		/* well, search the collision subtree */
5051+		new = &tree_node->sub_root.rb_node;
5052+		parent = NULL;
5053+		BUG_ON(!*new);
5054+		while (*new) {
5055+			int cmp;
5056+
5057+			stable_node = rb_entry(*new, struct stable_node, node);
5058+
5059+			cmp = hash_cmp(new_node->hash_max,
5060+					   stable_node->hash_max);
5061+
5062+			if (cmp < 0) {
5063+				parent = *new;
5064+				new = &parent->rb_left;
5065+			} else if (cmp > 0) {
5066+				parent = *new;
5067+				new = &parent->rb_right;
5068+			} else {
5069+				/* oh, no, still a collision */
5070+				goto failed;
5071+			}
5072+		}
5073+
5074+		goto add_node;
5075+	}
5076+
5077+	/* no tree node found */
5078+	tree_node = alloc_tree_node(tree_node_listp);
5079+	if (!tree_node) {
5080+		pr_err("UKSM: memory allocation error!\n");
5081+		goto failed;
5082+	} else {
5083+		tree_node->hash = hash;
5084+		rb_link_node(&tree_node->node, parent, new);
5085+		rb_insert_color(&tree_node->node, root_treep);
5086+
5087+tree_node_reuse:
5088+		/* prepare for stable node insertion */
5089+		parent = NULL;
5090+		new = &tree_node->sub_root.rb_node;
5091+	}
5092+
5093+add_node:
5094+	rb_link_node(&new_node->node, parent, new);
5095+	rb_insert_color(&new_node->node, &tree_node->sub_root);
5096+	new_node->tree_node = tree_node;
5097+	tree_node->count++;
5098+	return;
5099+
5100+failed:
5101+	/* This can only happen when two nodes have collided
5102+	 * in two levels.
5103+	 */
5104+	new_node->tree_node = NULL;
5105+	return;
5106+}
5107+
5108+static inline void free_all_tree_nodes(struct list_head *list)
5109+{
5110+	struct tree_node *node, *tmp;
5111+
5112+	list_for_each_entry_safe(node, tmp, list, all_list) {
5113+		free_tree_node(node);
5114+	}
5115+}
5116+
5117+/**
5118+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5119+ * strength to the current hash_strength. It re-structures the hole tree.
5120+ */
5121+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5122+{
5123+	struct stable_node *node, *tmp;
5124+	struct rb_root *root_new_treep;
5125+	struct list_head *new_tree_node_listp;
5126+
5127+	stable_tree_index = (stable_tree_index + 1) % 2;
5128+	root_new_treep = &root_stable_tree[stable_tree_index];
5129+	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5130+	*root_new_treep = RB_ROOT;
5131+	BUG_ON(!list_empty(new_tree_node_listp));
5132+
5133+	/*
5134+	 * we need to be safe, the node could be removed by get_uksm_page()
5135+	 */
5136+	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5137+		void *addr;
5138+		struct page *node_page;
5139+		u32 hash;
5140+
5141+		/*
5142+		 * We are completely re-structuring the stable nodes to a new
5143+		 * stable tree. We don't want to touch the old tree unlinks and
5144+		 * old tree_nodes. The old tree_nodes will be freed at once.
5145+		 */
5146+		node_page = get_uksm_page(node, 0, 0);
5147+		if (!node_page)
5148+			continue;
5149+
5150+		if (node->tree_node) {
5151+			hash = node->tree_node->hash;
5152+
5153+			addr = kmap_atomic(node_page);
5154+
5155+			hash = delta_hash(addr, prev_hash_strength,
5156+					  hash_strength, hash);
5157+			kunmap_atomic(addr);
5158+		} else {
5159+			/*
5160+			 *it was not inserted to rbtree due to collision in last
5161+			 *round scan.
5162+			 */
5163+			hash = page_hash(node_page, hash_strength, 0);
5164+		}
5165+
5166+		stable_node_reinsert(node, node_page, root_new_treep,
5167+				     new_tree_node_listp, hash);
5168+		put_page(node_page);
5169+	}
5170+
5171+	root_stable_treep = root_new_treep;
5172+	free_all_tree_nodes(stable_tree_node_listp);
5173+	BUG_ON(!list_empty(stable_tree_node_listp));
5174+	stable_tree_node_listp = new_tree_node_listp;
5175+}
5176+
5177+static inline void inc_hash_strength(unsigned long delta)
5178+{
5179+	hash_strength += 1 << delta;
5180+	if (hash_strength > HASH_STRENGTH_MAX)
5181+		hash_strength = HASH_STRENGTH_MAX;
5182+}
5183+
5184+static inline void dec_hash_strength(unsigned long delta)
5185+{
5186+	unsigned long change = 1 << delta;
5187+
5188+	if (hash_strength <= change + 1)
5189+		hash_strength = 1;
5190+	else
5191+		hash_strength -= change;
5192+}
5193+
5194+static inline void inc_hash_strength_delta(void)
5195+{
5196+	hash_strength_delta++;
5197+	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5198+		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5199+}
5200+
5201+static inline unsigned long get_current_neg_ratio(void)
5202+{
5203+	u64 pos = benefit.pos;
5204+	u64 neg = benefit.neg;
5205+
5206+	if (!neg)
5207+		return 0;
5208+
5209+	if (!pos || neg > pos)
5210+		return 100;
5211+
5212+	if (neg > div64_u64(U64_MAX, 100))
5213+		pos = div64_u64(pos, 100);
5214+	else
5215+		neg *= 100;
5216+
5217+	return div64_u64(neg, pos);
5218+}
5219+
5220+static inline unsigned long get_current_benefit(void)
5221+{
5222+	u64 pos = benefit.pos;
5223+	u64 neg = benefit.neg;
5224+	u64 scanned = benefit.scanned;
5225+
5226+	if (neg > pos)
5227+		return 0;
5228+
5229+	return div64_u64((pos - neg), scanned);
5230+}
5231+
5232+static inline int judge_rshash_direction(void)
5233+{
5234+	u64 current_neg_ratio, stable_benefit;
5235+	u64 current_benefit, delta = 0;
5236+	int ret = STILL;
5237+
5238+	/*
5239+	 * Try to probe a value after the boot, and in case the system
5240+	 * are still for a long time.
5241+	 */
5242+	if ((fully_scanned_round & 0xFFULL) == 10) {
5243+		ret = OBSCURE;
5244+		goto out;
5245+	}
5246+
5247+	current_neg_ratio = get_current_neg_ratio();
5248+
5249+	if (current_neg_ratio == 0) {
5250+		rshash_neg_cont_zero++;
5251+		if (rshash_neg_cont_zero > 2)
5252+			return GO_DOWN;
5253+		else
5254+			return STILL;
5255+	}
5256+	rshash_neg_cont_zero = 0;
5257+
5258+	if (current_neg_ratio > 90) {
5259+		ret = GO_UP;
5260+		goto out;
5261+	}
5262+
5263+	current_benefit = get_current_benefit();
5264+	stable_benefit = rshash_state.stable_benefit;
5265+
5266+	if (!stable_benefit) {
5267+		ret = OBSCURE;
5268+		goto out;
5269+	}
5270+
5271+	if (current_benefit > stable_benefit)
5272+		delta = current_benefit - stable_benefit;
5273+	else if (current_benefit < stable_benefit)
5274+		delta = stable_benefit - current_benefit;
5275+
5276+	delta = div64_u64(100 * delta, stable_benefit);
5277+
5278+	if (delta > 50) {
5279+		rshash_cont_obscure++;
5280+		if (rshash_cont_obscure > 2)
5281+			return OBSCURE;
5282+		else
5283+			return STILL;
5284+	}
5285+
5286+out:
5287+	rshash_cont_obscure = 0;
5288+	return ret;
5289+}
5290+
5291+/**
5292+ * rshash_adjust() - The main function to control the random sampling state
5293+ * machine for hash strength adapting.
5294+ *
5295+ * return true if hash_strength has changed.
5296+ */
5297+static inline int rshash_adjust(void)
5298+{
5299+	unsigned long prev_hash_strength = hash_strength;
5300+
5301+	if (!encode_benefit())
5302+		return 0;
5303+
5304+	switch (rshash_state.state) {
5305+	case RSHASH_STILL:
5306+		switch (judge_rshash_direction()) {
5307+		case GO_UP:
5308+			if (rshash_state.pre_direct == GO_DOWN)
5309+				hash_strength_delta = 0;
5310+
5311+			inc_hash_strength(hash_strength_delta);
5312+			inc_hash_strength_delta();
5313+			rshash_state.stable_benefit = get_current_benefit();
5314+			rshash_state.pre_direct = GO_UP;
5315+			break;
5316+
5317+		case GO_DOWN:
5318+			if (rshash_state.pre_direct == GO_UP)
5319+				hash_strength_delta = 0;
5320+
5321+			dec_hash_strength(hash_strength_delta);
5322+			inc_hash_strength_delta();
5323+			rshash_state.stable_benefit = get_current_benefit();
5324+			rshash_state.pre_direct = GO_DOWN;
5325+			break;
5326+
5327+		case OBSCURE:
5328+			rshash_state.stable_point = hash_strength;
5329+			rshash_state.turn_point_down = hash_strength;
5330+			rshash_state.turn_point_up = hash_strength;
5331+			rshash_state.turn_benefit_down = get_current_benefit();
5332+			rshash_state.turn_benefit_up = get_current_benefit();
5333+			rshash_state.lookup_window_index = 0;
5334+			rshash_state.state = RSHASH_TRYDOWN;
5335+			dec_hash_strength(hash_strength_delta);
5336+			inc_hash_strength_delta();
5337+			break;
5338+
5339+		case STILL:
5340+			break;
5341+		default:
5342+			BUG();
5343+		}
5344+		break;
5345+
5346+	case RSHASH_TRYDOWN:
5347+		if (rshash_state.lookup_window_index++ % 5 == 0)
5348+			rshash_state.below_count = 0;
5349+
5350+		if (get_current_benefit() < rshash_state.stable_benefit)
5351+			rshash_state.below_count++;
5352+		else if (get_current_benefit() >
5353+			 rshash_state.turn_benefit_down) {
5354+			rshash_state.turn_point_down = hash_strength;
5355+			rshash_state.turn_benefit_down = get_current_benefit();
5356+		}
5357+
5358+		if (rshash_state.below_count >= 3 ||
5359+		    judge_rshash_direction() == GO_UP ||
5360+		    hash_strength == 1) {
5361+			hash_strength = rshash_state.stable_point;
5362+			hash_strength_delta = 0;
5363+			inc_hash_strength(hash_strength_delta);
5364+			inc_hash_strength_delta();
5365+			rshash_state.lookup_window_index = 0;
5366+			rshash_state.state = RSHASH_TRYUP;
5367+			hash_strength_delta = 0;
5368+		} else {
5369+			dec_hash_strength(hash_strength_delta);
5370+			inc_hash_strength_delta();
5371+		}
5372+		break;
5373+
5374+	case RSHASH_TRYUP:
5375+		if (rshash_state.lookup_window_index++ % 5 == 0)
5376+			rshash_state.below_count = 0;
5377+
5378+		if (get_current_benefit() < rshash_state.turn_benefit_down)
5379+			rshash_state.below_count++;
5380+		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5381+			rshash_state.turn_point_up = hash_strength;
5382+			rshash_state.turn_benefit_up = get_current_benefit();
5383+		}
5384+
5385+		if (rshash_state.below_count >= 3 ||
5386+		    judge_rshash_direction() == GO_DOWN ||
5387+		    hash_strength == HASH_STRENGTH_MAX) {
5388+			hash_strength = rshash_state.turn_benefit_up >
5389+				rshash_state.turn_benefit_down ?
5390+				rshash_state.turn_point_up :
5391+				rshash_state.turn_point_down;
5392+
5393+			rshash_state.state = RSHASH_PRE_STILL;
5394+		} else {
5395+			inc_hash_strength(hash_strength_delta);
5396+			inc_hash_strength_delta();
5397+		}
5398+
5399+		break;
5400+
5401+	case RSHASH_NEW:
5402+	case RSHASH_PRE_STILL:
5403+		rshash_state.stable_benefit = get_current_benefit();
5404+		rshash_state.state = RSHASH_STILL;
5405+		hash_strength_delta = 0;
5406+		break;
5407+	default:
5408+		BUG();
5409+	}
5410+
5411+	/* rshash_neg = rshash_pos = 0; */
5412+	reset_benefit();
5413+
5414+	if (prev_hash_strength != hash_strength)
5415+		stable_tree_delta_hash(prev_hash_strength);
5416+
5417+	return prev_hash_strength != hash_strength;
5418+}
5419+
5420+/**
5421+ * round_update_ladder() - The main function to do update of all the
5422+ * adjustments whenever a scan round is finished.
5423+ */
5424+static noinline void round_update_ladder(void)
5425+{
5426+	int i;
5427+	unsigned long dedup;
5428+	struct vma_slot *slot, *tmp_slot;
5429+
5430+	for (i = 0; i < SCAN_LADDER_SIZE; i++)
5431+		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5432+
5433+	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5434+
5435+		/* slot may be rung_rm_slot() when mm exits */
5436+		if (slot->snode) {
5437+			dedup = cal_dedup_ratio_old(slot);
5438+			if (dedup && dedup >= uksm_abundant_threshold)
5439+				vma_rung_up(slot);
5440+		}
5441+
5442+		slot->pages_bemerged = 0;
5443+		slot->pages_cowed = 0;
5444+
5445+		list_del_init(&slot->dedup_list);
5446+	}
5447+}
5448+
5449+static void uksm_del_vma_slot(struct vma_slot *slot)
5450+{
5451+	int i, j;
5452+	struct rmap_list_entry *entry;
5453+
5454+	if (slot->snode) {
5455+		/*
5456+		 * In case it just failed when entering the rung, it's not
5457+		 * necessary.
5458+		 */
5459+		rung_rm_slot(slot);
5460+	}
5461+
5462+	if (!list_empty(&slot->dedup_list))
5463+		list_del(&slot->dedup_list);
5464+
5465+	if (!slot->rmap_list_pool || !slot->pool_counts) {
5466+		/* In case it OOMed in uksm_vma_enter() */
5467+		goto out;
5468+	}
5469+
5470+	for (i = 0; i < slot->pool_size; i++) {
5471+		void *addr;
5472+
5473+		if (!slot->rmap_list_pool[i])
5474+			continue;
5475+
5476+		addr = kmap(slot->rmap_list_pool[i]);
5477+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5478+			entry = (struct rmap_list_entry *)addr + j;
5479+			if (is_addr(entry->addr))
5480+				continue;
5481+			if (!entry->item)
5482+				continue;
5483+
5484+			remove_rmap_item_from_tree(entry->item);
5485+			free_rmap_item(entry->item);
5486+			slot->pool_counts[i]--;
5487+		}
5488+		BUG_ON(slot->pool_counts[i]);
5489+		kunmap(slot->rmap_list_pool[i]);
5490+		__free_page(slot->rmap_list_pool[i]);
5491+	}
5492+	kfree(slot->rmap_list_pool);
5493+	kfree(slot->pool_counts);
5494+
5495+out:
5496+	slot->rung = NULL;
5497+	if (slot->flags & UKSM_SLOT_IN_UKSM) {
5498+		BUG_ON(uksm_pages_total < slot->pages);
5499+		uksm_pages_total -= slot->pages;
5500+	}
5501+
5502+	if (slot->fully_scanned_round == fully_scanned_round)
5503+		scanned_virtual_pages -= slot->pages;
5504+	else
5505+		scanned_virtual_pages -= slot->pages_scanned;
5506+	free_vma_slot(slot);
5507+}
5508+
5509+
5510+#define SPIN_LOCK_PERIOD	32
5511+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5512+static inline void cleanup_vma_slots(void)
5513+{
5514+	struct vma_slot *slot;
5515+	int i;
5516+
5517+	i = 0;
5518+	spin_lock(&vma_slot_list_lock);
5519+	while (!list_empty(&vma_slot_del)) {
5520+		slot = list_entry(vma_slot_del.next,
5521+				  struct vma_slot, slot_list);
5522+		list_del(&slot->slot_list);
5523+		cleanup_slots[i++] = slot;
5524+		if (i == SPIN_LOCK_PERIOD) {
5525+			spin_unlock(&vma_slot_list_lock);
5526+			while (--i >= 0)
5527+				uksm_del_vma_slot(cleanup_slots[i]);
5528+			i = 0;
5529+			spin_lock(&vma_slot_list_lock);
5530+		}
5531+	}
5532+	spin_unlock(&vma_slot_list_lock);
5533+
5534+	while (--i >= 0)
5535+		uksm_del_vma_slot(cleanup_slots[i]);
5536+}
5537+
5538+/*
5539+ * Expotional moving average formula
5540+ */
5541+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5542+{
5543+	/*
5544+	 * For a very high burst, even the ema cannot work well, a false very
5545+	 * high per-page time estimation can result in feedback in very high
5546+	 * overhead of context switch and rung update -- this will then lead
5547+	 * to higher per-paper time, this may not converge.
5548+	 *
5549+	 * Instead, we try to approach this value in a binary manner.
5550+	 */
5551+	if (curr > last_ema * 10)
5552+		return last_ema * 2;
5553+
5554+	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5555+}
5556+
5557+/*
5558+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5559+ * nanoseconds based on current uksm_sleep_jiffies.
5560+ */
5561+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5562+{
5563+	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5564+		(TIME_RATIO_SCALE - ratio) * ratio;
5565+}
5566+
5567+
5568+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5569+{
5570+	unsigned long ret;
5571+
5572+	BUG_ON(!cpu_time_ratio);
5573+
5574+	if (cpu_time_ratio > 0)
5575+		ret = cpu_time_ratio;
5576+	else
5577+		ret = (unsigned long)(-cpu_time_ratio) *
5578+			uksm_max_cpu_percentage / 100UL;
5579+
5580+	return ret ? ret : 1;
5581+}
5582+
5583+static noinline void uksm_calc_scan_pages(void)
5584+{
5585+	struct scan_rung *ladder = uksm_scan_ladder;
5586+	unsigned long sleep_usecs, nsecs;
5587+	unsigned long ratio;
5588+	int i;
5589+	unsigned long per_page;
5590+
5591+	if (uksm_ema_page_time > 100000 ||
5592+	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5593+		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5594+
5595+	per_page = uksm_ema_page_time;
5596+	BUG_ON(!per_page);
5597+
5598+	/*
5599+	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5600+	 * based on saved user input.
5601+	 */
5602+	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5603+		uksm_sleep_jiffies = uksm_sleep_saved;
5604+
5605+	/* We require a rung scan at least 1 page in a period. */
5606+	nsecs = per_page;
5607+	ratio = rung_real_ratio(ladder[0].cpu_ratio);
5608+	if (cpu_ratio_to_nsec(ratio) < nsecs) {
5609+		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5610+				/ NSEC_PER_USEC;
5611+		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5612+	}
5613+
5614+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5615+		ratio = rung_real_ratio(ladder[i].cpu_ratio);
5616+		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5617+					per_page;
5618+		BUG_ON(!ladder[i].pages_to_scan);
5619+		uksm_calc_rung_step(&ladder[i], per_page, ratio);
5620+	}
5621+}
5622+
5623+/*
5624+ * From the scan time of this round (ns) to next expected min sleep time
5625+ * (ms), be careful of the possible overflows. ratio is taken from
5626+ * rung_real_ratio()
5627+ */
5628+static inline
5629+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5630+{
5631+	scan_time >>= 20; /* to msec level now */
5632+	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5633+
5634+	return (unsigned int) ((unsigned long) scan_time *
5635+			       (TIME_RATIO_SCALE - ratio) / ratio);
5636+}
5637+
5638+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5639+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5640+
5641+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5642+{
5643+	struct scan_rung *rung;
5644+
5645+	rung = &uksm_scan_ladder[0];
5646+	rung_add_new_slots(rung, slots, num);
5647+}
5648+
5649+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5650+
5651+static void uksm_enter_all_slots(void)
5652+{
5653+	struct vma_slot *slot;
5654+	unsigned long index;
5655+	struct list_head empty_vma_list;
5656+	int i;
5657+
5658+	i = 0;
5659+	index = 0;
5660+	INIT_LIST_HEAD(&empty_vma_list);
5661+
5662+	spin_lock(&vma_slot_list_lock);
5663+	while (!list_empty(&vma_slot_new)) {
5664+		slot = list_entry(vma_slot_new.next,
5665+				  struct vma_slot, slot_list);
5666+
5667+		if (!slot->vma->anon_vma) {
5668+			list_move(&slot->slot_list, &empty_vma_list);
5669+		} else if (vma_can_enter(slot->vma)) {
5670+			batch_slots[index++] = slot;
5671+			list_del_init(&slot->slot_list);
5672+		} else {
5673+			list_move(&slot->slot_list, &vma_slot_noadd);
5674+		}
5675+
5676+		if (++i == SPIN_LOCK_PERIOD ||
5677+		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5678+			spin_unlock(&vma_slot_list_lock);
5679+
5680+			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5681+				uksm_vma_enter(batch_slots, index);
5682+				index = 0;
5683+			}
5684+			i = 0;
5685+			cond_resched();
5686+			spin_lock(&vma_slot_list_lock);
5687+		}
5688+	}
5689+
5690+	list_splice(&empty_vma_list, &vma_slot_new);
5691+
5692+	spin_unlock(&vma_slot_list_lock);
5693+
5694+	if (index)
5695+		uksm_vma_enter(batch_slots, index);
5696+
5697+}
5698+
5699+static inline int rung_round_finished(struct scan_rung *rung)
5700+{
5701+	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5702+}
5703+
5704+static inline void judge_slot(struct vma_slot *slot)
5705+{
5706+	struct scan_rung *rung = slot->rung;
5707+	unsigned long dedup;
5708+	int deleted;
5709+
5710+	dedup = cal_dedup_ratio(slot);
5711+	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5712+		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5713+	else if (dedup && dedup >= uksm_abundant_threshold)
5714+		deleted = vma_rung_up(slot);
5715+	else
5716+		deleted = vma_rung_down(slot);
5717+
5718+	slot->pages_merged = 0;
5719+	slot->pages_cowed = 0;
5720+	slot->this_sampled = 0;
5721+
5722+	if (vma_fully_scanned(slot))
5723+		slot->pages_scanned = 0;
5724+
5725+	slot->last_scanned = slot->pages_scanned;
5726+
5727+	/* If its deleted in above, then rung was already advanced. */
5728+	if (!deleted)
5729+		advance_current_scan(rung);
5730+}
5731+
5732+
5733+static inline int hash_round_finished(void)
5734+{
5735+	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5736+		scanned_virtual_pages = 0;
5737+		if (uksm_pages_scanned)
5738+			fully_scanned_round++;
5739+
5740+		return 1;
5741+	} else {
5742+		return 0;
5743+	}
5744+}
5745+
5746+#define UKSM_MMSEM_BATCH	5
5747+#define BUSY_RETRY		100
5748+
5749+/**
5750+ * uksm_do_scan()  - the main worker function.
5751+ */
5752+static noinline void uksm_do_scan(void)
5753+{
5754+	struct vma_slot *slot, *iter;
5755+	struct mm_struct *busy_mm;
5756+	unsigned char round_finished, all_rungs_emtpy;
5757+	int i, err, mmsem_batch;
5758+	unsigned long pcost;
5759+	long long delta_exec;
5760+	unsigned long vpages, max_cpu_ratio;
5761+	unsigned long long start_time, end_time, scan_time;
5762+	unsigned int expected_jiffies;
5763+
5764+	might_sleep();
5765+
5766+	vpages = 0;
5767+
5768+	start_time = task_sched_runtime(current);
5769+	max_cpu_ratio = 0;
5770+	mmsem_batch = 0;
5771+
5772+	for (i = 0; i < SCAN_LADDER_SIZE;) {
5773+		struct scan_rung *rung = &uksm_scan_ladder[i];
5774+		unsigned long ratio;
5775+		int busy_retry;
5776+
5777+		if (!rung->pages_to_scan) {
5778+			i++;
5779+			continue;
5780+		}
5781+
5782+		if (!rung->vma_root.num) {
5783+			rung->pages_to_scan = 0;
5784+			i++;
5785+			continue;
5786+		}
5787+
5788+		ratio = rung_real_ratio(rung->cpu_ratio);
5789+		if (ratio > max_cpu_ratio)
5790+			max_cpu_ratio = ratio;
5791+
5792+		busy_retry = BUSY_RETRY;
5793+		/*
5794+		 * Do not consider rung_round_finished() here, just used up the
5795+		 * rung->pages_to_scan quota.
5796+		 */
5797+		while (rung->pages_to_scan && rung->vma_root.num &&
5798+		       likely(!freezing(current))) {
5799+			int reset = 0;
5800+
5801+			slot = rung->current_scan;
5802+
5803+			BUG_ON(vma_fully_scanned(slot));
5804+
5805+			if (mmsem_batch)
5806+				err = 0;
5807+			else
5808+				err = try_down_read_slot_mmap_lock(slot);
5809+
5810+			if (err == -ENOENT) {
5811+rm_slot:
5812+				rung_rm_slot(slot);
5813+				continue;
5814+			}
5815+
5816+			busy_mm = slot->mm;
5817+
5818+			if (err == -EBUSY) {
5819+				/* skip other vmas on the same mm */
5820+				do {
5821+					reset = advance_current_scan(rung);
5822+					iter = rung->current_scan;
5823+					busy_retry--;
5824+					if (iter->vma->vm_mm != busy_mm ||
5825+					    !busy_retry || reset)
5826+						break;
5827+				} while (1);
5828+
5829+				if (iter->vma->vm_mm != busy_mm) {
5830+					continue;
5831+				} else {
5832+					/* scan round finsished */
5833+					break;
5834+				}
5835+			}
5836+
5837+			BUG_ON(!vma_can_enter(slot->vma));
5838+			if (uksm_test_exit(slot->vma->vm_mm)) {
5839+				mmsem_batch = 0;
5840+				mmap_read_unlock(slot->vma->vm_mm);
5841+				goto rm_slot;
5842+			}
5843+
5844+			if (mmsem_batch)
5845+				mmsem_batch--;
5846+			else
5847+				mmsem_batch = UKSM_MMSEM_BATCH;
5848+
5849+			/* Ok, we have take the mmap_lock, ready to scan */
5850+			scan_vma_one_page(slot);
5851+			rung->pages_to_scan--;
5852+			vpages++;
5853+
5854+			if (rung->current_offset + rung->step > slot->pages - 1
5855+			    || vma_fully_scanned(slot)) {
5856+				mmap_read_unlock(slot->vma->vm_mm);
5857+				judge_slot(slot);
5858+				mmsem_batch = 0;
5859+			} else {
5860+				rung->current_offset += rung->step;
5861+				if (!mmsem_batch)
5862+					mmap_read_unlock(slot->vma->vm_mm);
5863+			}
5864+
5865+			busy_retry = BUSY_RETRY;
5866+			cond_resched();
5867+		}
5868+
5869+		if (mmsem_batch) {
5870+			mmap_read_unlock(slot->vma->vm_mm);
5871+			mmsem_batch = 0;
5872+		}
5873+
5874+		if (freezing(current))
5875+			break;
5876+
5877+		cond_resched();
5878+	}
5879+	end_time = task_sched_runtime(current);
5880+	delta_exec = end_time - start_time;
5881+
5882+	if (freezing(current))
5883+		return;
5884+
5885+	cleanup_vma_slots();
5886+	uksm_enter_all_slots();
5887+
5888+	round_finished = 1;
5889+	all_rungs_emtpy = 1;
5890+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5891+		struct scan_rung *rung = &uksm_scan_ladder[i];
5892+
5893+		if (rung->vma_root.num) {
5894+			all_rungs_emtpy = 0;
5895+			if (!rung_round_finished(rung))
5896+				round_finished = 0;
5897+		}
5898+	}
5899+
5900+	if (all_rungs_emtpy)
5901+		round_finished = 0;
5902+
5903+	if (round_finished) {
5904+		round_update_ladder();
5905+		uksm_eval_round++;
5906+
5907+		if (hash_round_finished() && rshash_adjust()) {
5908+			/* Reset the unstable root iff hash strength changed */
5909+			uksm_hash_round++;
5910+			root_unstable_tree = RB_ROOT;
5911+			free_all_tree_nodes(&unstable_tree_node_list);
5912+		}
5913+
5914+		/*
5915+		 * A number of pages can hang around indefinitely on per-cpu
5916+		 * pagevecs, raised page count preventing write_protect_page
5917+		 * from merging them.  Though it doesn't really matter much,
5918+		 * it is puzzling to see some stuck in pages_volatile until
5919+		 * other activity jostles them out, and they also prevented
5920+		 * LTP's KSM test from succeeding deterministically; so drain
5921+		 * them here (here rather than on entry to uksm_do_scan(),
5922+		 * so we don't IPI too often when pages_to_scan is set low).
5923+		 */
5924+		lru_add_drain_all();
5925+	}
5926+
5927+
5928+	if (vpages && delta_exec > 0) {
5929+		pcost = (unsigned long) delta_exec / vpages;
5930+		if (likely(uksm_ema_page_time))
5931+			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5932+		else
5933+			uksm_ema_page_time = pcost;
5934+	}
5935+
5936+	uksm_calc_scan_pages();
5937+	uksm_sleep_real = uksm_sleep_jiffies;
5938+	/* in case of radical cpu bursts, apply the upper bound */
5939+	end_time = task_sched_runtime(current);
5940+	if (max_cpu_ratio && end_time > start_time) {
5941+		scan_time = end_time - start_time;
5942+		expected_jiffies = msecs_to_jiffies(
5943+			scan_time_to_sleep(scan_time, max_cpu_ratio));
5944+
5945+		if (expected_jiffies > uksm_sleep_real)
5946+			uksm_sleep_real = expected_jiffies;
5947+
5948+		/* We have a 1 second up bound for responsiveness. */
5949+		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5950+			uksm_sleep_real = msecs_to_jiffies(1000);
5951+	}
5952+
5953+	return;
5954+}
5955+
5956+static int ksmd_should_run(void)
5957+{
5958+	return uksm_run & UKSM_RUN_MERGE;
5959+}
5960+
5961+static int uksm_scan_thread(void *nothing)
5962+{
5963+	set_freezable();
5964+	set_user_nice(current, 5);
5965+
5966+	while (!kthread_should_stop()) {
5967+		mutex_lock(&uksm_thread_mutex);
5968+		if (ksmd_should_run())
5969+			uksm_do_scan();
5970+		mutex_unlock(&uksm_thread_mutex);
5971+
5972+		try_to_freeze();
5973+
5974+		if (ksmd_should_run()) {
5975+			schedule_timeout_interruptible(uksm_sleep_real);
5976+			uksm_sleep_times++;
5977+		} else {
5978+			wait_event_freezable(uksm_thread_wait,
5979+				ksmd_should_run() || kthread_should_stop());
5980+		}
5981+	}
5982+	return 0;
5983+}
5984+
5985+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
5986+{
5987+	struct stable_node *stable_node;
5988+	struct node_vma *node_vma;
5989+	struct rmap_item *rmap_item;
5990+	int search_new_forks = 0;
5991+	unsigned long address;
5992+
5993+	VM_BUG_ON_PAGE(!PageKsm(page), page);
5994+	VM_BUG_ON_PAGE(!PageLocked(page), page);
5995+
5996+	stable_node = page_stable_node(page);
5997+	if (!stable_node)
5998+		return;
5999+again:
6000+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6001+		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6002+			struct anon_vma *anon_vma = rmap_item->anon_vma;
6003+			struct anon_vma_chain *vmac;
6004+			struct vm_area_struct *vma;
6005+
6006+			cond_resched();
6007+			anon_vma_lock_read(anon_vma);
6008+			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6009+						       0, ULONG_MAX) {
6010+				cond_resched();
6011+				vma = vmac->vma;
6012+				address = get_rmap_addr(rmap_item);
6013+
6014+				if (address < vma->vm_start ||
6015+				    address >= vma->vm_end)
6016+					continue;
6017+
6018+				if ((rmap_item->slot->vma == vma) ==
6019+				    search_new_forks)
6020+					continue;
6021+
6022+				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6023+					continue;
6024+
6025+				if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
6026+					anon_vma_unlock_read(anon_vma);
6027+					return;
6028+				}
6029+
6030+				if (rwc->done && rwc->done(page)) {
6031+					anon_vma_unlock_read(anon_vma);
6032+					return;
6033+				}
6034+			}
6035+			anon_vma_unlock_read(anon_vma);
6036+		}
6037+	}
6038+	if (!search_new_forks++)
6039+		goto again;
6040+}
6041+
6042+#ifdef CONFIG_MIGRATION
6043+/* Common ksm interface but may be specific to uksm */
6044+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6045+{
6046+	struct stable_node *stable_node;
6047+
6048+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6049+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6050+	VM_BUG_ON(newpage->mapping != oldpage->mapping);
6051+
6052+	stable_node = page_stable_node(newpage);
6053+	if (stable_node) {
6054+		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6055+		stable_node->kpfn = page_to_pfn(newpage);
6056+		/*
6057+		 * newpage->mapping was set in advance; now we need smp_wmb()
6058+		 * to make sure that the new stable_node->kpfn is visible
6059+		 * to get_ksm_page() before it can see that oldpage->mapping
6060+		 * has gone stale (or that PageSwapCache has been cleared).
6061+		 */
6062+		smp_wmb();
6063+		set_page_stable_node(oldpage, NULL);
6064+	}
6065+}
6066+#endif /* CONFIG_MIGRATION */
6067+
6068+#ifdef CONFIG_MEMORY_HOTREMOVE
6069+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6070+						 unsigned long end_pfn)
6071+{
6072+	struct rb_node *node;
6073+
6074+	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6075+		struct stable_node *stable_node;
6076+
6077+		stable_node = rb_entry(node, struct stable_node, node);
6078+		if (stable_node->kpfn >= start_pfn &&
6079+		    stable_node->kpfn < end_pfn)
6080+			return stable_node;
6081+	}
6082+	return NULL;
6083+}
6084+
6085+static int uksm_memory_callback(struct notifier_block *self,
6086+			       unsigned long action, void *arg)
6087+{
6088+	struct memory_notify *mn = arg;
6089+	struct stable_node *stable_node;
6090+
6091+	switch (action) {
6092+	case MEM_GOING_OFFLINE:
6093+		/*
6094+		 * Keep it very simple for now: just lock out ksmd and
6095+		 * MADV_UNMERGEABLE while any memory is going offline.
6096+		 * mutex_lock_nested() is necessary because lockdep was alarmed
6097+		 * that here we take uksm_thread_mutex inside notifier chain
6098+		 * mutex, and later take notifier chain mutex inside
6099+		 * uksm_thread_mutex to unlock it.   But that's safe because both
6100+		 * are inside mem_hotplug_mutex.
6101+		 */
6102+		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6103+		break;
6104+
6105+	case MEM_OFFLINE:
6106+		/*
6107+		 * Most of the work is done by page migration; but there might
6108+		 * be a few stable_nodes left over, still pointing to struct
6109+		 * pages which have been offlined: prune those from the tree.
6110+		 */
6111+		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6112+					mn->start_pfn + mn->nr_pages)) != NULL)
6113+			remove_node_from_stable_tree(stable_node, 1, 1);
6114+		/* fallthrough */
6115+
6116+	case MEM_CANCEL_OFFLINE:
6117+		mutex_unlock(&uksm_thread_mutex);
6118+		break;
6119+	}
6120+	return NOTIFY_OK;
6121+}
6122+#endif /* CONFIG_MEMORY_HOTREMOVE */
6123+
6124+#ifdef CONFIG_SYSFS
6125+/*
6126+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
6127+ */
6128+
6129+#define UKSM_ATTR_RO(_name) \
6130+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6131+#define UKSM_ATTR(_name) \
6132+	static struct kobj_attribute _name##_attr = \
6133+		__ATTR(_name, 0644, _name##_show, _name##_store)
6134+
6135+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6136+				    struct kobj_attribute *attr, char *buf)
6137+{
6138+	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6139+}
6140+
6141+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6142+				     struct kobj_attribute *attr,
6143+				     const char *buf, size_t count)
6144+{
6145+	unsigned long max_cpu_percentage;
6146+	int err;
6147+
6148+	err = kstrtoul(buf, 10, &max_cpu_percentage);
6149+	if (err || max_cpu_percentage > 100)
6150+		return -EINVAL;
6151+
6152+	if (max_cpu_percentage == 100)
6153+		max_cpu_percentage = 99;
6154+	else if (max_cpu_percentage < 10)
6155+		max_cpu_percentage = 10;
6156+
6157+	uksm_max_cpu_percentage = max_cpu_percentage;
6158+
6159+	return count;
6160+}
6161+UKSM_ATTR(max_cpu_percentage);
6162+
6163+static ssize_t sleep_millisecs_show(struct kobject *kobj,
6164+				    struct kobj_attribute *attr, char *buf)
6165+{
6166+	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6167+}
6168+
6169+static ssize_t sleep_millisecs_store(struct kobject *kobj,
6170+				     struct kobj_attribute *attr,
6171+				     const char *buf, size_t count)
6172+{
6173+	unsigned long msecs;
6174+	int err;
6175+
6176+	err = kstrtoul(buf, 10, &msecs);
6177+	if (err || msecs > MSEC_PER_SEC)
6178+		return -EINVAL;
6179+
6180+	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6181+	uksm_sleep_saved = uksm_sleep_jiffies;
6182+
6183+	return count;
6184+}
6185+UKSM_ATTR(sleep_millisecs);
6186+
6187+
6188+static ssize_t cpu_governor_show(struct kobject *kobj,
6189+				  struct kobj_attribute *attr, char *buf)
6190+{
6191+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6192+	int i;
6193+
6194+	buf[0] = '\0';
6195+	for (i = 0; i < n ; i++) {
6196+		if (uksm_cpu_governor == i)
6197+			strcat(buf, "[");
6198+
6199+		strcat(buf, uksm_cpu_governor_str[i]);
6200+
6201+		if (uksm_cpu_governor == i)
6202+			strcat(buf, "]");
6203+
6204+		strcat(buf, " ");
6205+	}
6206+	strcat(buf, "\n");
6207+
6208+	return strlen(buf);
6209+}
6210+
6211+static inline void init_performance_values(void)
6212+{
6213+	int i;
6214+	struct scan_rung *rung;
6215+	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6216+
6217+
6218+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6219+		rung = uksm_scan_ladder + i;
6220+		rung->cpu_ratio = preset->cpu_ratio[i];
6221+		rung->cover_msecs = preset->cover_msecs[i];
6222+	}
6223+
6224+	uksm_max_cpu_percentage = preset->max_cpu;
6225+}
6226+
6227+static ssize_t cpu_governor_store(struct kobject *kobj,
6228+				   struct kobj_attribute *attr,
6229+				   const char *buf, size_t count)
6230+{
6231+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6232+
6233+	for (n--; n >= 0 ; n--) {
6234+		if (!strncmp(buf, uksm_cpu_governor_str[n],
6235+			     strlen(uksm_cpu_governor_str[n])))
6236+			break;
6237+	}
6238+
6239+	if (n < 0)
6240+		return -EINVAL;
6241+	else
6242+		uksm_cpu_governor = n;
6243+
6244+	init_performance_values();
6245+
6246+	return count;
6247+}
6248+UKSM_ATTR(cpu_governor);
6249+
6250+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6251+			char *buf)
6252+{
6253+	return sprintf(buf, "%u\n", uksm_run);
6254+}
6255+
6256+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6257+			 const char *buf, size_t count)
6258+{
6259+	int err;
6260+	unsigned long flags;
6261+
6262+	err = kstrtoul(buf, 10, &flags);
6263+	if (err || flags > UINT_MAX)
6264+		return -EINVAL;
6265+	if (flags > UKSM_RUN_MERGE)
6266+		return -EINVAL;
6267+
6268+	mutex_lock(&uksm_thread_mutex);
6269+	if (uksm_run != flags)
6270+		uksm_run = flags;
6271+	mutex_unlock(&uksm_thread_mutex);
6272+
6273+	if (flags & UKSM_RUN_MERGE)
6274+		wake_up_interruptible(&uksm_thread_wait);
6275+
6276+	return count;
6277+}
6278+UKSM_ATTR(run);
6279+
6280+static ssize_t abundant_threshold_show(struct kobject *kobj,
6281+				     struct kobj_attribute *attr, char *buf)
6282+{
6283+	return sprintf(buf, "%u\n", uksm_abundant_threshold);
6284+}
6285+
6286+static ssize_t abundant_threshold_store(struct kobject *kobj,
6287+				      struct kobj_attribute *attr,
6288+				      const char *buf, size_t count)
6289+{
6290+	int err;
6291+	unsigned long flags;
6292+
6293+	err = kstrtoul(buf, 10, &flags);
6294+	if (err || flags > 99)
6295+		return -EINVAL;
6296+
6297+	uksm_abundant_threshold = flags;
6298+
6299+	return count;
6300+}
6301+UKSM_ATTR(abundant_threshold);
6302+
6303+static ssize_t thrash_threshold_show(struct kobject *kobj,
6304+				     struct kobj_attribute *attr, char *buf)
6305+{
6306+	return sprintf(buf, "%u\n", uksm_thrash_threshold);
6307+}
6308+
6309+static ssize_t thrash_threshold_store(struct kobject *kobj,
6310+				      struct kobj_attribute *attr,
6311+				      const char *buf, size_t count)
6312+{
6313+	int err;
6314+	unsigned long flags;
6315+
6316+	err = kstrtoul(buf, 10, &flags);
6317+	if (err || flags > 99)
6318+		return -EINVAL;
6319+
6320+	uksm_thrash_threshold = flags;
6321+
6322+	return count;
6323+}
6324+UKSM_ATTR(thrash_threshold);
6325+
6326+static ssize_t cpu_ratios_show(struct kobject *kobj,
6327+			       struct kobj_attribute *attr, char *buf)
6328+{
6329+	int i, size;
6330+	struct scan_rung *rung;
6331+	char *p = buf;
6332+
6333+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6334+		rung = &uksm_scan_ladder[i];
6335+
6336+		if (rung->cpu_ratio > 0)
6337+			size = sprintf(p, "%d ", rung->cpu_ratio);
6338+		else
6339+			size = sprintf(p, "MAX/%d ",
6340+					TIME_RATIO_SCALE / -rung->cpu_ratio);
6341+
6342+		p += size;
6343+	}
6344+
6345+	*p++ = '\n';
6346+	*p = '\0';
6347+
6348+	return p - buf;
6349+}
6350+
6351+static ssize_t cpu_ratios_store(struct kobject *kobj,
6352+				      struct kobj_attribute *attr,
6353+				      const char *buf, size_t count)
6354+{
6355+	int i, cpuratios[SCAN_LADDER_SIZE], err;
6356+	unsigned long value;
6357+	struct scan_rung *rung;
6358+	char *p, *end = NULL;
6359+
6360+	p = kzalloc(count, GFP_KERNEL);
6361+	if (!p)
6362+		return -ENOMEM;
6363+
6364+	memcpy(p, buf, count);
6365+
6366+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6367+		if (i != SCAN_LADDER_SIZE - 1) {
6368+			end = strchr(p, ' ');
6369+			if (!end)
6370+				return -EINVAL;
6371+
6372+			*end = '\0';
6373+		}
6374+
6375+		if (strstr(p, "MAX/")) {
6376+			p = strchr(p, '/') + 1;
6377+			err = kstrtoul(p, 10, &value);
6378+			if (err || value > TIME_RATIO_SCALE || !value)
6379+				return -EINVAL;
6380+
6381+			cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
6382+		} else {
6383+			err = kstrtoul(p, 10, &value);
6384+			if (err || value > TIME_RATIO_SCALE || !value)
6385+				return -EINVAL;
6386+
6387+			cpuratios[i] = value;
6388+		}
6389+
6390+		p = end + 1;
6391+	}
6392+
6393+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6394+		rung = &uksm_scan_ladder[i];
6395+
6396+		rung->cpu_ratio = cpuratios[i];
6397+	}
6398+
6399+	return count;
6400+}
6401+UKSM_ATTR(cpu_ratios);
6402+
6403+static ssize_t eval_intervals_show(struct kobject *kobj,
6404+			       struct kobj_attribute *attr, char *buf)
6405+{
6406+	int i, size;
6407+	struct scan_rung *rung;
6408+	char *p = buf;
6409+
6410+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6411+		rung = &uksm_scan_ladder[i];
6412+		size = sprintf(p, "%u ", rung->cover_msecs);
6413+		p += size;
6414+	}
6415+
6416+	*p++ = '\n';
6417+	*p = '\0';
6418+
6419+	return p - buf;
6420+}
6421+
6422+static ssize_t eval_intervals_store(struct kobject *kobj,
6423+				      struct kobj_attribute *attr,
6424+				      const char *buf, size_t count)
6425+{
6426+	int i, err;
6427+	unsigned long values[SCAN_LADDER_SIZE];
6428+	struct scan_rung *rung;
6429+	char *p, *end = NULL;
6430+	ssize_t ret = count;
6431+
6432+	p = kzalloc(count + 2, GFP_KERNEL);
6433+	if (!p)
6434+		return -ENOMEM;
6435+
6436+	memcpy(p, buf, count);
6437+
6438+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6439+		if (i != SCAN_LADDER_SIZE - 1) {
6440+			end = strchr(p, ' ');
6441+			if (!end) {
6442+				ret = -EINVAL;
6443+				goto out;
6444+			}
6445+
6446+			*end = '\0';
6447+		}
6448+
6449+		err = kstrtoul(p, 10, &values[i]);
6450+		if (err) {
6451+			ret = -EINVAL;
6452+			goto out;
6453+		}
6454+
6455+		p = end + 1;
6456+	}
6457+
6458+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6459+		rung = &uksm_scan_ladder[i];
6460+
6461+		rung->cover_msecs = values[i];
6462+	}
6463+
6464+out:
6465+	kfree(p);
6466+	return ret;
6467+}
6468+UKSM_ATTR(eval_intervals);
6469+
6470+static ssize_t ema_per_page_time_show(struct kobject *kobj,
6471+				 struct kobj_attribute *attr, char *buf)
6472+{
6473+	return sprintf(buf, "%lu\n", uksm_ema_page_time);
6474+}
6475+UKSM_ATTR_RO(ema_per_page_time);
6476+
6477+static ssize_t pages_shared_show(struct kobject *kobj,
6478+				 struct kobj_attribute *attr, char *buf)
6479+{
6480+	return sprintf(buf, "%lu\n", uksm_pages_shared);
6481+}
6482+UKSM_ATTR_RO(pages_shared);
6483+
6484+static ssize_t pages_sharing_show(struct kobject *kobj,
6485+				  struct kobj_attribute *attr, char *buf)
6486+{
6487+	return sprintf(buf, "%lu\n", uksm_pages_sharing);
6488+}
6489+UKSM_ATTR_RO(pages_sharing);
6490+
6491+static ssize_t pages_unshared_show(struct kobject *kobj,
6492+				   struct kobj_attribute *attr, char *buf)
6493+{
6494+	return sprintf(buf, "%lu\n", uksm_pages_unshared);
6495+}
6496+UKSM_ATTR_RO(pages_unshared);
6497+
6498+static ssize_t full_scans_show(struct kobject *kobj,
6499+			       struct kobj_attribute *attr, char *buf)
6500+{
6501+	return sprintf(buf, "%llu\n", fully_scanned_round);
6502+}
6503+UKSM_ATTR_RO(full_scans);
6504+
6505+static ssize_t pages_scanned_show(struct kobject *kobj,
6506+				  struct kobj_attribute *attr, char *buf)
6507+{
6508+	unsigned long base = 0;
6509+	u64 delta, ret;
6510+
6511+	if (pages_scanned_stored) {
6512+		base = pages_scanned_base;
6513+		ret = pages_scanned_stored;
6514+		delta = uksm_pages_scanned >> base;
6515+		if (CAN_OVERFLOW_U64(ret, delta)) {
6516+			ret >>= 1;
6517+			delta >>= 1;
6518+			base++;
6519+			ret += delta;
6520+		}
6521+	} else {
6522+		ret = uksm_pages_scanned;
6523+	}
6524+
6525+	while (ret > ULONG_MAX) {
6526+		ret >>= 1;
6527+		base++;
6528+	}
6529+
6530+	if (base)
6531+		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6532+	else
6533+		return sprintf(buf, "%lu\n", (unsigned long)ret);
6534+}
6535+UKSM_ATTR_RO(pages_scanned);
6536+
6537+static ssize_t hash_strength_show(struct kobject *kobj,
6538+				  struct kobj_attribute *attr, char *buf)
6539+{
6540+	return sprintf(buf, "%lu\n", hash_strength);
6541+}
6542+UKSM_ATTR_RO(hash_strength);
6543+
6544+static ssize_t sleep_times_show(struct kobject *kobj,
6545+				  struct kobj_attribute *attr, char *buf)
6546+{
6547+	return sprintf(buf, "%llu\n", uksm_sleep_times);
6548+}
6549+UKSM_ATTR_RO(sleep_times);
6550+
6551+
6552+static struct attribute *uksm_attrs[] = {
6553+	&max_cpu_percentage_attr.attr,
6554+	&sleep_millisecs_attr.attr,
6555+	&cpu_governor_attr.attr,
6556+	&run_attr.attr,
6557+	&ema_per_page_time_attr.attr,
6558+	&pages_shared_attr.attr,
6559+	&pages_sharing_attr.attr,
6560+	&pages_unshared_attr.attr,
6561+	&full_scans_attr.attr,
6562+	&pages_scanned_attr.attr,
6563+	&hash_strength_attr.attr,
6564+	&sleep_times_attr.attr,
6565+	&thrash_threshold_attr.attr,
6566+	&abundant_threshold_attr.attr,
6567+	&cpu_ratios_attr.attr,
6568+	&eval_intervals_attr.attr,
6569+	NULL,
6570+};
6571+
6572+static struct attribute_group uksm_attr_group = {
6573+	.attrs = uksm_attrs,
6574+	.name = "uksm",
6575+};
6576+#endif /* CONFIG_SYSFS */
6577+
6578+static inline void init_scan_ladder(void)
6579+{
6580+	int i;
6581+	struct scan_rung *rung;
6582+
6583+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6584+		rung = uksm_scan_ladder + i;
6585+		slot_tree_init_root(&rung->vma_root);
6586+	}
6587+
6588+	init_performance_values();
6589+	uksm_calc_scan_pages();
6590+}
6591+
6592+static inline int cal_positive_negative_costs(void)
6593+{
6594+	struct page *p1, *p2;
6595+	unsigned char *addr1, *addr2;
6596+	unsigned long i, time_start, hash_cost;
6597+	unsigned long loopnum = 0;
6598+
6599+	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6600+	volatile u32 hash;
6601+	volatile int ret;
6602+
6603+	p1 = alloc_page(GFP_KERNEL);
6604+	if (!p1)
6605+		return -ENOMEM;
6606+
6607+	p2 = alloc_page(GFP_KERNEL);
6608+	if (!p2)
6609+		return -ENOMEM;
6610+
6611+	addr1 = kmap_atomic(p1);
6612+	addr2 = kmap_atomic(p2);
6613+	memset(addr1, prandom_u32(), PAGE_SIZE);
6614+	memcpy(addr2, addr1, PAGE_SIZE);
6615+
6616+	/* make sure that the two pages differ in last byte */
6617+	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6618+	kunmap_atomic(addr2);
6619+	kunmap_atomic(addr1);
6620+
6621+	time_start = jiffies;
6622+	while (jiffies - time_start < 100) {
6623+		for (i = 0; i < 100; i++)
6624+			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6625+		loopnum += 100;
6626+	}
6627+	hash_cost = (jiffies - time_start);
6628+
6629+	time_start = jiffies;
6630+	for (i = 0; i < loopnum; i++)
6631+		ret = pages_identical_with_cost(p1, p2);
6632+	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6633+	memcmp_cost /= hash_cost;
6634+	pr_info("UKSM: relative memcmp_cost = %lu "
6635+		"hash=%u cmp_ret=%d.\n",
6636+		memcmp_cost, hash, ret);
6637+
6638+	__free_page(p1);
6639+	__free_page(p2);
6640+	return 0;
6641+}
6642+
6643+static int init_zeropage_hash_table(void)
6644+{
6645+	struct page *page;
6646+	char *addr;
6647+	int i;
6648+
6649+	page = alloc_page(GFP_KERNEL);
6650+	if (!page)
6651+		return -ENOMEM;
6652+
6653+	addr = kmap_atomic(page);
6654+	memset(addr, 0, PAGE_SIZE);
6655+	kunmap_atomic(addr);
6656+
6657+	zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
6658+		GFP_KERNEL);
6659+	if (!zero_hash_table)
6660+		return -ENOMEM;
6661+
6662+	for (i = 0; i < HASH_STRENGTH_MAX; i++)
6663+		zero_hash_table[i] = page_hash(page, i, 0);
6664+
6665+	__free_page(page);
6666+
6667+	return 0;
6668+}
6669+
6670+static inline int init_random_sampling(void)
6671+{
6672+	unsigned long i;
6673+
6674+	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6675+	if (!random_nums)
6676+		return -ENOMEM;
6677+
6678+	for (i = 0; i < HASH_STRENGTH_FULL; i++)
6679+		random_nums[i] = i;
6680+
6681+	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6682+		unsigned long rand_range, swap_index, tmp;
6683+
6684+		rand_range = HASH_STRENGTH_FULL - i;
6685+		swap_index = i + prandom_u32() % rand_range;
6686+		tmp = random_nums[i];
6687+		random_nums[i] =  random_nums[swap_index];
6688+		random_nums[swap_index] = tmp;
6689+	}
6690+
6691+	rshash_state.state = RSHASH_NEW;
6692+	rshash_state.below_count = 0;
6693+	rshash_state.lookup_window_index = 0;
6694+
6695+	return cal_positive_negative_costs();
6696+}
6697+
6698+static int __init uksm_slab_init(void)
6699+{
6700+	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6701+	if (!rmap_item_cache)
6702+		goto out;
6703+
6704+	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6705+	if (!stable_node_cache)
6706+		goto out_free1;
6707+
6708+	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6709+	if (!node_vma_cache)
6710+		goto out_free2;
6711+
6712+	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6713+	if (!vma_slot_cache)
6714+		goto out_free3;
6715+
6716+	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6717+	if (!tree_node_cache)
6718+		goto out_free4;
6719+
6720+	return 0;
6721+
6722+out_free4:
6723+	kmem_cache_destroy(vma_slot_cache);
6724+out_free3:
6725+	kmem_cache_destroy(node_vma_cache);
6726+out_free2:
6727+	kmem_cache_destroy(stable_node_cache);
6728+out_free1:
6729+	kmem_cache_destroy(rmap_item_cache);
6730+out:
6731+	return -ENOMEM;
6732+}
6733+
6734+static void __init uksm_slab_free(void)
6735+{
6736+	kmem_cache_destroy(stable_node_cache);
6737+	kmem_cache_destroy(rmap_item_cache);
6738+	kmem_cache_destroy(node_vma_cache);
6739+	kmem_cache_destroy(vma_slot_cache);
6740+	kmem_cache_destroy(tree_node_cache);
6741+}
6742+
6743+/* Common interface to ksm, different to it. */
6744+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6745+		unsigned long end, int advice, unsigned long *vm_flags)
6746+{
6747+	int err;
6748+
6749+	switch (advice) {
6750+	case MADV_MERGEABLE:
6751+		return 0;		/* just ignore the advice */
6752+
6753+	case MADV_UNMERGEABLE:
6754+		if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
6755+			return 0;		/* just ignore the advice */
6756+
6757+		if (vma->anon_vma) {
6758+			err = unmerge_uksm_pages(vma, start, end);
6759+			if (err)
6760+				return err;
6761+		}
6762+
6763+		uksm_remove_vma(vma);
6764+		*vm_flags &= ~VM_MERGEABLE;
6765+		break;
6766+	}
6767+
6768+	return 0;
6769+}
6770+
6771+/* Common interface to ksm, actually the same. */
6772+struct page *ksm_might_need_to_copy(struct page *page,
6773+			struct vm_area_struct *vma, unsigned long address)
6774+{
6775+	struct anon_vma *anon_vma = page_anon_vma(page);
6776+	struct page *new_page;
6777+
6778+	if (PageKsm(page)) {
6779+		if (page_stable_node(page))
6780+			return page;	/* no need to copy it */
6781+	} else if (!anon_vma) {
6782+		return page;		/* no need to copy it */
6783+	} else if (anon_vma->root == vma->anon_vma->root &&
6784+		 page->index == linear_page_index(vma, address)) {
6785+		return page;		/* still no need to copy it */
6786+	}
6787+	if (!PageUptodate(page))
6788+		return page;		/* let do_swap_page report the error */
6789+
6790+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6791+	if (new_page) {
6792+		copy_user_highpage(new_page, page, address, vma);
6793+
6794+		SetPageDirty(new_page);
6795+		__SetPageUptodate(new_page);
6796+		__SetPageLocked(new_page);
6797+	}
6798+
6799+	return new_page;
6800+}
6801+
6802+/* Copied from mm/ksm.c and required from 5.1 */
6803+bool reuse_ksm_page(struct page *page,
6804+		    struct vm_area_struct *vma,
6805+		    unsigned long address)
6806+{
6807+#ifdef CONFIG_DEBUG_VM
6808+	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
6809+			WARN_ON(!page_mapped(page)) ||
6810+			WARN_ON(!PageLocked(page))) {
6811+		dump_page(page, "reuse_ksm_page");
6812+		return false;
6813+	}
6814+#endif
6815+
6816+	if (PageSwapCache(page) || !page_stable_node(page))
6817+		return false;
6818+	/* Prohibit parallel get_ksm_page() */
6819+	if (!page_ref_freeze(page, 1))
6820+		return false;
6821+
6822+	page_move_anon_rmap(page, vma);
6823+	page->index = linear_page_index(vma, address);
6824+	page_ref_unfreeze(page, 1);
6825+
6826+	return true;
6827+}
6828+
6829+static int __init uksm_init(void)
6830+{
6831+	struct task_struct *uksm_thread;
6832+	int err;
6833+
6834+	uksm_sleep_jiffies = msecs_to_jiffies(100);
6835+	uksm_sleep_saved = uksm_sleep_jiffies;
6836+
6837+	slot_tree_init();
6838+	init_scan_ladder();
6839+
6840+
6841+	err = init_random_sampling();
6842+	if (err)
6843+		goto out_free2;
6844+
6845+	err = uksm_slab_init();
6846+	if (err)
6847+		goto out_free1;
6848+
6849+	err = init_zeropage_hash_table();
6850+	if (err)
6851+		goto out_free0;
6852+
6853+	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6854+	if (IS_ERR(uksm_thread)) {
6855+		pr_err("uksm: creating kthread failed\n");
6856+		err = PTR_ERR(uksm_thread);
6857+		goto out_free;
6858+	}
6859+
6860+#ifdef CONFIG_SYSFS
6861+	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6862+	if (err) {
6863+		pr_err("uksm: register sysfs failed\n");
6864+		kthread_stop(uksm_thread);
6865+		goto out_free;
6866+	}
6867+#else
6868+	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
6869+
6870+#endif /* CONFIG_SYSFS */
6871+
6872+#ifdef CONFIG_MEMORY_HOTREMOVE
6873+	/*
6874+	 * Choose a high priority since the callback takes uksm_thread_mutex:
6875+	 * later callbacks could only be taking locks which nest within that.
6876+	 */
6877+	hotplug_memory_notifier(uksm_memory_callback, 100);
6878+#endif
6879+	return 0;
6880+
6881+out_free:
6882+	kfree(zero_hash_table);
6883+out_free0:
6884+	uksm_slab_free();
6885+out_free1:
6886+	kfree(random_nums);
6887+out_free2:
6888+	kfree(uksm_scan_ladder);
6889+	return err;
6890+}
6891+
6892+#ifdef MODULE
6893+subsys_initcall(ksm_init);
6894+#else
6895+late_initcall(uksm_init);
6896+#endif
6897+
6898diff -Naur a/mm/vmstat.c b/mm/vmstat.c
6899--- a/mm/vmstat.c	2020-06-14 21:45:04.000000000 +0200
6900+++ b/mm/vmstat.c	2020-06-15 10:34:30.892359059 +0200
6901@@ -1173,6 +1173,9 @@
6902 	"nr_foll_pin_acquired",
6903 	"nr_foll_pin_released",
6904 
6905+#ifdef CONFIG_UKSM
6906+	"nr_uksm_zero_pages",
6907+#endif
6908 	/* enum writeback_stat_item counters */
6909 	"nr_dirty_threshold",
6910 	"nr_dirty_background_threshold",