9ZuuzWcX

· 6 years ago · Dec 04, 2019, 04:50 PM
1// SPDX-License-Identifier: GPL-2.0
2/*
3 *  linux/mm/vmscan.c
4 *
5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6 *
7 *  Swap reorganised 29.12.95, Stephen Tweedie.
8 *  kswapd added: 7.1.96  sct
9 *  Removed kswapd_ctl limits, and swap out as many pages as needed
10 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
11 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
12 *  Multiqueue VM started 5.8.00, Rik van Riel.
13 */
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/mm.h>
18#include <linux/sched/mm.h>
19#include <linux/module.h>
20#include <linux/gfp.h>
21#include <linux/kernel_stat.h>
22#include <linux/swap.h>
23#include <linux/pagemap.h>
24#include <linux/init.h>
25#include <linux/highmem.h>
26#include <linux/vmpressure.h>
27#include <linux/vmstat.h>
28#include <linux/file.h>
29#include <linux/writeback.h>
30#include <linux/blkdev.h>
31#include <linux/buffer_head.h>	/* for try_to_release_page(),
32				   buffer_heads_over_limit */
33#include <linux/mm_inline.h>
34#include <linux/backing-dev.h>
35#include <linux/rmap.h>
36#include <linux/topology.h>
37#include <linux/cpu.h>
38#include <linux/cpuset.h>
39#include <linux/compaction.h>
40#include <linux/notifier.h>
41#include <linux/rwsem.h>
42#include <linux/delay.h>
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45#include <linux/memcontrol.h>
46#include <linux/delayacct.h>
47#include <linux/sysctl.h>
48#include <linux/oom.h>
49#include <linux/prefetch.h>
50#include <linux/printk.h>
51#include <linux/dax.h>
52
53#include <asm/tlbflush.h>
54#include <asm/div64.h>
55
56#include <linux/swapops.h>
57#include <linux/balloon_compaction.h>
58
59#include "internal.h"
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/vmscan.h>
63extern unsigned long lifeTime_cur_window;
64extern unsigned long lifeTime_prev_window;
65extern struct mutex lifeTime_mutex;
66
67struct scan_control {
68	/* How many pages shrink_list() should reclaim */
69	unsigned long nr_to_reclaim;
70
71	/* This context's GFP mask */
72	gfp_t gfp_mask;
73
74	/* Allocation order */
75	int order;
76
77	/*
78	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
79	 * are scanned.
80	 */
81	nodemask_t	*nodemask;
82
83	/*
84	 * The memory cgroup that hit its limit and as a result is the
85	 * primary target of this reclaim invocation.
86	 */
87	struct mem_cgroup *target_mem_cgroup;
88
89	/* Scan (total_size >> priority) pages at once */
90	int priority;
91
92	/* The highest zone to isolate pages for reclaim from */
93	enum zone_type reclaim_idx;
94
95	/* Writepage batching in laptop mode; RECLAIM_WRITE */
96	unsigned int may_writepage:1;
97
98	/* Can mapped pages be reclaimed? */
99	unsigned int may_unmap:1;
100
101	/* Can pages be swapped as part of reclaim? */
102	unsigned int may_swap:1;
103
104	/*
105	 * Cgroups are not reclaimed below their configured memory.low,
106	 * unless we threaten to OOM. If any cgroups are skipped due to
107	 * memory.low and nothing was reclaimed, go back for memory.low.
108	 */
109	unsigned int memcg_low_reclaim:1;
110	unsigned int memcg_low_skipped:1;
111
112	unsigned int hibernation_mode:1;
113
114	/* One of the zones is ready for compaction */
115	unsigned int compaction_ready:1;
116
117	/* Searching for pages to promote */
118	unsigned int only_promote:1;
119
120	/* Incremented by the number of inactive pages that were scanned */
121	unsigned long nr_scanned;
122
123	/* Number of pages freed so far during a call to shrink_zones() */
124	unsigned long nr_reclaimed;
125};
126
127#ifdef ARCH_HAS_PREFETCH
128#define prefetch_prev_lru_page(_page, _base, _field)			\
129	do {								\
130		if ((_page)->lru.prev != _base) {			\
131			struct page *prev;				\
132			\
133			prev = lru_to_page(&(_page->lru));		\
134			prefetch(&prev->_field);			\
135		}							\
136	} while (0)
137#else
138#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
139#endif
140
141#ifdef ARCH_HAS_PREFETCHW
142#define prefetchw_prev_lru_page(_page, _base, _field)			\
143	do {								\
144		if ((_page)->lru.prev != _base) {			\
145			struct page *prev;				\
146			\
147			prev = lru_to_page(&(_page->lru));		\
148			prefetchw(&prev->_field);			\
149		}							\
150	} while (0)
151#else
152#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
153#endif
154
155/*
156 * From 0 .. 100.  Higher means more swappy.
157 */
158int vm_swappiness = 60;
159/*
160 * The total number of pages which are beyond the high watermark within all
161 * zones.
162 */
163unsigned long vm_total_pages;
164
165static LIST_HEAD(shrinker_list);
166static DECLARE_RWSEM(shrinker_rwsem);
167
168#ifdef CONFIG_MEMCG
169static bool global_reclaim(struct scan_control *sc)
170{
171	return !sc->target_mem_cgroup;
172}
173
174/**
175 * sane_reclaim - is the usual dirty throttling mechanism operational?
176 * @sc: scan_control in question
177 *
178 * The normal page dirty throttling mechanism in balance_dirty_pages() is
179 * completely broken with the legacy memcg and direct stalling in
180 * shrink_page_list() is used for throttling instead, which lacks all the
181 * niceties such as fairness, adaptive pausing, bandwidth proportional
182 * allocation and configurability.
183 *
184 * This function tests whether the vmscan currently in progress can assume
185 * that the normal dirty throttling mechanism is operational.
186 */
187static bool sane_reclaim(struct scan_control *sc)
188{
189	struct mem_cgroup *memcg = sc->target_mem_cgroup;
190
191	if (!memcg)
192		return true;
193#ifdef CONFIG_CGROUP_WRITEBACK
194	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
195		return true;
196#endif
197	return false;
198}
199#else
200static bool global_reclaim(struct scan_control *sc)
201{
202	return true;
203}
204
205static bool sane_reclaim(struct scan_control *sc)
206{
207	return true;
208}
209#endif
210
211/*
212 * This misses isolated pages which are not accounted for to save counters.
213 * As the data only determines if reclaim or compaction continues, it is
214 * not expected that isolated pages will be a dominating factor.
215 */
216unsigned long zone_reclaimable_pages(struct zone *zone)
217{
218	unsigned long nr;
219
220	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
221		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
222	if (get_nr_swap_pages() > 0)
223		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
224			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
225
226	return nr;
227}
228
229unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
230{
231	unsigned long nr;
232
233	nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
234		node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
235		node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
236
237	if (get_nr_swap_pages() > 0)
238		nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
239			node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
240			node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
241
242	return nr;
243}
244
245/**
246 * lruvec_lru_size -  Returns the number of pages on the given LRU list.
247 * @lruvec: lru vector
248 * @lru: lru to use
249 * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
250 */
251unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
252{
253	unsigned long lru_size;
254	int zid;
255
256	if (!mem_cgroup_disabled())
257		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
258	else
259		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
260
261	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
262		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
263		unsigned long size;
264
265		if (!managed_zone(zone))
266			continue;
267
268		if (!mem_cgroup_disabled())
269			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
270		else
271			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
272					NR_ZONE_LRU_BASE + lru);
273		lru_size -= min(size, lru_size);
274	}
275
276	return lru_size;
277
278}
279
280/*
281 * Add a shrinker callback to be called from the vm.
282 */
283int register_shrinker(struct shrinker *shrinker)
284{
285	size_t size = sizeof(*shrinker->nr_deferred);
286
287	if (shrinker->flags & SHRINKER_NUMA_AWARE)
288		size *= nr_node_ids;
289
290	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
291	if (!shrinker->nr_deferred)
292		return -ENOMEM;
293
294	down_write(&shrinker_rwsem);
295	list_add_tail(&shrinker->list, &shrinker_list);
296	up_write(&shrinker_rwsem);
297	return 0;
298}
299EXPORT_SYMBOL(register_shrinker);
300
301/*
302 * Remove one
303 */
304void unregister_shrinker(struct shrinker *shrinker)
305{
306	if (!shrinker->nr_deferred)
307		return;
308	down_write(&shrinker_rwsem);
309	list_del(&shrinker->list);
310	up_write(&shrinker_rwsem);
311	kfree(shrinker->nr_deferred);
312	shrinker->nr_deferred = NULL;
313}
314EXPORT_SYMBOL(unregister_shrinker);
315
316#define SHRINK_BATCH 128
317
318static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
319		struct shrinker *shrinker,
320		unsigned long nr_scanned,
321		unsigned long nr_eligible)
322{
323	unsigned long freed = 0;
324	unsigned long long delta;
325	long total_scan;
326	long freeable;
327	long nr;
328	long new_nr;
329	int nid = shrinkctl->nid;
330	long batch_size = shrinker->batch ? shrinker->batch
331		: SHRINK_BATCH;
332	long scanned = 0, next_deferred;
333
334	freeable = shrinker->count_objects(shrinker, shrinkctl);
335	if (freeable == 0)
336		return 0;
337
338	/*
339	 * copy the current shrinker scan count into a local variable
340	 * and zero it so that other concurrent shrinker invocations
341	 * don't also do this scanning work.
342	 */
343	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
344
345	total_scan = nr;
346	delta = (4 * nr_scanned) / shrinker->seeks;
347	delta *= freeable;
348	do_div(delta, nr_eligible + 1);
349	total_scan += delta;
350	if (total_scan < 0) {
351		pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
352				shrinker->scan_objects, total_scan);
353		total_scan = freeable;
354		next_deferred = nr;
355	} else
356		next_deferred = total_scan;
357
358	/*
359	 * We need to avoid excessive windup on filesystem shrinkers
360	 * due to large numbers of GFP_NOFS allocations causing the
361	 * shrinkers to return -1 all the time. This results in a large
362	 * nr being built up so when a shrink that can do some work
363	 * comes along it empties the entire cache due to nr >>>
364	 * freeable. This is bad for sustaining a working set in
365	 * memory.
366	 *
367	 * Hence only allow the shrinker to scan the entire cache when
368	 * a large delta change is calculated directly.
369	 */
370	if (delta < freeable / 4)
371		total_scan = min(total_scan, freeable / 2);
372
373	/*
374	 * Avoid risking looping forever due to too large nr value:
375	 * never try to free more than twice the estimate number of
376	 * freeable entries.
377	 */
378	if (total_scan > freeable * 2)
379		total_scan = freeable * 2;
380
381	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
382			nr_scanned, nr_eligible,
383			freeable, delta, total_scan);
384
385	/*
386	 * Normally, we should not scan less than batch_size objects in one
387	 * pass to avoid too frequent shrinker calls, but if the slab has less
388	 * than batch_size objects in total and we are really tight on memory,
389	 * we will try to reclaim all available objects, otherwise we can end
390	 * up failing allocations although there are plenty of reclaimable
391	 * objects spread over several slabs with usage less than the
392	 * batch_size.
393	 *
394	 * We detect the "tight on memory" situations by looking at the total
395	 * number of objects we want to scan (total_scan). If it is greater
396	 * than the total number of objects on slab (freeable), we must be
397	 * scanning at high prio and therefore should try to reclaim as much as
398	 * possible.
399	 */
400	while (total_scan >= batch_size ||
401			total_scan >= freeable) {
402		unsigned long ret;
403		unsigned long nr_to_scan = min(batch_size, total_scan);
404
405		shrinkctl->nr_to_scan = nr_to_scan;
406		shrinkctl->nr_scanned = nr_to_scan;
407		ret = shrinker->scan_objects(shrinker, shrinkctl);
408		if (ret == SHRINK_STOP)
409			break;
410		freed += ret;
411
412		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
413		total_scan -= shrinkctl->nr_scanned;
414		scanned += shrinkctl->nr_scanned;
415
416		cond_resched();
417	}
418
419	if (next_deferred >= scanned)
420		next_deferred -= scanned;
421	else
422		next_deferred = 0;
423	/*
424	 * move the unused scan count back into the shrinker in a
425	 * manner that handles concurrent updates. If we exhausted the
426	 * scan, there is no need to do an update.
427	 */
428	if (next_deferred > 0)
429		new_nr = atomic_long_add_return(next_deferred,
430				&shrinker->nr_deferred[nid]);
431	else
432		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
433
434	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
435	return freed;
436}
437
438/**
439 * shrink_slab - shrink slab caches
440 * @gfp_mask: allocation context
441 * @nid: node whose slab caches to target
442 * @memcg: memory cgroup whose slab caches to target
443 * @nr_scanned: pressure numerator
444 * @nr_eligible: pressure denominator
445 *
446 * Call the shrink functions to age shrinkable caches.
447 *
448 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
449 * unaware shrinkers will receive a node id of 0 instead.
450 *
451 * @memcg specifies the memory cgroup to target. If it is not NULL,
452 * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
453 * objects from the memory cgroup specified. Otherwise, only unaware
454 * shrinkers are called.
455 *
456 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
457 * the available objects should be scanned.  Page reclaim for example
458 * passes the number of pages scanned and the number of pages on the
459 * LRU lists that it considered on @nid, plus a bias in @nr_scanned
460 * when it encountered mapped pages.  The ratio is further biased by
461 * the ->seeks setting of the shrink function, which indicates the
462 * cost to recreate an object relative to that of an LRU page.
463 *
464 * Returns the number of reclaimed slab objects.
465 */
466static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
467		struct mem_cgroup *memcg,
468		unsigned long nr_scanned,
469		unsigned long nr_eligible)
470{
471	struct shrinker *shrinker;
472	unsigned long freed = 0;
473
474	if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
475		return 0;
476
477	if (nr_scanned == 0)
478		nr_scanned = SWAP_CLUSTER_MAX;
479
480	if (!down_read_trylock(&shrinker_rwsem)) {
481		/*
482		 * If we would return 0, our callers would understand that we
483		 * have nothing else to shrink and give up trying. By returning
484		 * 1 we keep it going and assume we'll be able to shrink next
485		 * time.
486		 */
487		freed = 1;
488		goto out;
489	}
490
491	list_for_each_entry(shrinker, &shrinker_list, list) {
492		struct shrink_control sc = {
493			.gfp_mask = gfp_mask,
494			.nid = nid,
495			.memcg = memcg,
496		};
497
498		/*
499		 * If kernel memory accounting is disabled, we ignore
500		 * SHRINKER_MEMCG_AWARE flag and call all shrinkers
501		 * passing NULL for memcg.
502		 */
503		if (memcg_kmem_enabled() &&
504				!!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
505			continue;
506
507		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
508			sc.nid = 0;
509
510		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
511	}
512
513	up_read(&shrinker_rwsem);
514out:
515	cond_resched();
516	return freed;
517}
518
519void drop_slab_node(int nid)
520{
521	unsigned long freed;
522
523	do {
524		struct mem_cgroup *memcg = NULL;
525
526		freed = 0;
527		do {
528			freed += shrink_slab(GFP_KERNEL, nid, memcg,
529					1000, 1000);
530		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
531	} while (freed > 10);
532}
533
534void drop_slab(void)
535{
536	int nid;
537
538	for_each_online_node(nid)
539		drop_slab_node(nid);
540}
541
542static inline int is_page_cache_freeable(struct page *page)
543{
544	/*
545	 * A freeable page cache page is referenced only by the caller
546	 * that isolated the page, the page cache radix tree and
547	 * optional buffer heads at page->private.
548	 */
549	int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
550		HPAGE_PMD_NR : 1;
551	return page_count(page) - page_has_private(page) == 1 + radix_pins;
552}
553
554static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
555{
556	if (current->flags & PF_SWAPWRITE)
557		return 1;
558	if (!inode_write_congested(inode))
559		return 1;
560	if (inode_to_bdi(inode) == current->backing_dev_info)
561		return 1;
562	return 0;
563}
564
565/*
566 * We detected a synchronous write error writing a page out.  Probably
567 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
568 * fsync(), msync() or close().
569 *
570 * The tricky part is that after writepage we cannot touch the mapping: nothing
571 * prevents it from being freed up.  But we have a ref on the page and once
572 * that page is locked, the mapping is pinned.
573 *
574 * We're allowed to run sleeping lock_page() here because we know the caller has
575 * __GFP_FS.
576 */
577static void handle_write_error(struct address_space *mapping,
578		struct page *page, int error)
579{
580	lock_page(page);
581	if (page_mapping(page) == mapping)
582		mapping_set_error(mapping, error);
583	unlock_page(page);
584}
585
586/* possible outcome of pageout() */
587typedef enum {
588	/* failed to write page out, page is locked */
589	PAGE_KEEP,
590	/* move page to the active list, page is locked */
591	PAGE_ACTIVATE,
592	/* page has been sent to the disk successfully, page is unlocked */
593	PAGE_SUCCESS,
594	/* page is clean and locked */
595	PAGE_CLEAN,
596} pageout_t;
597
598/*
599 * pageout is called by shrink_page_list() for each dirty page.
600 * Calls ->writepage().
601 */
602static pageout_t pageout(struct page *page, struct address_space *mapping,
603		struct scan_control *sc)
604{
605	/*
606	 * If the page is dirty, only perform writeback if that write
607	 * will be non-blocking.  To prevent this allocation from being
608	 * stalled by pagecache activity.  But note that there may be
609	 * stalls if we need to run get_block().  We could test
610	 * PagePrivate for that.
611	 *
612	 * If this process is currently in __generic_file_write_iter() against
613	 * this page's queue, we can perform writeback even if that
614	 * will block.
615	 *
616	 * If the page is swapcache, write it back even if that would
617	 * block, for some throttling. This happens by accident, because
618	 * swap_backing_dev_info is bust: it doesn't reflect the
619	 * congestion state of the swapdevs.  Easy to fix, if needed.
620	 */
621	if (!is_page_cache_freeable(page))
622		return PAGE_KEEP;
623	if (!mapping) {
624		/*
625		 * Some data journaling orphaned pages can have
626		 * page->mapping == NULL while being dirty with clean buffers.
627		 */
628		if (page_has_private(page)) {
629			if (try_to_free_buffers(page)) {
630				ClearPageDirty(page);
631				pr_info("%s: orphaned page\n", __func__);
632				return PAGE_CLEAN;
633			}
634		}
635		return PAGE_KEEP;
636	}
637	if (mapping->a_ops->writepage == NULL)
638		return PAGE_ACTIVATE;
639	if (!may_write_to_inode(mapping->host, sc))
640		return PAGE_KEEP;
641
642	if (clear_page_dirty_for_io(page)) {
643		int res;
644		struct writeback_control wbc = {
645			.sync_mode = WB_SYNC_NONE,
646			.nr_to_write = SWAP_CLUSTER_MAX,
647			.range_start = 0,
648			.range_end = LLONG_MAX,
649			.for_reclaim = 1,
650		};
651
652		SetPageReclaim(page);
653		res = mapping->a_ops->writepage(page, &wbc);
654		if (res < 0)
655			handle_write_error(mapping, page, res);
656		if (res == AOP_WRITEPAGE_ACTIVATE) {
657			ClearPageReclaim(page);
658			return PAGE_ACTIVATE;
659		}
660
661		if (!PageWriteback(page)) {
662			/* synchronous write or broken a_ops? */
663			ClearPageReclaim(page);
664		}
665		trace_mm_vmscan_writepage(page);
666		inc_node_page_state(page, NR_VMSCAN_WRITE);
667		return PAGE_SUCCESS;
668	}
669
670	return PAGE_CLEAN;
671}
672
673/*
674 * Same as remove_mapping, but if the page is removed from the mapping, it
675 * gets returned with a refcount of 0.
676 */
677static int __remove_mapping(struct address_space *mapping, struct page *page,
678		bool reclaimed)
679{
680	unsigned long flags;
681	int refcount;
682
683	BUG_ON(!PageLocked(page));
684	BUG_ON(mapping != page_mapping(page));
685
686	spin_lock_irqsave(&mapping->tree_lock, flags);
687	/*
688	 * The non racy check for a busy page.
689	 *
690	 * Must be careful with the order of the tests. When someone has
691	 * a ref to the page, it may be possible that they dirty it then
692	 * drop the reference. So if PageDirty is tested before page_count
693	 * here, then the following race may occur:
694	 *
695	 * get_user_pages(&page);
696	 * [user mapping goes away]
697	 * write_to(page);
698	 *				!PageDirty(page)    [good]
699	 * SetPageDirty(page);
700	 * put_page(page);
701	 *				!page_count(page)   [good, discard it]
702	 *
703	 * [oops, our write_to data is lost]
704	 *
705	 * Reversing the order of the tests ensures such a situation cannot
706	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
707	 * load is not satisfied before that of page->_refcount.
708	 *
709	 * Note that if SetPageDirty is always performed via set_page_dirty,
710	 * and thus under tree_lock, then this ordering is not required.
711	 */
712	if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
713		refcount = 1 + HPAGE_PMD_NR;
714	else
715		refcount = 2;
716	if (!page_ref_freeze(page, refcount))
717		goto cannot_free;
718	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
719	if (unlikely(PageDirty(page))) {
720		page_ref_unfreeze(page, refcount);
721		goto cannot_free;
722	}
723
724	if (PageSwapCache(page)) {
725		swp_entry_t swap = { .val = page_private(page) };
726		mem_cgroup_swapout(page, swap);
727		__delete_from_swap_cache(page);
728		spin_unlock_irqrestore(&mapping->tree_lock, flags);
729		put_swap_page(page, swap);
730	} else {
731		void (*freepage)(struct page *);
732		void *shadow = NULL;
733
734		freepage = mapping->a_ops->freepage;
735		/*
736		 * Remember a shadow entry for reclaimed file cache in
737		 * order to detect refaults, thus thrashing, later on.
738		 *
739		 * But don't store shadows in an address space that is
740		 * already exiting.  This is not just an optizimation,
741		 * inode reclaim needs to empty out the radix tree or
742		 * the nodes are lost.  Don't plant shadows behind its
743		 * back.
744		 *
745		 * We also don't store shadows for DAX mappings because the
746		 * only page cache pages found in these are zero pages
747		 * covering holes, and because we don't want to mix DAX
748		 * exceptional entries and shadow exceptional entries in the
749		 * same page_tree.
750		 */
751		if (reclaimed && page_is_file_cache(page) &&
752				!mapping_exiting(mapping) && !dax_mapping(mapping))
753			shadow = workingset_eviction(mapping, page);
754		__delete_from_page_cache(page, shadow);
755		spin_unlock_irqrestore(&mapping->tree_lock, flags);
756
757		if (freepage != NULL)
758			freepage(page);
759	}
760
761	return 1;
762
763cannot_free:
764	spin_unlock_irqrestore(&mapping->tree_lock, flags);
765	return 0;
766}
767
768/*
769 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
770 * someone else has a ref on the page, abort and return 0.  If it was
771 * successfully detached, return 1.  Assumes the caller has a single ref on
772 * this page.
773 */
774int remove_mapping(struct address_space *mapping, struct page *page)
775{
776	if (__remove_mapping(mapping, page, false)) {
777		/*
778		 * Unfreezing the refcount with 1 rather than 2 effectively
779		 * drops the pagecache ref for us without requiring another
780		 * atomic operation.
781		 */
782		page_ref_unfreeze(page, 1);
783		return 1;
784	}
785	return 0;
786}
787
788/**
789 * putback_lru_page - put previously isolated page onto appropriate LRU list
790 * @page: page to be put back to appropriate lru list
791 *
792 * Add previously isolated @page to appropriate LRU list.
793 * Page may still be unevictable for other reasons.
794 *
795 * lru_lock must not be held, interrupts must be enabled.
796 */
797void putback_lru_page(struct page *page)
798{
799	bool is_unevictable;
800	int was_unevictable = PageUnevictable(page);
801
802	VM_BUG_ON_PAGE(PageLRU(page), page);
803
804redo:
805	ClearPageUnevictable(page);
806
807	if (page_evictable(page)) {
808		/*
809		 * For evictable pages, we can use the cache.
810		 * In event of a race, worst case is we end up with an
811		 * unevictable page on [in]active list.
812		 * We know how to handle that.
813		 */
814		is_unevictable = false;
815		lru_cache_add(page);
816	} else {
817		/*
818		 * Put unevictable pages directly on zone's unevictable
819		 * list.
820		 */
821		is_unevictable = true;
822		add_page_to_unevictable_list(page);
823		/*
824		 * When racing with an mlock or AS_UNEVICTABLE clearing
825		 * (page is unlocked) make sure that if the other thread
826		 * does not observe our setting of PG_lru and fails
827		 * isolation/check_move_unevictable_pages,
828		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
829		 * the page back to the evictable list.
830		 *
831		 * The other side is TestClearPageMlocked() or shmem_lock().
832		 */
833		smp_mb();
834	}
835
836	/*
837	 * page's status can change while we move it among lru. If an evictable
838	 * page is on unevictable list, it never be freed. To avoid that,
839	 * check after we added it to the list, again.
840	 */
841	if (is_unevictable && page_evictable(page)) {
842		if (!isolate_lru_page(page)) {
843			put_page(page);
844			goto redo;
845		}
846		/* This means someone else dropped this page from LRU
847		 * So, it will be freed or putback to LRU again. There is
848		 * nothing to do here.
849		 */
850	}
851
852	if (was_unevictable && !is_unevictable)
853		count_vm_event(UNEVICTABLE_PGRESCUED);
854	else if (!was_unevictable && is_unevictable)
855		count_vm_event(UNEVICTABLE_PGCULLED);
856
857	put_page(page);		/* drop ref from isolate */
858}
859
860enum page_references {
861	PAGEREF_RECLAIM,
862	PAGEREF_RECLAIM_CLEAN,
863	PAGEREF_KEEP,
864	PAGEREF_ACTIVATE,
865};
866
867static enum page_references page_check_references(struct page *page,
868		struct scan_control *sc)
869{
870	int referenced_ptes, referenced_page;
871	unsigned long vm_flags;
872
873	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
874			&vm_flags);
875	referenced_page = TestClearPageReferenced(page);
876
877	/*
878	 * Mlock lost the isolation race with us.  Let try_to_unmap()
879	 * move the page to the unevictable list.
880	 */
881	if (vm_flags & VM_LOCKED)
882		return PAGEREF_RECLAIM;
883
884	if (referenced_ptes) {
885		if (PageSwapBacked(page))
886			return PAGEREF_ACTIVATE;
887		/*
888		 * All mapped pages start out with page table
889		 * references from the instantiating fault, so we need
890		 * to look twice if a mapped file page is used more
891		 * than once.
892		 *
893		 * Mark it and spare it for another trip around the
894		 * inactive list.  Another page table reference will
895		 * lead to its activation.
896		 *
897		 * Note: the mark is set for activated pages as well
898		 * so that recently deactivated but used pages are
899		 * quickly recovered.
900		 */
901		SetPageReferenced(page);
902
903		if (referenced_page || referenced_ptes > 1)
904			return PAGEREF_ACTIVATE;
905
906		/*
907		 * Activate file-backed executable pages after first usage.
908		 */
909		if (vm_flags & VM_EXEC)
910			return PAGEREF_ACTIVATE;
911
912		return PAGEREF_KEEP;
913	}
914
915	/* Reclaim if clean, defer dirty pages to writeback */
916	if (referenced_page && !PageSwapBacked(page))
917		return PAGEREF_RECLAIM_CLEAN;
918
919	return PAGEREF_RECLAIM;
920}
921
922/* Check if a page is dirty or under writeback */
923static void page_check_dirty_writeback(struct page *page,
924		bool *dirty, bool *writeback)
925{
926	struct address_space *mapping;
927
928	/*
929	 * Anonymous pages are not handled by flushers and must be written
930	 * from reclaim context. Do not stall reclaim based on them
931	 */
932	if (!page_is_file_cache(page) ||
933			(PageAnon(page) && !PageSwapBacked(page))) {
934		*dirty = false;
935		*writeback = false;
936		return;
937	}
938
939	/* By default assume that the page flags are accurate */
940	*dirty = PageDirty(page);
941	*writeback = PageWriteback(page);
942
943	/* Verify dirty/writeback state if the filesystem supports it */
944	if (!page_has_private(page))
945		return;
946
947	mapping = page_mapping(page);
948	if (mapping && mapping->a_ops->is_dirty_writeback)
949		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
950}
951
952struct reclaim_stat {
953	unsigned nr_dirty;
954	unsigned nr_unqueued_dirty;
955	unsigned nr_congested;
956	unsigned nr_writeback;
957	unsigned nr_immediate;
958	unsigned nr_activate;
959	unsigned nr_ref_keep;
960	unsigned nr_unmap_fail;
961};
962
963/*
964 * shrink_page_list() returns the number of reclaimed pages
965 */
966static unsigned long shrink_page_list(struct list_head *page_list,
967		struct pglist_data *pgdat,
968		struct scan_control *sc,
969		enum ttu_flags ttu_flags,
970		struct reclaim_stat *stat,
971		bool force_reclaim)
972{
973	LIST_HEAD(ret_pages);
974	LIST_HEAD(free_pages);
975	int pgactivate = 0;
976	unsigned nr_unqueued_dirty = 0;
977	unsigned nr_dirty = 0;
978	unsigned nr_congested = 0;
979	unsigned nr_reclaimed = 0;
980	unsigned nr_writeback = 0;
981	unsigned nr_immediate = 0;
982	unsigned nr_ref_keep = 0;
983	unsigned nr_unmap_fail = 0;
984
985	cond_resched();
986
987	while (!list_empty(page_list)) {
988		struct address_space *mapping;
989		struct page *page;
990		int may_enter_fs;
991		enum page_references references = PAGEREF_RECLAIM_CLEAN;
992		bool dirty, writeback;
993
994		cond_resched();
995
996		page = lru_to_page(page_list);
997		list_del(&page->lru);
998
999		if (!trylock_page(page))
1000			goto keep;
1001
1002		VM_BUG_ON_PAGE(PageActive(page), page);
1003
1004		sc->nr_scanned++;
1005
1006		if (unlikely(!page_evictable(page)))
1007			goto activate_locked;
1008
1009		if (!sc->may_unmap && page_mapped(page))
1010			goto keep_locked;
1011
1012		/* Double the slab pressure for mapped and swapcache pages */
1013		if ((page_mapped(page) || PageSwapCache(page)) &&
1014				!(PageAnon(page) && !PageSwapBacked(page)))
1015			sc->nr_scanned++;
1016
1017		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1018			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1019
1020		/*
1021		 * The number of dirty pages determines if a zone is marked
1022		 * reclaim_congested which affects wait_iff_congested. kswapd
1023		 * will stall and start writing pages if the tail of the LRU
1024		 * is all dirty unqueued pages.
1025		 */
1026		page_check_dirty_writeback(page, &dirty, &writeback);
1027		if (dirty || writeback)
1028			nr_dirty++;
1029
1030		if (dirty && !writeback)
1031			nr_unqueued_dirty++;
1032
1033		/*
1034		 * Treat this page as congested if the underlying BDI is or if
1035		 * pages are cycling through the LRU so quickly that the
1036		 * pages marked for immediate reclaim are making it to the
1037		 * end of the LRU a second time.
1038		 */
1039		mapping = page_mapping(page);
1040		if (((dirty || writeback) && mapping &&
1041					inode_write_congested(mapping->host)) ||
1042				(writeback && PageReclaim(page)))
1043			nr_congested++;
1044
1045		/*
1046		 * If a page at the tail of the LRU is under writeback, there
1047		 * are three cases to consider.
1048		 *
1049		 * 1) If reclaim is encountering an excessive number of pages
1050		 *    under writeback and this page is both under writeback and
1051		 *    PageReclaim then it indicates that pages are being queued
1052		 *    for IO but are being recycled through the LRU before the
1053		 *    IO can complete. Waiting on the page itself risks an
1054		 *    indefinite stall if it is impossible to writeback the
1055		 *    page due to IO error or disconnected storage so instead
1056		 *    note that the LRU is being scanned too quickly and the
1057		 *    caller can stall after page list has been processed.
1058		 *
1059		 * 2) Global or new memcg reclaim encounters a page that is
1060		 *    not marked for immediate reclaim, or the caller does not
1061		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1062		 *    not to fs). In this case mark the page for immediate
1063		 *    reclaim and continue scanning.
1064		 *
1065		 *    Require may_enter_fs because we would wait on fs, which
1066		 *    may not have submitted IO yet. And the loop driver might
1067		 *    enter reclaim, and deadlock if it waits on a page for
1068		 *    which it is needed to do the write (loop masks off
1069		 *    __GFP_IO|__GFP_FS for this reason); but more thought
1070		 *    would probably show more reasons.
1071		 *
1072		 * 3) Legacy memcg encounters a page that is already marked
1073		 *    PageReclaim. memcg does not have any dirty pages
1074		 *    throttling so we could easily OOM just because too many
1075		 *    pages are in writeback and there is nothing else to
1076		 *    reclaim. Wait for the writeback to complete.
1077		 *
1078		 * In cases 1) and 2) we activate the pages to get them out of
1079		 * the way while we continue scanning for clean pages on the
1080		 * inactive list and refilling from the active list. The
1081		 * observation here is that waiting for disk writes is more
1082		 * expensive than potentially causing reloads down the line.
1083		 * Since they're marked for immediate reclaim, they won't put
1084		 * memory pressure on the cache working set any longer than it
1085		 * takes to write them to disk.
1086		 */
1087		if (PageWriteback(page)) {
1088			/* Case 1 above */
1089			if (current_is_kswapd() &&
1090					PageReclaim(page) &&
1091					test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1092				nr_immediate++;
1093				goto activate_locked;
1094
1095				/* Case 2 above */
1096			} else if (sane_reclaim(sc) ||
1097					!PageReclaim(page) || !may_enter_fs) {
1098				/*
1099				 * This is slightly racy - end_page_writeback()
1100				 * might have just cleared PageReclaim, then
1101				 * setting PageReclaim here end up interpreted
1102				 * as PageReadahead - but that does not matter
1103				 * enough to care.  What we do want is for this
1104				 * page to have PageReclaim set next time memcg
1105				 * reclaim reaches the tests above, so it will
1106				 * then wait_on_page_writeback() to avoid OOM;
1107				 * and it's also appropriate in global reclaim.
1108				 */
1109				SetPageReclaim(page);
1110				nr_writeback++;
1111				goto activate_locked;
1112
1113				/* Case 3 above */
1114			} else {
1115				unlock_page(page);
1116				wait_on_page_writeback(page);
1117				/* then go back and try same page again */
1118				list_add_tail(&page->lru, page_list);
1119				continue;
1120			}
1121		}
1122
1123		if (!force_reclaim)
1124			references = page_check_references(page, sc);
1125
1126		switch (references) {
1127			case PAGEREF_ACTIVATE:
1128				goto activate_locked;
1129			case PAGEREF_KEEP:
1130				nr_ref_keep++;
1131				goto keep_locked;
1132			case PAGEREF_RECLAIM:
1133			case PAGEREF_RECLAIM_CLEAN:
1134				; /* try to reclaim the page below */
1135		}
1136
1137		if (sc->only_promote)
1138			goto keep_locked;
1139
1140		/*
1141		 * Anonymous process memory has backing store?
1142		 * Try to allocate it some swap space here.
1143		 * Lazyfree page could be freed directly
1144		 */
1145		if (PageAnon(page) && PageSwapBacked(page)) {
1146			if (!PageSwapCache(page)) {
1147				if (!(sc->gfp_mask & __GFP_IO))
1148					goto keep_locked;
1149				if (PageTransHuge(page)) {
1150					/* cannot split THP, skip it */
1151					if (!can_split_huge_page(page, NULL))
1152						goto activate_locked;
1153					/*
1154					 * Split pages without a PMD map right
1155					 * away. Chances are some or all of the
1156					 * tail pages can be freed without IO.
1157					 */
1158					if (!compound_mapcount(page) &&
1159							split_huge_page_to_list(page,
1160								page_list))
1161						goto activate_locked;
1162				}
1163				if (!add_to_swap(page)) {
1164					if (!PageTransHuge(page))
1165						goto activate_locked;
1166					/* Fallback to swap normal pages */
1167					if (split_huge_page_to_list(page,
1168								page_list))
1169						goto activate_locked;
1170#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1171					count_vm_event(THP_SWPOUT_FALLBACK);
1172#endif
1173					if (!add_to_swap(page))
1174						goto activate_locked;
1175				}
1176
1177				may_enter_fs = 1;
1178
1179				/* Adding to swap updated mapping */
1180				mapping = page_mapping(page);
1181			}
1182		} else if (unlikely(PageTransHuge(page))) {
1183			/* Split file THP */
1184			if (split_huge_page_to_list(page, page_list))
1185				goto keep_locked;
1186		}
1187
1188		/*
1189		 * The page is mapped into the page tables of one or more
1190		 * processes. Try to unmap it here.
1191		 */
1192		if (page_mapped(page)) {
1193			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1194
1195			if (unlikely(PageTransHuge(page)))
1196				flags |= TTU_SPLIT_HUGE_PMD;
1197			if (!try_to_unmap(page, flags)) {
1198				nr_unmap_fail++;
1199				goto activate_locked;
1200			}
1201		}
1202
1203		if (PageDirty(page)) {
1204			/*
1205			 * Only kswapd can writeback filesystem pages
1206			 * to avoid risk of stack overflow. But avoid
1207			 * injecting inefficient single-page IO into
1208			 * flusher writeback as much as possible: only
1209			 * write pages when we've encountered many
1210			 * dirty pages, and when we've already scanned
1211			 * the rest of the LRU for clean pages and see
1212			 * the same dirty pages again (PageReclaim).
1213			 */
1214			if (page_is_file_cache(page) &&
1215					(!current_is_kswapd() || !PageReclaim(page) ||
1216					 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1217				/*
1218				 * Immediately reclaim when written back.
1219				 * Similar in principal to deactivate_page()
1220				 * except we already have the page isolated
1221				 * and know it's dirty
1222				 */
1223				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1224				SetPageReclaim(page);
1225
1226				goto activate_locked;
1227			}
1228
1229			if (references == PAGEREF_RECLAIM_CLEAN)
1230				goto keep_locked;
1231			if (!may_enter_fs)
1232				goto keep_locked;
1233			if (!sc->may_writepage)
1234				goto keep_locked;
1235
1236			/*
1237			 * Page is dirty. Flush the TLB if a writable entry
1238			 * potentially exists to avoid CPU writes after IO
1239			 * starts and then write it out here.
1240			 */
1241			try_to_unmap_flush_dirty();
1242			switch (pageout(page, mapping, sc)) {
1243				case PAGE_KEEP:
1244					goto keep_locked;
1245				case PAGE_ACTIVATE:
1246					goto activate_locked;
1247				case PAGE_SUCCESS:
1248					if (PageWriteback(page))
1249						goto keep;
1250					if (PageDirty(page))
1251						goto keep;
1252
1253					/*
1254					 * A synchronous write - probably a ramdisk.  Go
1255					 * ahead and try to reclaim the page.
1256					 */
1257					if (!trylock_page(page))
1258						goto keep;
1259					if (PageDirty(page) || PageWriteback(page))
1260						goto keep_locked;
1261					mapping = page_mapping(page);
1262				case PAGE_CLEAN:
1263					; /* try to free the page below */
1264			}
1265		}
1266
1267		/*
1268		 * If the page has buffers, try to free the buffer mappings
1269		 * associated with this page. If we succeed we try to free
1270		 * the page as well.
1271		 *
1272		 * We do this even if the page is PageDirty().
1273		 * try_to_release_page() does not perform I/O, but it is
1274		 * possible for a page to have PageDirty set, but it is actually
1275		 * clean (all its buffers are clean).  This happens if the
1276		 * buffers were written out directly, with submit_bh(). ext3
1277		 * will do this, as well as the blockdev mapping.
1278		 * try_to_release_page() will discover that cleanness and will
1279		 * drop the buffers and mark the page clean - it can be freed.
1280		 *
1281		 * Rarely, pages can have buffers and no ->mapping.  These are
1282		 * the pages which were not successfully invalidated in
1283		 * truncate_complete_page().  We try to drop those buffers here
1284		 * and if that worked, and the page is no longer mapped into
1285		 * process address space (page_count == 1) it can be freed.
1286		 * Otherwise, leave the page on the LRU so it is swappable.
1287		 */
1288		if (page_has_private(page)) {
1289			if (!try_to_release_page(page, sc->gfp_mask))
1290				goto activate_locked;
1291			if (!mapping && page_count(page) == 1) {
1292				unlock_page(page);
1293				if (put_page_testzero(page))
1294					goto free_it;
1295				else {
1296					/*
1297					 * rare race with speculative reference.
1298					 * the speculative reference will free
1299					 * this page shortly, so we may
1300					 * increment nr_reclaimed here (and
1301					 * leave it off the LRU).
1302					 */
1303					nr_reclaimed++;
1304					continue;
1305				}
1306			}
1307		}
1308
1309		if (PageAnon(page) && !PageSwapBacked(page)) {
1310			/* follow __remove_mapping for reference */
1311			if (!page_ref_freeze(page, 1))
1312				goto keep_locked;
1313			if (PageDirty(page)) {
1314				page_ref_unfreeze(page, 1);
1315				goto keep_locked;
1316			}
1317
1318			count_vm_event(PGLAZYFREED);
1319			count_memcg_page_event(page, PGLAZYFREED);
1320		} else if (!mapping || !__remove_mapping(mapping, page, true))
1321			goto keep_locked;
1322		/*
1323		 * At this point, we have no other references and there is
1324		 * no way to pick any more up (removed from LRU, removed
1325		 * from pagecache). Can use non-atomic bitops now (and
1326		 * we obviously don't have to worry about waking up a process
1327		 * waiting on the page lock, because there are no references.
1328		 */
1329		__ClearPageLocked(page);
1330free_it:
1331		nr_reclaimed++;
1332
1333		/*
1334		 * Is there need to periodically free_page_list? It would
1335		 * appear not as the counts should be low
1336		 */
1337		if (unlikely(PageTransHuge(page))) {
1338			mem_cgroup_uncharge(page);
1339			(*get_compound_page_dtor(page))(page);
1340		} else
1341			list_add(&page->lru, &free_pages);
1342		continue;
1343
1344activate_locked:
1345		/* Not a candidate for swapping, so reclaim swap space. */
1346		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1347					PageMlocked(page)))
1348			try_to_free_swap(page);
1349		VM_BUG_ON_PAGE(PageActive(page), page);
1350		if (!PageMlocked(page)) {
1351			SetPageActive(page);
1352			pgactivate++;
1353			count_memcg_page_event(page, PGACTIVATE);
1354		}
1355keep_locked:
1356		unlock_page(page);
1357keep:
1358		list_add(&page->lru, &ret_pages);
1359		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1360	}
1361
1362	mem_cgroup_uncharge_list(&free_pages);
1363	try_to_unmap_flush();
1364	free_unref_page_list(&free_pages);
1365
1366	list_splice(&ret_pages, page_list);
1367	count_vm_events(PGACTIVATE, pgactivate);
1368
1369	if (stat) {
1370		stat->nr_dirty = nr_dirty;
1371		stat->nr_congested = nr_congested;
1372		stat->nr_unqueued_dirty = nr_unqueued_dirty;
1373		stat->nr_writeback = nr_writeback;
1374		stat->nr_immediate = nr_immediate;
1375		stat->nr_activate = pgactivate;
1376		stat->nr_ref_keep = nr_ref_keep;
1377		stat->nr_unmap_fail = nr_unmap_fail;
1378	}
1379	return nr_reclaimed;
1380}
1381
1382unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1383		struct list_head *page_list)
1384{
1385	struct scan_control sc = {
1386		.gfp_mask = GFP_KERNEL,
1387		.priority = DEF_PRIORITY,
1388		.may_unmap = 1,
1389	};
1390	unsigned long ret;
1391	struct page *page, *next;
1392	LIST_HEAD(clean_pages);
1393
1394	list_for_each_entry_safe(page, next, page_list, lru) {
1395		if (page_is_file_cache(page) && !PageDirty(page) &&
1396				!__PageMovable(page)) {
1397			ClearPageActive(page);
1398			list_move(&page->lru, &clean_pages);
1399		}
1400	}
1401
1402	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1403			TTU_IGNORE_ACCESS, NULL, true);
1404	list_splice(&clean_pages, page_list);
1405	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1406	return ret;
1407}
1408
1409/*
1410 * Attempt to remove the specified page from its LRU.  Only take this page
1411 * if it is of the appropriate PageActive status.  Pages which are being
1412 * freed elsewhere are also ignored.
1413 *
1414 * page:	page to consider
1415 * mode:	one of the LRU isolation modes defined above
1416 *
1417 * returns 0 on success, -ve errno on failure.
1418 */
1419int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1420{
1421	int ret = -EINVAL;
1422
1423	/* Only take pages on the LRU. */
1424	if (!PageLRU(page))
1425		return ret;
1426
1427	/* Compaction should not handle unevictable pages but CMA can do so */
1428	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1429		return ret;
1430
1431	ret = -EBUSY;
1432
1433	if (!PageReferenced(page) && (mode & ISOLATE_PROMOTE))
1434		return ret;
1435
1436	/*
1437	 * To minimise LRU disruption, the caller can indicate that it only
1438	 * wants to isolate pages it will be able to operate on without
1439	 * blocking - clean pages for the most part.
1440	 *
1441	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1442	 * that it is possible to migrate without blocking
1443	 */
1444	if (mode & ISOLATE_ASYNC_MIGRATE) {
1445		/* All the caller can do on PageWriteback is block */
1446		if (PageWriteback(page))
1447			return ret;
1448
1449		if (PageDirty(page)) {
1450			struct address_space *mapping;
1451
1452			/*
1453			 * Only pages without mappings or that have a
1454			 * ->migratepage callback are possible to migrate
1455			 * without blocking
1456			 */
1457			mapping = page_mapping(page);
1458			if (mapping && !mapping->a_ops->migratepage)
1459				return ret;
1460		}
1461	}
1462
1463	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1464		return ret;
1465
1466	if (likely(get_page_unless_zero(page))) {
1467		/*
1468		 * Be careful not to clear PageLRU until after we're
1469		 * sure the page is not being freed elsewhere -- the
1470		 * page release code relies on it.
1471		 */
1472		ClearPageLRU(page);
1473		ret = 0;
1474	}
1475
1476	return ret;
1477}
1478
1479
1480/*
1481 * Update LRU sizes after isolating pages. The LRU size updates must
1482 * be complete before mem_cgroup_update_lru_size due to a santity check.
1483 */
1484static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1485		enum lru_list lru, unsigned long *nr_zone_taken)
1486{
1487	int zid;
1488
1489	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1490		if (!nr_zone_taken[zid])
1491			continue;
1492
1493		__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1494#ifdef CONFIG_MEMCG
1495		mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1496#endif
1497	}
1498
1499}
1500
1501/*
1502 * zone_lru_lock is heavily contended.  Some of the functions that
1503 * shrink the lists perform better by taking out a batch of pages
1504 * and working on them outside the LRU lock.
1505 *
1506 * For pagecache intensive workloads, this function is the hottest
1507 * spot in the kernel (apart from copy_*_user functions).
1508 *
1509 * Appropriate locks must be held before calling this function.
1510 *
1511 * @nr_to_scan:	The number of eligible pages to look through on the list.
1512 * @lruvec:	The LRU vector to pull pages from.
1513 * @dst:	The temp list to put pages on to.
1514 * @nr_scanned:	The number of pages that were scanned.
1515 * @sc:		The scan_control struct for this reclaim session
1516 * @mode:	One of the LRU isolation modes
1517 * @lru:	LRU list id for isolating
1518 *
1519 * returns how many pages were moved onto *@dst.
1520 */
1521static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1522		struct lruvec *lruvec, struct list_head *dst,
1523		unsigned long *nr_scanned, struct scan_control *sc,
1524		isolate_mode_t mode, enum lru_list lru)
1525{
1526	struct list_head *src = &lruvec->lists[lru];
1527	unsigned long nr_taken = 0;
1528	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1529	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1530	unsigned long skipped = 0;
1531	unsigned long scan, total_scan, nr_pages, case0, caseBusy;
1532	LIST_HEAD(pages_skipped);
1533	struct zone *zone;
1534	int i;
1535	int zone_stat[MAX_NR_ZONES] = { 0 };
1536	//printk("MC_Debug_4156: ilp: nr_to_scan: %lu\n", nr_to_scan);
1537	//printk("MC_Debug_4156: ilp: lru: %d\n", lru);
1538	/*
1539	   if(list_empty(src))
1540	   printk("MC_Debug_4156: ilp: list:%d is empty\n", lru);
1541	   else
1542	   printk("MC_Debug_4156: ilp: list:%d is not empty\n", lru);
1543	   */
1544	scan = 0;
1545	case0 = 0;
1546	caseBusy = 0;
1547	for (total_scan = 0;
1548			scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
1549			total_scan++) {
1550		struct page *page;
1551		//case0 = 0;
1552		//caseBusy = 0;
1553
1554		page = lru_to_page(src);
1555		zone = page_zone(page);
1556		zone_stat[zone_idx(zone)]++;
1557		//printk("MC_Debug_4156: ilp: page_zone: %s\n", zone->name);
1558
1559
1560		prefetchw_prev_lru_page(page, src, flags);
1561
1562		VM_BUG_ON_PAGE(!PageLRU(page), page);
1563
1564		if (page_zonenum(page) > sc->reclaim_idx) {
1565			list_move(&page->lru, &pages_skipped);
1566			nr_skipped[page_zonenum(page)]++;
1567			//printk("MC_Debug_4156: __ilp: continuing: page_zonenum: %d, reclaim_idx: %d\n", page_zonenum(page), sc->reclaim_idx);
1568			continue;
1569		}
1570
1571		/*
1572		 * Do not count skipped pages because that makes the function
1573		 * return with no isolated pages if the LRU mostly contains
1574		 * ineligible pages.  This causes the VM to not reclaim any
1575		 * pages, triggering a premature OOM.
1576		 */
1577		scan++;
1578		switch (__isolate_lru_page(page, mode)) {
1579			case 0:
1580				case0++;
1581				nr_pages = hpage_nr_pages(page);
1582				nr_taken += nr_pages;
1583				nr_zone_taken[page_zonenum(page)] += nr_pages;
1584				list_move(&page->lru, dst);
1585				break;
1586
1587			case -EBUSY:
1588				caseBusy++;
1589				/* else it is being freed elsewhere */
1590				//printk("MC_Debug_4256: isolate_lru_pages: got -EBUSY\n");
1591				list_move(&page->lru, src);
1592				continue;
1593
1594			default:
1595				BUG();
1596		}
1597	}
1598	//printk("MC_Debug_4156: ilp: case0: %lu, caseBusy: %lu\n", case0, caseBusy);
1599
1600	/*
1601	 * Splice any skipped pages to the start of the LRU list. Note that
1602	 * this disrupts the LRU order when reclaiming for lower zones but
1603	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1604	 * scanning would soon rescan the same pages to skip and put the
1605	 * system at risk of premature OOM.
1606	 */
1607	if (!list_empty(&pages_skipped)) {
1608		int zid;
1609
1610		list_splice(&pages_skipped, src);
1611		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1612			if (!nr_skipped[zid])
1613				continue;
1614
1615			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1616			skipped += nr_skipped[zid];
1617		}
1618	}
1619	*nr_scanned = total_scan;
1620	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1621			total_scan, skipped, nr_taken, mode, lru);
1622	update_lru_sizes(lruvec, lru, nr_zone_taken);
1623	return nr_taken;
1624}
1625
1626/**
1627 * isolate_lru_page - tries to isolate a page from its LRU list
1628 * @page: page to isolate from its LRU list
1629 *
1630 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1631 * vmstat statistic corresponding to whatever LRU list the page was on.
1632 *
1633 * Returns 0 if the page was removed from an LRU list.
1634 * Returns -EBUSY if the page was not on an LRU list.
1635 *
1636 * The returned page will have PageLRU() cleared.  If it was found on
1637 * the active list, it will have PageActive set.  If it was found on
1638 * the unevictable list, it will have the PageUnevictable bit set. That flag
1639 * may need to be cleared by the caller before letting the page go.
1640 *
1641 * The vmstat statistic corresponding to the list on which the page was
1642 * found will be decremented.
1643 *
1644 * Restrictions:
1645 * (1) Must be called with an elevated refcount on the page. This is a
1646 *     fundamentnal difference from isolate_lru_pages (which is called
1647 *     without a stable reference).
1648 * (2) the lru_lock must not be held.
1649 * (3) interrupts must be enabled.
1650 */
1651int isolate_lru_page(struct page *page)
1652{
1653	int ret = -EBUSY;
1654
1655	VM_BUG_ON_PAGE(!page_count(page), page);
1656	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1657
1658	if (PageLRU(page)) {
1659		struct zone *zone = page_zone(page);
1660		struct lruvec *lruvec;
1661
1662		spin_lock_irq(zone_lru_lock(zone));
1663		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
1664		if (PageLRU(page)) {
1665			int lru = page_lru(page);
1666			get_page(page);
1667			ClearPageLRU(page);
1668			del_page_from_lru_list(page, lruvec, lru);
1669			ret = 0;
1670		}
1671		spin_unlock_irq(zone_lru_lock(zone));
1672	}
1673	return ret;
1674}
1675
1676/*
1677 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1678 * then get resheduled. When there are massive number of tasks doing page
1679 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1680 * the LRU list will go small and be scanned faster than necessary, leading to
1681 * unnecessary swapping, thrashing and OOM.
1682 */
1683static int too_many_isolated(struct pglist_data *pgdat, int file,
1684		struct scan_control *sc)
1685{
1686	unsigned long inactive, isolated;
1687
1688	if (current_is_kswapd())
1689		return 0;
1690
1691	if (!sane_reclaim(sc))
1692		return 0;
1693
1694	if (file) {
1695		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1696		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1697	} else {
1698		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1699		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1700	}
1701
1702	/*
1703	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1704	 * won't get blocked by normal direct-reclaimers, forming a circular
1705	 * deadlock.
1706	 */
1707	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1708		inactive >>= 3;
1709
1710	return isolated > inactive;
1711}
1712
1713	static noinline_for_stack void
1714putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1715{
1716	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1717	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1718	LIST_HEAD(pages_to_free);
1719
1720	/*
1721	 * Put back any unfreeable pages.
1722	 */
1723	while (!list_empty(page_list)) {
1724		struct page *page = lru_to_page(page_list);
1725		int lru;
1726
1727		VM_BUG_ON_PAGE(PageLRU(page), page);
1728		list_del(&page->lru);
1729		if (unlikely(!page_evictable(page))) {
1730			spin_unlock_irq(&pgdat->lru_lock);
1731			putback_lru_page(page);
1732			spin_lock_irq(&pgdat->lru_lock);
1733			continue;
1734		}
1735
1736		lruvec = mem_cgroup_page_lruvec(page, pgdat);
1737
1738		SetPageLRU(page);
1739		lru = page_lru(page);
1740		add_page_to_lru_list(page, lruvec, lru);
1741
1742		if (is_active_lru(lru)) {
1743			int file = is_file_lru(lru);
1744			int numpages = hpage_nr_pages(page);
1745			reclaim_stat->recent_rotated[file] += numpages;
1746		}
1747		if (put_page_testzero(page)) {
1748			__ClearPageLRU(page);
1749			__ClearPageActive(page);
1750			del_page_from_lru_list(page, lruvec, lru);
1751
1752			if (unlikely(PageCompound(page))) {
1753				spin_unlock_irq(&pgdat->lru_lock);
1754				mem_cgroup_uncharge(page);
1755				(*get_compound_page_dtor(page))(page);
1756				spin_lock_irq(&pgdat->lru_lock);
1757			} else
1758				list_add(&page->lru, &pages_to_free);
1759		}
1760	}
1761
1762	/*
1763	 * To save our caller's stack, now use input list for pages to free.
1764	 */
1765	list_splice(&pages_to_free, page_list);
1766}
1767
1768/*
1769 * If a kernel thread (such as nfsd for loop-back mounts) services
1770 * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
1771 * In that case we should only throttle if the backing device it is
1772 * writing to is congested.  In other cases it is safe to throttle.
1773 */
1774static int current_may_throttle(void)
1775{
1776	return !(current->flags & PF_LESS_THROTTLE) ||
1777		current->backing_dev_info == NULL ||
1778		bdi_write_congested(current->backing_dev_info);
1779}
1780
1781struct page* vmscan_alloc_pmem_page (struct  page *page, unsigned long private, int **res)
1782{
1783	//gfp_t gfp_mask = GFP_USER | __GFP_PMEM;
1784	gfp_t gfp_mask = __GFP_PMEM;
1785	return alloc_page(gfp_mask);
1786}
1787
1788/*
1789 * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1790 * of reclaimed pages
1791 */
1792	static noinline_for_stack unsigned long
1793shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1794		struct scan_control *sc, enum lru_list lru)
1795{
1796	//printk("MC_Debug_4156: shrink_inactive_list\n");
1797	LIST_HEAD(page_list);
1798	LIST_HEAD(demote_list);
1799	unsigned long nr_scanned;
1800	unsigned long nr_reclaimed = 0;
1801	unsigned long nr_taken;
1802	struct reclaim_stat stat = {};
1803	isolate_mode_t isolate_mode = 0;
1804	int file = is_file_lru(lru);
1805	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1806	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1807	bool stalled = false;
1808	struct page *page;
1809	struct page *next;
1810	struct zone *zone;
1811	int zone_pages[MAX_NR_ZONES] = {0};
1812	int i;
1813
1814	while (unlikely(too_many_isolated(pgdat, file, sc))) {
1815		if (stalled)
1816			return 0;
1817
1818		/* wait a bit for the reclaimer. */
1819		msleep(100);
1820		stalled = true;
1821
1822		/* We are about to die and free our memory. Return now. */
1823		if (fatal_signal_pending(current))
1824			return SWAP_CLUSTER_MAX;
1825	}
1826
1827	lru_add_drain();
1828
1829	if (!sc->may_unmap)
1830		isolate_mode |= ISOLATE_UNMAPPED;
1831
1832	spin_lock_irq(&pgdat->lru_lock);
1833
1834	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1835			&nr_scanned, sc, isolate_mode, lru);
1836
1837	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1838	reclaim_stat->recent_scanned[file] += nr_taken;
1839
1840	if (current_is_kswapd()) {
1841		if (global_reclaim(sc))
1842			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
1843		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
1844				nr_scanned);
1845	} else {
1846		if (global_reclaim(sc))
1847			__count_vm_events(PGSCAN_DIRECT, nr_scanned);
1848		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
1849				nr_scanned);
1850	}
1851	spin_unlock_irq(&pgdat->lru_lock);
1852
1853	if (nr_taken == 0)
1854	{
1855		//printk("MC_Debug_4156: shrink_inactive_list: nr_taken is zero\n");
1856		return 0;
1857	}
1858/*
1859list_for_each_entry_safe(page, next, &page_list, lru) {
1860
1861	cond_resched();
1862	if(zone_idx(page_zone(page)) != ZONE_PMEM)
1863	{
1864		zone = page_zone(page);
1865		zone_pages[zone_idx(zone)]++;
1866		list_del(&page->lru);
1867		list_add(&page->lru, &demote_list);
1868	}
1869
1870}
1871
1872if(!list_empty(&demote_list))
1873{
1874	int ret = migrate_pages(&demote_list, vmscan_alloc_pmem_page, NULL, 0,
1875			MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1876	nr_reclaimed = (ret >= 0 ? nr_taken - ret : 0);
1877
1878	if(ret!=0)
1879	{
1880		list_for_each_entry_safe(page, next, &demote_list, lru) {
1881
1882			cond_resched();
1883			zone_pages[zone_idx(page_zone(page))]--;
1884
1885		}
1886
1887
1888	}
1889
1890	for(i=0;i<MAX_NR_ZONES;i++)
1891	{
1892		if(zone_pages[MAX_NR_ZONES]!=0)
1893		{
1894			__mod_zone_page_state(&pgdat->node_zones[i], NR_DEMOTED, zone_pages[i]);
1895
1896		}
1897	}
1898	//__mod_zone_page_state(zone, NR_DEMOTED, nr_reclaimed);
1899	//printk("MC_Debug_4156: Demoted %lu pages, not migrated %lu pages\n",nr_reclaimed, nr_taken - nr_reclaimed);
1900}
1901else {
1902
1903	//printk("MC_Debug_4156: demote list is emtpy\n");
1904}
1905
1906list_for_each_entry_safe(page, next, &demote_list, lru) {
1907
1908	cond_resched();
1909	list_move(&page->lru, &page_list);
1910
1911}
1912
1913*/
1914nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1915		&stat, false);
1916
1917spin_lock_irq(&pgdat->lru_lock);
1918
1919if (current_is_kswapd()) {
1920	if (global_reclaim(sc))
1921		__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
1922	count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
1923			nr_reclaimed);
1924} else {
1925	if (global_reclaim(sc))
1926		__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1927	count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
1928			nr_reclaimed);
1929}
1930
1931putback_inactive_pages(lruvec, &page_list);
1932
1933__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1934
1935spin_unlock_irq(&pgdat->lru_lock);
1936
1937mem_cgroup_uncharge_list(&page_list);
1938free_unref_page_list(&page_list);
1939
1940/*
1941 * If reclaim is isolating dirty pages under writeback, it implies
1942 * that the long-lived page allocation rate is exceeding the page
1943 * laundering rate. Either the global limits are not being effective
1944 * at throttling processes due to the page distribution throughout
1945 * zones or there is heavy usage of a slow backing device. The
1946 * only option is to throttle from reclaim context which is not ideal
1947 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1948 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
1949 nr_reclaimed);
1950 }
1951
1952 putback_inactive_pages(lruvec, &page_list);
1953
1954 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1955
1956 spin_unlock_irq(&pgdat->lru_lock);
1957
1958 mem_cgroup_uncharge_list(&page_list);
1959 free_unref_page_list(&page_list);
1960
1961/*
1962 * If reclaim is isolating dirty pages under writeback, it implies
1963 * that the long-lived page allocation rate is exceeding the page
1964 * laundering rate. Either the global limits are not being effective
1965 * at throttling processes due to the page distribution throughout
1966 * zones or there is heavy usage of a slow backing device. The
1967 * only option is to throttle from reclaim context which is not ideal
1968 * as there is no guarantee the dirtying process is throttled in the
1969 * same way balance_dirty_pages() manages.
1970 *
1971 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1972 * of pages under pages flagged for immediate reclaim and stall if any
1973 * are encountered in the nr_immediate check below.
1974 */
1975if (stat.nr_writeback && stat.nr_writeback == nr_taken)
1976	set_bit(PGDAT_WRITEBACK, &pgdat->flags);
1977
1978	/*
1979	 * Legacy memcg will stall in page writeback so avoid forcibly
1980	 * stalling here.
1981	 */
1982	if (sane_reclaim(sc)) {
1983		/*
1984		 * Tag a zone as congested if all the dirty pages scanned were
1985		 * backed by a congested BDI and wait_iff_congested will stall.
1986		 */
1987		if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
1988			set_bit(PGDAT_CONGESTED, &pgdat->flags);
1989
1990		/*
1991		 * If dirty pages are scanned that are not queued for IO, it
1992		 * implies that flushers are not doing their job. This can
1993		 * happen when memory pressure pushes dirty pages to the end of
1994		 * the LRU before the dirty limits are breached and the dirty
1995		 * data has expired. It can also happen when the proportion of
1996		 * dirty pages grows not through writes but through memory
1997		 * pressure reclaiming all the clean cache. And in some cases,
1998		 * the flushers simply cannot keep up with the allocation
1999		 * rate. Nudge the flusher threads in case they are asleep, but
2000		 * also allow kswapd to start writing pages during reclaim.
2001		 */
2002		if (stat.nr_unqueued_dirty == nr_taken) {
2003			wakeup_flusher_threads(WB_REASON_VMSCAN);
2004			set_bit(PGDAT_DIRTY, &pgdat->flags);
2005		}
2006
2007		/*
2008		 * If kswapd scans pages marked marked for immediate
2009		 * reclaim and under writeback (nr_immediate), it implies
2010		 * that pages are cycling through the LRU faster than
2011		 * they are written so also forcibly stall.
2012		 */
2013		if (stat.nr_immediate && current_may_throttle())
2014			congestion_wait(BLK_RW_ASYNC, HZ/10);
2015	}
2016
2017/*
2018 * Stall direct reclaim for IO completions if underlying BDIs or zone
2019 * is congested. Allow kswapd to continue until it starts encountering
2020 * unqueued dirty pages or cycling through the LRU too quickly.
2021 */
2022if (!sc->hibernation_mode && !current_is_kswapd() &&
2023		current_may_throttle())
2024wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
2025
2026trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2027		nr_scanned, nr_reclaimed,
2028		stat.nr_dirty,  stat.nr_writeback,
2029		stat.nr_congested, stat.nr_immediate,
2030		stat.nr_activate, stat.nr_ref_keep,
2031		stat.nr_unmap_fail,
2032		sc->priority, file);
2033return nr_reclaimed;
2034}
2035
2036/*
2037 * This moves pages from the active list to the inactive list.
2038 *
2039 * We move them the other way if the page is referenced by one or more
2040 * processes, from rmap.
2041 *
2042 * If the pages are mostly unmapped, the processing is fast and it is
2043 * appropriate to hold zone_lru_lock across the whole operation.  But if
2044 * the pages are mapped, the processing is slow (page_referenced()) so we
2045 * should drop zone_lru_lock around each page.  It's impossible to balance
2046 * this, so instead we remove the pages from the LRU while processing them.
2047 * It is safe to rely on PG_active against the non-LRU pages in here because
2048 * nobody will play with that bit on a non-LRU page.
2049 *
2050 * The downside is that we have to touch page->_refcount against each page.
2051 * But we had to alter page->flags anyway.
2052 *
2053 * Returns the number of pages moved to the given lru.
2054 */
2055
2056static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
2057		struct list_head *list,
2058		struct list_head *pages_to_free,
2059		enum lru_list lru)
2060{
2061	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2062	struct page *page;
2063	int nr_pages;
2064	int nr_moved = 0;
2065
2066	while (!list_empty(list)) {
2067		page = lru_to_page(list);
2068		lruvec = mem_cgroup_page_lruvec(page, pgdat);
2069
2070		VM_BUG_ON_PAGE(PageLRU(page), page);
2071		SetPageLRU(page);
2072
2073		if (is_promote_lru(lru))
2074			TestClearPageReferenced(page);
2075
2076		nr_pages = hpage_nr_pages(page);
2077		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
2078		list_move(&page->lru, &lruvec->lists[lru]);
2079
2080		if (put_page_testzero(page)) {
2081			__ClearPageLRU(page);
2082			__ClearPageActive(page);
2083			del_page_from_lru_list(page, lruvec, lru);
2084
2085			if (unlikely(PageCompound(page))) {
2086				spin_unlock_irq(&pgdat->lru_lock);
2087				mem_cgroup_uncharge(page);
2088				(*get_compound_page_dtor(page))(page);
2089				spin_lock_irq(&pgdat->lru_lock);
2090			} else
2091				list_add(&page->lru, pages_to_free);
2092		} else {
2093			nr_moved += nr_pages;
2094		}
2095	}
2096
2097	if (!is_active_lru(lru)) {
2098		__count_vm_events(PGDEACTIVATE, nr_moved);
2099		count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
2100				nr_moved);
2101	}
2102
2103	return nr_moved;
2104}
2105
2106static void shrink_active_list(unsigned long nr_to_scan,
2107		struct lruvec *lruvec,
2108		struct scan_control *sc,
2109		enum lru_list lru)
2110{
2111	//printk("MC_Debug_4156: sal\n");
2112	unsigned long nr_taken;
2113	unsigned long nr_scanned;
2114	unsigned long vm_flags;
2115	LIST_HEAD(l_hold);	/* The pages which were snipped off */
2116	LIST_HEAD(l_active);
2117	LIST_HEAD(l_inactive);
2118	LIST_HEAD(l_promote);
2119	struct page *page;
2120	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2121	unsigned nr_deactivate, nr_activate;
2122	unsigned nr_rotated = 0;
2123	isolate_mode_t isolate_mode = 0;
2124	int file = is_file_lru(lru);
2125	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2126	int promote_cnt = 0;
2127
2128	lru_add_drain();
2129
2130	if (!sc->may_unmap)
2131		isolate_mode |= ISOLATE_UNMAPPED;
2132
2133	spin_lock_irq(&pgdat->lru_lock);
2134
2135	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2136			&nr_scanned, sc, isolate_mode, lru);
2137
2138	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2139	reclaim_stat->recent_scanned[file] += nr_taken;
2140
2141	__count_vm_events(PGREFILL, nr_scanned);
2142	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2143
2144	spin_unlock_irq(&pgdat->lru_lock);
2145
2146	while (!list_empty(&l_hold)) {
2147		cond_resched();
2148		page = lru_to_page(&l_hold);
2149		list_del(&page->lru);
2150
2151		if (unlikely(!page_evictable(page))) {
2152			putback_lru_page(page);
2153			continue;
2154		}
2155
2156		if (unlikely(buffer_heads_over_limit)) {
2157			if (page_has_private(page) && trylock_page(page)) {
2158				if (page_has_private(page))
2159					try_to_release_page(page, 0);
2160				unlock_page(page);
2161			}
2162		}
2163
2164		if (zone_idx(page_zone(page)) == ZONE_PMEM &&
2165				page_check_references(page, sc) == PAGEREF_ACTIVATE) {
2166
2167			//if(page_zonenum(page) != zone_idx(page_zone(page)))
2168			//	printk("MC_Debug_test415: something wrong\n");
2169			//else
2170			//	printk("MC_Debug_test415: okay\n");
2171
2172			list_add(&page->lru, &l_promote);
2173			promote_cnt++;
2174			continue;
2175		}
2176
2177		if (sc->only_promote) {
2178			list_add(&page->lru, &l_active);
2179			continue;
2180		}
2181
2182		if (page_referenced(page, 0, sc->target_mem_cgroup,
2183					&vm_flags)) {
2184			nr_rotated += hpage_nr_pages(page);
2185			/*
2186			 * Identify referenced, file-backed active pages and
2187			 * give them one more trip around the active list. So
2188			 * that executable code get better chances to stay in
2189			 * memory under moderate memory pressure.  Anon pages
2190			 * are not likely to be evicted by use-once streaming
2191			 * IO, plus JVM can create lots of anon VM_EXEC pages,
2192			 * so we ignore them here.
2193			 */
2194			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2195				list_add(&page->lru, &l_active);
2196				continue;
2197			}
2198		}
2199
2200		ClearPageActive(page);	/* we are de-activating */
2201		list_add(&page->lru, &l_inactive);
2202	}
2203
2204	/*
2205	 * Move pages back to the lru list.
2206	 */
2207	spin_lock_irq(&pgdat->lru_lock);
2208	/*
2209	 * Count referenced pages from currently used mappings as rotated,
2210	 * even though only some of them are actually re-activated.  This
2211	 * helps balance scan pressure between file and anonymous pages in
2212	 * get_scan_count.
2213	 */
2214	reclaim_stat->recent_rotated[file] += nr_rotated;
2215
2216	//printk("MC_Debug_4156: promote_cnt = %d\n", promote_cnt);
2217
2218	nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
2219	move_active_pages_to_lru(lruvec, &l_promote, &l_hold, lru - LRU_ACTIVE + LRU_PROMOTE); //MC_Debug: check this
2220	nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
2221	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2222	spin_unlock_irq(&pgdat->lru_lock);
2223
2224	mem_cgroup_uncharge_list(&l_hold);
2225	free_unref_page_list(&l_hold);
2226	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2227			nr_deactivate, nr_rotated, sc->priority, file);
2228}
2229
2230struct page* vmscan_alloc_normal_page (struct page *page, unsigned long private, int **res)
2231{
2232	gfp_t gfp_mask = GFP_USER; // | __GFP_PROMOTE;
2233	return alloc_page(gfp_mask);
2234	//struct page *new_page = alloc_page(gfp_mask);
2235	//if (zone_idx(page_zone(new_page)) == ZONE_PMEM) {
2236	//printk("Promoting from PMEM is allocating in PMEM again\n");
2237	//wake_up_interruptible(&page_zone(new_page)->zone_pgdat->kswapd_wait);
2238	//}
2239	//return new_page;
2240}
2241
2242	static noinline_for_stack unsigned long
2243shrink_promote_list(unsigned long nr_to_scan,
2244		struct lruvec *lruvec,
2245		struct scan_control *sc,
2246		enum lru_list lru)
2247{
2248
2249	//return 0;
2250
2251	//printk("MC_Debug_4156: spl\n");
2252	unsigned long nr_taken;
2253	unsigned long nr_scanned;
2254	unsigned long nr_migrated;
2255	LIST_HEAD(l_hold);	/* The pages which were snipped off */
2256	LIST_HEAD(l_free);
2257	//struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2258	isolate_mode_t isolate_mode = 0;
2259	int file = is_file_lru(lru);
2260	int zid;
2261	//struct zone *zone = lruvec_zone(lruvec);
2262	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2263	struct zone *zone;
2264	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2265
2266	/*for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2267	  zone = &lruvec_pgdat(lruvec)->node_zones[zid];
2268	  if (zone_idx(zone) != ZONE_PMEM)
2269	  continue;
2270	  }*/
2271
2272	zone = &lruvec_pgdat(lruvec)->node_zones[ZONE_PMEM];
2273
2274	//printk("MC_Debug_4156: zone: %s\n", zone->name);
2275
2276	lru_add_drain();
2277
2278	if (!sc->may_unmap)
2279		isolate_mode |= ISOLATE_UNMAPPED;
2280	/*
2281	   if (!sc->may_writepage)
2282	   isolate_mode |= ISOLATE_CLEAN;
2283	   */
2284	spin_lock_irq(&pgdat->lru_lock);
2285	//spin_unlock_irq(zone_lru_lock(zone));
2286
2287	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2288			&nr_scanned, sc, isolate_mode, lru);
2289	//if (global_reclaim(sc))
2290	//__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
2291
2292	//reclaim_stat->recent_scanned[file] = nr_taken;
2293
2294	//__count_zone_vm_events(PGREFILL, zone, nr_scanned);
2295	//__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
2296	//__mod_zone_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2297
2298	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2299	reclaim_stat->recent_scanned[file] += nr_taken;
2300
2301	spin_unlock_irq(&pgdat->lru_lock);
2302	//printk("MC_Debug_4156: nr_taken: %lu\n", nr_taken);
2303
2304	if (nr_taken) {
2305		int ret = migrate_pages(&l_hold, vmscan_alloc_normal_page,
2306				NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
2307		nr_migrated = (ret < 0 ? 0 : nr_taken - ret);
2308		__mod_zone_page_state(zone, NR_PROMOTED, nr_migrated);
2309		//printk("MC_Debug_4156: Promoted %lu pages, not migrated %lu pages\n",nr_migrated, nr_taken - nr_migrated);
2310	}
2311
2312	/*
2313	 * Move pages back to the lru list.
2314	 */
2315	spin_lock_irq(&pgdat->lru_lock);
2316
2317	// Return to active list if couldn't promote
2318	move_active_pages_to_lru(lruvec, &l_hold, &l_free,
2319			lru - LRU_PROMOTE + LRU_ACTIVE);
2320	//__mod_zone_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2321
2322	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2323
2324	spin_unlock_irq(&pgdat->lru_lock);
2325
2326	mem_cgroup_uncharge_list(&l_free);
2327	//free_hot_cold_page_list(&l_free, true);
2328	free_unref_page_list(&l_free);
2329
2330	return nr_migrated;
2331
2332
2333
2334}
2335
2336/*
2337 * The inactive anon list should be small enough that the VM never has
2338 * to do too much work.
2339 *
2340 * The inactive file list should be small enough to leave most memory
2341 * to the established workingset on the scan-resistant active list,
2342 * but large enough to avoid thrashing the aggregate readahead window.
2343 *
2344 * Both inactive lists should also be large enough that each inactive
2345 * page has a chance to be referenced again before it is reclaimed.
2346 *
2347 * If that fails and refaulting is observed, the inactive list grows.
2348 *
2349 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2350 * on this LRU, maintained by the pageout code. An inactive_ratio
2351 * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2352 *
2353 * total     target    max
2354 * memory    ratio     inactive
2355 * -------------------------------------
2356 *   10MB       1         5MB
2357 *  100MB       1        50MB
2358 *    1GB       3       250MB
2359 *   10GB      10       0.9GB
2360 *  100GB      31         3GB
2361 *    1TB     101        10GB
2362 *   10TB     320        32GB
2363 */
2364static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2365		struct mem_cgroup *memcg,
2366		struct scan_control *sc, bool actual_reclaim)
2367{
2368	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2369	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2370	enum lru_list inactive_lru = file * LRU_FILE;
2371	unsigned long inactive, active;
2372	unsigned long inactive_ratio;
2373	unsigned long refaults;
2374	unsigned long gb;
2375
2376	/*
2377	 * If we don't have swap space, anonymous page deactivation
2378	 * is pointless.
2379	 */
2380	if (!file && !total_swap_pages)
2381		return false;
2382
2383	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2384	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2385
2386	if (memcg)
2387		refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2388	else
2389		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2390
2391	/*
2392	 * When refaults are being observed, it means a new workingset
2393	 * is being established. Disable active list protection to get
2394	 * rid of the stale workingset quickly.
2395	 */
2396	if (file && actual_reclaim && lruvec->refaults != refaults) {
2397		inactive_ratio = 0;
2398	} else {
2399		gb = (inactive + active) >> (30 - PAGE_SHIFT);
2400		if (gb)
2401			inactive_ratio = int_sqrt(10 * gb);
2402		else
2403			inactive_ratio = 1;
2404	}
2405
2406	if (actual_reclaim)
2407		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2408				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2409				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2410				inactive_ratio, file);
2411
2412	return inactive * inactive_ratio < active;
2413}
2414
2415static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2416		struct lruvec *lruvec, struct mem_cgroup *memcg,
2417		struct scan_control *sc)
2418{
2419	if (is_active_lru(lru)) {
2420		if (inactive_list_is_low(lruvec, is_file_lru(lru),
2421					memcg, sc, true) || sc->only_promote)
2422			shrink_active_list(nr_to_scan, lruvec, sc, lru);
2423		return 0;
2424	}
2425	else if (is_promote_lru(lru)) {
2426		return shrink_promote_list(nr_to_scan, lruvec, sc, lru);
2427	}
2428
2429	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2430}
2431
2432static void kpromoted_try_to_sleep(pg_data_t *pgdat, int nr_zones)
2433{
2434	//printk("MC_Debug_4156: kpromoted_try_to_sleep\n");
2435	DEFINE_WAIT(wait);
2436
2437	if (freezing(current) || kthread_should_stop())
2438		return;
2439
2440	prepare_to_wait(&pgdat->kpromoted_wait, &wait, TASK_INTERRUPTIBLE);
2441	if (!kthread_should_stop())
2442		schedule_timeout(HZ*5); //10 secs
2443	finish_wait(&pgdat->kpromoted_wait, &wait);
2444}
2445
2446enum promote_page_error{
2447	ISOLATE_PAGE_ERROR,
2448	PMEM_ALLOC_ERROR,
2449	MIGRATE_PAGE_ERROR,
2450	PROMOTE_PAGE_SUCCESS,
2451};
2452
2453
2454/*
2455enum promote_page_error promote_page(struct page *page)
2456{
2457
2458 //fix this method
2459 struct page *new_page;
2460	if(isolate_lru_page(page)==0){
2461
2462		new_page = vmscan_alloc_pmem_page(NULL, NULL, NULL);
2463
2464		if(!new_page)
2465		{
2466			return PMEM_ALLOC_ERROR;
2467		}
2468		else{
2469			mapping = page_mapping(page);
2470
2471			ret = migrate_page(mapping, new_page, page, MIGRATE_SYNC);
2472			if(ret!=MIGRATEPAGE_SUCCESS)
2473			{
2474				__free_page(new_page);
2475				return MIGRATE_PAGE_ERROR;
2476			}
2477			else
2478			{
2479				return PROMOTE_PAGE_SUCCESS;
2480			}
2481		}
2482
2483	}
2484	else{
2485
2486		return ISOLATE_PAGE_ERROR;
2487
2488	}
2489
2490
2491
2492}
2493*/
2494/*
2495 * The background promote daemon, started as a kernel thread
2496 * from the init process.
2497 */
2498
2499static int kpromoted(void *p)
2500{
2501	unsigned long tmpHis[20];
2502	memset(tmpHis, 0, sizeof(tmpHis));
2503	unsigned long total_page_cnt = 0;
2504	unsigned long total_cnt_ref = 0;
2505	unsigned long total_cnt_pageRef = 0;
2506	unsigned long cnt_set_btime = 0;
2507	unsigned long cnt_active_pages = 0;
2508	printk("MC_Debug_4156: kpromoted\n");
2509	pg_data_t *pgdat = (pg_data_t*)p;
2510	LIST_HEAD(promote_list);
2511
2512	struct reclaim_state reclaim_state = {
2513		.reclaimed_slab = 0,
2514	};
2515	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2516	struct task_struct *tsk = current;
2517
2518	struct scan_control sc = {
2519		.nr_to_reclaim = SWAP_CLUSTER_MAX,
2520		//.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2521		//.order = order,
2522		//.nodemask = nodemask,
2523		.priority = DEF_PRIORITY,
2524		.may_writepage = !laptop_mode,
2525		.may_unmap = 1,
2526		.may_swap = 1,
2527		.only_promote = 1,
2528		.reclaim_idx = 4,
2529	};
2530	unsigned long vm_flags;
2531
2532	//lockdep_set_current_reclaim_state(GFP_KERNEL);
2533
2534	if (!cpumask_empty(cpumask))
2535		set_cpus_allowed_ptr(tsk, cpumask);
2536	current->reclaim_state = &reclaim_state;
2537
2538	/*
2539	 * Tell the memory management that we're a "memory allocator",
2540	 * and that if we need more memory we should get access to it
2541	 * regardless (see "__alloc_pages()"). "kswapd" should
2542	 * never get caught in the normal page freeing logic.
2543	 *
2544	 * (Kswapd normally doesn't need memory anyway, but sometimes
2545	 * you need a small amount of memory in order to be able to
2546	 * page out something else, and this flag essentially protects
2547	 * us from recursively trying to free more memory as we're
2548	 * trying to free the first piece of memory in the first place).
2549	 */
2550	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2551	unsigned long window_size=20;
2552	unsigned long cur_scan = 0;
2553	unsigned long got_hit = 0;
2554	int i=0;
2555	unsigned long windowID = 0;
2556	unsigned long tmp;
2557	int tmp_threshold=window_size/2;
2558	struct address_space *mapping;
2559	struct page *new_page;
2560	int ret;
2561	enum promote_page_error promote_ret;
2562	unsigned long start_ns, end_ns;
2563
2564	for ( ; ; ) {
2565		__mod_node_page_state(pgdat, NR_SCAN_ID, 1); //Debug
2566
2567		if(cur_scan%window_size==0)
2568		{
2569			++windowID;
2570			__mod_node_page_state(pgdat, NR_WINDOW_ID, 1); //Debug
2571
2572
2573		}
2574
2575		total_page_cnt = 0;
2576		total_cnt_ref = 0;
2577		total_cnt_pageRef = 0;
2578		cnt_set_btime = 0;
2579		cnt_active_pages = 0;
2580		got_hit = 0;
2581		int lru;
2582		struct mem_cgroup_reclaim_cookie reclaim = {
2583			.pgdat = pgdat,
2584			.priority = DEF_PRIORITY,
2585		};
2586		struct mem_cgroup *root = NULL;
2587		struct mem_cgroup *memcg = mem_cgroup_iter(root, NULL, &reclaim);
2588		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2589
2590		start_ns = ktime_get_ns();
2591		struct page *page;
2592
2593		for_each_evictable_lru(lru) {
2594
2595
2596			struct list_head *src = &lruvec->lists[lru];
2597
2598			list_for_each_entry(page, src, lru)
2599			{
2600				total_page_cnt++; //counting total pages;
2601				page->temperature = total_page_cnt;
2602				//page->iden = ++iden_cnt;
2603				if(page->birthTime==0)
2604				{
2605					page->birthTime=jiffies;
2606					cnt_set_btime++;
2607					//page->iden = ++iden_cnt;
2608				}
2609
2610				if(cur_scan%window_size==0)
2611				{
2612					//page->temperature=0;
2613
2614				}
2615
2616				if (PageReferenced(page) || page_referenced(page, 0, memcg,
2617							&vm_flags))
2618				{
2619					//page->temperature++;
2620					got_hit++;
2621					ClearPageReferenced(page);
2622					if( page->temperature > 0 &&  (page->temperature % 5000) == 0)
2623						printk("pageref: %lu true\n",page->temperature);
2624				}
2625				else
2626				{
2627					if( page->temperature > 0 &&  (page->temperature % 5000) == 0)
2628						printk("pageref: %lu false\n",page->temperature);
2629				}
2630
2631			}
2632
2633		}
2634
2635		cur_scan++;
2636
2637		end_ns = ktime_get_ns();
2638		unsigned long duration_ms = (end_ns - start_ns)/1000000;
2639
2640
2641		printk("AMC_Debug_Scan: t: %lu, w: %lu, s: %lu, cntTotalP: %lu, gotHit: %lu\n", duration_ms, windowID, cur_scan, total_page_cnt, got_hit);
2642
2643sleep:
2644		//ret = try_to_freeze();
2645		if (kthread_should_stop())
2646		{
2647			printk("AMC_Debug_Scan: Why should I stop?\n");
2648			break;
2649		}
2650
2651		kpromoted_try_to_sleep(pgdat, pgdat->nr_zones);
2652		}
2653
2654		printk("AMC_Debug_Scan: scan ends here\n");
2655
2656		tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
2657		current->reclaim_state = NULL;
2658		//lockdep_clear_current_reclaim_state();
2659
2660		return 0;
2661	}
2662
2663
2664
2665
2666	enum scan_balance {
2667		SCAN_EQUAL,
2668		SCAN_FRACT,
2669		SCAN_ANON,
2670		SCAN_FILE,
2671	};
2672
2673	/*
2674	 * Determine how aggressively the anon and file LRU lists should be
2675	 * scanned.  The relative value of each set of LRU lists is determined
2676	 * by looking at the fraction of the pages scanned we did rotate back
2677	 * onto the active list instead of evict.
2678	 *
2679	 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
2680	 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
2681	 */
2682	static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2683			struct scan_control *sc, unsigned long *nr,
2684			unsigned long *lru_pages)
2685	{
2686		int swappiness = mem_cgroup_swappiness(memcg);
2687		struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2688		u64 fraction[2];
2689		u64 denominator = 0;	/* gcc */
2690		struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2691		unsigned long anon_prio, file_prio;
2692		enum scan_balance scan_balance;
2693		unsigned long anon, file;
2694		unsigned long ap, fp;
2695		enum lru_list lru;
2696
2697		/* If we have no swap space, do not bother scanning anon pages. */
2698		if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2699			scan_balance = SCAN_FILE;
2700			goto out;
2701		}
2702
2703		/*
2704		 * Global reclaim will swap to prevent OOM even with no
2705		 * swappiness, but memcg users want to use this knob to
2706		 * disable swapping for individual groups completely when
2707		 * using the memory controller's swap limit feature would be
2708		 * too expensive.
2709		 */
2710		if (!global_reclaim(sc) && !swappiness) {
2711			scan_balance = SCAN_FILE;
2712			goto out;
2713		}
2714
2715		/*
2716		 * Do not apply any pressure balancing cleverness when the
2717		 * system is close to OOM, scan both anon and file equally
2718		 * (unless the swappiness setting disagrees with swapping).
2719		 */
2720		if (!sc->priority && swappiness) {
2721			scan_balance = SCAN_EQUAL;
2722			goto out;
2723		}
2724
2725		/*
2726		 * Prevent the reclaimer from falling into the cache trap: as
2727		 * cache pages start out inactive, every cache fault will tip
2728		 * the scan balance towards the file LRU.  And as the file LRU
2729		 * shrinks, so does the window for rotation from references.
2730		 * This means we have a runaway feedback loop where a tiny
2731		 * thrashing file LRU becomes infinitely more attractive than
2732		 * anon pages.  Try to detect this based on file LRU size.
2733		 */
2734		if (global_reclaim(sc)) {
2735			unsigned long pgdatfile;
2736			unsigned long pgdatfree;
2737			int z;
2738			unsigned long total_high_wmark = 0;
2739
2740			pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2741			pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2742				node_page_state(pgdat, NR_INACTIVE_FILE);
2743
2744			for (z = 0; z < MAX_NR_ZONES; z++) {
2745				struct zone *zone = &pgdat->node_zones[z];
2746				if (!managed_zone(zone))
2747					continue;
2748
2749				total_high_wmark += high_wmark_pages(zone);
2750			}
2751
2752			if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2753				/*
2754				 * Force SCAN_ANON if there are enough inactive
2755				 * anonymous pages on the LRU in eligible zones.
2756				 * Otherwise, the small LRU gets thrashed.
2757				 */
2758				if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
2759						lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2760						>> sc->priority) {
2761					scan_balance = SCAN_ANON;
2762					goto out;
2763				}
2764			}
2765		}
2766
2767		/*
2768		 * If there is enough inactive page cache, i.e. if the size of the
2769		 * inactive list is greater than that of the active list *and* the
2770		 * inactive list actually has some pages to scan on this priority, we
2771		 * do not reclaim anything from the anonymous working set right now.
2772		 * Without the second condition we could end up never scanning an
2773		 * lruvec even if it has plenty of old anonymous pages unless the
2774		 * system is under heavy pressure.
2775		 */
2776		if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
2777				lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2778			scan_balance = SCAN_FILE;
2779			goto out;
2780		}
2781
2782		scan_balance = SCAN_FRACT;
2783
2784		/*
2785		 * With swappiness at 100, anonymous and file have the same priority.
2786		 * This scanning priority is essentially the inverse of IO cost.
2787		 */
2788		anon_prio = swappiness;
2789		file_prio = 200 - anon_prio;
2790
2791		/*
2792		 * OK, so we have swap space and a fair amount of page cache
2793		 * pages.  We use the recently rotated / recently scanned
2794		 * ratios to determine how valuable each cache is.
2795		 *
2796		 * Because workloads change over time (and to avoid overflow)
2797		 * we keep these statistics as a floating average, which ends
2798		 * up weighing recent references more than old ones.
2799		 *
2800		 * anon in [0], file in [1]
2801		 */
2802
2803		anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2804			lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2805		file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2806			lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2807
2808		spin_lock_irq(&pgdat->lru_lock);
2809		if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2810			reclaim_stat->recent_scanned[0] /= 2;
2811			reclaim_stat->recent_rotated[0] /= 2;
2812		}
2813
2814		if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2815			reclaim_stat->recent_scanned[1] /= 2;
2816			reclaim_stat->recent_rotated[1] /= 2;
2817		}
2818
2819		/*
2820		 * The amount of pressure on anon vs file pages is inversely
2821		 * proportional to the fraction of recently scanned pages on
2822		 * each list that were recently referenced and in active use.
2823		 */
2824		ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2825		ap /= reclaim_stat->recent_rotated[0] + 1;
2826
2827		fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2828		fp /= reclaim_stat->recent_rotated[1] + 1;
2829		spin_unlock_irq(&pgdat->lru_lock);
2830
2831		fraction[0] = ap;
2832		fraction[1] = fp;
2833		denominator = ap + fp + 1;
2834out:
2835		*lru_pages = 0;
2836		for_each_evictable_lru(lru) {
2837			int file = is_file_lru(lru);
2838			unsigned long size;
2839			unsigned long scan;
2840
2841			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2842			scan = size >> sc->priority;
2843			/*
2844			 * If the cgroup's already been deleted, make sure to
2845			 * scrape out the remaining cache.
2846			 */
2847			if (!scan && !mem_cgroup_online(memcg))
2848				scan = min(size, SWAP_CLUSTER_MAX);
2849
2850			switch (scan_balance) {
2851				case SCAN_EQUAL:
2852					/* Scan lists relative to size */
2853					break;
2854				case SCAN_FRACT:
2855					/*
2856					 * Scan types proportional to swappiness and
2857					 * their relative recent reclaim efficiency.
2858					 */
2859					scan = div64_u64(scan * fraction[file],
2860							denominator);
2861					break;
2862				case SCAN_FILE:
2863				case SCAN_ANON:
2864					/* Scan one type exclusively */
2865					if ((scan_balance == SCAN_FILE) != file) {
2866						size = 0;
2867						scan = 0;
2868					}
2869					break;
2870				default:
2871					/* Look ma, no brain */
2872					BUG();
2873			}
2874
2875			*lru_pages += size;
2876			nr[lru] = scan;
2877		}
2878	}
2879
2880	/*
2881	 * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
2882	 */
2883	static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2884			struct scan_control *sc, unsigned long *lru_pages)
2885	{
2886		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2887		unsigned long nr[NR_LRU_LISTS];
2888		unsigned long targets[NR_LRU_LISTS];
2889		unsigned long nr_to_scan;
2890		enum lru_list lru;
2891		unsigned long nr_reclaimed = 0;
2892		unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2893		struct blk_plug plug;
2894		bool scan_adjusted;
2895
2896		get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2897
2898		/* Record the original scan target for proportional adjustments later */
2899		memcpy(targets, nr, sizeof(nr));
2900
2901		/*
2902		 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2903		 * event that can occur when there is little memory pressure e.g.
2904		 * multiple streaming readers/writers. Hence, we do not abort scanning
2905		 * when the requested number of pages are reclaimed when scanning at
2906		 * DEF_PRIORITY on the assumption that the fact we are direct
2907		 * reclaiming implies that kswapd is not keeping up and it is best to
2908		 * do a batch of work at once. For memcg reclaim one check is made to
2909		 * abort proportional reclaim if either the file or anon lru has already
2910		 * dropped to zero at the first pass.
2911		 */
2912		scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2913				sc->priority == DEF_PRIORITY);
2914
2915		blk_start_plug(&plug);
2916		while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2917				nr[LRU_INACTIVE_FILE]) {
2918			unsigned long nr_anon, nr_file, percentage;
2919			unsigned long nr_scanned;
2920
2921			for_each_evictable_lru(lru) {
2922				if (nr[lru]) {
2923					nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2924					nr[lru] -= nr_to_scan;
2925
2926					nr_reclaimed += shrink_list(lru, nr_to_scan,
2927							lruvec, memcg, sc);
2928				}
2929			}
2930
2931			cond_resched();
2932
2933			if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2934				continue;
2935
2936			/*
2937			 * For kswapd and memcg, reclaim at least the number of pages
2938			 * requested. Ensure that the anon and file LRUs are scanned
2939			 * proportionally what was requested by get_scan_count(). We
2940			 * stop reclaiming one LRU and reduce the amount scanning
2941			 * proportional to the original scan target.
2942			 */
2943			nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2944			nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2945
2946			/*
2947			 * It's just vindictive to attack the larger once the smaller
2948			 * has gone to zero.  And given the way we stop scanning the
2949			 * smaller below, this makes sure that we only make one nudge
2950			 * towards proportionality once we've got nr_to_reclaim.
2951			 */
2952			if (!nr_file || !nr_anon)
2953				break;
2954
2955			if (nr_file > nr_anon) {
2956				unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2957					targets[LRU_ACTIVE_ANON] + 1;
2958				lru = LRU_BASE;
2959				percentage = nr_anon * 100 / scan_target;
2960			} else {
2961				unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2962					targets[LRU_ACTIVE_FILE] + 1;
2963				lru = LRU_FILE;
2964				percentage = nr_file * 100 / scan_target;
2965			}
2966
2967			/* Stop scanning the smaller of the LRU */
2968			nr[lru] = 0;
2969			nr[lru + LRU_ACTIVE] = 0;
2970
2971			/*
2972			 * Recalculate the other LRU scan count based on its original
2973			 * scan target and the percentage scanning already complete
2974			 */
2975			lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2976			nr_scanned = targets[lru] - nr[lru];
2977			nr[lru] = targets[lru] * (100 - percentage) / 100;
2978			nr[lru] -= min(nr[lru], nr_scanned);
2979
2980			lru += LRU_ACTIVE;
2981			nr_scanned = targets[lru] - nr[lru];
2982			nr[lru] = targets[lru] * (100 - percentage) / 100;
2983			nr[lru] -= min(nr[lru], nr_scanned);
2984
2985			scan_adjusted = true;
2986		}
2987		blk_finish_plug(&plug);
2988		sc->nr_reclaimed += nr_reclaimed;
2989
2990		/*
2991		 * Even if we did not try to evict anon pages at all, we want to
2992		 * rebalance the anon lru active/inactive ratio.
2993		 */
2994		if (inactive_list_is_low(lruvec, false, memcg, sc, true))
2995			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2996					sc, LRU_ACTIVE_ANON);
2997	}
2998
2999	/* Use reclaim/compaction for costly allocs or under memory pressure */
3000	static bool in_reclaim_compaction(struct scan_control *sc)
3001	{
3002		if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
3003				(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
3004				 sc->priority < DEF_PRIORITY - 2))
3005			return true;
3006
3007		return false;
3008	}
3009
3010	/*
3011	 * Reclaim/compaction is used for high-order allocation requests. It reclaims
3012	 * order-0 pages before compacting the zone. should_continue_reclaim() returns
3013	 * true if more pages should be reclaimed such that when the page allocator
3014	 * calls try_to_compact_zone() that it will have enough free pages to succeed.
3015	 * It will give up earlier than that if there is difficulty reclaiming pages.
3016	 */
3017	static inline bool should_continue_reclaim(struct pglist_data *pgdat,
3018			unsigned long nr_reclaimed,
3019			unsigned long nr_scanned,
3020			struct scan_control *sc)
3021	{
3022		unsigned long pages_for_compaction;
3023		unsigned long inactive_lru_pages;
3024		int z;
3025
3026		/* If not in reclaim/compaction mode, stop */
3027		if (!in_reclaim_compaction(sc))
3028			return false;
3029
3030		/* Consider stopping depending on scan and reclaim activity */
3031		if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
3032			/*
3033			 * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
3034			 * full LRU list has been scanned and we are still failing
3035			 * to reclaim pages. This full LRU scan is potentially
3036			 * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
3037			 */
3038			if (!nr_reclaimed && !nr_scanned)
3039				return false;
3040		} else {
3041			/*
3042			 * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
3043			 * fail without consequence, stop if we failed to reclaim
3044			 * any pages from the last SWAP_CLUSTER_MAX number of
3045			 * pages that were scanned. This will return to the
3046			 * caller faster at the risk reclaim/compaction and
3047			 * the resulting allocation attempt fails
3048			 */
3049			if (!nr_reclaimed)
3050				return false;
3051		}
3052
3053		/*
3054		 * If we have not reclaimed enough pages for compaction and the
3055		 * inactive lists are large enough, continue reclaiming
3056		 */
3057		pages_for_compaction = compact_gap(sc->order);
3058		inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
3059		if (get_nr_swap_pages() > 0)
3060			inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
3061		if (sc->nr_reclaimed < pages_for_compaction &&
3062				inactive_lru_pages > pages_for_compaction)
3063			return true;
3064
3065		/* If compaction would go ahead or the allocation would succeed, stop */
3066		for (z = 0; z <= sc->reclaim_idx; z++) {
3067			struct zone *zone = &pgdat->node_zones[z];
3068			if (!managed_zone(zone))
3069				continue;
3070
3071			switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
3072				case COMPACT_SUCCESS:
3073				case COMPACT_CONTINUE:
3074					return false;
3075				default:
3076					/* check next zone */
3077					;
3078			}
3079		}
3080		return true;
3081	}
3082
3083	static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
3084	{
3085		struct reclaim_state *reclaim_state = current->reclaim_state;
3086		unsigned long nr_reclaimed, nr_scanned;
3087		bool reclaimable = false;
3088
3089		do {
3090			struct mem_cgroup *root = sc->target_mem_cgroup;
3091			struct mem_cgroup_reclaim_cookie reclaim = {
3092				.pgdat = pgdat,
3093				.priority = sc->priority,
3094			};
3095			unsigned long node_lru_pages = 0;
3096			struct mem_cgroup *memcg;
3097
3098			nr_reclaimed = sc->nr_reclaimed;
3099			nr_scanned = sc->nr_scanned;
3100
3101			memcg = mem_cgroup_iter(root, NULL, &reclaim);
3102			do {
3103				unsigned long lru_pages;
3104				unsigned long reclaimed;
3105				unsigned long scanned;
3106
3107				if (mem_cgroup_low(root, memcg)) {
3108					if (!sc->memcg_low_reclaim) {
3109						sc->memcg_low_skipped = 1;
3110						continue;
3111					}
3112					mem_cgroup_event(memcg, MEMCG_LOW);
3113				}
3114
3115				reclaimed = sc->nr_reclaimed;
3116				scanned = sc->nr_scanned;
3117
3118				shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
3119				node_lru_pages += lru_pages;
3120
3121				if (memcg)
3122					shrink_slab(sc->gfp_mask, pgdat->node_id,
3123							memcg, sc->nr_scanned - scanned,
3124							lru_pages);
3125
3126				/* Record the group's reclaim efficiency */
3127				vmpressure(sc->gfp_mask, memcg, false,
3128						sc->nr_scanned - scanned,
3129						sc->nr_reclaimed - reclaimed);
3130
3131				/*
3132				 * Direct reclaim and kswapd have to scan all memory
3133				 * cgroups to fulfill the overall scan target for the
3134				 * node.
3135				 *
3136				 * Limit reclaim, on the other hand, only cares about
3137				 * nr_to_reclaim pages to be reclaimed and it will
3138				 * retry with decreasing priority if one round over the
3139				 * whole hierarchy is not sufficient.
3140				 */
3141				if (!global_reclaim(sc) &&
3142						sc->nr_reclaimed >= sc->nr_to_reclaim) {
3143					mem_cgroup_iter_break(root, memcg);
3144					break;
3145				}
3146			} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
3147
3148			/*
3149			 * Shrink the slab caches in the same proportion that
3150			 * the eligible LRU pages were scanned.
3151			 */
3152			if (global_reclaim(sc))
3153				shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
3154						sc->nr_scanned - nr_scanned,
3155						node_lru_pages);
3156
3157			if (reclaim_state) {
3158				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
3159				reclaim_state->reclaimed_slab = 0;
3160			}
3161
3162			/* Record the subtree's reclaim efficiency */
3163			vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
3164					sc->nr_scanned - nr_scanned,
3165					sc->nr_reclaimed - nr_reclaimed);
3166
3167			if (sc->nr_reclaimed - nr_reclaimed)
3168				reclaimable = true;
3169
3170		} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
3171					sc->nr_scanned - nr_scanned, sc));
3172
3173		/*
3174		 * Kswapd gives up on balancing particular nodes after too
3175		 * many failures to reclaim anything from them and goes to
3176		 * sleep. On reclaim progress, reset the failure counter. A
3177		 * successful direct reclaim run will revive a dormant kswapd.
3178		 */
3179		if (reclaimable)
3180			pgdat->kswapd_failures = 0;
3181
3182		return reclaimable;
3183	}
3184
3185	/*
3186	 * Returns true if compaction should go ahead for a costly-order request, or
3187	 * the allocation would already succeed without compaction. Return false if we
3188	 * should reclaim first.
3189	 */
3190	static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
3191	{
3192		unsigned long watermark;
3193		enum compact_result suitable;
3194
3195		suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
3196		if (suitable == COMPACT_SUCCESS)
3197			/* Allocation should succeed already. Don't reclaim. */
3198			return true;
3199		if (suitable == COMPACT_SKIPPED)
3200			/* Compaction cannot yet proceed. Do reclaim. */
3201			return false;
3202
3203		/*
3204		 * Compaction is already possible, but it takes time to run and there
3205		 * are potentially other callers using the pages just freed. So proceed
3206		 * with reclaim to make a buffer of free pages available to give
3207		 * compaction a reasonable chance of completing and allocating the page.
3208		 * Note that we won't actually reclaim the whole buffer in one attempt
3209		 * as the target watermark in should_continue_reclaim() is lower. But if
3210		 * we are already above the high+gap watermark, don't reclaim at all.
3211		 */
3212		watermark = high_wmark_pages(zone) + compact_gap(sc->order);
3213
3214		return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
3215	}
3216
3217	/*
3218	 * This is the direct reclaim path, for page-allocating processes.  We only
3219	 * try to reclaim pages from zones which will satisfy the caller's allocation
3220	 * request.
3221	 *
3222	 * If a zone is deemed to be full of pinned pages then just give it a light
3223	 * scan then give up on it.
3224	 */
3225	static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
3226	{
3227		struct zoneref *z;
3228		struct zone *zone;
3229		unsigned long nr_soft_reclaimed;
3230		unsigned long nr_soft_scanned;
3231		gfp_t orig_mask;
3232		pg_data_t *last_pgdat = NULL;
3233
3234		/*
3235		 * If the number of buffer_heads in the machine exceeds the maximum
3236		 * allowed level, force direct reclaim to scan the highmem zone as
3237		 * highmem pages could be pinning lowmem pages storing buffer_heads
3238		 */
3239		orig_mask = sc->gfp_mask;
3240		if (buffer_heads_over_limit) {
3241			sc->gfp_mask |= __GFP_HIGHMEM;
3242			sc->reclaim_idx = gfp_zone(sc->gfp_mask);
3243		}
3244
3245		for_each_zone_zonelist_nodemask(zone, z, zonelist,
3246				sc->reclaim_idx, sc->nodemask) {
3247			/*
3248			 * Take care memory controller reclaiming has small influence
3249			 * to global LRU.
3250			 */
3251			if (global_reclaim(sc)) {
3252				if (!cpuset_zone_allowed(zone,
3253							GFP_KERNEL | __GFP_HARDWALL))
3254					continue;
3255
3256				/*
3257				 * If we already have plenty of memory free for
3258				 * compaction in this zone, don't free any more.
3259				 * Even though compaction is invoked for any
3260				 * non-zero order, only frequent costly order
3261				 * reclamation is disruptive enough to become a
3262				 * noticeable problem, like transparent huge
3263				 * page allocations.
3264				 */
3265				if (IS_ENABLED(CONFIG_COMPACTION) &&
3266						sc->order > PAGE_ALLOC_COSTLY_ORDER &&
3267						compaction_ready(zone, sc)) {
3268					sc->compaction_ready = true;
3269					continue;
3270				}
3271
3272				/*
3273				 * Shrink each node in the zonelist once. If the
3274				 * zonelist is ordered by zone (not the default) then a
3275				 * node may be shrunk multiple times but in that case
3276				 * the user prefers lower zones being preserved.
3277				 */
3278				if (zone->zone_pgdat == last_pgdat)
3279					continue;
3280
3281				/*
3282				 * This steals pages from memory cgroups over softlimit
3283				 * and returns the number of reclaimed pages and
3284				 * scanned pages. This works for global memory pressure
3285				 * and balancing, not for a memcg's limit.
3286				 */
3287				nr_soft_scanned = 0;
3288				nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
3289						sc->order, sc->gfp_mask,
3290						&nr_soft_scanned);
3291				sc->nr_reclaimed += nr_soft_reclaimed;
3292				sc->nr_scanned += nr_soft_scanned;
3293				/* need some check for avoid more shrink_zone() */
3294			}
3295
3296			/* See comment about same check for global reclaim above */
3297			if (zone->zone_pgdat == last_pgdat)
3298				continue;
3299			last_pgdat = zone->zone_pgdat;
3300			shrink_node(zone->zone_pgdat, sc);
3301		}
3302
3303		/*
3304		 * Restore to original mask to avoid the impact on the caller if we
3305		 * promoted it to __GFP_HIGHMEM.
3306		 */
3307		sc->gfp_mask = orig_mask;
3308	}
3309
3310	static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
3311	{
3312		struct mem_cgroup *memcg;
3313
3314		memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
3315		do {
3316			unsigned long refaults;
3317			struct lruvec *lruvec;
3318
3319			if (memcg)
3320				refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
3321			else
3322				refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
3323
3324			lruvec = mem_cgroup_lruvec(pgdat, memcg);
3325			lruvec->refaults = refaults;
3326		} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
3327	}
3328
3329	/*
3330	 * This is the main entry point to direct page reclaim.
3331	 *
3332	 * If a full scan of the inactive list fails to free enough memory then we
3333	 * are "out of memory" and something needs to be killed.
3334	 *
3335	 * If the caller is !__GFP_FS then the probability of a failure is reasonably
3336	 * high - the zone may be full of dirty or under-writeback pages, which this
3337	 * caller can't do much about.  We kick the writeback threads and take explicit
3338	 * naps in the hope that some of these pages can be written.  But if the
3339	 * allocating task holds filesystem locks which prevent writeout this might not
3340	 * work, and the allocation attempt will fail.
3341	 *
3342	 * returns:	0, if no pages reclaimed
3343	 * 		else, the number of pages reclaimed
3344	 */
3345	static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3346			struct scan_control *sc)
3347	{
3348		int initial_priority = sc->priority;
3349		pg_data_t *last_pgdat;
3350		struct zoneref *z;
3351		struct zone *zone;
3352retry:
3353		delayacct_freepages_start();
3354
3355		if (global_reclaim(sc))
3356			__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3357
3358		do {
3359			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3360					sc->priority);
3361			sc->nr_scanned = 0;
3362			shrink_zones(zonelist, sc);
3363
3364			if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3365				break;
3366
3367			if (sc->compaction_ready)
3368				break;
3369
3370			/*
3371			 * If we're getting trouble reclaiming, start doing
3372			 * writepage even in laptop mode.
3373			 */
3374			if (sc->priority < DEF_PRIORITY - 2)
3375				sc->may_writepage = 1;
3376		} while (--sc->priority >= 0);
3377
3378		last_pgdat = NULL;
3379		for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3380				sc->nodemask) {
3381			if (zone->zone_pgdat == last_pgdat)
3382				continue;
3383			last_pgdat = zone->zone_pgdat;
3384			snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3385		}
3386
3387		delayacct_freepages_end();
3388
3389		if (sc->nr_reclaimed)
3390			return sc->nr_reclaimed;
3391
3392		/* Aborted reclaim to try compaction? don't OOM, then */
3393		if (sc->compaction_ready)
3394			return 1;
3395
3396		/* Untapped cgroup reserves?  Don't OOM, retry. */
3397		if (sc->memcg_low_skipped) {
3398			sc->priority = initial_priority;
3399			sc->memcg_low_reclaim = 1;
3400			sc->memcg_low_skipped = 0;
3401			goto retry;
3402		}
3403
3404		return 0;
3405	}
3406
3407	static bool allow_direct_reclaim(pg_data_t *pgdat)
3408	{
3409		struct zone *zone;
3410		unsigned long pfmemalloc_reserve = 0;
3411		unsigned long free_pages = 0;
3412		int i;
3413		bool wmark_ok;
3414
3415		if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3416			return true;
3417
3418		for (i = 0; i <= ZONE_NORMAL; i++) {
3419			zone = &pgdat->node_zones[i];
3420			if (!managed_zone(zone))
3421				continue;
3422
3423			if (!zone_reclaimable_pages(zone))
3424				continue;
3425
3426			pfmemalloc_reserve += min_wmark_pages(zone);
3427			free_pages += zone_page_state(zone, NR_FREE_PAGES);
3428		}
3429
3430		/* If there are no reserves (unexpected config) then do not throttle */
3431		if (!pfmemalloc_reserve)
3432			return true;
3433
3434		wmark_ok = free_pages > pfmemalloc_reserve / 2;
3435
3436		/* kswapd must be awake if processes are being throttled */
3437		if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3438			pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3439					(enum zone_type)ZONE_NORMAL);
3440			wake_up_interruptible(&pgdat->kswapd_wait);
3441		}
3442
3443		return wmark_ok;
3444	}
3445
3446	/*
3447	 * Throttle direct reclaimers if backing storage is backed by the network
3448	 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
3449	 * depleted. kswapd will continue to make progress and wake the processes
3450	 * when the low watermark is reached.
3451	 *
3452	 * Returns true if a fatal signal was delivered during throttling. If this
3453	 * happens, the page allocator should not consider triggering the OOM killer.
3454	 */
3455	static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3456			nodemask_t *nodemask)
3457	{
3458		struct zoneref *z;
3459		struct zone *zone;
3460		pg_data_t *pgdat = NULL;
3461
3462		/*
3463		 * Kernel threads should not be throttled as they may be indirectly
3464		 * responsible for cleaning pages necessary for reclaim to make forward
3465		 * progress. kjournald for example may enter direct reclaim while
3466		 * committing a transaction where throttling it could forcing other
3467		 * processes to block on log_wait_commit().
3468		 */
3469		if (current->flags & PF_KTHREAD)
3470			goto out;
3471
3472		/*
3473		 * If a fatal signal is pending, this process should not throttle.
3474		 * It should return quickly so it can exit and free its memory
3475		 */
3476		if (fatal_signal_pending(current))
3477			goto out;
3478
3479		/*
3480		 * Check if the pfmemalloc reserves are ok by finding the first node
3481		 * with a usable ZONE_NORMAL or lower zone. The expectation is that
3482		 * GFP_KERNEL will be required for allocating network buffers when
3483		 * swapping over the network so ZONE_HIGHMEM is unusable.
3484		 *
3485		 * Throttling is based on the first usable node and throttled processes
3486		 * wait on a queue until kswapd makes progress and wakes them. There
3487		 * is an affinity then between processes waking up and where reclaim
3488		 * progress has been made assuming the process wakes on the same node.
3489		 * More importantly, processes running on remote nodes will not compete
3490		 * for remote pfmemalloc reserves and processes on different nodes
3491		 * should make reasonable progress.
3492		 */
3493		for_each_zone_zonelist_nodemask(zone, z, zonelist,
3494				gfp_zone(gfp_mask), nodemask) {
3495			if (zone_idx(zone) > ZONE_NORMAL)
3496				continue;
3497
3498			/* Throttle based on the first usable node */
3499			pgdat = zone->zone_pgdat;
3500			if (allow_direct_reclaim(pgdat))
3501				goto out;
3502			break;
3503		}
3504
3505		/* If no zone was usable by the allocation flags then do not throttle */
3506		if (!pgdat)
3507			goto out;
3508
3509		/* Account for the throttling */
3510		count_vm_event(PGSCAN_DIRECT_THROTTLE);
3511
3512		/*
3513		 * If the caller cannot enter the filesystem, it's possible that it
3514		 * is due to the caller holding an FS lock or performing a journal
3515		 * transaction in the case of a filesystem like ext[3|4]. In this case,
3516		 * it is not safe to block on pfmemalloc_wait as kswapd could be
3517		 * blocked waiting on the same lock. Instead, throttle for up to a
3518		 * second before continuing.
3519		 */
3520		if (!(gfp_mask & __GFP_FS)) {
3521			wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3522					allow_direct_reclaim(pgdat), HZ);
3523
3524			goto check_pending;
3525		}
3526
3527		/* Throttle until kswapd wakes the process */
3528		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3529				allow_direct_reclaim(pgdat));
3530
3531check_pending:
3532		if (fatal_signal_pending(current))
3533			return true;
3534
3535out:
3536		return false;
3537	}
3538
3539	unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3540			gfp_t gfp_mask, nodemask_t *nodemask)
3541	{
3542		unsigned long nr_reclaimed;
3543		struct scan_control sc = {
3544			.nr_to_reclaim = SWAP_CLUSTER_MAX,
3545			.gfp_mask = current_gfp_context(gfp_mask),
3546			.reclaim_idx = gfp_zone(gfp_mask),
3547			.order = order,
3548			.nodemask = nodemask,
3549			.priority = DEF_PRIORITY,
3550			.may_writepage = !laptop_mode,
3551			.may_unmap = 1,
3552			.may_swap = 1,
3553		};
3554
3555		/*
3556		 * Do not enter reclaim if fatal signal was delivered while throttled.
3557		 * 1 is returned so that the page allocator does not OOM kill at this
3558		 * point.
3559		 */
3560		if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3561			return 1;
3562
3563		trace_mm_vmscan_direct_reclaim_begin(order,
3564				sc.may_writepage,
3565				sc.gfp_mask,
3566				sc.reclaim_idx);
3567
3568		nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3569
3570		trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3571
3572		return nr_reclaimed;
3573	}
3574
3575#ifdef CONFIG_MEMCG
3576
3577	unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3578			gfp_t gfp_mask, bool noswap,
3579			pg_data_t *pgdat,
3580			unsigned long *nr_scanned)
3581	{
3582		struct scan_control sc = {
3583			.nr_to_reclaim = SWAP_CLUSTER_MAX,
3584			.target_mem_cgroup = memcg,
3585			.may_writepage = !laptop_mode,
3586			.may_unmap = 1,
3587			.reclaim_idx = MAX_NR_ZONES - 1,
3588			.may_swap = !noswap,
3589		};
3590		unsigned long lru_pages;
3591
3592		sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3593			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3594
3595		trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3596				sc.may_writepage,
3597				sc.gfp_mask,
3598				sc.reclaim_idx);
3599
3600		/*
3601		 * NOTE: Although we can get the priority field, using it
3602		 * here is not a good idea, since it limits the pages we can scan.
3603		 * if we don't reclaim here, the shrink_node from balance_pgdat
3604		 * will pick up pages from other mem cgroup's as well. We hack
3605		 * the priority and make it zero.
3606		 */
3607		shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3608
3609		trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3610
3611		*nr_scanned = sc.nr_scanned;
3612		return sc.nr_reclaimed;
3613	}
3614
3615	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3616			unsigned long nr_pages,
3617			gfp_t gfp_mask,
3618			bool may_swap)
3619	{
3620		struct zonelist *zonelist;
3621		unsigned long nr_reclaimed;
3622		int nid;
3623		unsigned int noreclaim_flag;
3624		struct scan_control sc = {
3625			.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3626			.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3627				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3628			.reclaim_idx = MAX_NR_ZONES - 1,
3629			.target_mem_cgroup = memcg,
3630			.priority = DEF_PRIORITY,
3631			.may_writepage = !laptop_mode,
3632			.may_unmap = 1,
3633			.may_swap = may_swap,
3634		};
3635
3636		/*
3637		 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
3638		 * take care of from where we get pages. So the node where we start the
3639		 * scan does not need to be the current node.
3640		 */
3641		nid = mem_cgroup_select_victim_node(memcg);
3642
3643		zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3644
3645		trace_mm_vmscan_memcg_reclaim_begin(0,
3646				sc.may_writepage,
3647				sc.gfp_mask,
3648				sc.reclaim_idx);
3649
3650		noreclaim_flag = memalloc_noreclaim_save();
3651		nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3652		memalloc_noreclaim_restore(noreclaim_flag);
3653
3654		trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3655
3656		return nr_reclaimed;
3657	}
3658#endif
3659
3660	static void age_active_anon(struct pglist_data *pgdat,
3661			struct scan_control *sc)
3662	{
3663		struct mem_cgroup *memcg;
3664
3665		if (!total_swap_pages)
3666			return;
3667
3668		memcg = mem_cgroup_iter(NULL, NULL, NULL);
3669		do {
3670			struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3671
3672			if (inactive_list_is_low(lruvec, false, memcg, sc, true))
3673				shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3674						sc, LRU_ACTIVE_ANON);
3675
3676			memcg = mem_cgroup_iter(NULL, memcg, NULL);
3677		} while (memcg);
3678	}
3679
3680	/*
3681	 * Returns true if there is an eligible zone balanced for the request order
3682	 * and classzone_idx
3683	 */
3684	static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3685	{
3686		int i;
3687		unsigned long mark = -1;
3688		struct zone *zone;
3689
3690		for (i = 0; i <= classzone_idx; i++) {
3691			zone = pgdat->node_zones + i;
3692
3693			if (!managed_zone(zone))
3694				continue;
3695
3696			mark = high_wmark_pages(zone);
3697			if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3698				return true;
3699		}
3700
3701		/*
3702		 * If a node has no populated zone within classzone_idx, it does not
3703		 * need balancing by definition. This can happen if a zone-restricted
3704		 * allocation tries to wake a remote kswapd.
3705		 */
3706		if (mark == -1)
3707			return true;
3708
3709		return false;
3710	}
3711
3712	/* Clear pgdat state for congested, dirty or under writeback. */
3713	static void clear_pgdat_congested(pg_data_t *pgdat)
3714	{
3715		clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3716		clear_bit(PGDAT_DIRTY, &pgdat->flags);
3717		clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3718	}
3719
3720	/*
3721	 * Prepare kswapd for sleeping. This verifies that there are no processes
3722	 * waiting in throttle_direct_reclaim() and that watermarks have been met.
3723	 *
3724	 * Returns true if kswapd is ready to sleep
3725	 */
3726	static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3727	{
3728		/*
3729		 * The throttled processes are normally woken up in balance_pgdat() as
3730		 * soon as allow_direct_reclaim() is true. But there is a potential
3731		 * race between when kswapd checks the watermarks and a process gets
3732		 * throttled. There is also a potential race if processes get
3733		 * throttled, kswapd wakes, a large process exits thereby balancing the
3734		 * zones, which causes kswapd to exit balance_pgdat() before reaching
3735		 * the wake up checks. If kswapd is going to sleep, no process should
3736		 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
3737		 * the wake up is premature, processes will wake kswapd and get
3738		 * throttled again. The difference from wake ups in balance_pgdat() is
3739		 * that here we are under prepare_to_wait().
3740		 */
3741		if (waitqueue_active(&pgdat->pfmemalloc_wait))
3742			wake_up_all(&pgdat->pfmemalloc_wait);
3743
3744		/* Hopeless node, leave it to direct reclaim */
3745		if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3746			return true;
3747
3748		if (pgdat_balanced(pgdat, order, classzone_idx))
3749		/*if (pgdat_balanced(pgdat, order, classzone_idx) &&
3750				(!populated_zone(&pgdat->node_zones[ZONE_PMEM]))) ||
3751										    zone_balanced(&pgdat->node_zones[ZONE_NORMAL], order, 0, classzone_idx)))*/
3752		{
3753			clear_pgdat_congested(pgdat);
3754			return true;
3755		}
3756
3757		return false;
3758	}
3759
3760	/*
3761	 * kswapd shrinks a node of pages that are at or below the highest usable
3762	 * zone that is currently unbalanced.
3763	 *
3764	 * Returns true if kswapd scanned at least the requested number of pages to
3765	 * reclaim or if the lack of progress was due to pages under writeback.
3766	 * This is used to determine if the scanning priority needs to be raised.
3767	 */
3768	static bool kswapd_shrink_node(pg_data_t *pgdat,
3769			struct scan_control *sc)
3770	{
3771		struct zone *zone;
3772		int z;
3773
3774		/* Reclaim a number of pages proportional to the number of zones */
3775		sc->nr_to_reclaim = 0;
3776		for (z = 0; z <= sc->reclaim_idx; z++) {
3777			zone = pgdat->node_zones + z;
3778			if (!managed_zone(zone))
3779				continue;
3780
3781			sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3782		}
3783
3784		/*
3785		 * Historically care was taken to put equal pressure on all zones but
3786		 * now pressure is applied based on node LRU order.
3787		 */
3788		shrink_node(pgdat, sc);
3789
3790		/*
3791		 * Fragmentation may mean that the system cannot be rebalanced for
3792		 * high-order allocations. If twice the allocation size has been
3793		 * reclaimed then recheck watermarks only at order-0 to prevent
3794		 * excessive reclaim. Assume that a process requested a high-order
3795		 * can direct reclaim/compact.
3796		 */
3797		if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3798			sc->order = 0;
3799
3800		return sc->nr_scanned >= sc->nr_to_reclaim;
3801	}
3802
3803	/*
3804	 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
3805	 * that are eligible for use by the caller until at least one zone is
3806	 * balanced.
3807	 *
3808	 * Returns the order kswapd finished reclaiming at.
3809	 *
3810	 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
3811	 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
3812	 * found to have free_pages <= high_wmark_pages(zone), any page is that zone
3813	 * or lower is eligible for reclaim until at least one usable zone is
3814	 * balanced.
3815	 */
3816	static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3817	{
3818		int i;
3819		unsigned long nr_soft_reclaimed;
3820		unsigned long nr_soft_scanned;
3821		struct zone *zone;
3822		struct scan_control sc = {
3823			.gfp_mask = GFP_KERNEL,
3824			.order = order,
3825			.priority = DEF_PRIORITY,
3826			.may_writepage = !laptop_mode,
3827			.may_unmap = 1,
3828			.may_swap = 1,
3829		};
3830		count_vm_event(PAGEOUTRUN);
3831
3832		do {
3833			unsigned long nr_reclaimed = sc.nr_reclaimed;
3834			bool raise_priority = true;
3835
3836			sc.reclaim_idx = classzone_idx;
3837
3838			/*
3839			 * If the number of buffer_heads exceeds the maximum allowed
3840			 * then consider reclaiming from all zones. This has a dual
3841			 * purpose -- on 64-bit systems it is expected that
3842			 * buffer_heads are stripped during active rotation. On 32-bit
3843			 * systems, highmem pages can pin lowmem memory and shrinking
3844			 * buffers can relieve lowmem pressure. Reclaim may still not
3845			 * go ahead if all eligible zones for the original allocation
3846			 * request are balanced to avoid excessive reclaim from kswapd.
3847			 */
3848			if (buffer_heads_over_limit) {
3849				for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3850					zone = pgdat->node_zones + i;
3851					if (!managed_zone(zone))
3852						continue;
3853
3854					sc.reclaim_idx = i;
3855					break;
3856				}
3857			}
3858
3859			/*
3860			 * Only reclaim if there are no eligible zones. Note that
3861			 * sc.reclaim_idx is not used as buffer_heads_over_limit may
3862			 * have adjusted it.
3863			 */
3864			if (pgdat_balanced(pgdat, sc.order, classzone_idx))
3865				goto out;
3866
3867			/*
3868			 * Do some background aging of the anon list, to give
3869			 * pages a chance to be referenced before reclaiming. All
3870			 * pages are rotated regardless of classzone as this is
3871			 * about consistent aging.
3872			 */
3873			age_active_anon(pgdat, &sc);
3874
3875			/*
3876			 * If we're getting trouble reclaiming, start doing writepage
3877			 * even in laptop mode.
3878			 */
3879			if (sc.priority < DEF_PRIORITY - 2)
3880				sc.may_writepage = 1;
3881
3882			/* Call soft limit reclaim before calling shrink_node. */
3883			sc.nr_scanned = 0;
3884			nr_soft_scanned = 0;
3885			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3886					sc.gfp_mask, &nr_soft_scanned);
3887			sc.nr_reclaimed += nr_soft_reclaimed;
3888
3889			/*
3890			 * There should be no need to raise the scanning priority if
3891			 * enough pages are already being scanned that that high
3892			 * watermark would be met at 100% efficiency.
3893			 */
3894			if (kswapd_shrink_node(pgdat, &sc))
3895				raise_priority = false;
3896
3897			/*
3898			 * If the low watermark is met there is no need for processes
3899			 * to be throttled on pfmemalloc_wait as they should not be
3900			 * able to safely make forward progress. Wake them
3901			 */
3902			if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3903					allow_direct_reclaim(pgdat))
3904				wake_up_all(&pgdat->pfmemalloc_wait);
3905
3906			/* Check if kswapd should be suspending */
3907			if (try_to_freeze() || kthread_should_stop())
3908				break;
3909
3910			/*
3911			 * Raise priority if scanning rate is too low or there was no
3912			 * progress in reclaiming pages
3913			 */
3914			nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3915			if (raise_priority || !nr_reclaimed)
3916				sc.priority--;
3917		} while (sc.priority >= 1);
3918
3919		if (!sc.nr_reclaimed)
3920			pgdat->kswapd_failures++;
3921
3922out:
3923		snapshot_refaults(NULL, pgdat);
3924		/*
3925		 * Return the order kswapd stopped reclaiming at as
3926		 * prepare_kswapd_sleep() takes it into account. If another caller
3927		 * entered the allocator slow path while kswapd was awake, order will
3928		 * remain at the higher level.
3929		 */
3930		return sc.order;
3931	}
3932
3933	/*
3934	 * pgdat->kswapd_classzone_idx is the highest zone index that a recent
3935	 * allocation request woke kswapd for. When kswapd has not woken recently,
3936	 * the value is MAX_NR_ZONES which is not a valid index. This compares a
3937	 * given classzone and returns it or the highest classzone index kswapd
3938	 * was recently woke for.
3939	 */
3940	static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3941			enum zone_type classzone_idx)
3942	{
3943		if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3944			return classzone_idx;
3945
3946		return max(pgdat->kswapd_classzone_idx, classzone_idx);
3947	}
3948
3949	static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3950			unsigned int classzone_idx)
3951	{
3952		//printk("MC_Debug_kswapd: kswapd_try_to_sleep\n");
3953		long remaining = 0;
3954		DEFINE_WAIT(wait);
3955
3956		if (freezing(current) || kthread_should_stop())
3957			return;
3958
3959		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3960
3961		/*
3962		 * Try to sleep for a short interval. Note that kcompactd will only be
3963		 * woken if it is possible to sleep for a short interval. This is
3964		 * deliberate on the assumption that if reclaim cannot keep an
3965		 * eligible zone balanced that it's also unlikely that compaction will
3966		 * succeed.
3967		 */
3968		if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3969			/*
3970			 * Compaction records what page blocks it recently failed to
3971			 * isolate pages from and skips them in the future scanning.
3972			 * When kswapd is going to sleep, it is reasonable to assume
3973			 * that pages and compaction may succeed so reset the cache.
3974			 */
3975			reset_isolation_suitable(pgdat);
3976
3977			/*
3978			 * We have freed the memory, now we should compact it to make
3979			 * allocation of the requested order possible.
3980			 */
3981			wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3982
3983			remaining = schedule_timeout(HZ);
3984
3985			/*
3986			 * If woken prematurely then reset kswapd_classzone_idx and
3987			 * order. The values will either be from a wakeup request or
3988			 * the previous request that slept prematurely.
3989			 */
3990			if (remaining) {
3991				pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3992				pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3993			}
3994
3995			finish_wait(&pgdat->kswapd_wait, &wait);
3996			prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3997		}
3998
3999		/*
4000		 * After a short sleep, check if it was a premature sleep. If not, then
4001		 * go fully to sleep until explicitly woken up.
4002		 */
4003		if (!remaining &&
4004				prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
4005			trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
4006			printk("MC_Debug_kswapd: going for long sleep\n");
4007
4008			/*
4009			 * vmstat counters are not perfectly accurate and the estimated
4010			 * value for counters such as NR_FREE_PAGES can deviate from the
4011			 * true value by nr_online_cpus * threshold. To avoid the zone
4012			 * watermarks being breached while under pressure, we reduce the
4013			 * per-cpu vmstat threshold while kswapd is awake and restore
4014			 * them before going back to sleep.
4015			 */
4016			set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
4017
4018			if (!kthread_should_stop())
4019				schedule();
4020
4021			set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
4022		} else {
4023			if (remaining)
4024				count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
4025			else
4026				count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
4027		}
4028		finish_wait(&pgdat->kswapd_wait, &wait);
4029	}
4030
4031	/*
4032	 * The background pageout daemon, started as a kernel thread
4033	 * from the init process.
4034	 *
4035	 * This basically trickles out pages so that we have _some_
4036	 * free memory available even if there is no other activity
4037	 * that frees anything up. This is needed for things like routing
4038	 * etc, where we otherwise might have all activity going on in
4039	 * asynchronous contexts that cannot page things out.
4040	 *
4041	 * If there are applications that are active memory-allocators
4042	 * (most normal use), this basically shouldn't matter.
4043	 */
4044	static int kswapd(void *p)
4045	{
4046
4047		//printk("MC_Debug_kswapd: kswapd\n");
4048		unsigned int alloc_order, reclaim_order;
4049		unsigned int classzone_idx = MAX_NR_ZONES - 1;
4050		pg_data_t *pgdat = (pg_data_t*)p;
4051		struct task_struct *tsk = current;
4052
4053		struct reclaim_state reclaim_state = {
4054			.reclaimed_slab = 0,
4055		};
4056		const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
4057
4058		if (!cpumask_empty(cpumask))
4059			set_cpus_allowed_ptr(tsk, cpumask);
4060		current->reclaim_state = &reclaim_state;
4061
4062		/*
4063		 * Tell the memory management that we're a "memory allocator",
4064		 * and that if we need more memory we should get access to it
4065		 * regardless (see "__alloc_pages()"). "kswapd" should
4066		 * never get caught in the normal page freeing logic.
4067		 *
4068		 * (Kswapd normally doesn't need memory anyway, but sometimes
4069		 * you need a small amount of memory in order to be able to
4070		 * page out something else, and this flag essentially protects
4071		 * us from recursively trying to free more memory as we're
4072		 * trying to free the first piece of memory in the first place).
4073		 */
4074		tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
4075		set_freezable();
4076
4077		pgdat->kswapd_order = 0;
4078		pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
4079		//unsigned long cnt = 0;
4080		for ( ; ; ) {
4081			//printk("MC_Debug_kswapd: inside loop: %lu\n", cnt++);
4082			bool ret;
4083
4084			alloc_order = reclaim_order = pgdat->kswapd_order;
4085			classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
4086
4087kswapd_try_sleep:
4088			kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
4089					classzone_idx);
4090
4091			/* Read the new order and classzone_idx */
4092			alloc_order = reclaim_order = pgdat->kswapd_order;
4093			classzone_idx = kswapd_classzone_idx(pgdat, 0);
4094			pgdat->kswapd_order = 0;
4095			pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
4096
4097			ret = try_to_freeze();
4098			if (kthread_should_stop())
4099				break;
4100
4101			/*
4102			 * We can speed up thawing tasks if we don't call balance_pgdat
4103			 * after returning from the refrigerator
4104			 */
4105			if (ret)
4106				continue;
4107
4108			/*
4109			 * Reclaim begins at the requested order but if a high-order
4110			 * reclaim fails then kswapd falls back to reclaiming for
4111			 * order-0. If that happens, kswapd will consider sleeping
4112			 * for the order it finished reclaiming at (reclaim_order)
4113			 * but kcompactd is woken to compact for the original
4114			 * request (alloc_order).
4115			 */
4116			trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
4117					alloc_order);
4118			fs_reclaim_acquire(GFP_KERNEL);
4119			reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
4120			fs_reclaim_release(GFP_KERNEL);
4121			if (reclaim_order < alloc_order)
4122				goto kswapd_try_sleep;
4123		}
4124
4125		tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
4126		current->reclaim_state = NULL;
4127
4128		return 0;
4129	}
4130
4131	/*
4132	 * A zone is low on free memory, so wake its kswapd task to service it.
4133	 */
4134	void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
4135	{
4136		printk("MC_Debug_kswapd: wakeup_kswapd\n");
4137		pg_data_t *pgdat;
4138
4139		if (!managed_zone(zone))
4140			return;
4141
4142		if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
4143			return;
4144		pgdat = zone->zone_pgdat;
4145		pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
4146				classzone_idx);
4147		pgdat->kswapd_order = max(pgdat->kswapd_order, order);
4148		if (!waitqueue_active(&pgdat->kswapd_wait))
4149			return;
4150
4151		/* Hopeless node, leave it to direct reclaim */
4152		if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
4153			return;
4154
4155		if (pgdat_balanced(pgdat, order, classzone_idx))
4156			return;
4157
4158		trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
4159		wake_up_interruptible(&pgdat->kswapd_wait);
4160	}
4161
4162#ifdef CONFIG_HIBERNATION
4163	/*
4164	 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
4165	 * freed pages.
4166	 *
4167	 * Rather than trying to age LRUs the aim is to preserve the overall
4168	 * LRU order by reclaiming preferentially
4169	 * inactive > active > active referenced > active mapped
4170	 */
4171	unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4172	{
4173		struct reclaim_state reclaim_state;
4174		struct scan_control sc = {
4175			.nr_to_reclaim = nr_to_reclaim,
4176			.gfp_mask = GFP_HIGHUSER_MOVABLE,
4177			.reclaim_idx = MAX_NR_ZONES - 1,
4178			.priority = DEF_PRIORITY,
4179			.may_writepage = 1,
4180			.may_unmap = 1,
4181			.may_swap = 1,
4182			.hibernation_mode = 1,
4183		};
4184		struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4185		struct task_struct *p = current;
4186		unsigned long nr_reclaimed;
4187		unsigned int noreclaim_flag;
4188
4189		noreclaim_flag = memalloc_noreclaim_save();
4190		fs_reclaim_acquire(sc.gfp_mask);
4191		reclaim_state.reclaimed_slab = 0;
4192		p->reclaim_state = &reclaim_state;
4193
4194		nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4195
4196		p->reclaim_state = NULL;
4197		fs_reclaim_release(sc.gfp_mask);
4198		memalloc_noreclaim_restore(noreclaim_flag);
4199
4200		return nr_reclaimed;
4201	}
4202#endif /* CONFIG_HIBERNATION */
4203
4204	/* It's optimal to keep kswapds on the same CPUs as their memory, but
4205	   not required for correctness.  So if the last cpu in a node goes
4206	   away, we get changed to run anywhere: as the first one comes back,
4207	   restore their cpu bindings. */
4208	static int kswapd_cpu_online(unsigned int cpu)
4209	{
4210		//printk("MC_Debug_kswapd: kswapd_online\n");
4211		int nid;
4212
4213		for_each_node_state(nid, N_MEMORY) {
4214			pg_data_t *pgdat = NODE_DATA(nid);
4215			const struct cpumask *mask;
4216
4217			mask = cpumask_of_node(pgdat->node_id);
4218
4219			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
4220				/* One of our CPUs online: restore mask */
4221				set_cpus_allowed_ptr(pgdat->kswapd, mask);
4222		}
4223		return 0;
4224	}
4225
4226	/*
4227	 * This kswapd start function will be called by init and node-hot-add.
4228	 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
4229	 */
4230	int kswapd_run(int nid)
4231	{
4232		//printk("MC_Debug_kswapd: kswapd_run\n");
4233		pg_data_t *pgdat = NODE_DATA(nid);
4234		int ret = 0;
4235
4236		if (pgdat->kswapd)
4237			return 0;
4238
4239		pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4240		if (IS_ERR(pgdat->kswapd)) {
4241			/* failure at boot is fatal */
4242			BUG_ON(system_state < SYSTEM_RUNNING);
4243			pr_err("Failed to start kswapd on node %d\n", nid);
4244			ret = PTR_ERR(pgdat->kswapd);
4245			pgdat->kswapd = NULL;
4246		}
4247
4248		if (pgdat->kpromoted)
4249			return 0;
4250
4251		pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
4252		if (IS_ERR(pgdat->kpromoted)) {
4253			/* failure at boot is fatal */
4254			//printk("MC_Debug_4156: kpromoted error: failed to start\n");
4255			BUG_ON(system_state == SYSTEM_BOOTING);
4256			//pr_err("Failed to start kpromoted on node %d\n", nid);
4257			ret = PTR_ERR(pgdat->kpromoted);
4258			pgdat->kpromoted = NULL;
4259		}
4260		return ret;
4261	}
4262
4263	/*
4264	 * Called by memory hotplug when all memory in a node is offlined.  Caller must
4265	 * hold mem_hotplug_begin/end().
4266	 */
4267	void kswapd_stop(int nid)
4268	{
4269		//printk("MC_Debug_kswapd: kswapd_stop\n");
4270		struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4271		struct task_struct *kpromoted = NODE_DATA(nid)->kpromoted;
4272
4273		if (kswapd) {
4274			kthread_stop(kswapd);
4275			NODE_DATA(nid)->kswapd = NULL;
4276		}
4277
4278		if (kpromoted) {
4279			kthread_stop(kpromoted);
4280			NODE_DATA(nid)->kpromoted = NULL;
4281		}
4282	}
4283
4284	static int __init kswapd_init(void)
4285	{
4286
4287		//printk("MC_Debug_kswapd: kswapd_init\n");
4288		int nid, ret;
4289
4290		swap_setup();
4291		for_each_node_state(nid, N_MEMORY)
4292			kswapd_run(nid);
4293		ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
4294				"mm/vmscan:online", kswapd_cpu_online,
4295				NULL);
4296		WARN_ON(ret < 0);
4297		return 0;
4298	}
4299
4300	module_init(kswapd_init)
4301
4302#ifdef CONFIG_NUMA
4303		/*
4304		 * Node reclaim mode
4305		 *
4306		 * If non-zero call node_reclaim when the number of free pages falls below
4307		 * the watermarks.
4308		 */
4309		int node_reclaim_mode __read_mostly;
4310
4311#define RECLAIM_OFF 0
4312#define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
4313#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
4314#define RECLAIM_UNMAP (1<<2)	/* Unmap pages during reclaim */
4315
4316	/*
4317	 * Priority for NODE_RECLAIM. This determines the fraction of pages
4318	 * of a node considered for each zone_reclaim. 4 scans 1/16th of
4319	 * a zone.
4320	 */
4321#define NODE_RECLAIM_PRIORITY 4
4322
4323	/*
4324	 * Percentage of pages in a zone that must be unmapped for node_reclaim to
4325	 * occur.
4326	 */
4327	int sysctl_min_unmapped_ratio = 1;
4328
4329	/*
4330	 * If the number of slab pages in a zone grows beyond this percentage then
4331	 * slab reclaim needs to occur.
4332	 */
4333	int sysctl_min_slab_ratio = 5;
4334
4335	static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4336	{
4337		unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4338		unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4339			node_page_state(pgdat, NR_ACTIVE_FILE);
4340
4341		/*
4342		 * It's possible for there to be more file mapped pages than
4343		 * accounted for by the pages on the file LRU lists because
4344		 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
4345		 */
4346		return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4347	}
4348
4349	/* Work out how many page cache pages we can reclaim in this reclaim_mode */
4350	static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4351	{
4352		unsigned long nr_pagecache_reclaimable;
4353		unsigned long delta = 0;
4354
4355		/*
4356		 * If RECLAIM_UNMAP is set, then all file pages are considered
4357		 * potentially reclaimable. Otherwise, we have to worry about
4358		 * pages like swapcache and node_unmapped_file_pages() provides
4359		 * a better estimate
4360		 */
4361		if (node_reclaim_mode & RECLAIM_UNMAP)
4362			nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4363		else
4364			nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4365
4366		/* If we can't clean pages, remove dirty pages from consideration */
4367		if (!(node_reclaim_mode & RECLAIM_WRITE))
4368			delta += node_page_state(pgdat, NR_FILE_DIRTY);
4369
4370		/* Watch for any possible underflows due to delta */
4371		if (unlikely(delta > nr_pagecache_reclaimable))
4372			delta = nr_pagecache_reclaimable;
4373
4374		return nr_pagecache_reclaimable - delta;
4375	}
4376
4377	/*
4378	 * Try to free up some pages from this node through reclaim.
4379	 */
4380	static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4381	{
4382		/* Minimum pages needed in order to stay on node */
4383		const unsigned long nr_pages = 1 << order;
4384		struct task_struct *p = current;
4385		struct reclaim_state reclaim_state;
4386		unsigned int noreclaim_flag;
4387		struct scan_control sc = {
4388			.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4389			.gfp_mask = current_gfp_context(gfp_mask),
4390			.order = order,
4391			.priority = NODE_RECLAIM_PRIORITY,
4392			.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4393			.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4394			.may_swap = 1,
4395			.reclaim_idx = gfp_zone(gfp_mask),
4396		};
4397
4398		cond_resched();
4399		/*
4400		 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
4401		 * and we also need to be able to write out pages for RECLAIM_WRITE
4402		 * and RECLAIM_UNMAP.
4403		 */
4404		noreclaim_flag = memalloc_noreclaim_save();
4405		p->flags |= PF_SWAPWRITE;
4406		fs_reclaim_acquire(sc.gfp_mask);
4407		reclaim_state.reclaimed_slab = 0;
4408		p->reclaim_state = &reclaim_state;
4409
4410		if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4411			/*
4412			 * Free memory by calling shrink zone with increasing
4413			 * priorities until we have enough memory freed.
4414			 */
4415			do {
4416				shrink_node(pgdat, &sc);
4417			} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4418		}
4419
4420		p->reclaim_state = NULL;
4421		fs_reclaim_release(gfp_mask);
4422		current->flags &= ~PF_SWAPWRITE;
4423		memalloc_noreclaim_restore(noreclaim_flag);
4424		return sc.nr_reclaimed >= nr_pages;
4425	}
4426
4427	int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4428	{
4429		int ret;
4430
4431		/*
4432		 * Node reclaim reclaims unmapped file backed pages and
4433		 * slab pages if we are over the defined limits.
4434		 *
4435		 * A small portion of unmapped file backed pages is needed for
4436		 * file I/O otherwise pages read by file I/O will be immediately
4437		 * thrown out if the node is overallocated. So we do not reclaim
4438		 * if less than a specified percentage of the node is used by
4439		 * unmapped file backed pages.
4440		 */
4441		if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4442				node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4443			return NODE_RECLAIM_FULL;
4444
4445		/*
4446		 * Do not scan if the allocation should not be delayed.
4447		 */
4448		if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4449			return NODE_RECLAIM_NOSCAN;
4450
4451		/*
4452		 * Only run node reclaim on the local node or on nodes that do not
4453		 * have associated processors. This will favor the local processor
4454		 * over remote processors and spread off node memory allocations
4455		 * as wide as possible.
4456		 */
4457		if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4458			return NODE_RECLAIM_NOSCAN;
4459
4460		if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4461			return NODE_RECLAIM_NOSCAN;
4462
4463		ret = __node_reclaim(pgdat, gfp_mask, order);
4464		clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4465
4466		if (!ret)
4467			count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4468
4469		return ret;
4470	}
4471#endif
4472
4473	/*
4474	 * page_evictable - test whether a page is evictable
4475	 * @page: the page to test
4476	 *
4477	 * Test whether page is evictable--i.e., should be placed on active/inactive
4478	 * lists vs unevictable list.
4479	 *
4480	 * Reasons page might not be evictable:
4481	 * (1) page's mapping marked unevictable
4482	 * (2) page is part of an mlocked VMA
4483	 *
4484	 */
4485	int page_evictable(struct page *page)
4486	{
4487		return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4488	}
4489
4490#ifdef CONFIG_SHMEM
4491	/**
4492	 * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
4493	 * @pages:	array of pages to check
4494	 * @nr_pages:	number of pages to check
4495	 *
4496	 * Checks pages for evictability and moves them to the appropriate lru list.
4497	 *
4498	 * This function is only used for SysV IPC SHM_UNLOCK.
4499	 */
4500	void check_move_unevictable_pages(struct page **pages, int nr_pages)
4501	{
4502		struct lruvec *lruvec;
4503		struct pglist_data *pgdat = NULL;
4504		int pgscanned = 0;
4505		int pgrescued = 0;
4506		int i;
4507
4508		for (i = 0; i < nr_pages; i++) {
4509			struct page *page = pages[i];
4510			struct pglist_data *pagepgdat = page_pgdat(page);
4511
4512			pgscanned++;
4513			if (pagepgdat != pgdat) {
4514				if (pgdat)
4515					spin_unlock_irq(&pgdat->lru_lock);
4516				__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4517				spin_unlock_irq(&pgdat->lru_lock);
4518			}
4519		}
4520	}
4521#endif /* CONFIG_SHMEM */