vmscan.c 72.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
22
#include <linux/vmstat.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>	/* for try_to_release_page(),
					buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
37
#include <linux/delay.h>
38
#include <linux/kthread.h>
39
#include <linux/freezer.h>
40
#include <linux/memcontrol.h>
41
#include <linux/delayacct.h>
42
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
46
47
48

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>

49
50
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
51
52
53
54
struct scan_control {
	/* Incremented by the number of inactive pages that were scanned */
	unsigned long nr_scanned;

55
56
57
	/* Number of pages freed so far during a call to shrink_zones() */
	unsigned long nr_reclaimed;

Linus Torvalds's avatar
Linus Torvalds committed
58
	/* This context's GFP mask */
Al Viro's avatar
Al Viro committed
59
	gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62

	int may_writepage;

63
64
	/* Can mapped pages be reclaimed? */
	int may_unmap;
65

Linus Torvalds's avatar
Linus Torvalds committed
66
67
68
69
70
	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
	 * In this context, it doesn't matter that we scan the
	 * whole list at once. */
	int swap_cluster_max;
71
72

	int swappiness;
73
74

	int all_unreclaimable;
Andy Whitcroft's avatar
Andy Whitcroft committed
75
76

	int order;
77
78
79
80
81
82
83
84

	/* Which cgroup do we reclaim from */
	struct mem_cgroup *mem_cgroup;

	/* Pluggable isolate pages callback */
	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
			unsigned long *scanned, int order, int mode,
			struct zone *z, struct mem_cgroup *mem_cont,
85
			int active, int file);
Linus Torvalds's avatar
Linus Torvalds committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
};

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetch(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetchw(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 100.  Higher means more swappy.
 */
int vm_swappiness = 60;
122
long vm_total_pages;	/* The total number of pages which the VM controls */
Linus Torvalds's avatar
Linus Torvalds committed
123
124
125
126

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

127
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
128
#define scanning_global_lru(sc)	(!(sc)->mem_cgroup)
129
#else
130
#define scanning_global_lru(sc)	(1)
131
132
#endif

133
134
135
static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
						  struct scan_control *sc)
{
136
	if (!scanning_global_lru(sc))
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
137
138
		return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);

139
140
141
	return &zone->reclaim_stat;
}

142
143
144
static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
				   enum lru_list lru)
{
145
	if (!scanning_global_lru(sc))
146
147
		return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);

148
149
150
151
	return zone_page_state(zone, NR_LRU_BASE + lru);
}


Linus Torvalds's avatar
Linus Torvalds committed
152
153
154
/*
 * Add a shrinker callback to be called from the vm
 */
155
void register_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
156
{
157
158
159
160
	shrinker->nr = 0;
	down_write(&shrinker_rwsem);
	list_add_tail(&shrinker->list, &shrinker_list);
	up_write(&shrinker_rwsem);
Linus Torvalds's avatar
Linus Torvalds committed
161
}
162
EXPORT_SYMBOL(register_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
163
164
165
166

/*
 * Remove one
 */
167
void unregister_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
172
{
	down_write(&shrinker_rwsem);
	list_del(&shrinker->list);
	up_write(&shrinker_rwsem);
}
173
EXPORT_SYMBOL(unregister_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
174
175
176
177
178
179
180
181
182
183

#define SHRINK_BATCH 128
/*
 * Call the shrink functions to age shrinkable caches
 *
 * Here we assume it costs one seek to replace a lru page and that it also
 * takes a seek to recreate a cache object.  With this in mind we age equal
 * percentages of the lru and ageable caches.  This should balance the seeks
 * generated by these structures.
 *
Simon Arlott's avatar
Simon Arlott committed
184
 * If the vm encountered mapped pages on the LRU it increase the pressure on
Linus Torvalds's avatar
Linus Torvalds committed
185
186
187
188
189
190
191
 * slab to avoid swapping.
 *
 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 *
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
192
193
 *
 * Returns the number of slab objects which we shrunk.
Linus Torvalds's avatar
Linus Torvalds committed
194
 */
195
196
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
			unsigned long lru_pages)
Linus Torvalds's avatar
Linus Torvalds committed
197
198
{
	struct shrinker *shrinker;
199
	unsigned long ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
200
201
202
203
204

	if (scanned == 0)
		scanned = SWAP_CLUSTER_MAX;

	if (!down_read_trylock(&shrinker_rwsem))
205
		return 1;	/* Assume we'll be able to shrink next time */
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209

	list_for_each_entry(shrinker, &shrinker_list, list) {
		unsigned long long delta;
		unsigned long total_scan;
210
		unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
211
212

		delta = (4 * scanned) / shrinker->seeks;
213
		delta *= max_pass;
Linus Torvalds's avatar
Linus Torvalds committed
214
215
		do_div(delta, lru_pages + 1);
		shrinker->nr += delta;
216
217
		if (shrinker->nr < 0) {
			printk(KERN_ERR "%s: nr=%ld\n",
218
					__func__, shrinker->nr);
219
220
221
222
223
224
225
226
227
228
			shrinker->nr = max_pass;
		}

		/*
		 * Avoid risking looping forever due to too large nr value:
		 * never try to free more than twice the estimate number of
		 * freeable entries.
		 */
		if (shrinker->nr > max_pass * 2)
			shrinker->nr = max_pass * 2;
Linus Torvalds's avatar
Linus Torvalds committed
229
230
231
232
233
234
235

		total_scan = shrinker->nr;
		shrinker->nr = 0;

		while (total_scan >= SHRINK_BATCH) {
			long this_scan = SHRINK_BATCH;
			int shrink_ret;
236
			int nr_before;
Linus Torvalds's avatar
Linus Torvalds committed
237

238
239
			nr_before = (*shrinker->shrink)(0, gfp_mask);
			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
240
241
			if (shrink_ret == -1)
				break;
242
243
			if (shrink_ret < nr_before)
				ret += nr_before - shrink_ret;
244
			count_vm_events(SLABS_SCANNED, this_scan);
Linus Torvalds's avatar
Linus Torvalds committed
245
246
247
248
249
250
251
252
			total_scan -= this_scan;

			cond_resched();
		}

		shrinker->nr += total_scan;
	}
	up_read(&shrinker_rwsem);
253
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
}

/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
	struct address_space *mapping;

	/* Page is in somebody's page tables. */
	if (page_mapped(page))
		return 1;

	/* Be more reluctant to reclaim swapcache than pagecache */
	if (PageSwapCache(page))
		return 1;

	mapping = page_mapping(page);
	if (!mapping)
		return 0;

	/* File is mmap'd by somebody? */
	return mapping_mapped(mapping);
}

static inline int is_page_cache_freeable(struct page *page)
{
	return page_count(page) - !!PagePrivate(page) == 2;
}

static int may_write_to_queue(struct backing_dev_info *bdi)
{
284
	if (current->flags & PF_SWAPWRITE)
Linus Torvalds's avatar
Linus Torvalds committed
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
		return 1;
	if (!bdi_write_congested(bdi))
		return 1;
	if (bdi == current->backing_dev_info)
		return 1;
	return 0;
}

/*
 * We detected a synchronous write error writing a page out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the page and once
 * that page is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping lock_page() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
				struct page *page, int error)
{
	lock_page(page);
309
310
	if (page_mapping(page) == mapping)
		mapping_set_error(mapping, error);
Linus Torvalds's avatar
Linus Torvalds committed
311
312
313
	unlock_page(page);
}

314
315
316
317
318
319
/* Request for sync pageout. */
enum pageout_io {
	PAGEOUT_IO_ASYNC,
	PAGEOUT_IO_SYNC,
};

320
321
322
323
324
325
326
327
328
329
330
331
/* possible outcome of pageout() */
typedef enum {
	/* failed to write page out, page is locked */
	PAGE_KEEP,
	/* move page to the active list, page is locked */
	PAGE_ACTIVATE,
	/* page has been sent to the disk successfully, page is unlocked */
	PAGE_SUCCESS,
	/* page is clean and locked */
	PAGE_CLEAN,
} pageout_t;

Linus Torvalds's avatar
Linus Torvalds committed
332
/*
333
334
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
Linus Torvalds's avatar
Linus Torvalds committed
335
 */
336
337
static pageout_t pageout(struct page *page, struct address_space *mapping,
						enum pageout_io sync_writeback)
Linus Torvalds's avatar
Linus Torvalds committed
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
{
	/*
	 * If the page is dirty, only perform writeback if that write
	 * will be non-blocking.  To prevent this allocation from being
	 * stalled by pagecache activity.  But note that there may be
	 * stalls if we need to run get_block().  We could test
	 * PagePrivate for that.
	 *
	 * If this process is currently in generic_file_write() against
	 * this page's queue, we can perform writeback even if that
	 * will block.
	 *
	 * If the page is swapcache, write it back even if that would
	 * block, for some throttling. This happens by accident, because
	 * swap_backing_dev_info is bust: it doesn't reflect the
	 * congestion state of the swapdevs.  Easy to fix, if needed.
	 * See swapfile.c:page_queue_congested().
	 */
	if (!is_page_cache_freeable(page))
		return PAGE_KEEP;
	if (!mapping) {
		/*
		 * Some data journaling orphaned pages can have
		 * page->mapping == NULL while being dirty with clean buffers.
		 */
363
		if (PagePrivate(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
364
365
			if (try_to_free_buffers(page)) {
				ClearPageDirty(page);
366
				printk("%s: orphaned page\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
				return PAGE_CLEAN;
			}
		}
		return PAGE_KEEP;
	}
	if (mapping->a_ops->writepage == NULL)
		return PAGE_ACTIVATE;
	if (!may_write_to_queue(mapping->backing_dev_info))
		return PAGE_KEEP;

	if (clear_page_dirty_for_io(page)) {
		int res;
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_NONE,
			.nr_to_write = SWAP_CLUSTER_MAX,
382
383
			.range_start = 0,
			.range_end = LLONG_MAX,
Linus Torvalds's avatar
Linus Torvalds committed
384
385
386
387
388
389
390
391
			.nonblocking = 1,
			.for_reclaim = 1,
		};

		SetPageReclaim(page);
		res = mapping->a_ops->writepage(page, &wbc);
		if (res < 0)
			handle_write_error(mapping, page, res);
392
		if (res == AOP_WRITEPAGE_ACTIVATE) {
Linus Torvalds's avatar
Linus Torvalds committed
393
394
395
			ClearPageReclaim(page);
			return PAGE_ACTIVATE;
		}
396
397
398
399
400
401
402
403
404

		/*
		 * Wait on writeback if requested to. This happens when
		 * direct reclaiming a large contiguous area and the
		 * first attempt to free a range of pages fails.
		 */
		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
			wait_on_page_writeback(page);

Linus Torvalds's avatar
Linus Torvalds committed
405
406
407
408
		if (!PageWriteback(page)) {
			/* synchronous write or broken a_ops? */
			ClearPageReclaim(page);
		}
409
		inc_zone_page_state(page, NR_VMSCAN_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
410
411
412
413
414
415
		return PAGE_SUCCESS;
	}

	return PAGE_CLEAN;
}

416
/*
Nick Piggin's avatar
Nick Piggin committed
417
418
 * Same as remove_mapping, but if the page is removed from the mapping, it
 * gets returned with a refcount of 0.
419
 */
Nick Piggin's avatar
Nick Piggin committed
420
static int __remove_mapping(struct address_space *mapping, struct page *page)
421
{
422
423
	BUG_ON(!PageLocked(page));
	BUG_ON(mapping != page_mapping(page));
424

Nick Piggin's avatar
Nick Piggin committed
425
	spin_lock_irq(&mapping->tree_lock);
426
	/*
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
	 * The non racy check for a busy page.
	 *
	 * Must be careful with the order of the tests. When someone has
	 * a ref to the page, it may be possible that they dirty it then
	 * drop the reference. So if PageDirty is tested before page_count
	 * here, then the following race may occur:
	 *
	 * get_user_pages(&page);
	 * [user mapping goes away]
	 * write_to(page);
	 *				!PageDirty(page)    [good]
	 * SetPageDirty(page);
	 * put_page(page);
	 *				!page_count(page)   [good, discard it]
	 *
	 * [oops, our write_to data is lost]
	 *
	 * Reversing the order of the tests ensures such a situation cannot
	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
	 * load is not satisfied before that of page->_count.
	 *
	 * Note that if SetPageDirty is always performed via set_page_dirty,
	 * and thus under tree_lock, then this ordering is not required.
450
	 */
Nick Piggin's avatar
Nick Piggin committed
451
	if (!page_freeze_refs(page, 2))
452
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
453
454
455
	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
	if (unlikely(PageDirty(page))) {
		page_unfreeze_refs(page, 2);
456
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
457
	}
458
459
460
461

	if (PageSwapCache(page)) {
		swp_entry_t swap = { .val = page_private(page) };
		__delete_from_swap_cache(page);
Nick Piggin's avatar
Nick Piggin committed
462
		spin_unlock_irq(&mapping->tree_lock);
463
		swap_free(swap);
Nick Piggin's avatar
Nick Piggin committed
464
465
	} else {
		__remove_from_page_cache(page);
Nick Piggin's avatar
Nick Piggin committed
466
		spin_unlock_irq(&mapping->tree_lock);
467
468
469
470
471
	}

	return 1;

cannot_free:
Nick Piggin's avatar
Nick Piggin committed
472
	spin_unlock_irq(&mapping->tree_lock);
473
474
475
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/*
 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
 * someone else has a ref on the page, abort and return 0.  If it was
 * successfully detached, return 1.  Assumes the caller has a single ref on
 * this page.
 */
int remove_mapping(struct address_space *mapping, struct page *page)
{
	if (__remove_mapping(mapping, page)) {
		/*
		 * Unfreezing the refcount with 1 rather than 2 effectively
		 * drops the pagecache ref for us without requiring another
		 * atomic operation.
		 */
		page_unfreeze_refs(page, 1);
		return 1;
	}
	return 0;
}

496
497
498
499
500
501
502
503
504
505
506
507
508
509
/**
 * putback_lru_page - put previously isolated page onto appropriate LRU list
 * @page: page to be put back to appropriate lru list
 *
 * Add previously isolated @page to appropriate LRU list.
 * Page may still be unevictable for other reasons.
 *
 * lru_lock must not be held, interrupts must be enabled.
 */
#ifdef CONFIG_UNEVICTABLE_LRU
void putback_lru_page(struct page *page)
{
	int lru;
	int active = !!TestClearPageActive(page);
510
	int was_unevictable = PageUnevictable(page);
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550

	VM_BUG_ON(PageLRU(page));

redo:
	ClearPageUnevictable(page);

	if (page_evictable(page, NULL)) {
		/*
		 * For evictable pages, we can use the cache.
		 * In event of a race, worst case is we end up with an
		 * unevictable page on [in]active list.
		 * We know how to handle that.
		 */
		lru = active + page_is_file_cache(page);
		lru_cache_add_lru(page, lru);
	} else {
		/*
		 * Put unevictable pages directly on zone's unevictable
		 * list.
		 */
		lru = LRU_UNEVICTABLE;
		add_page_to_unevictable_list(page);
	}

	/*
	 * page's status can change while we move it among lru. If an evictable
	 * page is on unevictable list, it never be freed. To avoid that,
	 * check after we added it to the list, again.
	 */
	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
		if (!isolate_lru_page(page)) {
			put_page(page);
			goto redo;
		}
		/* This means someone else dropped this page from LRU
		 * So, it will be freed or putback to LRU again. There is
		 * nothing to do here.
		 */
	}

551
552
553
554
555
	if (was_unevictable && lru != LRU_UNEVICTABLE)
		count_vm_event(UNEVICTABLE_PGRESCUED);
	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
		count_vm_event(UNEVICTABLE_PGCULLED);

556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
	put_page(page);		/* drop ref from isolate */
}

#else /* CONFIG_UNEVICTABLE_LRU */

void putback_lru_page(struct page *page)
{
	int lru;
	VM_BUG_ON(PageLRU(page));

	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
	lru_cache_add_lru(page, lru);
	put_page(page);
}
#endif /* CONFIG_UNEVICTABLE_LRU */


Linus Torvalds's avatar
Linus Torvalds committed
573
/*
574
 * shrink_page_list() returns the number of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
575
 */
576
static unsigned long shrink_page_list(struct list_head *page_list,
577
578
					struct scan_control *sc,
					enum pageout_io sync_writeback)
Linus Torvalds's avatar
Linus Torvalds committed
579
580
581
582
{
	LIST_HEAD(ret_pages);
	struct pagevec freed_pvec;
	int pgactivate = 0;
583
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598

	cond_resched();

	pagevec_init(&freed_pvec, 1);
	while (!list_empty(page_list)) {
		struct address_space *mapping;
		struct page *page;
		int may_enter_fs;
		int referenced;

		cond_resched();

		page = lru_to_page(page_list);
		list_del(&page->lru);

Nick Piggin's avatar
Nick Piggin committed
599
		if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
600
601
			goto keep;

Nick Piggin's avatar
Nick Piggin committed
602
		VM_BUG_ON(PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
603
604

		sc->nr_scanned++;
605

606
607
		if (unlikely(!page_evictable(page, NULL)))
			goto cull_mlocked;
608

609
		if (!sc->may_unmap && page_mapped(page))
610
611
			goto keep_locked;

Linus Torvalds's avatar
Linus Torvalds committed
612
613
614
615
		/* Double the slab pressure for mapped and swapcache pages */
		if (page_mapped(page) || PageSwapCache(page))
			sc->nr_scanned++;

616
617
618
619
620
621
622
623
624
625
626
627
628
629
		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

		if (PageWriteback(page)) {
			/*
			 * Synchronous reclaim is performed in two passes,
			 * first an asynchronous pass over the list to
			 * start parallel writeback, and a second synchronous
			 * pass to wait for the IO to complete.  Wait here
			 * for any page for which writeback has already
			 * started.
			 */
			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
				wait_on_page_writeback(page);
630
			else
631
632
				goto keep_locked;
		}
Linus Torvalds's avatar
Linus Torvalds committed
633

634
		referenced = page_referenced(page, 1, sc->mem_cgroup);
Linus Torvalds's avatar
Linus Torvalds committed
635
		/* In active use or really unfreeable?  Activate it. */
Andy Whitcroft's avatar
Andy Whitcroft committed
636
637
		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
					referenced && page_mapping_inuse(page))
Linus Torvalds's avatar
Linus Torvalds committed
638
639
640
641
642
643
			goto activate_locked;

		/*
		 * Anonymous process memory has backing store?
		 * Try to allocate it some swap space here.
		 */
644
		if (PageAnon(page) && !PageSwapCache(page)) {
645
646
			if (!(sc->gfp_mask & __GFP_IO))
				goto keep_locked;
647
			if (!add_to_swap(page))
Linus Torvalds's avatar
Linus Torvalds committed
648
				goto activate_locked;
649
			may_enter_fs = 1;
650
		}
Linus Torvalds's avatar
Linus Torvalds committed
651
652
653
654
655
656
657
658

		mapping = page_mapping(page);

		/*
		 * The page is mapped into the page tables of one or more
		 * processes. Try to unmap it here.
		 */
		if (page_mapped(page) && mapping) {
659
			switch (try_to_unmap(page, 0)) {
Linus Torvalds's avatar
Linus Torvalds committed
660
661
662
663
			case SWAP_FAIL:
				goto activate_locked;
			case SWAP_AGAIN:
				goto keep_locked;
664
665
			case SWAP_MLOCK:
				goto cull_mlocked;
Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
669
670
671
			case SWAP_SUCCESS:
				; /* try to free the page below */
			}
		}

		if (PageDirty(page)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
672
			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
Linus Torvalds's avatar
Linus Torvalds committed
673
				goto keep_locked;
674
			if (!may_enter_fs)
Linus Torvalds's avatar
Linus Torvalds committed
675
				goto keep_locked;
676
			if (!sc->may_writepage)
Linus Torvalds's avatar
Linus Torvalds committed
677
678
679
				goto keep_locked;

			/* Page is dirty, try to write it out here */
680
			switch (pageout(page, mapping, sync_writeback)) {
Linus Torvalds's avatar
Linus Torvalds committed
681
682
683
684
685
			case PAGE_KEEP:
				goto keep_locked;
			case PAGE_ACTIVATE:
				goto activate_locked;
			case PAGE_SUCCESS:
686
				if (PageWriteback(page) || PageDirty(page))
Linus Torvalds's avatar
Linus Torvalds committed
687
688
689
690
691
					goto keep;
				/*
				 * A synchronous write - probably a ramdisk.  Go
				 * ahead and try to reclaim the page.
				 */
Nick Piggin's avatar
Nick Piggin committed
692
				if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
					goto keep;
				if (PageDirty(page) || PageWriteback(page))
					goto keep_locked;
				mapping = page_mapping(page);
			case PAGE_CLEAN:
				; /* try to free the page below */
			}
		}

		/*
		 * If the page has buffers, try to free the buffer mappings
		 * associated with this page. If we succeed we try to free
		 * the page as well.
		 *
		 * We do this even if the page is PageDirty().
		 * try_to_release_page() does not perform I/O, but it is
		 * possible for a page to have PageDirty set, but it is actually
		 * clean (all its buffers are clean).  This happens if the
		 * buffers were written out directly, with submit_bh(). ext3
712
		 * will do this, as well as the blockdev mapping.
Linus Torvalds's avatar
Linus Torvalds committed
713
714
715
716
717
718
719
720
721
722
723
724
725
		 * try_to_release_page() will discover that cleanness and will
		 * drop the buffers and mark the page clean - it can be freed.
		 *
		 * Rarely, pages can have buffers and no ->mapping.  These are
		 * the pages which were not successfully invalidated in
		 * truncate_complete_page().  We try to drop those buffers here
		 * and if that worked, and the page is no longer mapped into
		 * process address space (page_count == 1) it can be freed.
		 * Otherwise, leave the page on the LRU so it is swappable.
		 */
		if (PagePrivate(page)) {
			if (!try_to_release_page(page, sc->gfp_mask))
				goto activate_locked;
Nick Piggin's avatar
Nick Piggin committed
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
			if (!mapping && page_count(page) == 1) {
				unlock_page(page);
				if (put_page_testzero(page))
					goto free_it;
				else {
					/*
					 * rare race with speculative reference.
					 * the speculative reference will free
					 * this page shortly, so we may
					 * increment nr_reclaimed here (and
					 * leave it off the LRU).
					 */
					nr_reclaimed++;
					continue;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
742
743
		}

Nick Piggin's avatar
Nick Piggin committed
744
		if (!mapping || !__remove_mapping(mapping, page))
745
			goto keep_locked;
Linus Torvalds's avatar
Linus Torvalds committed
746

Nick Piggin's avatar
Nick Piggin committed
747
748
749
750
751
752
753
754
		/*
		 * At this point, we have no other references and there is
		 * no way to pick any more up (removed from LRU, removed
		 * from pagecache). Can use non-atomic bitops now (and
		 * we obviously don't have to worry about waking up a process
		 * waiting on the page lock, because there are no references.
		 */
		__clear_page_locked(page);
Nick Piggin's avatar
Nick Piggin committed
755
free_it:
756
		nr_reclaimed++;
Nick Piggin's avatar
Nick Piggin committed
757
758
759
760
		if (!pagevec_add(&freed_pvec, page)) {
			__pagevec_free(&freed_pvec);
			pagevec_reinit(&freed_pvec);
		}
Linus Torvalds's avatar
Linus Torvalds committed
761
762
		continue;

763
cull_mlocked:
764
765
		if (PageSwapCache(page))
			try_to_free_swap(page);
766
767
768
769
		unlock_page(page);
		putback_lru_page(page);
		continue;

Linus Torvalds's avatar
Linus Torvalds committed
770
activate_locked:
771
772
		/* Not a candidate for swapping, so reclaim swap space. */
		if (PageSwapCache(page) && vm_swap_full())
773
			try_to_free_swap(page);
774
		VM_BUG_ON(PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
775
776
777
778
779
780
		SetPageActive(page);
		pgactivate++;
keep_locked:
		unlock_page(page);
keep:
		list_add(&page->lru, &ret_pages);
781
		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
Linus Torvalds's avatar
Linus Torvalds committed
782
783
784
	}
	list_splice(&ret_pages, page_list);
	if (pagevec_count(&freed_pvec))
Nick Piggin's avatar
Nick Piggin committed
785
		__pagevec_free(&freed_pvec);
786
	count_vm_events(PGACTIVATE, pgactivate);
787
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
788
789
}

Andy Whitcroft's avatar
Andy Whitcroft committed
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
/* LRU Isolation modes. */
#define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
#define ISOLATE_ACTIVE 1	/* Isolate active pages. */
#define ISOLATE_BOTH 2		/* Isolate both active and inactive pages. */

/*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
 * freed elsewhere are also ignored.
 *
 * page:	page to consider
 * mode:	one of the LRU isolation modes defined above
 *
 * returns 0 on success, -ve errno on failure.
 */
805
int __isolate_lru_page(struct page *page, int mode, int file)
Andy Whitcroft's avatar
Andy Whitcroft committed
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
{
	int ret = -EINVAL;

	/* Only take pages on the LRU. */
	if (!PageLRU(page))
		return ret;

	/*
	 * When checking the active state, we need to be sure we are
	 * dealing with comparible boolean values.  Take the logical not
	 * of each.
	 */
	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
		return ret;

821
822
823
	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
		return ret;

824
825
826
827
828
829
830
831
	/*
	 * When this function is being called for lumpy reclaim, we
	 * initially look into all LRU pages, active, inactive and
	 * unevictable; only give shrink_page_list evictable pages.
	 */
	if (PageUnevictable(page))
		return ret;

Andy Whitcroft's avatar
Andy Whitcroft committed
832
	ret = -EBUSY;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
833

Andy Whitcroft's avatar
Andy Whitcroft committed
834
835
836
837
838
839
840
841
	if (likely(get_page_unless_zero(page))) {
		/*
		 * Be careful not to clear PageLRU until after we're
		 * sure the page is not being freed elsewhere -- the
		 * page release code relies on it.
		 */
		ClearPageLRU(page);
		ret = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
842
		mem_cgroup_del_lru(page);
Andy Whitcroft's avatar
Andy Whitcroft committed
843
844
845
846
847
	}

	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
848
849
850
851
852
853
854
855
856
857
858
859
860
861
/*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan:	The number of pages to look through on the list.
 * @src:	The LRU list to pull pages off.
 * @dst:	The temp list to put pages on to.
 * @scanned:	The number of pages that were scanned.
Andy Whitcroft's avatar
Andy Whitcroft committed
862
863
 * @order:	The caller's attempted allocation order
 * @mode:	One of the LRU isolation modes
864
 * @file:	True [1] if isolating file [!anon] pages
Linus Torvalds's avatar
Linus Torvalds committed
865
866
867
 *
 * returns how many pages were moved onto *@dst.
 */
868
869
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
		struct list_head *src, struct list_head *dst,
870
		unsigned long *scanned, int order, int mode, int file)
Linus Torvalds's avatar
Linus Torvalds committed
871
{
872
	unsigned long nr_taken = 0;
873
	unsigned long scan;
Linus Torvalds's avatar
Linus Torvalds committed
874

875
	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
Andy Whitcroft's avatar
Andy Whitcroft committed
876
877
878
879
880
881
		struct page *page;
		unsigned long pfn;
		unsigned long end_pfn;
		unsigned long page_pfn;
		int zone_id;

Linus Torvalds's avatar
Linus Torvalds committed
882
883
884
		page = lru_to_page(src);
		prefetchw_prev_lru_page(page, src, flags);

Nick Piggin's avatar
Nick Piggin committed
885
		VM_BUG_ON(!PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
886

887
		switch (__isolate_lru_page(page, mode, file)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
888
889
		case 0:
			list_move(&page->lru, dst);
890
			nr_taken++;
Andy Whitcroft's avatar
Andy Whitcroft committed
891
892
893
894
895
896
			break;

		case -EBUSY:
			/* else it is being freed elsewhere */
			list_move(&page->lru, src);
			continue;
897

Andy Whitcroft's avatar
Andy Whitcroft committed
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
		default:
			BUG();
		}

		if (!order)
			continue;

		/*
		 * Attempt to take all pages in the order aligned region
		 * surrounding the tag page.  Only take those pages of
		 * the same active state as that tag page.  We may safely
		 * round the target page pfn down to the requested order
		 * as the mem_map is guarenteed valid out to MAX_ORDER,
		 * where that page is in a different zone we will detect
		 * it from its zone id and abort this block scan.
		 */
		zone_id = page_zone_id(page);
		page_pfn = page_to_pfn(page);
		pfn = page_pfn & ~((1 << order) - 1);
		end_pfn = pfn + (1 << order);
		for (; pfn < end_pfn; pfn++) {
			struct page *cursor_page;

			/* The target page is in the block, ignore it. */
			if (unlikely(pfn == page_pfn))
				continue;

			/* Avoid holes within the zone. */
			if (unlikely(!pfn_valid_within(pfn)))
				break;

			cursor_page = pfn_to_page(pfn);
930

Andy Whitcroft's avatar
Andy Whitcroft committed
931
932
933
			/* Check that we have not crossed a zone boundary. */
			if (unlikely(page_zone_id(cursor_page) != zone_id))
				continue;
934
			switch (__isolate_lru_page(cursor_page, mode, file)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
935
936
937
938
939
940
941
942
943
944
			case 0:
				list_move(&cursor_page->lru, dst);
				nr_taken++;
				scan++;
				break;

			case -EBUSY:
				/* else it is being freed elsewhere */
				list_move(&cursor_page->lru, src);
			default:
945
				break;	/* ! on LRU or wrong list */
Andy Whitcroft's avatar
Andy Whitcroft committed
946
947
			}
		}
Linus Torvalds's avatar
Linus Torvalds committed
948
949
950
951
952
953
	}

	*scanned = scan;
	return nr_taken;
}

954
955
956
957
958
static unsigned long isolate_pages_global(unsigned long nr,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
959
					int active, int file)
960
{
961
	int lru = LRU_BASE;
962
	if (active)
963
964
965
966
967
		lru += LRU_ACTIVE;
	if (file)
		lru += LRU_FILE;
	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
								mode, !!file);
968
969
}

Andy Whitcroft's avatar
Andy Whitcroft committed
970
971
972
973
/*
 * clear_active_flags() is a helper for shrink_active_list(), clearing
 * any active bits from the pages in the list.
 */
974
975
static unsigned long clear_active_flags(struct list_head *page_list,
					unsigned int *count)
Andy Whitcroft's avatar
Andy Whitcroft committed
976
977
{
	int nr_active = 0;
978
	int lru;
Andy Whitcroft's avatar
Andy Whitcroft committed
979
980
	struct page *page;

981
982
	list_for_each_entry(page, page_list, lru) {
		lru = page_is_file_cache(page);
Andy Whitcroft's avatar
Andy Whitcroft committed
983
		if (PageActive(page)) {
984
			lru += LRU_ACTIVE;
Andy Whitcroft's avatar
Andy Whitcroft committed
985
986
987
			ClearPageActive(page);
			nr_active++;
		}
988
989
		count[lru]++;
	}
Andy Whitcroft's avatar
Andy Whitcroft committed
990
991
992
993

	return nr_active;
}

994
995
996
997
998
999
1000
1001
1002
1003
1004
/**
 * isolate_lru_page - tries to isolate a page from its LRU list
 * @page: page to isolate from its LRU list
 *
 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
 * vmstat statistic corresponding to whatever LRU list the page was on.
 *
 * Returns 0 if the page was removed from an LRU list.
 * Returns -EBUSY if the page was not on an LRU list.
 *
 * The returned page will have PageLRU() cleared.  If it was found on
1005
1006
1007
 * the active list, it will have PageActive set.  If it was found on
 * the unevictable list, it will have the PageUnevictable bit set. That flag
 * may need to be cleared by the caller before letting the page go.
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
 *
 * The vmstat statistic corresponding to the list on which the page was
 * found will be decremented.
 *
 * Restrictions:
 * (1) Must be called with an elevated refcount on the page. This is a
 *     fundamentnal difference from isolate_lru_pages (which is called
 *     without a stable reference).
 * (2) the lru_lock must not be held.
 * (3) interrupts must be enabled.
 */
int isolate_lru_page(struct page *page)
{
	int ret = -EBUSY;

	if (PageLRU(page)) {
		struct zone *zone = page_zone(page);

		spin_lock_irq(&zone->lru_lock);
		if (PageLRU(page) && get_page_unless_zero(page)) {
1028
			int lru = page_lru(page);
1029
1030
			ret = 0;
			ClearPageLRU(page);
1031
1032

			del_page_from_lru_list(zone, page, lru);
1033
1034
1035
1036
1037
1038
		}
		spin_unlock_irq(&zone->lru_lock);
	}
	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
1039
/*
1040
1041
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
1042
 */
1043
static unsigned long shrink_inactive_list(unsigned long max_scan,
1044
1045
			struct zone *zone, struct scan_control *sc,
			int priority, int file)
Linus Torvalds's avatar
Linus Torvalds committed
1046
1047
1048
{
	LIST_HEAD(page_list);
	struct pagevec pvec;
1049
	unsigned long nr_scanned = 0;
1050
	unsigned long nr_reclaimed = 0;
1051
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
Linus Torvalds's avatar
Linus Torvalds committed
1052
1053
1054
1055
1056

	pagevec_init(&pvec, 1);

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
1057
	do {
Linus Torvalds's avatar
Linus Torvalds committed
1058
		struct page *page;
1059
1060
1061
		unsigned long nr_taken;
		unsigned long nr_scan;
		unsigned long nr_freed;
Andy Whitcroft's avatar
Andy Whitcroft committed
1062
		unsigned long nr_active;
1063
		unsigned int count[NR_LRU_LISTS] = { 0, };
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
		int mode = ISOLATE_INACTIVE;

		/*
		 * If we need a large contiguous chunk of memory, or have
		 * trouble getting a small set of contiguous pages, we
		 * will reclaim both active and inactive pages.
		 *
		 * We use the same threshold as pageout congestion_wait below.
		 */
		if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
			mode = ISOLATE_BOTH;
		else if (sc->order && priority < DEF_PRIORITY - 2)
			mode = ISOLATE_BOTH;
Linus Torvalds's avatar
Linus Torvalds committed
1077

1078
		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1079
1080
1081
			     &page_list, &nr_scan, sc->order, mode,
				zone, sc->mem_cgroup, 0, file);
		nr_active = clear_active_flags(&page_list, count);
1082
		__count_vm_events(PGDEACTIVATE, nr_active);
Andy Whitcroft's avatar
Andy Whitcroft committed
1083

1084
1085
1086
1087
1088
1089
1090
1091
1092
		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
						-count[LRU_ACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
						-count[LRU_INACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
						-count[LRU_ACTIVE_ANON]);
		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
						-count[LRU_INACTIVE_ANON]);

1093
		if (scanning_global_lru(sc))
1094
			zone->pages_scanned += nr_scan;
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
1095
1096
1097
1098
1099
1100

		reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
		reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
		reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
		reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];

Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
		spin_unlock_irq(&zone->lru_lock);

1103
		nr_scanned += nr_scan;
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);

		/*
		 * If we are direct reclaiming for contiguous pages and we do
		 * not reclaim everything in the list, try again and wait
		 * for IO to complete. This will stall high-order allocations
		 * but that should be acceptable to the caller
		 */
		if (nr_freed < nr_taken && !current_is_kswapd() &&
					sc->order > PAGE_ALLOC_COSTLY_ORDER) {
			congestion_wait(WRITE, HZ/10);

			/*
			 * The attempt at page out may have made some
			 * of the pages active, mark them inactive again.
			 */
1120
			nr_active = clear_active_flags(&page_list, count);
1121
1122
1123
1124
1125
1126
			count_vm_events(PGDEACTIVATE, nr_active);

			nr_freed += shrink_page_list(&page_list, sc,
							PAGEOUT_IO_SYNC);
		}

1127
		nr_reclaimed += nr_freed;
Nick Piggin's avatar
Nick Piggin committed
1128
1129
		local_irq_disable();
		if (current_is_kswapd()) {
1130
1131
			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
			__count_vm_events(KSWAPD_STEAL, nr_freed);
1132
		} else if (scanning_global_lru(sc))
1133
			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1134

Shantanu Goel's avatar
Shantanu Goel committed
1135
		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
Nick Piggin's avatar
Nick Piggin committed
1136

1137
1138
1139
		if (nr_taken == 0)
			goto done;

Nick Piggin's avatar
Nick Piggin committed
1140
		spin_lock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1141
1142
1143
1144
		/*
		 * Put back any unfreeable pages.
		 */
		while (!list_empty(&page_list)) {
1145
			int lru;
Linus Torvalds's avatar
Linus Torvalds committed
1146
			page = lru_to_page(&page_list);
Nick Piggin's avatar
Nick Piggin committed
1147
			VM_BUG_ON(PageLRU(page));
Linus Torvalds's avatar
Linus Torvalds committed
1148
			list_del(&page->lru);
1149
1150
1151
1152
1153
1154
1155
1156
1157
			if (unlikely(!page_evictable(page, NULL))) {
				spin_unlock_irq(&zone->lru_lock);
				putback_lru_page(page);
				spin_lock_irq(&zone->lru_lock);
				continue;
			}
			SetPageLRU(page);
			lru = page_lru(page);
			add_page_to_lru_list(zone, page, lru);
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
1158
			if (PageActive(page)) {
1159
				int file = !!page_is_file_cache(page);
1160
				reclaim_stat->recent_rotated[file]++;
1161
			}
Linus Torvalds's avatar
Linus Torvalds committed
1162
1163
1164
1165
1166
1167
			if (!pagevec_add(&pvec, page)) {
				spin_unlock_irq(&zone->lru_lock);
				__pagevec_release(&pvec);
				spin_lock_irq(&zone->lru_lock);
			}
		}
1168
  	} while (nr_scanned < max_scan);
1169
	spin_unlock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1170
done:
1171
	local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
1172
	pagevec_release(&pvec);
1173
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1174
1175
}

1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
/*
 * We are about to scan this zone at a certain priority level.  If that priority
 * level is smaller (ie: more urgent) than the previous priority, then note
 * that priority level within the zone.  This is done so that when the next
 * process comes in to scan this zone, it will immediately start out at this
 * priority level rather than having to build up its own scanning priority.
 * Here, this priority affects only the reclaim-mapped threshold.
 */
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
	if (priority < zone->prev_priority)
		zone->prev_priority = priority;
}

Linus Torvalds's avatar
Linus Torvalds committed
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
 * processes, from rmap.
 *
 * If the pages are mostly unmapped, the processing is fast and it is
 * appropriate to hold zone->lru_lock across the whole operation.  But if
 * the pages are mapped, the processing is slow (page_referenced()) so we
 * should drop zone->lru_lock around each page.  It's impossible to balance
 * this, so instead we remove the pages from the LRU while processing them.
 * It is safe to rely on PG_active against the non-LRU pages in here because
 * nobody will play with that bit on a non-LRU page.
 *
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
1207
1208


1209
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1210
			struct scan_control *sc, int priority, int file)
Linus Torvalds's avatar
Linus Torvalds committed
1211
{
1212
	unsigned long pgmoved;
Linus Torvalds's avatar
Linus Torvalds committed
1213
	int pgdeactivate = 0;
1214
	unsigned long pgscanned;
Linus Torvalds's avatar
Linus Torvalds committed
1215
	LIST_HEAD(l_hold);	/* The pages which were snipped off */
1216
	LIST_HEAD(l_inactive);
Linus Torvalds's avatar
Linus Torvalds committed
1217
1218
	struct page *page;
	struct pagevec pvec;
1219
	enum lru_list lru;
1220
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
Linus Torvalds's avatar
Linus Torvalds committed
1221
1222
1223

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
1224
1225
	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
					ISOLATE_ACTIVE, zone,
1226
					sc->mem_cgroup, 1, file);
1227
1228
1229
1230
	/*
	 * zone->pages_scanned is used for detect zone's oom
	 * mem_cgroup remembers nr_scan by itself.
	 */
1231
	if (scanning_global_lru(sc)) {
1232
		zone->pages_scanned += pgscanned;
1233
	}
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
1234
	reclaim_stat->recent_scanned[!!file] += pgmoved;
1235

1236
1237
1238
1239
	if (file)
		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
	else
		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1240
1241
	spin_unlock_irq(&zone->lru_lock);

1242
	pgmoved = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1243
1244
1245
1246
	while (!list_empty(&l_hold)) {
		cond_resched();
		page = lru_to_page(&l_hold);
		list_del(&page->lru);
1247

1248
1249
1250
1251
1252
		if (unlikely(!page_evictable(page, NULL))) {
			putback_lru_page(page);
			continue;
		}

1253
1254
1255
1256
1257
		/* page_referenced clears PageReferenced */
		if (page_mapping_inuse(page) &&
		    page_referenced(page, 0, sc->mem_cgroup))
			pgmoved++;

Linus Torvalds's avatar
Linus Torvalds committed
1258
1259
1260
		list_add(&page->lru, &l_inactive);
	}

1261
1262
1263
1264
1265
1266
	/*
	 * Move the pages to the [file or anon] inactive list.
	 */
	pagevec_init(&pvec, 1);
	lru = LRU_BASE + file * LRU_FILE;

1267
	spin_lock_irq(&zone->lru_lock);
1268
	/*
1269
1270
1271
1272
1273
	 * Count referenced pages from currently used mappings as
	 * rotated, even though they are moved to the inactive list.
	 * This helps balance scan pressure between file and anonymous
	 * pages in get_scan_ratio.
	 */
KOSAKI Motohiro's avatar
KOSAKI Motohiro committed
1274
	reclaim_stat->recent_rotated[!!file] += pgmoved;
1275

1276
	pgmoved = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1277
1278
1279
	while (!list_empty(&l_inactive)) {
		page = lru_to_page(&l_inactive);
		prefetchw_prev_lru_page(page, &l_inactive, flags);
Nick Piggin's avatar
Nick Piggin committed
1280
		VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
1281
		SetPageLRU(page);
Nick Piggin's avatar
Nick Piggin committed
1282
		VM_BUG_ON(!PageActive(page));
1283
1284
		ClearPageActive(page);

1285
		list_move(&page->lru, &zone->lru[lru].list);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1286
		mem_cgroup_add_lru_list(page, lru);
Linus Torvalds's avatar
Linus Torvalds committed
1287
1288
		pgmoved++;
		if (!pagevec_add(&pvec, page)) {
1289
			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1290
1291
1292
1293
1294
1295
1296
1297
1298
			spin_unlock_irq(&zone->lru_lock);
			pgdeactivate += pgmoved;
			pgmoved = 0;
			if (buffer_heads_over_limit)
				pagevec_strip(&pvec);
			__pagevec_release(&pvec);
			spin_lock_irq(&zone->lru_lock);
		}
	}
1299
	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1300
1301
1302
1303
1304
1305
	pgdeactivate += pgmoved;
	if (buffer_heads_over_limit) {
		spin_unlock_irq(&zone->lru_lock);
		pagevec_strip(&pvec);
		spin_lock_irq(&zone->lru_lock);
	}
1306
1307
1308
	__count_zone_vm_events(PGREFILL, zone, pgscanned);
	__count_vm_events(PGDEACTIVATE, pgdeactivate);
	spin_unlock_irq(&zone->lru_lock);
1309
1310
	if (vm_swap_full())
		pagevec_swap_free(&pvec);
Linus Torvalds's avatar
Linus Torvalds committed
1311

Nick Piggin's avatar
Nick Piggin committed
1312
	pagevec_release(&pvec);
Linus Torvalds's avatar
Linus Torvalds committed
1313
1314
}

1315
static int inactive_anon_is_low_global(struct zone *zone)
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
{
	unsigned long active, inactive;

	active = zone_page_state(zone, NR_ACTIVE_ANON);
	inactive = zone_page_state(zone, NR_INACTIVE_ANON);

	if (inactive * zone->inactive_ratio < active)
		return 1;

	return 0;
}

1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
/**
 * inactive_anon_is_low - check if anonymous pages need to be deactivated
 * @zone: zone to check
 * @sc:   scan control of this context
 *
 * Returns true if the zone does not have enough inactive anon pages,
 * meaning some active anon pages need to be deactivated.
 */
static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
	int low;

1340
	if (scanning_global_lru(sc))
1341
1342
		low = inactive_anon_is_low_global(zone);
	else
1343
		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1344
1345
1346
	return low;
}

1347
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1348
1349
	struct zone *zone, struct scan_control *sc, int priority)
{
1350
1351
	int file = is_file_lru(lru);

1352
1353
1354
1355
1356
	if (lru == LRU_ACTIVE_FILE) {
		shrink_active_list(nr_to_scan, zone, sc, priority, file);
		return 0;
	}

1357
	if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1358
		shrink_active_list(nr_to_scan, zone, sc, priority, file);
1359
1360
		return 0;
	}
1361
	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
}

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
 * percent[0] specifies how much pressure to put on ram/swap backed
 * memory, while percent[1] determines pressure on the file LRUs.
 */
static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
					unsigned long *percent)
{
	unsigned long anon, file, free;
	unsigned long anon_prio, file_prio;
	unsigned long ap, fp;
1379
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1380
1381
1382
1383
1384
1385
1386
1387

	/* If we have no swap space, do not bother scanning anon pages. */
	if (nr_swap_pages <= 0) {