vmscan.c 60.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
22
#include <linux/vmstat.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>	/* for try_to_release_page(),
					buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
37
#include <linux/delay.h>
38
#include <linux/kthread.h>
39
#include <linux/freezer.h>
40
#include <linux/memcontrol.h>
41
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
45
46
47

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>

48
49
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
50
51
52
53
54
struct scan_control {
	/* Incremented by the number of inactive pages that were scanned */
	unsigned long nr_scanned;

	/* This context's GFP mask */
Al Viro's avatar
Al Viro committed
55
	gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58

	int may_writepage;

59
60
61
	/* Can pages be swapped as part of reclaim? */
	int may_swap;

Linus Torvalds's avatar
Linus Torvalds committed
62
63
64
65
66
	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
	 * In this context, it doesn't matter that we scan the
	 * whole list at once. */
	int swap_cluster_max;
67
68

	int swappiness;
69
70

	int all_unreclaimable;
Andy Whitcroft's avatar
Andy Whitcroft committed
71
72

	int order;
73
74
75
76
77
78
79
80

	/* Which cgroup do we reclaim from */
	struct mem_cgroup *mem_cgroup;

	/* Pluggable isolate pages callback */
	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
			unsigned long *scanned, int order, int mode,
			struct zone *z, struct mem_cgroup *mem_cont,
81
			int active, int file);
Linus Torvalds's avatar
Linus Torvalds committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
};

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetch(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field)			\
	do {								\
		if ((_page)->lru.prev != _base) {			\
			struct page *prev;				\
									\
			prev = lru_to_page(&(_page->lru));		\
			prefetchw(&prev->_field);			\
		}							\
	} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 100.  Higher means more swappy.
 */
int vm_swappiness = 60;
118
long vm_total_pages;	/* The total number of pages which the VM controls */
Linus Torvalds's avatar
Linus Torvalds committed
119
120
121
122

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

123
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
124
125
126
127
128
#define scan_global_lru(sc)	(!(sc)->mem_cgroup)
#else
#define scan_global_lru(sc)	(1)
#endif

Linus Torvalds's avatar
Linus Torvalds committed
129
130
131
/*
 * Add a shrinker callback to be called from the vm
 */
132
void register_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
133
{
134
135
136
137
	shrinker->nr = 0;
	down_write(&shrinker_rwsem);
	list_add_tail(&shrinker->list, &shrinker_list);
	up_write(&shrinker_rwsem);
Linus Torvalds's avatar
Linus Torvalds committed
138
}
139
EXPORT_SYMBOL(register_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
140
141
142
143

/*
 * Remove one
 */
144
void unregister_shrinker(struct shrinker *shrinker)
Linus Torvalds's avatar
Linus Torvalds committed
145
146
147
148
149
{
	down_write(&shrinker_rwsem);
	list_del(&shrinker->list);
	up_write(&shrinker_rwsem);
}
150
EXPORT_SYMBOL(unregister_shrinker);
Linus Torvalds's avatar
Linus Torvalds committed
151
152
153
154
155
156
157
158
159
160

#define SHRINK_BATCH 128
/*
 * Call the shrink functions to age shrinkable caches
 *
 * Here we assume it costs one seek to replace a lru page and that it also
 * takes a seek to recreate a cache object.  With this in mind we age equal
 * percentages of the lru and ageable caches.  This should balance the seeks
 * generated by these structures.
 *
Simon Arlott's avatar
Simon Arlott committed
161
 * If the vm encountered mapped pages on the LRU it increase the pressure on
Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
165
166
167
168
 * slab to avoid swapping.
 *
 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 *
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
169
170
 *
 * Returns the number of slab objects which we shrunk.
Linus Torvalds's avatar
Linus Torvalds committed
171
 */
172
173
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
			unsigned long lru_pages)
Linus Torvalds's avatar
Linus Torvalds committed
174
175
{
	struct shrinker *shrinker;
176
	unsigned long ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
177
178
179
180
181

	if (scanned == 0)
		scanned = SWAP_CLUSTER_MAX;

	if (!down_read_trylock(&shrinker_rwsem))
182
		return 1;	/* Assume we'll be able to shrink next time */
Linus Torvalds's avatar
Linus Torvalds committed
183
184
185
186

	list_for_each_entry(shrinker, &shrinker_list, list) {
		unsigned long long delta;
		unsigned long total_scan;
187
		unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
188
189

		delta = (4 * scanned) / shrinker->seeks;
190
		delta *= max_pass;
Linus Torvalds's avatar
Linus Torvalds committed
191
192
		do_div(delta, lru_pages + 1);
		shrinker->nr += delta;
193
194
		if (shrinker->nr < 0) {
			printk(KERN_ERR "%s: nr=%ld\n",
195
					__func__, shrinker->nr);
196
197
198
199
200
201
202
203
204
205
			shrinker->nr = max_pass;
		}

		/*
		 * Avoid risking looping forever due to too large nr value:
		 * never try to free more than twice the estimate number of
		 * freeable entries.
		 */
		if (shrinker->nr > max_pass * 2)
			shrinker->nr = max_pass * 2;
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
211
212

		total_scan = shrinker->nr;
		shrinker->nr = 0;

		while (total_scan >= SHRINK_BATCH) {
			long this_scan = SHRINK_BATCH;
			int shrink_ret;
213
			int nr_before;
Linus Torvalds's avatar
Linus Torvalds committed
214

215
216
			nr_before = (*shrinker->shrink)(0, gfp_mask);
			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
217
218
			if (shrink_ret == -1)
				break;
219
220
			if (shrink_ret < nr_before)
				ret += nr_before - shrink_ret;
221
			count_vm_events(SLABS_SCANNED, this_scan);
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
225
226
227
228
229
			total_scan -= this_scan;

			cond_resched();
		}

		shrinker->nr += total_scan;
	}
	up_read(&shrinker_rwsem);
230
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
}

/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
	struct address_space *mapping;

	/* Page is in somebody's page tables. */
	if (page_mapped(page))
		return 1;

	/* Be more reluctant to reclaim swapcache than pagecache */
	if (PageSwapCache(page))
		return 1;

	mapping = page_mapping(page);
	if (!mapping)
		return 0;

	/* File is mmap'd by somebody? */
	return mapping_mapped(mapping);
}

static inline int is_page_cache_freeable(struct page *page)
{
	return page_count(page) - !!PagePrivate(page) == 2;
}

static int may_write_to_queue(struct backing_dev_info *bdi)
{
261
	if (current->flags & PF_SWAPWRITE)
Linus Torvalds's avatar
Linus Torvalds committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
		return 1;
	if (!bdi_write_congested(bdi))
		return 1;
	if (bdi == current->backing_dev_info)
		return 1;
	return 0;
}

/*
 * We detected a synchronous write error writing a page out.  Probably
 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 * fsync(), msync() or close().
 *
 * The tricky part is that after writepage we cannot touch the mapping: nothing
 * prevents it from being freed up.  But we have a ref on the page and once
 * that page is locked, the mapping is pinned.
 *
 * We're allowed to run sleeping lock_page() here because we know the caller has
 * __GFP_FS.
 */
static void handle_write_error(struct address_space *mapping,
				struct page *page, int error)
{
	lock_page(page);
286
287
	if (page_mapping(page) == mapping)
		mapping_set_error(mapping, error);
Linus Torvalds's avatar
Linus Torvalds committed
288
289
290
	unlock_page(page);
}

291
292
293
294
295
296
/* Request for sync pageout. */
enum pageout_io {
	PAGEOUT_IO_ASYNC,
	PAGEOUT_IO_SYNC,
};

297
298
299
300
301
302
303
304
305
306
307
308
/* possible outcome of pageout() */
typedef enum {
	/* failed to write page out, page is locked */
	PAGE_KEEP,
	/* move page to the active list, page is locked */
	PAGE_ACTIVATE,
	/* page has been sent to the disk successfully, page is unlocked */
	PAGE_SUCCESS,
	/* page is clean and locked */
	PAGE_CLEAN,
} pageout_t;

Linus Torvalds's avatar
Linus Torvalds committed
309
/*
310
311
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
Linus Torvalds's avatar
Linus Torvalds committed
312
 */
313
314
static pageout_t pageout(struct page *page, struct address_space *mapping,
						enum pageout_io sync_writeback)
Linus Torvalds's avatar
Linus Torvalds committed
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
{
	/*
	 * If the page is dirty, only perform writeback if that write
	 * will be non-blocking.  To prevent this allocation from being
	 * stalled by pagecache activity.  But note that there may be
	 * stalls if we need to run get_block().  We could test
	 * PagePrivate for that.
	 *
	 * If this process is currently in generic_file_write() against
	 * this page's queue, we can perform writeback even if that
	 * will block.
	 *
	 * If the page is swapcache, write it back even if that would
	 * block, for some throttling. This happens by accident, because
	 * swap_backing_dev_info is bust: it doesn't reflect the
	 * congestion state of the swapdevs.  Easy to fix, if needed.
	 * See swapfile.c:page_queue_congested().
	 */
	if (!is_page_cache_freeable(page))
		return PAGE_KEEP;
	if (!mapping) {
		/*
		 * Some data journaling orphaned pages can have
		 * page->mapping == NULL while being dirty with clean buffers.
		 */
340
		if (PagePrivate(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
341
342
			if (try_to_free_buffers(page)) {
				ClearPageDirty(page);
343
				printk("%s: orphaned page\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
				return PAGE_CLEAN;
			}
		}
		return PAGE_KEEP;
	}
	if (mapping->a_ops->writepage == NULL)
		return PAGE_ACTIVATE;
	if (!may_write_to_queue(mapping->backing_dev_info))
		return PAGE_KEEP;

	if (clear_page_dirty_for_io(page)) {
		int res;
		struct writeback_control wbc = {
			.sync_mode = WB_SYNC_NONE,
			.nr_to_write = SWAP_CLUSTER_MAX,
359
360
			.range_start = 0,
			.range_end = LLONG_MAX,
Linus Torvalds's avatar
Linus Torvalds committed
361
362
363
364
365
366
367
368
			.nonblocking = 1,
			.for_reclaim = 1,
		};

		SetPageReclaim(page);
		res = mapping->a_ops->writepage(page, &wbc);
		if (res < 0)
			handle_write_error(mapping, page, res);
369
		if (res == AOP_WRITEPAGE_ACTIVATE) {
Linus Torvalds's avatar
Linus Torvalds committed
370
371
372
			ClearPageReclaim(page);
			return PAGE_ACTIVATE;
		}
373
374
375
376
377
378
379
380
381

		/*
		 * Wait on writeback if requested to. This happens when
		 * direct reclaiming a large contiguous area and the
		 * first attempt to free a range of pages fails.
		 */
		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
			wait_on_page_writeback(page);

Linus Torvalds's avatar
Linus Torvalds committed
382
383
384
385
		if (!PageWriteback(page)) {
			/* synchronous write or broken a_ops? */
			ClearPageReclaim(page);
		}
386
		inc_zone_page_state(page, NR_VMSCAN_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
387
388
389
390
391
392
		return PAGE_SUCCESS;
	}

	return PAGE_CLEAN;
}

393
/*
Nick Piggin's avatar
Nick Piggin committed
394
395
 * Same as remove_mapping, but if the page is removed from the mapping, it
 * gets returned with a refcount of 0.
396
 */
Nick Piggin's avatar
Nick Piggin committed
397
static int __remove_mapping(struct address_space *mapping, struct page *page)
398
{
399
400
	BUG_ON(!PageLocked(page));
	BUG_ON(mapping != page_mapping(page));
401

Nick Piggin's avatar
Nick Piggin committed
402
	spin_lock_irq(&mapping->tree_lock);
403
	/*
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
	 * The non racy check for a busy page.
	 *
	 * Must be careful with the order of the tests. When someone has
	 * a ref to the page, it may be possible that they dirty it then
	 * drop the reference. So if PageDirty is tested before page_count
	 * here, then the following race may occur:
	 *
	 * get_user_pages(&page);
	 * [user mapping goes away]
	 * write_to(page);
	 *				!PageDirty(page)    [good]
	 * SetPageDirty(page);
	 * put_page(page);
	 *				!page_count(page)   [good, discard it]
	 *
	 * [oops, our write_to data is lost]
	 *
	 * Reversing the order of the tests ensures such a situation cannot
	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
	 * load is not satisfied before that of page->_count.
	 *
	 * Note that if SetPageDirty is always performed via set_page_dirty,
	 * and thus under tree_lock, then this ordering is not required.
427
	 */
Nick Piggin's avatar
Nick Piggin committed
428
	if (!page_freeze_refs(page, 2))
429
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
430
431
432
	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
	if (unlikely(PageDirty(page))) {
		page_unfreeze_refs(page, 2);
433
		goto cannot_free;
Nick Piggin's avatar
Nick Piggin committed
434
	}
435
436
437
438

	if (PageSwapCache(page)) {
		swp_entry_t swap = { .val = page_private(page) };
		__delete_from_swap_cache(page);
Nick Piggin's avatar
Nick Piggin committed
439
		spin_unlock_irq(&mapping->tree_lock);
440
		swap_free(swap);
Nick Piggin's avatar
Nick Piggin committed
441
442
	} else {
		__remove_from_page_cache(page);
Nick Piggin's avatar
Nick Piggin committed
443
		spin_unlock_irq(&mapping->tree_lock);
444
445
446
447
448
	}

	return 1;

cannot_free:
Nick Piggin's avatar
Nick Piggin committed
449
	spin_unlock_irq(&mapping->tree_lock);
450
451
452
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/*
 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
 * someone else has a ref on the page, abort and return 0.  If it was
 * successfully detached, return 1.  Assumes the caller has a single ref on
 * this page.
 */
int remove_mapping(struct address_space *mapping, struct page *page)
{
	if (__remove_mapping(mapping, page)) {
		/*
		 * Unfreezing the refcount with 1 rather than 2 effectively
		 * drops the pagecache ref for us without requiring another
		 * atomic operation.
		 */
		page_unfreeze_refs(page, 1);
		return 1;
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
473
/*
474
 * shrink_page_list() returns the number of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
475
 */
476
static unsigned long shrink_page_list(struct list_head *page_list,
477
478
					struct scan_control *sc,
					enum pageout_io sync_writeback)
Linus Torvalds's avatar
Linus Torvalds committed
479
480
481
482
{
	LIST_HEAD(ret_pages);
	struct pagevec freed_pvec;
	int pgactivate = 0;
483
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498

	cond_resched();

	pagevec_init(&freed_pvec, 1);
	while (!list_empty(page_list)) {
		struct address_space *mapping;
		struct page *page;
		int may_enter_fs;
		int referenced;

		cond_resched();

		page = lru_to_page(page_list);
		list_del(&page->lru);

Nick Piggin's avatar
Nick Piggin committed
499
		if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
500
501
			goto keep;

Nick Piggin's avatar
Nick Piggin committed
502
		VM_BUG_ON(PageActive(page));
Linus Torvalds's avatar
Linus Torvalds committed
503
504

		sc->nr_scanned++;
505
506
507
508

		if (!sc->may_swap && page_mapped(page))
			goto keep_locked;

Linus Torvalds's avatar
Linus Torvalds committed
509
510
511
512
		/* Double the slab pressure for mapped and swapcache pages */
		if (page_mapped(page) || PageSwapCache(page))
			sc->nr_scanned++;

513
514
515
516
517
518
519
520
521
522
523
524
525
526
		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

		if (PageWriteback(page)) {
			/*
			 * Synchronous reclaim is performed in two passes,
			 * first an asynchronous pass over the list to
			 * start parallel writeback, and a second synchronous
			 * pass to wait for the IO to complete.  Wait here
			 * for any page for which writeback has already
			 * started.
			 */
			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
				wait_on_page_writeback(page);
527
			else
528
529
				goto keep_locked;
		}
Linus Torvalds's avatar
Linus Torvalds committed
530

531
		referenced = page_referenced(page, 1, sc->mem_cgroup);
Linus Torvalds's avatar
Linus Torvalds committed
532
		/* In active use or really unfreeable?  Activate it. */
Andy Whitcroft's avatar
Andy Whitcroft committed
533
534
		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
					referenced && page_mapping_inuse(page))
Linus Torvalds's avatar
Linus Torvalds committed
535
536
537
538
539
540
541
			goto activate_locked;

#ifdef CONFIG_SWAP
		/*
		 * Anonymous process memory has backing store?
		 * Try to allocate it some swap space here.
		 */
542
		if (PageAnon(page) && !PageSwapCache(page))
543
			if (!add_to_swap(page, GFP_ATOMIC))
Linus Torvalds's avatar
Linus Torvalds committed
544
545
546
547
548
549
550
551
552
553
				goto activate_locked;
#endif /* CONFIG_SWAP */

		mapping = page_mapping(page);

		/*
		 * The page is mapped into the page tables of one or more
		 * processes. Try to unmap it here.
		 */
		if (page_mapped(page) && mapping) {
554
			switch (try_to_unmap(page, 0)) {
Linus Torvalds's avatar
Linus Torvalds committed
555
556
557
558
559
560
561
562
563
564
			case SWAP_FAIL:
				goto activate_locked;
			case SWAP_AGAIN:
				goto keep_locked;
			case SWAP_SUCCESS:
				; /* try to free the page below */
			}
		}

		if (PageDirty(page)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
565
			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
Linus Torvalds's avatar
Linus Torvalds committed
566
				goto keep_locked;
567
			if (!may_enter_fs)
Linus Torvalds's avatar
Linus Torvalds committed
568
				goto keep_locked;
569
			if (!sc->may_writepage)
Linus Torvalds's avatar
Linus Torvalds committed
570
571
572
				goto keep_locked;

			/* Page is dirty, try to write it out here */
573
			switch (pageout(page, mapping, sync_writeback)) {
Linus Torvalds's avatar
Linus Torvalds committed
574
575
576
577
578
			case PAGE_KEEP:
				goto keep_locked;
			case PAGE_ACTIVATE:
				goto activate_locked;
			case PAGE_SUCCESS:
579
				if (PageWriteback(page) || PageDirty(page))
Linus Torvalds's avatar
Linus Torvalds committed
580
581
582
583
584
					goto keep;
				/*
				 * A synchronous write - probably a ramdisk.  Go
				 * ahead and try to reclaim the page.
				 */
Nick Piggin's avatar
Nick Piggin committed
585
				if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
					goto keep;
				if (PageDirty(page) || PageWriteback(page))
					goto keep_locked;
				mapping = page_mapping(page);
			case PAGE_CLEAN:
				; /* try to free the page below */
			}
		}

		/*
		 * If the page has buffers, try to free the buffer mappings
		 * associated with this page. If we succeed we try to free
		 * the page as well.
		 *
		 * We do this even if the page is PageDirty().
		 * try_to_release_page() does not perform I/O, but it is
		 * possible for a page to have PageDirty set, but it is actually
		 * clean (all its buffers are clean).  This happens if the
		 * buffers were written out directly, with submit_bh(). ext3
		 * will do this, as well as the blockdev mapping. 
		 * try_to_release_page() will discover that cleanness and will
		 * drop the buffers and mark the page clean - it can be freed.
		 *
		 * Rarely, pages can have buffers and no ->mapping.  These are
		 * the pages which were not successfully invalidated in
		 * truncate_complete_page().  We try to drop those buffers here
		 * and if that worked, and the page is no longer mapped into
		 * process address space (page_count == 1) it can be freed.
		 * Otherwise, leave the page on the LRU so it is swappable.
		 */
		if (PagePrivate(page)) {
			if (!try_to_release_page(page, sc->gfp_mask))
				goto activate_locked;
Nick Piggin's avatar
Nick Piggin committed
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
			if (!mapping && page_count(page) == 1) {
				unlock_page(page);
				if (put_page_testzero(page))
					goto free_it;
				else {
					/*
					 * rare race with speculative reference.
					 * the speculative reference will free
					 * this page shortly, so we may
					 * increment nr_reclaimed here (and
					 * leave it off the LRU).
					 */
					nr_reclaimed++;
					continue;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
635
636
		}

Nick Piggin's avatar
Nick Piggin committed
637
		if (!mapping || !__remove_mapping(mapping, page))
638
			goto keep_locked;
Linus Torvalds's avatar
Linus Torvalds committed
639
640

		unlock_page(page);
Nick Piggin's avatar
Nick Piggin committed
641
free_it:
642
		nr_reclaimed++;
Nick Piggin's avatar
Nick Piggin committed
643
644
645
646
		if (!pagevec_add(&freed_pvec, page)) {
			__pagevec_free(&freed_pvec);
			pagevec_reinit(&freed_pvec);
		}
Linus Torvalds's avatar
Linus Torvalds committed
647
648
649
		continue;

activate_locked:
650
651
652
		/* Not a candidate for swapping, so reclaim swap space. */
		if (PageSwapCache(page) && vm_swap_full())
			remove_exclusive_swap_page_ref(page);
Linus Torvalds's avatar
Linus Torvalds committed
653
654
655
656
657
658
		SetPageActive(page);
		pgactivate++;
keep_locked:
		unlock_page(page);
keep:
		list_add(&page->lru, &ret_pages);
Nick Piggin's avatar
Nick Piggin committed
659
		VM_BUG_ON(PageLRU(page));
Linus Torvalds's avatar
Linus Torvalds committed
660
661
662
	}
	list_splice(&ret_pages, page_list);
	if (pagevec_count(&freed_pvec))
Nick Piggin's avatar
Nick Piggin committed
663
		__pagevec_free(&freed_pvec);
664
	count_vm_events(PGACTIVATE, pgactivate);
665
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
666
667
}

Andy Whitcroft's avatar
Andy Whitcroft committed
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
/* LRU Isolation modes. */
#define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
#define ISOLATE_ACTIVE 1	/* Isolate active pages. */
#define ISOLATE_BOTH 2		/* Isolate both active and inactive pages. */

/*
 * Attempt to remove the specified page from its LRU.  Only take this page
 * if it is of the appropriate PageActive status.  Pages which are being
 * freed elsewhere are also ignored.
 *
 * page:	page to consider
 * mode:	one of the LRU isolation modes defined above
 *
 * returns 0 on success, -ve errno on failure.
 */
683
int __isolate_lru_page(struct page *page, int mode, int file)
Andy Whitcroft's avatar
Andy Whitcroft committed
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
{
	int ret = -EINVAL;

	/* Only take pages on the LRU. */
	if (!PageLRU(page))
		return ret;

	/*
	 * When checking the active state, we need to be sure we are
	 * dealing with comparible boolean values.  Take the logical not
	 * of each.
	 */
	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
		return ret;

699
700
701
	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
		return ret;

Andy Whitcroft's avatar
Andy Whitcroft committed
702
703
704
705
706
707
708
709
710
711
712
713
714
715
	ret = -EBUSY;
	if (likely(get_page_unless_zero(page))) {
		/*
		 * Be careful not to clear PageLRU until after we're
		 * sure the page is not being freed elsewhere -- the
		 * page release code relies on it.
		 */
		ClearPageLRU(page);
		ret = 0;
	}

	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
716
717
718
719
720
721
722
723
724
725
726
727
728
729
/*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
 * and working on them outside the LRU lock.
 *
 * For pagecache intensive workloads, this function is the hottest
 * spot in the kernel (apart from copy_*_user functions).
 *
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan:	The number of pages to look through on the list.
 * @src:	The LRU list to pull pages off.
 * @dst:	The temp list to put pages on to.
 * @scanned:	The number of pages that were scanned.
Andy Whitcroft's avatar
Andy Whitcroft committed
730
731
 * @order:	The caller's attempted allocation order
 * @mode:	One of the LRU isolation modes
732
 * @file:	True [1] if isolating file [!anon] pages
Linus Torvalds's avatar
Linus Torvalds committed
733
734
735
 *
 * returns how many pages were moved onto *@dst.
 */
736
737
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
		struct list_head *src, struct list_head *dst,
738
		unsigned long *scanned, int order, int mode, int file)
Linus Torvalds's avatar
Linus Torvalds committed
739
{
740
	unsigned long nr_taken = 0;
741
	unsigned long scan;
Linus Torvalds's avatar
Linus Torvalds committed
742

743
	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
Andy Whitcroft's avatar
Andy Whitcroft committed
744
745
746
747
748
749
		struct page *page;
		unsigned long pfn;
		unsigned long end_pfn;
		unsigned long page_pfn;
		int zone_id;

Linus Torvalds's avatar
Linus Torvalds committed
750
751
752
		page = lru_to_page(src);
		prefetchw_prev_lru_page(page, src, flags);

Nick Piggin's avatar
Nick Piggin committed
753
		VM_BUG_ON(!PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
754

755
		switch (__isolate_lru_page(page, mode, file)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
756
757
		case 0:
			list_move(&page->lru, dst);
758
			nr_taken++;
Andy Whitcroft's avatar
Andy Whitcroft committed
759
760
761
762
763
764
			break;

		case -EBUSY:
			/* else it is being freed elsewhere */
			list_move(&page->lru, src);
			continue;
765

Andy Whitcroft's avatar
Andy Whitcroft committed
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
		default:
			BUG();
		}

		if (!order)
			continue;

		/*
		 * Attempt to take all pages in the order aligned region
		 * surrounding the tag page.  Only take those pages of
		 * the same active state as that tag page.  We may safely
		 * round the target page pfn down to the requested order
		 * as the mem_map is guarenteed valid out to MAX_ORDER,
		 * where that page is in a different zone we will detect
		 * it from its zone id and abort this block scan.
		 */
		zone_id = page_zone_id(page);
		page_pfn = page_to_pfn(page);
		pfn = page_pfn & ~((1 << order) - 1);
		end_pfn = pfn + (1 << order);
		for (; pfn < end_pfn; pfn++) {
			struct page *cursor_page;

			/* The target page is in the block, ignore it. */
			if (unlikely(pfn == page_pfn))
				continue;

			/* Avoid holes within the zone. */
			if (unlikely(!pfn_valid_within(pfn)))
				break;

			cursor_page = pfn_to_page(pfn);
798

Andy Whitcroft's avatar
Andy Whitcroft committed
799
800
801
			/* Check that we have not crossed a zone boundary. */
			if (unlikely(page_zone_id(cursor_page) != zone_id))
				continue;
802
			switch (__isolate_lru_page(cursor_page, mode, file)) {
Andy Whitcroft's avatar
Andy Whitcroft committed
803
804
805
806
807
808
809
810
811
812
813
814
815
			case 0:
				list_move(&cursor_page->lru, dst);
				nr_taken++;
				scan++;
				break;

			case -EBUSY:
				/* else it is being freed elsewhere */
				list_move(&cursor_page->lru, src);
			default:
				break;
			}
		}
Linus Torvalds's avatar
Linus Torvalds committed
816
817
818
819
820
821
	}

	*scanned = scan;
	return nr_taken;
}

822
823
824
825
826
static unsigned long isolate_pages_global(unsigned long nr,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					struct mem_cgroup *mem_cont,
827
					int active, int file)
828
{
829
	int lru = LRU_BASE;
830
	if (active)
831
832
833
834
835
		lru += LRU_ACTIVE;
	if (file)
		lru += LRU_FILE;
	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
								mode, !!file);
836
837
}

Andy Whitcroft's avatar
Andy Whitcroft committed
838
839
840
841
/*
 * clear_active_flags() is a helper for shrink_active_list(), clearing
 * any active bits from the pages in the list.
 */
842
843
static unsigned long clear_active_flags(struct list_head *page_list,
					unsigned int *count)
Andy Whitcroft's avatar
Andy Whitcroft committed
844
845
{
	int nr_active = 0;
846
	int lru;
Andy Whitcroft's avatar
Andy Whitcroft committed
847
848
	struct page *page;

849
850
	list_for_each_entry(page, page_list, lru) {
		lru = page_is_file_cache(page);
Andy Whitcroft's avatar
Andy Whitcroft committed
851
		if (PageActive(page)) {
852
			lru += LRU_ACTIVE;
Andy Whitcroft's avatar
Andy Whitcroft committed
853
854
855
			ClearPageActive(page);
			nr_active++;
		}
856
857
		count[lru]++;
	}
Andy Whitcroft's avatar
Andy Whitcroft committed
858
859
860
861

	return nr_active;
}

862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
/**
 * isolate_lru_page - tries to isolate a page from its LRU list
 * @page: page to isolate from its LRU list
 *
 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
 * vmstat statistic corresponding to whatever LRU list the page was on.
 *
 * Returns 0 if the page was removed from an LRU list.
 * Returns -EBUSY if the page was not on an LRU list.
 *
 * The returned page will have PageLRU() cleared.  If it was found on
 * the active list, it will have PageActive set.  That flag may need
 * to be cleared by the caller before letting the page go.
 *
 * The vmstat statistic corresponding to the list on which the page was
 * found will be decremented.
 *
 * Restrictions:
 * (1) Must be called with an elevated refcount on the page. This is a
 *     fundamentnal difference from isolate_lru_pages (which is called
 *     without a stable reference).
 * (2) the lru_lock must not be held.
 * (3) interrupts must be enabled.
 */
int isolate_lru_page(struct page *page)
{
	int ret = -EBUSY;

	if (PageLRU(page)) {
		struct zone *zone = page_zone(page);

		spin_lock_irq(&zone->lru_lock);
		if (PageLRU(page) && get_page_unless_zero(page)) {
895
			int lru = LRU_BASE;
896
897
			ret = 0;
			ClearPageLRU(page);
898
899
900

			lru += page_is_file_cache(page) + !!PageActive(page);
			del_page_from_lru_list(zone, page, lru);
901
902
903
904
905
906
		}
		spin_unlock_irq(&zone->lru_lock);
	}
	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
907
/*
908
909
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
Linus Torvalds's avatar
Linus Torvalds committed
910
 */
911
static unsigned long shrink_inactive_list(unsigned long max_scan,
912
			struct zone *zone, struct scan_control *sc, int file)
Linus Torvalds's avatar
Linus Torvalds committed
913
914
915
{
	LIST_HEAD(page_list);
	struct pagevec pvec;
916
	unsigned long nr_scanned = 0;
917
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
918
919
920
921
922

	pagevec_init(&pvec, 1);

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
923
	do {
Linus Torvalds's avatar
Linus Torvalds committed
924
		struct page *page;
925
926
927
		unsigned long nr_taken;
		unsigned long nr_scan;
		unsigned long nr_freed;
Andy Whitcroft's avatar
Andy Whitcroft committed
928
		unsigned long nr_active;
929
930
931
		unsigned int count[NR_LRU_LISTS] = { 0, };
		int mode = (sc->order > PAGE_ALLOC_COSTLY_ORDER) ?
					ISOLATE_BOTH : ISOLATE_INACTIVE;
Linus Torvalds's avatar
Linus Torvalds committed
932

933
		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
934
935
936
			     &page_list, &nr_scan, sc->order, mode,
				zone, sc->mem_cgroup, 0, file);
		nr_active = clear_active_flags(&page_list, count);
937
		__count_vm_events(PGDEACTIVATE, nr_active);
Andy Whitcroft's avatar
Andy Whitcroft committed
938

939
940
941
942
943
944
945
946
947
948
		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
						-count[LRU_ACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
						-count[LRU_INACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
						-count[LRU_ACTIVE_ANON]);
		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
						-count[LRU_INACTIVE_ANON]);

		if (scan_global_lru(sc)) {
949
			zone->pages_scanned += nr_scan;
950
951
952
953
954
			zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
			zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
			zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
			zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
		}
Linus Torvalds's avatar
Linus Torvalds committed
955
956
		spin_unlock_irq(&zone->lru_lock);

957
		nr_scanned += nr_scan;
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);

		/*
		 * If we are direct reclaiming for contiguous pages and we do
		 * not reclaim everything in the list, try again and wait
		 * for IO to complete. This will stall high-order allocations
		 * but that should be acceptable to the caller
		 */
		if (nr_freed < nr_taken && !current_is_kswapd() &&
					sc->order > PAGE_ALLOC_COSTLY_ORDER) {
			congestion_wait(WRITE, HZ/10);

			/*
			 * The attempt at page out may have made some
			 * of the pages active, mark them inactive again.
			 */
974
			nr_active = clear_active_flags(&page_list, count);
975
976
977
978
979
980
			count_vm_events(PGDEACTIVATE, nr_active);

			nr_freed += shrink_page_list(&page_list, sc,
							PAGEOUT_IO_SYNC);
		}

981
		nr_reclaimed += nr_freed;
Nick Piggin's avatar
Nick Piggin committed
982
983
		local_irq_disable();
		if (current_is_kswapd()) {
984
985
			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
			__count_vm_events(KSWAPD_STEAL, nr_freed);
986
		} else if (scan_global_lru(sc))
987
			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
988

Shantanu Goel's avatar
Shantanu Goel committed
989
		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
Nick Piggin's avatar
Nick Piggin committed
990

991
992
993
		if (nr_taken == 0)
			goto done;

Nick Piggin's avatar
Nick Piggin committed
994
		spin_lock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
995
996
997
998
999
		/*
		 * Put back any unfreeable pages.
		 */
		while (!list_empty(&page_list)) {
			page = lru_to_page(&page_list);
Nick Piggin's avatar
Nick Piggin committed
1000
			VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
1001
			SetPageLRU(page);
Linus Torvalds's avatar
Linus Torvalds committed
1002
			list_del(&page->lru);
1003
			add_page_to_lru_list(zone, page, page_lru(page));
1004
1005
1006
1007
			if (PageActive(page) && scan_global_lru(sc)) {
				int file = !!page_is_file_cache(page);
				zone->recent_rotated[file]++;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1008
1009
1010
1011
1012
1013
			if (!pagevec_add(&pvec, page)) {
				spin_unlock_irq(&zone->lru_lock);
				__pagevec_release(&pvec);
				spin_lock_irq(&zone->lru_lock);
			}
		}
1014
  	} while (nr_scanned < max_scan);
1015
	spin_unlock(&zone->lru_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1016
done:
1017
	local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
1018
	pagevec_release(&pvec);
1019
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1020
1021
}

1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
/*
 * We are about to scan this zone at a certain priority level.  If that priority
 * level is smaller (ie: more urgent) than the previous priority, then note
 * that priority level within the zone.  This is done so that when the next
 * process comes in to scan this zone, it will immediately start out at this
 * priority level rather than having to build up its own scanning priority.
 * Here, this priority affects only the reclaim-mapped threshold.
 */
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
	if (priority < zone->prev_priority)
		zone->prev_priority = priority;
}

1036
1037
static inline int zone_is_near_oom(struct zone *zone)
{
1038
	return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1039
1040
}

Linus Torvalds's avatar
Linus Torvalds committed
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
 * processes, from rmap.
 *
 * If the pages are mostly unmapped, the processing is fast and it is
 * appropriate to hold zone->lru_lock across the whole operation.  But if
 * the pages are mapped, the processing is slow (page_referenced()) so we
 * should drop zone->lru_lock around each page.  It's impossible to balance
 * this, so instead we remove the pages from the LRU while processing them.
 * It is safe to rely on PG_active against the non-LRU pages in here because
 * nobody will play with that bit on a non-LRU page.
 *
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
1058
1059


1060
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1061
			struct scan_control *sc, int priority, int file)
Linus Torvalds's avatar
Linus Torvalds committed
1062
{
1063
	unsigned long pgmoved;
Linus Torvalds's avatar
Linus Torvalds committed
1064
	int pgdeactivate = 0;
1065
	unsigned long pgscanned;
Linus Torvalds's avatar
Linus Torvalds committed
1066
	LIST_HEAD(l_hold);	/* The pages which were snipped off */
1067
	LIST_HEAD(l_inactive);
Linus Torvalds's avatar
Linus Torvalds committed
1068
1069
	struct page *page;
	struct pagevec pvec;
1070
	enum lru_list lru;
Linus Torvalds's avatar
Linus Torvalds committed
1071
1072
1073

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
1074
1075
	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
					ISOLATE_ACTIVE, zone,
1076
					sc->mem_cgroup, 1, file);
1077
1078
1079
1080
	/*
	 * zone->pages_scanned is used for detect zone's oom
	 * mem_cgroup remembers nr_scan by itself.
	 */
1081
	if (scan_global_lru(sc)) {
1082
		zone->pages_scanned += pgscanned;
1083
1084
		zone->recent_scanned[!!file] += pgmoved;
	}
1085

1086
1087
1088
1089
	if (file)
		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
	else
		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1090
1091
	spin_unlock_irq(&zone->lru_lock);

1092
	pgmoved = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1093
1094
1095
1096
	while (!list_empty(&l_hold)) {
		cond_resched();
		page = lru_to_page(&l_hold);
		list_del(&page->lru);
1097
1098
1099
1100
1101
1102

		/* page_referenced clears PageReferenced */
		if (page_mapping_inuse(page) &&
		    page_referenced(page, 0, sc->mem_cgroup))
			pgmoved++;

Linus Torvalds's avatar
Linus Torvalds committed
1103
1104
1105
		list_add(&page->lru, &l_inactive);
	}

1106
	/*
1107
1108
1109
1110
1111
	 * Count referenced pages from currently used mappings as
	 * rotated, even though they are moved to the inactive list.
	 * This helps balance scan pressure between file and anonymous
	 * pages in get_scan_ratio.
	 */
1112
1113
	zone->recent_rotated[!!file] += pgmoved;

1114
	/*
1115
	 * Move the pages to the [file or anon] inactive list.
1116
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1117
	pagevec_init(&pvec, 1);
1118

Linus Torvalds's avatar
Linus Torvalds committed
1119
	pgmoved = 0;
1120
	lru = LRU_BASE + file * LRU_FILE;
Linus Torvalds's avatar
Linus Torvalds committed
1121
1122
1123
1124
	spin_lock_irq(&zone->lru_lock);
	while (!list_empty(&l_inactive)) {
		page = lru_to_page(&l_inactive);
		prefetchw_prev_lru_page(page, &l_inactive, flags);
Nick Piggin's avatar
Nick Piggin committed
1125
		VM_BUG_ON(PageLRU(page));
Nick Piggin's avatar
Nick Piggin committed
1126
		SetPageLRU(page);
Nick Piggin's avatar
Nick Piggin committed
1127
		VM_BUG_ON(!PageActive(page));
1128
1129
		ClearPageActive(page);

1130
		list_move(&page->lru, &zone->lru[lru].list);
1131
		mem_cgroup_move_lists(page, false);
Linus Torvalds's avatar
Linus Torvalds committed
1132
1133
		pgmoved++;
		if (!pagevec_add(&pvec, page)) {
1134
			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1135
1136
1137
1138
1139
1140
1141
1142
1143
			spin_unlock_irq(&zone->lru_lock);
			pgdeactivate += pgmoved;
			pgmoved = 0;
			if (buffer_heads_over_limit)
				pagevec_strip(&pvec);
			__pagevec_release(&pvec);
			spin_lock_irq(&zone->lru_lock);
		}
	}
1144
	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
Linus Torvalds's avatar
Linus Torvalds committed
1145
1146
1147
1148
1149
1150
	pgdeactivate += pgmoved;
	if (buffer_heads_over_limit) {
		spin_unlock_irq(&zone->lru_lock);
		pagevec_strip(&pvec);
		spin_lock_irq(&zone->lru_lock);
	}
1151
1152
1153
	__count_zone_vm_events(PGREFILL, zone, pgscanned);
	__count_vm_events(PGDEACTIVATE, pgdeactivate);
	spin_unlock_irq(&zone->lru_lock);
1154
1155
	if (vm_swap_full())
		pagevec_swap_free(&pvec);
Linus Torvalds's avatar
Linus Torvalds committed
1156

Nick Piggin's avatar
Nick Piggin committed
1157
	pagevec_release(&pvec);
Linus Torvalds's avatar
Linus Torvalds committed
1158
1159
}

1160
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1161
1162
	struct zone *zone, struct scan_control *sc, int priority)
{
1163
1164
	int file = is_file_lru(lru);

1165
1166
1167
1168
1169
1170
1171
	if (lru == LRU_ACTIVE_FILE) {
		shrink_active_list(nr_to_scan, zone, sc, priority, file);
		return 0;
	}

	if (lru == LRU_ACTIVE_ANON &&
	    (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1172
		shrink_active_list(nr_to_scan, zone, sc, priority, file);
1173
1174
		return 0;
	}
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
	return shrink_inactive_list(nr_to_scan, zone, sc, file);
}

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
 * percent[0] specifies how much pressure to put on ram/swap backed
 * memory, while percent[1] determines pressure on the file LRUs.
 */
static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
					unsigned long *percent)
{
	unsigned long anon, file, free;
	unsigned long anon_prio, file_prio;
	unsigned long ap, fp;

	anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
		zone_page_state(zone, NR_INACTIVE_ANON);
	file  = zone_page_state(zone, NR_ACTIVE_FILE) +
		zone_page_state(zone, NR_INACTIVE_FILE);
	free  = zone_page_state(zone, NR_FREE_PAGES);

	/* If we have no swap space, do not bother scanning anon pages. */
	if (nr_swap_pages <= 0) {
		percent[0] = 0;
		percent[1] = 100;
		return;
	}

	/* If we have very few page cache pages, force-scan anon pages. */
	if (unlikely(file + free <= zone->pages_high)) {
		percent[0] = 100;
		percent[1] = 0;
		return;
	}

	/*
	 * OK, so we have swap space and a fair amount of page cache
	 * pages.  We use the recently rotated / recently scanned
	 * ratios to determine how valuable each cache is.
	 *
	 * Because workloads change over time (and to avoid overflow)
	 * we keep these statistics as a floating average, which ends
	 * up weighing recent references more than old ones.
	 *
	 * anon in [0], file in [1]
	 */
	if (unlikely(zone->recent_scanned[0] > anon / 4)) {
		spin_lock_irq(&zone->lru_lock);
		zone->recent_scanned[0] /= 2;
		zone->recent_rotated[0] /= 2;
		spin_unlock_irq(&zone->lru_lock);
	}

	if (unlikely(zone->recent_scanned[1] > file / 4)) {
		spin_lock_irq(&zone->lru_lock);
		zone->recent_scanned[1] /= 2;
		zone->recent_rotated[1] /= 2;
		spin_unlock_irq(&zone->lru_lock);
	}

	/*
	 * With swappiness at 100, anonymous and file have the same priority.
	 * This scanning priority is essentially the inverse of IO cost.
	 */
	anon_prio = sc->swappiness;
	file_prio = 200 - sc->swappiness;

	/*
	 *                  anon       recent_rotated[0]
	 * %anon = 100 * ----------- / ----------------- * IO cost
	 *               anon + file      rotate_sum
	 */
	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
	ap /= zone->recent_rotated[0] + 1;

	fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
	fp /= zone->recent_rotated[1] + 1;

	/* Normalize to percentages */
	percent[0] = 100 * ap / (ap + fp + 1);
	percent[1] = 100 - percent[0];
1260
1261
}

1262

Linus Torvalds's avatar
Linus Torvalds committed
1263
1264
1265
/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
1266
1267
static unsigned long shrink_zone(int priority, struct zone *zone,
				struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
1268
{
1269
	unsigned long nr[NR_LRU_LISTS];
1270
	unsigned long nr_to_scan;
1271
	unsigned long nr_reclaimed = 0;
1272
	unsigned long percent[2];	/* anon @ 0; file @ 1 */
1273
	enum lru_list l;
Linus Torvalds's avatar
Linus Torvalds committed
1274

1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
	get_scan_ratio(zone, sc, percent);

	for_each_lru(l) {
		if (scan_global_lru(sc)) {
			int file = is_file_lru(l);
			int scan;
			/*
			 * Add one to nr_to_scan just to make sure that the
			 * kernel will slowly sift through each list.
			 */
			scan = zone_page_state(zone, NR_LRU_BASE + l);
			if (priority) {
				scan >>= priority;
				scan = (scan * percent[file]) / 100;
			}
			zone->lru[l].nr_scan += scan + 1;
1291
1292
1293
1294
1295
			nr[l] = zone->lru[l].nr_scan;
			if (nr[l] >= sc->swap_cluster_max)
				zone->lru[l].nr_scan = 0;
			else
				nr[l] = 0;
1296
1297
1298
1299
1300
1301
1302
1303
		} else {
			/*
			 * This reclaim occurs not because zone memory shortage
			 * but because memory controller hits its limit.
			 * Don't modify zone reclaim related data.
			 */
			nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
								priority, l);
1304
		}
1305
	}
Linus Torvalds's avatar
Linus Torvalds committed
1306

1307
1308
	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
					nr[LRU_INACTIVE_FILE]) {
1309
1310
1311
		for_each_lru(l) {
			if (nr[l]) {
				nr_to_scan = min(nr[l],
Linus Torvalds's avatar
Linus Torvalds committed
1312
					(unsigned long)sc->swap_cluster_max);
1313
				nr[l] -= nr_to_scan;
Linus Torvalds's avatar
Linus Torvalds committed
1314

1315
1316
1317
				nr_reclaimed += shrink_list(l, nr_to_scan,
							zone, sc, priority);
			}
Linus Torvalds's avatar
Linus Torvalds committed
1318
1319
1320
		}
	}

1321
1322
1323
1324
1325
1326
1327
1328
1329
	/*
	 * Even if we did not try to evict anon pages at all, we want to
	 * rebalance the anon lru active/inactive ratio.
	 */
	if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
	else if (!scan_global_lru(sc))
		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);

1330
	throttle_vm_writeout(sc->gfp_mask);
1331
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
}

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * We reclaim from a zone even if that zone is over pages_high.  Because:
 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
 *    allocation or
 * b) The zones may be over pages_high but they must go *over* pages_high to
 *    satisfy the `incremental min' zone defense algorithm.
 *
 * Returns the number of reclaimed pages.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
1350
static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1351
					struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
1352
{
1353
	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1354
	unsigned long nr_reclaimed = 0;
1355
	struct zoneref *z;
1356
	struct zone *zone;
1357

1358
	sc->all_unreclaimable = 1;
1359
	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1360
		if (!populated_zone(zone))
Linus Torvalds's avatar
Linus Torvalds committed
1361
			continue;
1362
1363
1364
1365
1366
1367
1368
1369
		/*
		 * Take care memory controller reclaiming has small influence
		 * to global LRU.
		 */
		if (scan_global_lru(sc)) {
			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
				continue;
			note_zone_scanning_priority(zone, priority);
Linus Torvalds's avatar
Linus Torvalds committed
1370

1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
			if (zone_is_all_unreclaimable(zone) &&
						priority != DEF_PRIORITY)
				continue;	/* Let kswapd poll it */
			sc->all_unreclaimable = 0;
		} else {
			/*
			 * Ignore cpuset limitation here. We just want to reduce
			 * # of used pages by us regardless of memory shortage.
			 */
			sc->all_unreclaimable = 0;
			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
							priority);
		}
1384

1385
		nr_reclaimed += shrink_zone(priority, zone, sc);
Linus Torvalds's avatar
Linus Torvalds committed
1386
	}
1387

1388
	return nr_reclaimed;
Linus Torvalds's avatar
Linus Torvalds committed
1389
}
1390

Linus Torvalds's avatar
Linus Torvalds committed
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
/*
 * This is the main entry point to direct page reclaim.
 *
 * If a full scan of the inactive list fails to free enough memory then we
 * are "out of memory" and something needs to be killed.
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick pdflush and take explicit naps in the
 * hope that some of these pages can be written.  But if the allocating task
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
1403
1404
1405
 *
 * returns:	0, if no pages reclaimed
 * 		else, the number of pages reclaimed
Linus Torvalds's avatar
Linus Torvalds committed
1406
 */
1407
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1408
					struct scan_control *sc)
Linus Torvalds's avatar
Linus Torvalds committed
1409
1410
{
	int priority;
1411
	unsigned long ret = 0;
1412
	unsigned long total_scanned = 0;
1413
	unsigned long nr_reclaimed = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1414
1415
	struct reclaim_state *reclaim_state = current->reclaim_state;
	unsigned long lru_pages = 0;
1416
	struct zoneref *z;
1417
	struct zone *zone;
1418
	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
1419

1420
1421
	delayacct_freepages_start();

1422
1423
1424
1425
1426
1427
	if (scan_global_lru(sc))
		count_vm_event(ALLOCSTALL);
	/*
	 * mem_cgroup will not do shrink_slab.
	 */
	if (scan_global_lru(sc)) {
1428
		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
Linus Torvalds's avatar
Linus Torvalds committed
1429

1430
1431
			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
1432

1433