memory.c 113 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
Hugh Dickins's avatar
Hugh Dickins committed
48
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
49
#include <linux/rmap.h>
50
#include <linux/export.h>
51
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
#include <linux/init.h>
53
#include <linux/pfn_t.h>
54
#include <linux/writeback.h>
55
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
56
#include <linux/mmu_notifier.h>
57
58
59
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
60
#include <linux/gfp.h>
61
#include <linux/migrate.h>
Andy Shevchenko's avatar
Andy Shevchenko committed
62
#include <linux/string.h>
63
#include <linux/dma-debug.h>
64
#include <linux/debugfs.h>
65
#include <linux/userfaultfd_k.h>
66
#include <linux/dax.h>
Linus Torvalds's avatar
Linus Torvalds committed
67

68
#include <asm/io.h>
69
#include <asm/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
70
#include <asm/pgalloc.h>
71
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
72
73
74
75
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

76
77
#include "internal.h"

78
79
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
80
81
#endif

82
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(high_memory);

102
103
104
105
106
107
108
109
110
111
112
113
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
114
115
116
117

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
118
	return 1;
119
120
121
}
__setup("norandmaps", disable_randmaps);

122
unsigned long zero_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
123
unsigned long highest_memmap_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
124

125
126
EXPORT_SYMBOL(zero_pfn);

Hugh Dickins's avatar
Hugh Dickins committed
127
128
129
130
131
132
133
134
135
/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
136

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
137

138
139
#if defined(SPLIT_RSS_COUNTING)

140
void sync_mm_rss(struct mm_struct *mm)
141
142
143
144
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
145
146
147
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
148
149
		}
	}
150
	current->rss_stat.events = 0;
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
172
		sync_mm_rss(task->mm);
173
}
174
#else /* SPLIT_RSS_COUNTING */
175
176
177
178
179
180
181
182

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

183
184
185
186
#endif /* SPLIT_RSS_COUNTING */

#ifdef HAVE_GENERIC_MMU_GATHER

187
static bool tlb_next_batch(struct mmu_gather *tlb)
188
189
190
191
192
193
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
194
		return true;
195
196
	}

197
	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
198
		return false;
199

200
201
	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
202
		return false;
203

204
	tlb->batch_count++;
205
206
207
208
209
210
211
	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

212
	return true;
213
214
215
216
217
218
219
}

/* tlb_gather_mmu
 *	Called to initialize an (on-stack) mmu_gather structure for page-table
 *	tear-down from @mm. The @fullmm argument is used when @mm is without
 *	users and we're going to destroy the full address space (exit/execve).
 */
220
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
221
222
223
{
	tlb->mm = mm;

224
225
	/* Is it from 0 to ~0? */
	tlb->fullmm     = !(start | (end+1));
226
	tlb->need_flush_all = 0;
227
228
229
230
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;
231
	tlb->batch_count = 0;
232
233
234
235

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb->batch = NULL;
#endif
236
	tlb->page_size = 0;
237
238

	__tlb_reset_range(tlb);
239
240
}

241
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
242
{
243
244
245
	if (!tlb->end)
		return;

246
	tlb_flush(tlb);
247
	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
248
249
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb_table_flush(tlb);
250
#endif
251
	__tlb_reset_range(tlb);
252
253
254
255
256
}

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;
257

258
	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
259
260
261
262
263
264
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

265
266
267
268
269
270
void tlb_flush_mmu(struct mmu_gather *tlb)
{
	tlb_flush_mmu_tlbonly(tlb);
	tlb_flush_mmu_free(tlb);
}

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/* tlb_finish_mmu
 *	Called at the end of the shootdown operation to free up any resources
 *	that were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
	struct mmu_gather_batch *batch, *next;

	tlb_flush_mmu(tlb);

	/* keep the page table cache within bounds */
	check_pgt_cache();

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

/* __tlb_remove_page
 *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *	handling the additional races in SMP caused by other CPUs caching valid
 *	mappings in their TLBs. Returns the number of free page slots left.
 *	When out of page slots we must call tlb_flush_mmu().
296
 *returns true if the caller should flush.
297
 */
298
bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
299
300
301
{
	struct mmu_gather_batch *batch;

302
	VM_BUG_ON(!tlb->end);
303
	VM_WARN_ON(tlb->page_size != page_size);
304

305
	batch = tlb->active;
306
307
308
309
310
	/*
	 * Add the page and check if we are full. If so
	 * force a flush.
	 */
	batch->pages[batch->nr++] = page;
311
312
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
313
			return true;
314
		batch = tlb->active;
315
	}
316
	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
317

318
	return false;
319
320
321
322
}

#endif /* HAVE_GENERIC_MMU_GATHER */

323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
 * See the comment near struct mmu_table_batch.
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling. See the comment near struct mmu_table_batch.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
	__tlb_remove_table(table);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	struct mmu_table_batch *batch;
	int i;

	batch = container_of(head, struct mmu_table_batch, rcu);

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	/*
	 * When there's less then two users of this mm there cannot be a
	 * concurrent page-table walk.
	 */
	if (atomic_read(&tlb->mm->mm_users) < 2) {
		__tlb_remove_table(table);
		return;
	}

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}
	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

396
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
397

Linus Torvalds's avatar
Linus Torvalds committed
398
399
400
401
/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
402
403
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
404
{
405
	pgtable_t token = pmd_pgtable(*pmd);
406
	pmd_clear(pmd);
407
	pte_free_tlb(tlb, token, addr);
408
	atomic_long_dec(&tlb->mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
409
410
}

411
412
413
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
414
415
416
{
	pmd_t *pmd;
	unsigned long next;
417
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
418

419
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
420
421
422
423
424
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
425
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
426
427
	} while (pmd++, addr = next, addr != end);

428
429
430
431
432
433
434
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
435
	}
436
437
438
439
440
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
441
	pmd_free_tlb(tlb, pmd, start);
442
	mm_dec_nr_pmds(tlb->mm);
Linus Torvalds's avatar
Linus Torvalds committed
443
444
}

445
446
447
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
448
449
450
{
	pud_t *pud;
	unsigned long next;
451
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
452

453
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
454
455
456
457
458
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
459
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
460
461
	} while (pud++, addr = next, addr != end);

462
463
464
465
466
467
468
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
469
	}
470
471
472
473
474
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
475
	pud_free_tlb(tlb, pud, start);
Linus Torvalds's avatar
Linus Torvalds committed
476
477
478
}

/*
479
 * This function frees user-level page tables of a process.
Linus Torvalds's avatar
Linus Torvalds committed
480
 */
481
void free_pgd_range(struct mmu_gather *tlb,
482
483
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
484
485
486
{
	pgd_t *pgd;
	unsigned long next;
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
513

514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;
529
530
531
532
533
	/*
	 * We add page table cache pages with PAGE_SIZE,
	 * (see pte_free_tlb()), flush the tlb if we need
	 */
	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
534
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
535
536
537
538
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
539
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
540
	} while (pgd++, addr = next, addr != end);
541
542
}

543
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
544
		unsigned long floor, unsigned long ceiling)
545
546
547
548
549
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

550
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
551
552
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
553
		 */
554
		unlink_anon_vmas(vma);
555
556
		unlink_file_vma(vma);

557
		if (is_vm_hugetlb_page(vma)) {
558
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
559
				floor, next? next->vm_start: ceiling);
560
561
562
563
564
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
565
			       && !is_vm_hugetlb_page(next)) {
566
567
				vma = next;
				next = vma->vm_next;
568
				unlink_anon_vmas(vma);
569
				unlink_file_vma(vma);
570
571
572
573
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
574
575
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
576
577
}

578
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
579
{
580
	spinlock_t *ptl;
581
	pgtable_t new = pte_alloc_one(mm, address);
582
583
584
	if (!new)
		return -ENOMEM;

585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

600
	ptl = pmd_lock(mm, pmd);
601
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
602
		atomic_long_inc(&mm->nr_ptes);
Linus Torvalds's avatar
Linus Torvalds committed
603
		pmd_populate(mm, pmd, new);
604
		new = NULL;
605
	}
606
	spin_unlock(ptl);
607
608
	if (new)
		pte_free(mm, new);
609
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
610
611
}

612
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
613
{
614
615
616
617
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

618
619
	smp_wmb(); /* See comment in __pte_alloc */

620
	spin_lock(&init_mm.page_table_lock);
621
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
622
		pmd_populate_kernel(&init_mm, pmd, new);
623
		new = NULL;
624
	}
625
	spin_unlock(&init_mm.page_table_lock);
626
627
	if (new)
		pte_free_kernel(&init_mm, new);
628
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
629
630
}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
631
632
633
634
635
636
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
637
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
638
639
	int i;

640
	if (current->mm == mm)
641
		sync_mm_rss(mm);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
642
643
644
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
645
646
}

Nick Piggin's avatar
Nick Piggin committed
647
/*
648
649
650
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
Nick Piggin's avatar
Nick Piggin committed
651
652
653
 *
 * The calling function must still handle the error.
 */
654
655
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
Nick Piggin's avatar
Nick Piggin committed
656
{
657
658
659
660
661
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
662
663
664
665
666
667
668
669
670
671
672
673
674
675
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
676
677
			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
				 nr_unshown);
678
679
680
681
682
683
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
684
685
686
687

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

688
689
690
	pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
		 current->comm,
		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
691
	if (page)
692
		dump_page(page, "bad pte");
693
694
	pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
695
696
697
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
698
699
700
701
702
	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
		 vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->fault : NULL,
		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
		 mapping ? mapping->a_ops->readpage : NULL);
Nick Piggin's avatar
Nick Piggin committed
703
	dump_stack();
704
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Nick Piggin's avatar
Nick Piggin committed
705
706
}

707
/*
708
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
709
 *
710
711
712
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
Jared Hulbert's avatar
Jared Hulbert committed
713
 *
714
715
716
717
718
719
720
721
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
722
 *
Jared Hulbert's avatar
Jared Hulbert committed
723
724
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
725
726
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
727
728
729
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
730
731
732
733
734
735
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
Jared Hulbert's avatar
Jared Hulbert committed
736
737
 *
 *
738
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
Jared Hulbert's avatar
Jared Hulbert committed
739
740
741
742
743
744
745
746
747
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
748
 */
749
750
751
752
753
754
755
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
756
{
757
	unsigned long pfn = pte_pfn(pte);
758
759

	if (HAVE_PTE_SPECIAL) {
760
		if (likely(!pte_special(pte)))
761
			goto check_pfn;
762
763
		if (vma->vm_ops && vma->vm_ops->find_special_page)
			return vma->vm_ops->find_special_page(vma, addr);
Hugh Dickins's avatar
Hugh Dickins committed
764
765
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
766
		if (!is_zero_pfn(pfn))
767
			print_bad_pte(vma, addr, pte, NULL);
768
769
770
771
772
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

Jared Hulbert's avatar
Jared Hulbert committed
773
774
775
776
777
778
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
779
780
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
Jared Hulbert's avatar
Jared Hulbert committed
781
782
783
784
785
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
786
787
	}

788
789
	if (is_zero_pfn(pfn))
		return NULL;
790
791
792
793
794
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
795
796

	/*
797
798
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
799
	 */
Jared Hulbert's avatar
Jared Hulbert committed
800
out:
801
	return pfn_to_page(pfn);
802
803
}

804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
				pmd_t pmd)
{
	unsigned long pfn = pmd_pfn(pmd);

	/*
	 * There is no pmd_special() but there may be special pmds, e.g.
	 * in a direct-access (dax) mapping, so let's just replicate the
	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
	 */
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
	}

	if (is_zero_pfn(pfn))
		return NULL;
	if (unlikely(pfn > highest_memmap_pfn))
		return NULL;

	/*
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
	 */
out:
	return pfn_to_page(pfn);
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
844
845
846
847
848
849
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

850
static inline unsigned long
Linus Torvalds's avatar
Linus Torvalds committed
851
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
Nick Piggin's avatar
Nick Piggin committed
852
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
853
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
854
{
Nick Piggin's avatar
Nick Piggin committed
855
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
856
857
858
859
860
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
		swp_entry_t entry = pte_to_swp_entry(pte);

		if (likely(!non_swap_entry(entry))) {
			if (swap_duplicate(entry) < 0)
				return entry.val;

			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
							&src_mm->mmlist);
				spin_unlock(&mmlist_lock);
			}
			rss[MM_SWAPENTS]++;
		} else if (is_migration_entry(entry)) {
			page = migration_entry_to_page(entry);

879
			rss[mm_counter(page)]++;
880
881
882
883
884
885
886
887
888
889
890
891

			if (is_write_migration_entry(entry) &&
					is_cow_mapping(vm_flags)) {
				/*
				 * COW mappings require pages in both
				 * parent and child to be set to read.
				 */
				make_migration_entry_read(&entry);
				pte = swp_entry_to_pte(entry);
				if (pte_swp_soft_dirty(*src_pte))
					pte = pte_swp_mksoft_dirty(pte);
				set_pte_at(src_mm, addr, src_pte, pte);
892
			}
Linus Torvalds's avatar
Linus Torvalds committed
893
		}
894
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
895
896
897
898
899
900
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
901
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
902
		ptep_set_wrprotect(src_mm, addr, src_pte);
903
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
904
905
906
907
908
909
910
911
912
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
913
914
915
916

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
917
		page_dup_rmap(page, false);
918
		rss[mm_counter(page)]++;
919
	}
920
921
922

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
923
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
924
925
}

926
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
927
928
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
929
{
930
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
931
	pte_t *src_pte, *dst_pte;
932
	spinlock_t *src_ptl, *dst_ptl;
933
	int progress = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
934
	int rss[NR_MM_COUNTERS];
935
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
936
937

again:
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
938
939
	init_rss_vec(rss);

940
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
941
942
	if (!dst_pte)
		return -ENOMEM;
Peter Zijlstra's avatar
Peter Zijlstra committed
943
	src_pte = pte_offset_map(src_pmd, addr);
944
	src_ptl = pte_lockptr(src_mm, src_pmd);
Ingo Molnar's avatar
Ingo Molnar committed
945
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
946
947
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
948
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
949
950
951
952
953
954

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
955
956
957
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
Nick Piggin's avatar
Nick Piggin committed
958
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
959
960
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
961
962
963
964
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
965
966
967
968
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
969
970
971
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

972
	arch_leave_lazy_mmu_mode();
973
	spin_unlock(src_ptl);
Peter Zijlstra's avatar
Peter Zijlstra committed
974
	pte_unmap(orig_src_pte);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
975
	add_mm_rss_vec(dst_mm, rss);
976
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
977
	cond_resched();
978
979
980
981
982
983

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1002
		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1003
			int err;
1004
			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1005
1006
1007
1008
1009
1010
1011
1012
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
1051
1052
1053
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */
	bool is_cow;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1054
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
1055

1056
1057
1058
1059
1060
1061
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
1062
1063
1064
	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
			!vma->anon_vma)
		return 0;
1065

Linus Torvalds's avatar
Linus Torvalds committed
1066
1067
1068
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1069
	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1070
1071
1072
1073
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
1074
		ret = track_pfn_copy(vma);
1075
1076
1077
1078
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1079
1080
1081
1082
1083
1084
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
1085
1086
1087
1088
1089
1090
	is_cow = is_cow_mapping(vma->vm_flags);
	mmun_start = addr;
	mmun_end   = end;
	if (is_cow)
		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
						    mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1091
1092

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1093
1094
1095
1096
1097
1098
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1099
1100
1101
1102
1103
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1104
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1105

1106
1107
	if (is_cow)
		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1108
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1109
1110
}

1111
static unsigned long zap_pte_range(struct mmu_gather *tlb,
Nick Piggin's avatar
Nick Piggin committed
1112
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1113
				unsigned long addr, unsigned long end,
1114
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1115
{
Nick Piggin's avatar
Nick Piggin committed
1116
	struct mm_struct *mm = tlb->mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1117
	int force_flush = 0;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1118
	int rss[NR_MM_COUNTERS];
1119
	spinlock_t *ptl;
1120
	pte_t *start_pte;
1121
	pte_t *pte;
1122
	swp_entry_t entry;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1123

1124
	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
Peter Zijlstra's avatar
Peter Zijlstra committed
1125
again:
1126
	init_rss_vec(rss);
1127
1128
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1129
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
1130
1131
	do {
		pte_t ptent = *pte;
1132
		if (pte_none(ptent)) {
Linus Torvalds's avatar
Linus Torvalds committed
1133
			continue;
1134
		}
1135

Linus Torvalds's avatar
Linus Torvalds committed
1136
		if (pte_present(ptent)) {
1137
			struct page *page;
1138

1139
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
1140
1141
1142
1143
1144
1145
1146
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
1147
				    details->check_mapping != page_rmapping(page))
Linus Torvalds's avatar
Linus Torvalds committed
1148
1149
					continue;
			}
Nick Piggin's avatar
Nick Piggin committed
1150
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1151
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1152
1153
1154
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
1155
1156

			if (!PageAnon(page)) {
1157
				if (pte_dirty(ptent)) {
Michal Hocko's avatar
Michal Hocko committed
1158
1159
1160
1161
1162
1163
					/*
					 * oom_reaper cannot tear down dirty
					 * pages
					 */
					if (unlikely(details && details->ignore_dirty))
						continue;
1164
					force_flush = 1;
1165
					set_page_dirty(page);
1166
				}
1167
				if (pte_young(ptent) &&
1168
				    likely(!(vma->vm_flags & VM_SEQ_READ)))
1169
					mark_page_accessed(page);
1170
			}
1171
			rss[mm_counter(page)]--;
1172
			page_remove_rmap(page, false);
1173
1174
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
1175
			if (unlikely(__tlb_remove_page(tlb, page))) {
1176
				force_flush = 1;
1177
				addr += PAGE_SIZE;
Peter Zijlstra's avatar
Peter Zijlstra committed
1178
				break;
1179
			}
Linus Torvalds's avatar
Linus Torvalds committed
1180
1181
			continue;
		}
Michal Hocko's avatar
Michal Hocko committed
1182
1183
		/* only check swap_entries if explicitly asked for in details */
		if (unlikely(details && !details->check_swap_entries))
Linus Torvalds's avatar
Linus Torvalds committed
1184
			continue;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1185

1186
1187
1188
1189
1190
		entry = pte_to_swp_entry(ptent);
		if (!non_swap_entry(entry))
			rss[MM_SWAPENTS]--;
		else if (is_migration_entry(entry)) {
			struct page *page;
1191

1192
			page = migration_entry_to_page(entry);
1193
			rss[mm_counter(page)]--;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1194
		}
1195
1196
		if (unlikely(!free_swap_and_cache(entry)))
			print_bad_pte(vma, addr, ptent, NULL);
1197
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1198
	} while (pte++, addr += PAGE_SIZE, addr != end);
1199

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1200
	add_mm_rss_vec(mm, rss);
1201
	arch_leave_lazy_mmu_mode();
1202

1203
	/* Do the actual TLB flush before dropping ptl */
1204
	if (force_flush)
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
		tlb_flush_mmu_tlbonly(tlb);
	pte_unmap_unlock(start_pte, ptl);

	/*
	 * If we forced a TLB flush (either due to running out of
	 * batch buffers or because we needed to flush dirty TLB
	 * entries before releasing the ptl), free the batched
	 * memory too. Restart if we didn't do everything.
	 */
	if (force_flush) {
		force_flush = 0;
		tlb_flush_mmu_free(tlb);
1217
		if (addr != end)
Peter Zijlstra's avatar
Peter Zijlstra committed
1218
1219
1220
			goto again;
	}

1221
	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1222
1223
}

1224
static inline unsigned long zap_pmd_range(struct<