mlock.c 23.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
/*
 *	linux/mm/mlock.c
 *
 *  (C) Copyright 1995 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 */

9
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
10
11
#include <linux/mman.h>
#include <linux/mm.h>
12
#include <linux/sched/user.h>
13
14
15
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
16
#include <linux/pagevec.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
18
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
19
#include <linux/sched.h>
20
#include <linux/export.h>
21
22
23
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
24
25
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
26
27

#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
28

29
bool can_do_mlock(void)
Alexey Dobriyan's avatar
Alexey Dobriyan committed
30
{
Jiri Slaby's avatar
Jiri Slaby committed
31
	if (rlimit(RLIMIT_MEMLOCK) != 0)
32
		return true;
33
	if (capable(CAP_IPC_LOCK))
34
35
		return true;
	return false;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
36
37
}
EXPORT_SYMBOL(can_do_mlock);
Linus Torvalds's avatar
Linus Torvalds committed
38

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/*
 * Mlocked pages are marked with PageMlocked() flag for efficient testing
 * in vmscan and, possibly, the fault path; and to support semi-accurate
 * statistics.
 *
 * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
 * The unevictable list is an LRU sibling list to the [in]active lists.
 * PageUnevictable is set to indicate the unevictable state.
 *
 * When lazy mlocking via vmscan, it is important to ensure that the
 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
 * may have mlocked a page that is being munlocked. So lazy mlock must take
 * the mmap_sem for read, and verify that the vma really is locked
 * (see mm/rmap.c).
 */

/*
 *  LRU accounting for clear_page_mlock()
 */
59
void clear_page_mlock(struct page *page)
60
{
61
	if (!TestClearPageMlocked(page))
62
63
		return;

David Rientjes's avatar
David Rientjes committed
64
65
	mod_zone_page_state(page_zone(page), NR_MLOCK,
			    -hpage_nr_pages(page));
Nick Piggin's avatar
Nick Piggin committed
66
	count_vm_event(UNEVICTABLE_PGCLEARED);
67
68
69
70
71
72
	/*
	 * The previous TestClearPageMlocked() corresponds to the smp_mb()
	 * in __pagevec_lru_add_fn().
	 *
	 * See __pagevec_lru_add_fn for more explanation.
	 */
73
74
75
76
	if (!isolate_lru_page(page)) {
		putback_lru_page(page);
	} else {
		/*
77
		 * We lost the race. the page already moved to evictable list.
78
		 */
79
		if (PageUnevictable(page))
Nick Piggin's avatar
Nick Piggin committed
80
			count_vm_event(UNEVICTABLE_PGSTRANDED);
81
82
83
84
85
86
87
88
89
	}
}

/*
 * Mark page as mlocked if not already.
 * If page on LRU, isolate and putback to move to unevictable list.
 */
void mlock_vma_page(struct page *page)
{
90
	/* Serialize with page migration */
91
92
	BUG_ON(!PageLocked(page));

93
94
95
	VM_BUG_ON_PAGE(PageTail(page), page);
	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);

Nick Piggin's avatar
Nick Piggin committed
96
	if (!TestSetPageMlocked(page)) {
David Rientjes's avatar
David Rientjes committed
97
98
		mod_zone_page_state(page_zone(page), NR_MLOCK,
				    hpage_nr_pages(page));
Nick Piggin's avatar
Nick Piggin committed
99
100
101
102
		count_vm_event(UNEVICTABLE_PGMLOCKED);
		if (!isolate_lru_page(page))
			putback_lru_page(page);
	}
103
104
}

105
106
107
108
109
110
111
112
113
/*
 * Isolate a page from LRU with optional get_page() pin.
 * Assumes lru_lock already held and page already pinned.
 */
static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
{
	if (PageLRU(page)) {
		struct lruvec *lruvec;

114
		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
115
116
117
118
119
120
121
122
123
124
		if (getpage)
			get_page(page);
		ClearPageLRU(page);
		del_page_from_lru_list(page, lruvec, page_lru(page));
		return true;
	}

	return false;
}

125
126
127
128
129
130
131
132
133
134
135
136
137
/*
 * Finish munlock after successful page isolation
 *
 * Page must be locked. This is a wrapper for try_to_munlock()
 * and putback_lru_page() with munlock accounting.
 */
static void __munlock_isolated_page(struct page *page)
{
	/*
	 * Optimization: if the page was mapped just once, that's our mapping
	 * and we don't need to check all the other vmas.
	 */
	if (page_mapcount(page) > 1)
138
		try_to_munlock(page);
139
140

	/* Did try_to_unlock() succeed or punt? */
141
	if (!PageMlocked(page))
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
		count_vm_event(UNEVICTABLE_PGMUNLOCKED);

	putback_lru_page(page);
}

/*
 * Accounting for page isolation fail during munlock
 *
 * Performs accounting when page isolation fails in munlock. There is nothing
 * else to do because it means some other task has already removed the page
 * from the LRU. putback_lru_page() will take care of removing the page from
 * the unevictable list, if necessary. vmscan [page_referenced()] will move
 * the page back to the unevictable list if some other vma has it mlocked.
 */
static void __munlock_isolation_failed(struct page *page)
{
	if (PageUnevictable(page))
159
		__count_vm_event(UNEVICTABLE_PGSTRANDED);
160
	else
161
		__count_vm_event(UNEVICTABLE_PGMUNLOCKED);
162
163
}

164
165
/**
 * munlock_vma_page - munlock a vma page
Mike Rapoport's avatar
Mike Rapoport committed
166
 * @page: page to be unlocked, either a normal page or THP page head
167
168
169
 *
 * returns the size of the page as a page mask (0 for normal page,
 *         HPAGE_PMD_NR - 1 for THP head page)
170
 *
171
172
173
174
175
176
177
178
179
180
 * called from munlock()/munmap() path with page supposedly on the LRU.
 * When we munlock a page, because the vma where we found the page is being
 * munlock()ed or munmap()ed, we want to check whether other vmas hold the
 * page locked so that we can leave it on the unevictable lru list and not
 * bother vmscan with it.  However, to walk the page's rmap list in
 * try_to_munlock() we must isolate the page from the LRU.  If some other
 * task has removed the page from the LRU, we won't be able to do that.
 * So we clear the PageMlocked as we might not get another chance.  If we
 * can't isolate the page, we leave it for putback_lru_page() and vmscan
 * [page_referenced()/try_to_unmap()] to deal with.
181
 */
182
unsigned int munlock_vma_page(struct page *page)
183
{
Kirill A. Shutemov's avatar
Kirill A. Shutemov committed
184
	int nr_pages;
185
	struct zone *zone = page_zone(page);
186

187
	/* For try_to_munlock() and to serialize with page migration */
188
189
	BUG_ON(!PageLocked(page));

190
191
	VM_BUG_ON_PAGE(PageTail(page), page);

192
	/*
193
194
195
	 * Serialize with any parallel __split_huge_page_refcount() which
	 * might otherwise copy PageMlocked to part of the tail pages before
	 * we clear it in the head page. It also stabilizes hpage_nr_pages().
196
	 */
197
	spin_lock_irq(zone_lru_lock(zone));
198

199
200
201
	if (!TestClearPageMlocked(page)) {
		/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
		nr_pages = 1;
202
		goto unlock_out;
203
	}
204

205
	nr_pages = hpage_nr_pages(page);
206
207
208
	__mod_zone_page_state(zone, NR_MLOCK, -nr_pages);

	if (__munlock_isolate_lru_page(page, true)) {
209
		spin_unlock_irq(zone_lru_lock(zone));
210
211
212
213
214
215
		__munlock_isolated_page(page);
		goto out;
	}
	__munlock_isolation_failed(page);

unlock_out:
216
	spin_unlock_irq(zone_lru_lock(zone));
217
218

out:
219
	return nr_pages - 1;
220
221
}

222
223
224
225
226
227
228
229
230
231
/*
 * convert get_user_pages() return value to posix mlock() error
 */
static int __mlock_posix_error_return(long retval)
{
	if (retval == -EFAULT)
		retval = -ENOMEM;
	else if (retval == -ENOMEM)
		retval = -EAGAIN;
	return retval;
232
233
}

234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/*
 * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
 *
 * The fast path is available only for evictable pages with single mapping.
 * Then we can bypass the per-cpu pvec and get better performance.
 * when mapcount > 1 we need try_to_munlock() which can fail.
 * when !page_evictable(), we need the full redo logic of putback_lru_page to
 * avoid leaving evictable page in unevictable list.
 *
 * In case of success, @page is added to @pvec and @pgrescued is incremented
 * in case that the page was previously unevictable. @page is also unlocked.
 */
static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
		int *pgrescued)
{
249
250
	VM_BUG_ON_PAGE(PageLRU(page), page);
	VM_BUG_ON_PAGE(!PageLocked(page), page);
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279

	if (page_mapcount(page) <= 1 && page_evictable(page)) {
		pagevec_add(pvec, page);
		if (TestClearPageUnevictable(page))
			(*pgrescued)++;
		unlock_page(page);
		return true;
	}

	return false;
}

/*
 * Putback multiple evictable pages to the LRU
 *
 * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
 * the pages might have meanwhile become unevictable but that is OK.
 */
static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
{
	count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
	/*
	 *__pagevec_lru_add() calls release_pages() so we don't call
	 * put_page() explicitly
	 */
	__pagevec_lru_add(pvec);
	count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
}

280
281
282
283
284
285
286
287
/*
 * Munlock a batch of pages from the same zone
 *
 * The work is split to two main phases. First phase clears the Mlocked flag
 * and attempts to isolate the pages, all under a single zone lru lock.
 * The second phase finishes the munlock only for pages where isolation
 * succeeded.
 *
288
 * Note that the pagevec may be modified during the process.
289
290
291
292
293
 */
static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
	int i;
	int nr = pagevec_count(pvec);
294
	int delta_munlocked = -nr;
295
296
	struct pagevec pvec_putback;
	int pgrescued = 0;
297

298
	pagevec_init(&pvec_putback);
299

300
	/* Phase 1: page isolation */
301
	spin_lock_irq(zone_lru_lock(zone));
302
303
304
305
306
	for (i = 0; i < nr; i++) {
		struct page *page = pvec->pages[i];

		if (TestClearPageMlocked(page)) {
			/*
307
308
			 * We already have pin from follow_page_mask()
			 * so we can spare the get_page() here.
309
			 */
310
311
312
313
			if (__munlock_isolate_lru_page(page, false))
				continue;
			else
				__munlock_isolation_failed(page);
314
315
		} else {
			delta_munlocked++;
316
		}
317
318
319
320
321
322
323
324
325

		/*
		 * We won't be munlocking this page in the next phase
		 * but we still need to release the follow_page_mask()
		 * pin. We cannot do it under lru_lock however. If it's
		 * the last pin, __page_cache_release() would deadlock.
		 */
		pagevec_add(&pvec_putback, pvec->pages[i]);
		pvec->pages[i] = NULL;
326
	}
327
	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
328
	spin_unlock_irq(zone_lru_lock(zone));
329

330
331
332
	/* Now we can release pins of pages that we are not munlocking */
	pagevec_release(&pvec_putback);

333
	/* Phase 2: page munlock */
334
335
336
337
338
	for (i = 0; i < nr; i++) {
		struct page *page = pvec->pages[i];

		if (page) {
			lock_page(page);
339
340
			if (!__putback_lru_fast_prepare(page, &pvec_putback,
					&pgrescued)) {
341
342
343
344
345
				/*
				 * Slow path. We don't want to lose the last
				 * pin before unlock_page()
				 */
				get_page(page); /* for putback_lru_page() */
346
347
				__munlock_isolated_page(page);
				unlock_page(page);
348
				put_page(page); /* from follow_page_mask() */
349
			}
350
351
		}
	}
352

353
354
355
356
	/*
	 * Phase 3: page putback for pages that qualified for the fast path
	 * This will also call put_page() to return pin from follow_page_mask()
	 */
357
358
	if (pagevec_count(&pvec_putback))
		__putback_lru_fast(&pvec_putback, pgrescued);
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
}

/*
 * Fill up pagevec for __munlock_pagevec using pte walk
 *
 * The function expects that the struct page corresponding to @start address is
 * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
 *
 * The rest of @pvec is filled by subsequent pages within the same pmd and same
 * zone, as long as the pte's are present and vm_normal_page() succeeds. These
 * pages also get pinned.
 *
 * Returns the address of the next page that should be scanned. This equals
 * @start + PAGE_SIZE when no page could be added by the pte walk.
 */
static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
375
376
			struct vm_area_struct *vma, struct zone *zone,
			unsigned long start, unsigned long end)
377
378
379
380
381
382
{
	pte_t *pte;
	spinlock_t *ptl;

	/*
	 * Initialize pte walk starting at the already pinned page where we
383
384
	 * are sure that there is a pte, as it was pinned under the same
	 * mmap_sem write op.
385
386
	 */
	pte = get_locked_pte(vma->vm_mm, start,	&ptl);
387
388
	/* Make sure we do not cross the page table boundary */
	end = pgd_addr_end(start, end);
389
	end = p4d_addr_end(start, end);
390
391
	end = pud_addr_end(start, end);
	end = pmd_addr_end(start, end);
392
393
394
395
396
397
398
399
400
401
402
403

	/* The page next to the pinned page is the first we will try to get */
	start += PAGE_SIZE;
	while (start < end) {
		struct page *page = NULL;
		pte++;
		if (pte_present(*pte))
			page = vm_normal_page(vma, start, *pte);
		/*
		 * Break if page could not be obtained or the page's node+zone does not
		 * match
		 */
404
		if (!page || page_zone(page) != zone)
405
			break;
406

407
408
409
410
411
412
413
		/*
		 * Do not use pagevec for PTE-mapped THP,
		 * munlock_vma_pages_range() will handle them.
		 */
		if (PageTransCompound(page))
			break;

414
415
416
417
418
419
420
421
422
423
424
		get_page(page);
		/*
		 * Increase the address that will be returned *before* the
		 * eventual break due to pvec becoming full by adding the page
		 */
		start += PAGE_SIZE;
		if (pagevec_add(pvec, page) == 0)
			break;
	}
	pte_unmap_unlock(pte, ptl);
	return start;
425
426
}

427
/*
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
 * munlock_vma_pages_range() - munlock all pages in the vma range.'
 * @vma - vma containing range to be munlock()ed.
 * @start - start address in @vma of the range
 * @end - end of range in @vma.
 *
 *  For mremap(), munmap() and exit().
 *
 * Called with @vma VM_LOCKED.
 *
 * Returns with VM_LOCKED cleared.  Callers must be prepared to
 * deal with this.
 *
 * We don't save and restore VM_LOCKED here because pages are
 * still on lru.  In unmap path, pages might be scanned by reclaim
 * and re-mlocked by try_to_{munlock|unmap} before we unmap and
 * free them.  This will result in freeing mlocked pages.
444
 */
445
void munlock_vma_pages_range(struct vm_area_struct *vma,
Hugh Dickins's avatar
Hugh Dickins committed
446
			     unsigned long start, unsigned long end)
447
{
Eric B Munson's avatar
Eric B Munson committed
448
	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
Hugh Dickins's avatar
Hugh Dickins committed
449

450
	while (start < end) {
451
		struct page *page;
452
		unsigned int page_mask = 0;
453
		unsigned long page_increm;
454
455
		struct pagevec pvec;
		struct zone *zone;
456

457
		pagevec_init(&pvec);
Hugh Dickins's avatar
Hugh Dickins committed
458
459
460
461
462
463
464
		/*
		 * Although FOLL_DUMP is intended for get_dump_page(),
		 * it just so happens that its special treatment of the
		 * ZERO_PAGE (returning an error instead of doing get_page)
		 * suits munlock very well (and if somehow an abnormal page
		 * has sneaked into the range, we won't oops here: great).
		 */
465
		page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
466

467
468
469
470
471
472
473
474
475
		if (page && !IS_ERR(page)) {
			if (PageTransTail(page)) {
				VM_BUG_ON_PAGE(PageMlocked(page), page);
				put_page(page); /* follow_page_mask() */
			} else if (PageTransHuge(page)) {
				lock_page(page);
				/*
				 * Any THP page found by follow_page_mask() may
				 * have gotten split before reaching
476
477
				 * munlock_vma_page(), so we need to compute
				 * the page_mask here instead.
478
479
480
481
482
483
484
485
486
487
488
489
				 */
				page_mask = munlock_vma_page(page);
				unlock_page(page);
				put_page(page); /* follow_page_mask() */
			} else {
				/*
				 * Non-huge pages are handled in batches via
				 * pagevec. The pin from follow_page_mask()
				 * prevents them from collapsing by THP.
				 */
				pagevec_add(&pvec, page);
				zone = page_zone(page);
490

491
492
493
494
495
496
497
				/*
				 * Try to fill the rest of pagevec using fast
				 * pte walk. This will also update start to
				 * the next page to process. Then munlock the
				 * pagevec.
				 */
				start = __munlock_pagevec_fill(&pvec, vma,
498
						zone, start, end);
499
500
501
				__munlock_pagevec(&pvec, zone);
				goto next;
			}
Hugh Dickins's avatar
Hugh Dickins committed
502
		}
503
		page_increm = 1 + page_mask;
504
		start += page_increm * PAGE_SIZE;
505
next:
Hugh Dickins's avatar
Hugh Dickins committed
506
507
		cond_resched();
	}
508
509
510
511
512
513
514
}

/*
 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
515
 * populate the ptes.
516
517
518
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
Linus Torvalds's avatar
Linus Torvalds committed
519
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
520
	unsigned long start, unsigned long end, vm_flags_t newflags)
Linus Torvalds's avatar
Linus Torvalds committed
521
{
522
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
523
	pgoff_t pgoff;
524
	int nr_pages;
Linus Torvalds's avatar
Linus Torvalds committed
525
	int ret = 0;
526
	int lock = !!(newflags & VM_LOCKED);
527
	vm_flags_t old_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
528

529
	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
530
531
	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
	    vma_is_dax(vma))
532
533
		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
		goto out;
534

Linus Torvalds's avatar
Linus Torvalds committed
535
536
	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
537
538
			  vma->vm_file, pgoff, vma_policy(vma),
			  vma->vm_userfaultfd_ctx);
Linus Torvalds's avatar
Linus Torvalds committed
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
	if (*prev) {
		vma = *prev;
		goto success;
	}

	if (start != vma->vm_start) {
		ret = split_vma(mm, vma, start, 1);
		if (ret)
			goto out;
	}

	if (end != vma->vm_end) {
		ret = split_vma(mm, vma, end, 0);
		if (ret)
			goto out;
	}

success:
557
558
559
560
561
562
	/*
	 * Keep track of amount of locked VM.
	 */
	nr_pages = (end - start) >> PAGE_SHIFT;
	if (!lock)
		nr_pages = -nr_pages;
563
564
	else if (old_flags & VM_LOCKED)
		nr_pages = 0;
565
566
	mm->locked_vm += nr_pages;

Linus Torvalds's avatar
Linus Torvalds committed
567
568
569
	/*
	 * vm_flags is protected by the mmap_sem held in write mode.
	 * It's okay if try_to_unmap_one unmaps a page just after we
570
	 * set VM_LOCKED, populate_vma_page_range will bring it back.
Linus Torvalds's avatar
Linus Torvalds committed
571
572
	 */

573
	if (lock)
Hugh Dickins's avatar
Hugh Dickins committed
574
		vma->vm_flags = newflags;
575
	else
Hugh Dickins's avatar
Hugh Dickins committed
576
		munlock_vma_pages_range(vma, start, end);
Linus Torvalds's avatar
Linus Torvalds committed
577
578

out:
579
	*prev = vma;
Linus Torvalds's avatar
Linus Torvalds committed
580
581
582
	return ret;
}

583
584
static int apply_vma_lock_flags(unsigned long start, size_t len,
				vm_flags_t flags)
Linus Torvalds's avatar
Linus Torvalds committed
585
586
587
588
589
{
	unsigned long nstart, end, tmp;
	struct vm_area_struct * vma, * prev;
	int error;

590
	VM_BUG_ON(offset_in_page(start));
591
	VM_BUG_ON(len != PAGE_ALIGN(len));
Linus Torvalds's avatar
Linus Torvalds committed
592
593
594
595
596
	end = start + len;
	if (end < start)
		return -EINVAL;
	if (end == start)
		return 0;
597
	vma = find_vma(current->mm, start);
Linus Torvalds's avatar
Linus Torvalds committed
598
599
600
	if (!vma || vma->vm_start > start)
		return -ENOMEM;

601
	prev = vma->vm_prev;
Linus Torvalds's avatar
Linus Torvalds committed
602
603
604
605
	if (start > vma->vm_start)
		prev = vma;

	for (nstart = start ; ; ) {
606
		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
607

608
		newflags |= flags;
Linus Torvalds's avatar
Linus Torvalds committed
609

610
		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
Linus Torvalds's avatar
Linus Torvalds committed
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
		tmp = vma->vm_end;
		if (tmp > end)
			tmp = end;
		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
		if (error)
			break;
		nstart = tmp;
		if (nstart < prev->vm_end)
			nstart = prev->vm_end;
		if (nstart >= end)
			break;

		vma = prev->vm_next;
		if (!vma || vma->vm_start != nstart) {
			error = -ENOMEM;
			break;
		}
	}
	return error;
}

632
633
634
635
636
637
638
/*
 * Go through vma areas and sum size of mlocked
 * vma pages, as return value.
 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 * is also counted.
 * Return value: previously mlocked page counts
 */
639
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
640
641
642
		unsigned long start, size_t len)
{
	struct vm_area_struct *vma;
643
	unsigned long count = 0;
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670

	if (mm == NULL)
		mm = current->mm;

	vma = find_vma(mm, start);
	if (vma == NULL)
		vma = mm->mmap;

	for (; vma ; vma = vma->vm_next) {
		if (start >= vma->vm_end)
			continue;
		if (start + len <=  vma->vm_start)
			break;
		if (vma->vm_flags & VM_LOCKED) {
			if (start > vma->vm_start)
				count -= (start - vma->vm_start);
			if (start + len < vma->vm_end) {
				count += start + len - vma->vm_start;
				break;
			}
			count += vma->vm_end - vma->vm_start;
		}
	}

	return count >> PAGE_SHIFT;
}

671
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
Linus Torvalds's avatar
Linus Torvalds committed
672
673
674
675
676
677
678
679
{
	unsigned long locked;
	unsigned long lock_limit;
	int error = -ENOMEM;

	if (!can_do_mlock())
		return -EPERM;

680
	len = PAGE_ALIGN(len + (offset_in_page(start)));
Linus Torvalds's avatar
Linus Torvalds committed
681
682
	start &= PAGE_MASK;

Jiri Slaby's avatar
Jiri Slaby committed
683
	lock_limit = rlimit(RLIMIT_MEMLOCK);
Linus Torvalds's avatar
Linus Torvalds committed
684
	lock_limit >>= PAGE_SHIFT;
685
686
	locked = len >> PAGE_SHIFT;

687
688
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
689
690

	locked += current->mm->locked_vm;
691
692
693
694
695
696
697
698
699
700
	if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
		/*
		 * It is possible that the regions requested intersect with
		 * previously mlocked areas, that part area in "mm->locked_vm"
		 * should not be counted to new mlock increment count. So check
		 * and adjust locked count if necessary.
		 */
		locked -= count_mm_mlocked_page_nr(current->mm,
				start, len);
	}
Linus Torvalds's avatar
Linus Torvalds committed
701
702
703

	/* check against resource limits */
	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
704
		error = apply_vma_lock_flags(start, len, flags);
705

Linus Torvalds's avatar
Linus Torvalds committed
706
	up_write(&current->mm->mmap_sem);
707
708
709
710
711
712
713
	if (error)
		return error;

	error = __mm_populate(start, len, 0);
	if (error)
		return __mlock_posix_error_return(error);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
714
715
}

716
717
718
719
720
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
	return do_mlock(start, len, VM_LOCKED);
}

721
722
SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
{
723
724
725
	vm_flags_t vm_flags = VM_LOCKED;

	if (flags & ~MLOCK_ONFAULT)
726
727
		return -EINVAL;

728
729
730
731
	if (flags & MLOCK_ONFAULT)
		vm_flags |= VM_LOCKONFAULT;

	return do_mlock(start, len, vm_flags);
732
733
}

734
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
Linus Torvalds's avatar
Linus Torvalds committed
735
736
737
{
	int ret;

738
	len = PAGE_ALIGN(len + (offset_in_page(start)));
Linus Torvalds's avatar
Linus Torvalds committed
739
	start &= PAGE_MASK;
740

741
742
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
743
	ret = apply_vma_lock_flags(start, len, 0);
Linus Torvalds's avatar
Linus Torvalds committed
744
	up_write(&current->mm->mmap_sem);
745

Linus Torvalds's avatar
Linus Torvalds committed
746
747
748
	return ret;
}

749
750
751
752
753
754
755
756
757
758
/*
 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 * and translate into the appropriate modifications to mm->def_flags and/or the
 * flags for all current VMAs.
 *
 * There are a couple of subtleties with this.  If mlockall() is called multiple
 * times with different flags, the values do not necessarily stack.  If mlockall
 * is called once including the MCL_FUTURE flag and then a second time without
 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 */
759
static int apply_mlockall_flags(int flags)
Linus Torvalds's avatar
Linus Torvalds committed
760
761
{
	struct vm_area_struct * vma, * prev = NULL;
762
	vm_flags_t to_add = 0;
Linus Torvalds's avatar
Linus Torvalds committed
763

764
765
	current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
	if (flags & MCL_FUTURE) {
766
		current->mm->def_flags |= VM_LOCKED;
767

768
769
770
771
772
773
774
775
776
777
778
779
		if (flags & MCL_ONFAULT)
			current->mm->def_flags |= VM_LOCKONFAULT;

		if (!(flags & MCL_CURRENT))
			goto out;
	}

	if (flags & MCL_CURRENT) {
		to_add |= VM_LOCKED;
		if (flags & MCL_ONFAULT)
			to_add |= VM_LOCKONFAULT;
	}
Linus Torvalds's avatar
Linus Torvalds committed
780
781

	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
782
		vm_flags_t newflags;
Linus Torvalds's avatar
Linus Torvalds committed
783

784
785
		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
		newflags |= to_add;
Linus Torvalds's avatar
Linus Torvalds committed
786
787
788

		/* Ignore errors */
		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
789
		cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
790
791
792
793
794
	}
out:
	return 0;
}

795
SYSCALL_DEFINE1(mlockall, int, flags)
Linus Torvalds's avatar
Linus Torvalds committed
796
797
{
	unsigned long lock_limit;
798
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
799

800
	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
801
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
802
803

	if (!can_do_mlock())
804
		return -EPERM;
Linus Torvalds's avatar
Linus Torvalds committed
805

Jiri Slaby's avatar
Jiri Slaby committed
806
	lock_limit = rlimit(RLIMIT_MEMLOCK);
Linus Torvalds's avatar
Linus Torvalds committed
807
808
	lock_limit >>= PAGE_SHIFT;

809
810
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
811

812
	ret = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
813
814
	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
	    capable(CAP_IPC_LOCK))
815
		ret = apply_mlockall_flags(flags);
Linus Torvalds's avatar
Linus Torvalds committed
816
	up_write(&current->mm->mmap_sem);
817
818
	if (!ret && (flags & MCL_CURRENT))
		mm_populate(0, TASK_SIZE);
819

Linus Torvalds's avatar
Linus Torvalds committed
820
821
822
	return ret;
}

823
SYSCALL_DEFINE0(munlockall)
Linus Torvalds's avatar
Linus Torvalds committed
824
825
826
{
	int ret;

827
828
	if (down_write_killable(&current->mm->mmap_sem))
		return -EINTR;
829
	ret = apply_mlockall_flags(0);
Linus Torvalds's avatar
Linus Torvalds committed
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
	up_write(&current->mm->mmap_sem);
	return ret;
}

/*
 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 * shm segments) get accounted against the user_struct instead.
 */
static DEFINE_SPINLOCK(shmlock_user_lock);

int user_shm_lock(size_t size, struct user_struct *user)
{
	unsigned long lock_limit, locked;
	int allowed = 0;

	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
Jiri Slaby's avatar
Jiri Slaby committed
846
	lock_limit = rlimit(RLIMIT_MEMLOCK);
847
848
	if (lock_limit == RLIM_INFINITY)
		allowed = 1;
Linus Torvalds's avatar
Linus Torvalds committed
849
850
	lock_limit >>= PAGE_SHIFT;
	spin_lock(&shmlock_user_lock);
851
852
	if (!allowed &&
	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
Linus Torvalds's avatar
Linus Torvalds committed
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
		goto out;
	get_uid(user);
	user->locked_shm += locked;
	allowed = 1;
out:
	spin_unlock(&shmlock_user_lock);
	return allowed;
}

void user_shm_unlock(size_t size, struct user_struct *user)
{
	spin_lock(&shmlock_user_lock);
	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	spin_unlock(&shmlock_user_lock);
	free_uid(user);
}
869
870
871
872

#ifdef CONFIG_IPIPE
int __ipipe_pin_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
873
874
	unsigned int gup_flags = 0;
	int ret, len;
875
876
877
878
879
880
881
882
883
884
885

	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
		return 0;

	if (!((vma->vm_flags & VM_DONTEXPAND) ||
	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm))) {
		ret = populate_vma_page_range(vma, vma->vm_start, vma->vm_end,
					      NULL);
		return ret < 0 ? ret : 0;
	}

886
887
	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
		gup_flags |= FOLL_WRITE;
888
	len = DIV_ROUND_UP(vma->vm_end, PAGE_SIZE) - vma->vm_start/PAGE_SIZE;
889
	ret = get_user_pages_locked(vma->vm_start, len, gup_flags, NULL, NULL);
890
891
892
893
894
	if (ret < 0)
		return ret;
	return ret == len ? 0 : -EFAULT;
}
#endif