mremap.c 13.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
/*
 *	mm/mremap.c
 *
 *	(C) Copyright 1996 Linus Torvalds
 *
Alan Cox's avatar
Alan Cox committed
6
 *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
7
8
9
10
11
12
13
 *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/shm.h>
14
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
16
#include <linux/mman.h>
#include <linux/swap.h>
17
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
19
20
21
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
22
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27

#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

28
29
#include "internal.h"

30
31
32
33
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)	(0)
#endif

34
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	pgd = pgd_offset(mm, addr);
	if (pgd_none_or_clear_bad(pgd))
		return NULL;

	pud = pud_offset(pgd, addr);
	if (pud_none_or_clear_bad(pud))
		return NULL;

	pmd = pmd_offset(pud, addr);
	if (pmd_none_or_clear_bad(pmd))
		return NULL;

52
	return pmd;
Linus Torvalds's avatar
Linus Torvalds committed
53
54
}

55
static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
{
	pgd_t *pgd;
	pud_t *pud;
59
	pmd_t *pmd;
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63

	pgd = pgd_offset(mm, addr);
	pud = pud_alloc(mm, pgd, addr);
	if (!pud)
64
		return NULL;
65

Linus Torvalds's avatar
Linus Torvalds committed
66
	pmd = pmd_alloc(mm, pud, addr);
67
	if (!pmd)
68
		return NULL;
69

70
	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
71
72
		return NULL;

73
	return pmd;
Linus Torvalds's avatar
Linus Torvalds committed
74
75
}

76
77
78
79
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
		unsigned long old_addr, unsigned long old_end,
		struct vm_area_struct *new_vma, pmd_t *new_pmd,
		unsigned long new_addr)
Linus Torvalds's avatar
Linus Torvalds committed
80
81
82
{
	struct address_space *mapping = NULL;
	struct mm_struct *mm = vma->vm_mm;
83
	pte_t *old_pte, *new_pte, pte;
84
	spinlock_t *old_ptl, *new_ptl;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
85
	unsigned long old_start;
Linus Torvalds's avatar
Linus Torvalds committed
86

Andrea Arcangeli's avatar
Andrea Arcangeli committed
87
88
89
	old_start = old_addr;
	mmu_notifier_invalidate_range_start(vma->vm_mm,
					    old_start, old_end);
Linus Torvalds's avatar
Linus Torvalds committed
90
91
92
	if (vma->vm_file) {
		/*
		 * Subtle point from Rajesh Venkatasubramanian: before
npiggin@suse.de's avatar
npiggin@suse.de committed
93
94
		 * moving file-based ptes, we must lock truncate_pagecache
		 * out, since it might clean the dst vma before the src vma,
Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
		 * and we propagate stale pages into the dst afterward.
		 */
		mapping = vma->vm_file->f_mapping;
		spin_lock(&mapping->i_mmap_lock);
		if (new_vma->vm_truncate_count &&
		    new_vma->vm_truncate_count != vma->vm_truncate_count)
			new_vma->vm_truncate_count = 0;
	}

104
105
106
107
	/*
	 * We don't have to worry about the ordering of src and dst
	 * pte locks because exclusive mmap_sem prevents deadlock.
	 */
108
109
	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
110
111
	new_ptl = pte_lockptr(mm, new_pmd);
	if (new_ptl != old_ptl)
Ingo Molnar's avatar
Ingo Molnar committed
112
		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
113
	arch_enter_lazy_mmu_mode();
114
115
116
117
118
119
120
121

	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
				   new_pte++, new_addr += PAGE_SIZE) {
		if (pte_none(*old_pte))
			continue;
		pte = ptep_clear_flush(vma, old_addr, old_pte);
		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
		set_pte_at(mm, new_addr, new_pte, pte);
Linus Torvalds's avatar
Linus Torvalds committed
122
	}
123

124
	arch_leave_lazy_mmu_mode();
125
126
	if (new_ptl != old_ptl)
		spin_unlock(new_ptl);
127
	pte_unmap_nested(new_pte - 1);
128
	pte_unmap_unlock(old_pte - 1, old_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
129
130
	if (mapping)
		spin_unlock(&mapping->i_mmap_lock);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
131
	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
Linus Torvalds's avatar
Linus Torvalds committed
132
133
}

134
135
#define LATENCY_LIMIT	(64 * PAGE_SIZE)

136
unsigned long move_page_tables(struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
137
138
139
		unsigned long old_addr, struct vm_area_struct *new_vma,
		unsigned long new_addr, unsigned long len)
{
140
141
	unsigned long extent, next, old_end;
	pmd_t *old_pmd, *new_pmd;
Linus Torvalds's avatar
Linus Torvalds committed
142

143
144
	old_end = old_addr + len;
	flush_cache_range(vma, old_addr, old_end);
Linus Torvalds's avatar
Linus Torvalds committed
145

146
	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
Linus Torvalds's avatar
Linus Torvalds committed
147
		cond_resched();
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
		next = (old_addr + PMD_SIZE) & PMD_MASK;
		if (next - 1 > old_end)
			next = old_end;
		extent = next - old_addr;
		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
		if (!old_pmd)
			continue;
		new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
		if (!new_pmd)
			break;
		next = (new_addr + PMD_SIZE) & PMD_MASK;
		if (extent > next - new_addr)
			extent = next - new_addr;
		if (extent > LATENCY_LIMIT)
			extent = LATENCY_LIMIT;
		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
				new_vma, new_pmd, new_addr);
Linus Torvalds's avatar
Linus Torvalds committed
165
	}
166
167

	return len + old_addr - old_end;	/* how much done */
Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
172
173
174
175
176
177
178
179
}

static unsigned long move_vma(struct vm_area_struct *vma,
		unsigned long old_addr, unsigned long old_len,
		unsigned long new_len, unsigned long new_addr)
{
	struct mm_struct *mm = vma->vm_mm;
	struct vm_area_struct *new_vma;
	unsigned long vm_flags = vma->vm_flags;
	unsigned long new_pgoff;
	unsigned long moved_len;
	unsigned long excess = 0;
180
	unsigned long hiwater_vm;
Linus Torvalds's avatar
Linus Torvalds committed
181
	int split = 0;
182
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
183
184
185
186
187
188
189
190

	/*
	 * We'd prefer to avoid failure later on in do_munmap:
	 * which may split one vma into three before unmapping.
	 */
	if (mm->map_count >= sysctl_max_map_count - 3)
		return -ENOMEM;

191
192
193
194
195
196
197
	/*
	 * Advise KSM to break any KSM pages in the area to be moved:
	 * it would be confusing if they were to turn up at the new
	 * location, where they happen to coincide with different KSM
	 * pages recently unmapped.  But leave vma->vm_flags as it was,
	 * so KSM can come around to merge on vma and new_vma afterwards.
	 */
198
199
200
201
	err = ksm_madvise(vma, old_addr, old_addr + old_len,
						MADV_UNMERGEABLE, &vm_flags);
	if (err)
		return err;
202

Linus Torvalds's avatar
Linus Torvalds committed
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
	if (!new_vma)
		return -ENOMEM;

	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
	if (moved_len < old_len) {
		/*
		 * On error, move entries back from new area to old,
		 * which will succeed since page tables still there,
		 * and then proceed to unmap new area instead of old.
		 */
		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
		vma = new_vma;
		old_len = new_len;
		old_addr = new_addr;
		new_addr = -ENOMEM;
	}

	/* Conceal VM_ACCOUNT so old reservation is not undone */
	if (vm_flags & VM_ACCOUNT) {
		vma->vm_flags &= ~VM_ACCOUNT;
		excess = vma->vm_end - vma->vm_start - old_len;
		if (old_addr > vma->vm_start &&
		    old_addr + old_len < vma->vm_end)
			split = 1;
	}

231
	/*
232
233
234
235
236
237
238
	 * If we failed to move page tables we still do total_vm increment
	 * since do_munmap() will decrement it by old_len == new_len.
	 *
	 * Since total_vm is about to be raised artificially high for a
	 * moment, we need to restore high watermark afterwards: if stats
	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
	 * If this were a serious issue, we'd add a flag to do_munmap().
239
	 */
240
	hiwater_vm = mm->hiwater_vm;
241
	mm->total_vm += new_len >> PAGE_SHIFT;
242
	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
243

Linus Torvalds's avatar
Linus Torvalds committed
244
245
246
247
248
	if (do_munmap(mm, old_addr, old_len) < 0) {
		/* OOM: unable to split vma, just get accounts right */
		vm_unacct_memory(excess >> PAGE_SHIFT);
		excess = 0;
	}
249
	mm->hiwater_vm = hiwater_vm;
Linus Torvalds's avatar
Linus Torvalds committed
250
251
252
253
254
255
256
257
258
259
260

	/* Restore VM_ACCOUNT if one or two pieces of vma left */
	if (excess) {
		vma->vm_flags |= VM_ACCOUNT;
		if (split)
			vma->vm_next->vm_flags |= VM_ACCOUNT;
	}

	if (vm_flags & VM_LOCKED) {
		mm->locked_vm += new_len >> PAGE_SHIFT;
		if (new_len > old_len)
261
262
			mlock_vma_pages_range(new_vma, new_addr + old_len,
						       new_addr + new_len);
Linus Torvalds's avatar
Linus Torvalds committed
263
264
265
266
267
	}

	return new_addr;
}

Al Viro's avatar
Al Viro committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
static struct vm_area_struct *vma_to_resize(unsigned long addr,
	unsigned long old_len, unsigned long new_len, unsigned long *p)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma = find_vma(mm, addr);

	if (!vma || vma->vm_start > addr)
		goto Efault;

	if (is_vm_hugetlb_page(vma))
		goto Einval;

	/* We can't remap across vm area boundaries */
	if (old_len > vma->vm_end - addr)
		goto Efault;

	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
		if (new_len > old_len)
			goto Efault;
	}

	if (vma->vm_flags & VM_LOCKED) {
		unsigned long locked, lock_limit;
		locked = mm->locked_vm << PAGE_SHIFT;
		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
		locked += new_len - old_len;
		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
			goto Eagain;
	}

	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
		goto Enomem;

	if (vma->vm_flags & VM_ACCOUNT) {
		unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
		if (security_vm_enough_memory(charged))
			goto Efault;
		*p = charged;
	}

	return vma;

Efault:	/* very odd choice for most of the cases, but... */
	return ERR_PTR(-EFAULT);
Einval:
	return ERR_PTR(-EINVAL);
Enomem:
	return ERR_PTR(-ENOMEM);
Eagain:
	return ERR_PTR(-EAGAIN);
}

Al Viro's avatar
Al Viro committed
320
321
322
323
324
325
326
327
static unsigned long mremap_to(unsigned long addr,
	unsigned long old_len, unsigned long new_addr,
	unsigned long new_len)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long ret = -EINVAL;
	unsigned long charged = 0;
328
	unsigned long map_flags;
Al Viro's avatar
Al Viro committed
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365

	if (new_addr & ~PAGE_MASK)
		goto out;

	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
		goto out;

	/* Check if the location we're moving into overlaps the
	 * old location at all, and fail if it does.
	 */
	if ((new_addr <= addr) && (new_addr+new_len) > addr)
		goto out;

	if ((addr <= new_addr) && (addr+old_len) > new_addr)
		goto out;

	ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
	if (ret)
		goto out;

	ret = do_munmap(mm, new_addr, new_len);
	if (ret)
		goto out;

	if (old_len >= new_len) {
		ret = do_munmap(mm, addr+new_len, old_len - new_len);
		if (ret && old_len != new_len)
			goto out;
		old_len = new_len;
	}

	vma = vma_to_resize(addr, old_len, new_len, &charged);
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
		goto out;
	}

366
367
368
369
370
371
372
373
374
	map_flags = MAP_FIXED;
	if (vma->vm_flags & VM_MAYSHARE)
		map_flags |= MAP_SHARED;
	ret = arch_mmap_check(new_addr, new_len, map_flags);
	if (ret)
		goto out1;
	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
				((addr - vma->vm_start) >> PAGE_SHIFT),
				map_flags);
Al Viro's avatar
Al Viro committed
375
	if (ret & ~PAGE_MASK)
376
377
378
379
380
381
382
		goto out1;

	ret = move_vma(vma, addr, old_len, new_len, new_addr);
	if (!(ret & ~PAGE_MASK))
		goto out;
out1:
	vm_unacct_memory(charged);
Al Viro's avatar
Al Viro committed
383
384
385
386
387

out:
	return ret;
}

Al Viro's avatar
Al Viro committed
388
389
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
390
	unsigned long end = vma->vm_end + delta;
Al Viro's avatar
Al Viro committed
391
392
393
	unsigned long max_addr = TASK_SIZE;
	if (vma->vm_next)
		max_addr = vma->vm_next->vm_start;
394
395
396
397
398
399
	if (max_addr < end || end < vma->vm_end)
		return 0;
	if (arch_mmap_check(vma->vm_start, end - vma->vm_start, MAP_FIXED))
		return 0;
	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
			      0, MAP_FIXED) & ~PAGE_MASK)
Al Viro's avatar
Al Viro committed
400
401
402
403
		return 0;
	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
404
405
406
407
408
409
410
411
412
413
414
/*
 * Expand (or shrink) an existing mapping, potentially moving it at the
 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 *
 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 * This option implies MREMAP_MAYMOVE.
 */
unsigned long do_mremap(unsigned long addr,
	unsigned long old_len, unsigned long new_len,
	unsigned long flags, unsigned long new_addr)
{
415
	struct mm_struct *mm = current->mm;
Linus Torvalds's avatar
Linus Torvalds committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
	struct vm_area_struct *vma;
	unsigned long ret = -EINVAL;
	unsigned long charged = 0;

	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
		goto out;

	if (addr & ~PAGE_MASK)
		goto out;

	old_len = PAGE_ALIGN(old_len);
	new_len = PAGE_ALIGN(new_len);

	/*
	 * We allow a zero old-len as a special case
	 * for DOS-emu "duplicate shm area" thing. But
	 * a zero new-len is nonsensical.
	 */
	if (!new_len)
		goto out;

	if (flags & MREMAP_FIXED) {
Al Viro's avatar
Al Viro committed
438
439
440
		if (flags & MREMAP_MAYMOVE)
			ret = mremap_to(addr, old_len, new_addr, new_len);
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
441
442
443
444
445
446
447
448
	}

	/*
	 * Always allow a shrinking remap: that just unmaps
	 * the unnecessary pages..
	 * do_munmap does all the needed commit accounting
	 */
	if (old_len >= new_len) {
449
		ret = do_munmap(mm, addr+new_len, old_len - new_len);
Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
		if (ret && old_len != new_len)
			goto out;
		ret = addr;
Al Viro's avatar
Al Viro committed
453
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
454
455
456
	}

	/*
Al Viro's avatar
Al Viro committed
457
	 * Ok, we need to grow..
Linus Torvalds's avatar
Linus Torvalds committed
458
	 */
Al Viro's avatar
Al Viro committed
459
460
461
	vma = vma_to_resize(addr, old_len, new_len, &charged);
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
Linus Torvalds's avatar
Linus Torvalds committed
462
		goto out;
akpm@osdl.org's avatar
akpm@osdl.org committed
463
	}
Linus Torvalds's avatar
Linus Torvalds committed
464
465
466

	/* old_len exactly to the end of the area..
	 */
Al Viro's avatar
Al Viro committed
467
	if (old_len == vma->vm_end - addr) {
Linus Torvalds's avatar
Linus Torvalds committed
468
		/* can we just expand the current mapping? */
Al Viro's avatar
Al Viro committed
469
		if (vma_expandable(vma, new_len - old_len)) {
Linus Torvalds's avatar
Linus Torvalds committed
470
471
472
473
474
			int pages = (new_len - old_len) >> PAGE_SHIFT;

			vma_adjust(vma, vma->vm_start,
				addr + new_len, vma->vm_pgoff, NULL);

475
476
			mm->total_vm += pages;
			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
Linus Torvalds's avatar
Linus Torvalds committed
477
			if (vma->vm_flags & VM_LOCKED) {
478
				mm->locked_vm += pages;
479
				mlock_vma_pages_range(vma, addr + old_len,
Linus Torvalds's avatar
Linus Torvalds committed
480
481
482
483
484
485
486
487
488
489
490
491
492
						   addr + new_len);
			}
			ret = addr;
			goto out;
		}
	}

	/*
	 * We weren't able to just expand or shrink the area,
	 * we need to create a new one and move it..
	 */
	ret = -ENOMEM;
	if (flags & MREMAP_MAYMOVE) {
Al Viro's avatar
Al Viro committed
493
494
495
496
497
498
499
500
501
		unsigned long map_flags = 0;
		if (vma->vm_flags & VM_MAYSHARE)
			map_flags |= MAP_SHARED;

		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
					vma->vm_pgoff, map_flags);
		if (new_addr & ~PAGE_MASK) {
			ret = new_addr;
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
502
		}
Al Viro's avatar
Al Viro committed
503
504
505
506

		ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
		if (ret)
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
507
508
509
510
511
512
513
514
		ret = move_vma(vma, addr, old_len, new_len, new_addr);
	}
out:
	if (ret & ~PAGE_MASK)
		vm_unacct_memory(charged);
	return ret;
}

515
516
517
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
		unsigned long, new_len, unsigned long, flags,
		unsigned long, new_addr)
Linus Torvalds's avatar
Linus Torvalds committed
518
519
520
521
522
523
524
525
{
	unsigned long ret;

	down_write(&current->mm->mmap_sem);
	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
	up_write(&current->mm->mmap_sem);
	return ret;
}