mempolicy.c 39.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds's avatar
Linus Torvalds committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 * Subject to the GNU Public License, version 2.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
21
 *
Linus Torvalds's avatar
Linus Torvalds committed
22
23
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
24
25
26
27
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
31
32
 * preferred       Try a specific node first before normal fallback.
 *                As a special case node -1 here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
33
 *
Linus Torvalds's avatar
Linus Torvalds committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *		  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
   could replace all the switch()es with a mempolicy_ops structure.
*/

#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/mempolicy.h>
86
#include <linux/swap.h>
87
88
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
89

Linus Torvalds's avatar
Linus Torvalds committed
90
91
92
#include <asm/tlbflush.h>
#include <asm/uaccess.h>

93
/* Internal flags */
94
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
95
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
96
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
97

Linus Torvalds's avatar
Linus Torvalds committed
98
99
100
101
102
103
104
static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache;

#define PDprintk(fmt...)

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
105
int policy_zone = ZONE_DMA;
Linus Torvalds's avatar
Linus Torvalds committed
106

107
struct mempolicy default_policy = {
Linus Torvalds's avatar
Linus Torvalds committed
108
109
110
111
112
	.refcnt = ATOMIC_INIT(1), /* never free it */
	.policy = MPOL_DEFAULT,
};

/* Do sanity checking on a policy */
113
static int mpol_check_policy(int mode, nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
114
{
115
	int empty = nodes_empty(*nodes);
Linus Torvalds's avatar
Linus Torvalds committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129

	switch (mode) {
	case MPOL_DEFAULT:
		if (!empty)
			return -EINVAL;
		break;
	case MPOL_BIND:
	case MPOL_INTERLEAVE:
		/* Preferred will only use the first bit, but allow
		   more for now. */
		if (empty)
			return -EINVAL;
		break;
	}
130
	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
131
132
}
/* Generate a custom zonelist for the BIND policy. */
133
static struct zonelist *bind_zonelist(nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
134
135
136
137
{
	struct zonelist *zl;
	int num, max, nd;

138
	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
Linus Torvalds's avatar
Linus Torvalds committed
139
140
141
142
	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
	if (!zl)
		return NULL;
	num = 0;
143
144
	for_each_node_mask(nd, *nodes)
		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
Linus Torvalds's avatar
Linus Torvalds committed
145
146
147
148
149
	zl->zones[num] = NULL;
	return zl;
}

/* Create a new policy */
150
static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
151
152
153
{
	struct mempolicy *policy;

154
	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
158
159
160
161
162
	if (mode == MPOL_DEFAULT)
		return NULL;
	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
	if (!policy)
		return ERR_PTR(-ENOMEM);
	atomic_set(&policy->refcnt, 1);
	switch (mode) {
	case MPOL_INTERLEAVE:
163
		policy->v.nodes = *nodes;
164
165
166
167
		if (nodes_weight(*nodes) == 0) {
			kmem_cache_free(policy_cache, policy);
			return ERR_PTR(-EINVAL);
		}
Linus Torvalds's avatar
Linus Torvalds committed
168
169
		break;
	case MPOL_PREFERRED:
170
		policy->v.preferred_node = first_node(*nodes);
Linus Torvalds's avatar
Linus Torvalds committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
		if (policy->v.preferred_node >= MAX_NUMNODES)
			policy->v.preferred_node = -1;
		break;
	case MPOL_BIND:
		policy->v.zonelist = bind_zonelist(nodes);
		if (policy->v.zonelist == NULL) {
			kmem_cache_free(policy_cache, policy);
			return ERR_PTR(-ENOMEM);
		}
		break;
	}
	policy->policy = mode;
	return policy;
}

186
static void gather_stats(struct page *, void *);
187
188
static void migrate_page_add(struct vm_area_struct *vma,
	struct page *page, struct list_head *pagelist, unsigned long flags);
189

190
/* Scan through pages checking if pages follow certain conditions. */
Nick Piggin's avatar
Nick Piggin committed
191
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
192
193
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
194
		void *private)
Linus Torvalds's avatar
Linus Torvalds committed
195
{
196
197
	pte_t *orig_pte;
	pte_t *pte;
198
	spinlock_t *ptl;
199

200
	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
201
	do {
202
		struct page *page;
203
204
205
		unsigned int nid;

		if (!pte_present(*pte))
Linus Torvalds's avatar
Linus Torvalds committed
206
			continue;
207
208
		page = vm_normal_page(vma, addr, *pte);
		if (!page)
Linus Torvalds's avatar
Linus Torvalds committed
209
			continue;
210
		nid = page_to_nid(page);
211
212
213
		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
			continue;

214
215
		if (flags & MPOL_MF_STATS)
			gather_stats(page, private);
216
217
		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
			spin_unlock(ptl);
218
			migrate_page_add(vma, page, private, flags);
219
220
			spin_lock(ptl);
		}
221
222
		else
			break;
223
	} while (pte++, addr += PAGE_SIZE, addr != end);
224
	pte_unmap_unlock(orig_pte, ptl);
225
226
227
	return addr != end;
}

Nick Piggin's avatar
Nick Piggin committed
228
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
229
230
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
231
		void *private)
232
233
234
235
236
237
238
239
240
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
241
		if (check_pte_range(vma, pmd, addr, next, nodes,
242
				    flags, private))
243
244
245
246
247
			return -EIO;
	} while (pmd++, addr = next, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
248
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
249
250
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
251
		void *private)
252
253
254
255
256
257
258
259
260
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
261
		if (check_pmd_range(vma, pud, addr, next, nodes,
262
				    flags, private))
263
264
265
266
267
			return -EIO;
	} while (pud++, addr = next, addr != end);
	return 0;
}

Nick Piggin's avatar
Nick Piggin committed
268
static inline int check_pgd_range(struct vm_area_struct *vma,
269
270
		unsigned long addr, unsigned long end,
		const nodemask_t *nodes, unsigned long flags,
271
		void *private)
272
273
274
275
{
	pgd_t *pgd;
	unsigned long next;

Nick Piggin's avatar
Nick Piggin committed
276
	pgd = pgd_offset(vma->vm_mm, addr);
277
278
279
280
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
281
		if (check_pud_range(vma, pgd, addr, next, nodes,
282
				    flags, private))
283
284
285
			return -EIO;
	} while (pgd++, addr = next, addr != end);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
286
287
}

288
289
290
291
292
293
294
295
296
297
298
299
300
301
/* Check if a vma is migratable */
static inline int vma_migratable(struct vm_area_struct *vma)
{
	if (vma->vm_flags & (
		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
		return 0;
	return 1;
}

/*
 * Check if all pages in a range are on a set of nodes.
 * If pagelist != NULL then isolate pages from the LRU and
 * put them on the pagelist.
 */
Linus Torvalds's avatar
Linus Torvalds committed
302
303
static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
304
		const nodemask_t *nodes, unsigned long flags, void *private)
Linus Torvalds's avatar
Linus Torvalds committed
305
306
307
308
309
310
311
312
313
{
	int err;
	struct vm_area_struct *first, *vma, *prev;

	first = find_vma(mm, start);
	if (!first)
		return ERR_PTR(-EFAULT);
	prev = NULL;
	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
314
315
316
317
318
319
320
321
322
323
		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
			if (!vma->vm_next && vma->vm_end < end)
				return ERR_PTR(-EFAULT);
			if (prev && prev->vm_end < vma->vm_start)
				return ERR_PTR(-EFAULT);
		}
		if (!is_vm_hugetlb_page(vma) &&
		    ((flags & MPOL_MF_STRICT) ||
		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
				vma_migratable(vma)))) {
Andi Kleen's avatar
Andi Kleen committed
324
			unsigned long endvma = vma->vm_end;
325

Andi Kleen's avatar
Andi Kleen committed
326
327
328
329
			if (endvma > end)
				endvma = end;
			if (vma->vm_start > start)
				start = vma->vm_start;
330
			err = check_pgd_range(vma, start, endvma, nodes,
331
						flags, private);
Linus Torvalds's avatar
Linus Torvalds committed
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
			if (err) {
				first = ERR_PTR(err);
				break;
			}
		}
		prev = vma;
	}
	return first;
}

/* Apply policy to a single VMA */
static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
{
	int err = 0;
	struct mempolicy *old = vma->vm_policy;

	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
		 vma->vm_ops, vma->vm_file,
		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);

	if (vma->vm_ops && vma->vm_ops->set_policy)
		err = vma->vm_ops->set_policy(vma, new);
	if (!err) {
		mpol_get(new);
		vma->vm_policy = new;
		mpol_free(old);
	}
	return err;
}

/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct vm_area_struct *vma, unsigned long start,
		       unsigned long end, struct mempolicy *new)
{
	struct vm_area_struct *next;
	int err;

	err = 0;
	for (; vma && vma->vm_start < end; vma = next) {
		next = vma->vm_next;
		if (vma->vm_start < start)
			err = split_vma(vma->vm_mm, vma, start, 1);
		if (!err && vma->vm_end > end)
			err = split_vma(vma->vm_mm, vma, end, 0);
		if (!err)
			err = policy_vma(vma, new);
		if (err)
			break;
	}
	return err;
}

385
386
387
388
389
390
static int contextualize_policy(int mode, nodemask_t *nodes)
{
	if (!nodes)
		return 0;

	cpuset_update_current_mems_allowed();
391
392
	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
		return -EINVAL;
393
394
395
	return mpol_check_policy(mode, nodes);
}

Linus Torvalds's avatar
Linus Torvalds committed
396
/* Set the process memory policy */
397
long do_set_mempolicy(int mode, nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
398
399
400
{
	struct mempolicy *new;

401
	if (contextualize_policy(mode, nodes))
Linus Torvalds's avatar
Linus Torvalds committed
402
		return -EINVAL;
403
	new = mpol_new(mode, nodes);
Linus Torvalds's avatar
Linus Torvalds committed
404
405
406
407
408
	if (IS_ERR(new))
		return PTR_ERR(new);
	mpol_free(current->mempolicy);
	current->mempolicy = new;
	if (new && new->policy == MPOL_INTERLEAVE)
409
		current->il_next = first_node(new->v.nodes);
Linus Torvalds's avatar
Linus Torvalds committed
410
411
412
413
	return 0;
}

/* Fill a zone bitmap for a policy */
414
static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds's avatar
Linus Torvalds committed
415
416
417
{
	int i;

418
	nodes_clear(*nodes);
Linus Torvalds's avatar
Linus Torvalds committed
419
420
421
	switch (p->policy) {
	case MPOL_BIND:
		for (i = 0; p->v.zonelist->zones[i]; i++)
422
423
			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
				*nodes);
Linus Torvalds's avatar
Linus Torvalds committed
424
425
426
427
		break;
	case MPOL_DEFAULT:
		break;
	case MPOL_INTERLEAVE:
428
		*nodes = p->v.nodes;
Linus Torvalds's avatar
Linus Torvalds committed
429
430
431
432
		break;
	case MPOL_PREFERRED:
		/* or use current node instead of online map? */
		if (p->v.preferred_node < 0)
433
			*nodes = node_online_map;
Linus Torvalds's avatar
Linus Torvalds committed
434
		else
435
			node_set(p->v.preferred_node, *nodes);
Linus Torvalds's avatar
Linus Torvalds committed
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
		break;
	default:
		BUG();
	}
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
	struct page *p;
	int err;

	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
	if (err >= 0) {
		err = page_to_nid(p);
		put_page(p);
	}
	return err;
}

/* Retrieve NUMA policy */
456
457
long do_get_mempolicy(int *policy, nodemask_t *nmask,
			unsigned long addr, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
458
{
459
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
460
461
462
463
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma = NULL;
	struct mempolicy *pol = current->mempolicy;

464
	cpuset_update_current_mems_allowed();
Linus Torvalds's avatar
Linus Torvalds committed
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
		return -EINVAL;
	if (flags & MPOL_F_ADDR) {
		down_read(&mm->mmap_sem);
		vma = find_vma_intersection(mm, addr, addr+1);
		if (!vma) {
			up_read(&mm->mmap_sem);
			return -EFAULT;
		}
		if (vma->vm_ops && vma->vm_ops->get_policy)
			pol = vma->vm_ops->get_policy(vma, addr);
		else
			pol = vma->vm_policy;
	} else if (addr)
		return -EINVAL;

	if (!pol)
		pol = &default_policy;

	if (flags & MPOL_F_NODE) {
		if (flags & MPOL_F_ADDR) {
			err = lookup_node(mm, addr);
			if (err < 0)
				goto out;
489
			*policy = err;
Linus Torvalds's avatar
Linus Torvalds committed
490
491
		} else if (pol == current->mempolicy &&
				pol->policy == MPOL_INTERLEAVE) {
492
			*policy = current->il_next;
Linus Torvalds's avatar
Linus Torvalds committed
493
494
495
496
497
		} else {
			err = -EINVAL;
			goto out;
		}
	} else
498
		*policy = pol->policy;
Linus Torvalds's avatar
Linus Torvalds committed
499
500
501
502
503
504
505

	if (vma) {
		up_read(&current->mm->mmap_sem);
		vma = NULL;
	}

	err = 0;
506
507
	if (nmask)
		get_zonemask(pol, nmask);
Linus Torvalds's avatar
Linus Torvalds committed
508
509
510
511
512
513
514

 out:
	if (vma)
		up_read(&current->mm->mmap_sem);
	return err;
}

515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
/*
 * page migration
 */

/* Check if we are the only process mapping the page in question */
static inline int single_mm_mapping(struct mm_struct *mm,
			struct address_space *mapping)
{
	struct vm_area_struct *vma;
	struct prio_tree_iter iter;
	int rc = 1;

	spin_lock(&mapping->i_mmap_lock);
	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
		if (mm != vma->vm_mm) {
			rc = 0;
			goto out;
		}
	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
		if (mm != vma->vm_mm) {
			rc = 0;
			goto out;
		}
out:
	spin_unlock(&mapping->i_mmap_lock);
	return rc;
}

/*
 * Add a page to be migrated to the pagelist
 */
static void migrate_page_add(struct vm_area_struct *vma,
	struct page *page, struct list_head *pagelist, unsigned long flags)
{
	/*
	 * Avoid migrating a page that is shared by others and not writable.
	 */
	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
	    mapping_writably_mapped(page->mapping) ||
	    single_mm_mapping(vma->vm_mm, page->mapping)) {
		int rc = isolate_lru_page(page);

		if (rc == 1)
			list_add(&page->lru, pagelist);
		/*
		 * If the isolate attempt was not successful then we just
		 * encountered an unswappable page. Something must be wrong.
	 	 */
		WARN_ON(rc == 0);
	}
}

static int swap_pages(struct list_head *pagelist)
{
	LIST_HEAD(moved);
	LIST_HEAD(failed);
	int n;

	n = migrate_pages(pagelist, NULL, &moved, &failed);
	putback_lru_pages(&failed);
	putback_lru_pages(&moved);

	return n;
}

580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
/*
 * For now migrate_pages simply swaps out the pages from nodes that are in
 * the source set but not in the target set. In the future, we would
 * want a function that moves pages between the two nodesets in such
 * a way as to preserve the physical layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm,
	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
	LIST_HEAD(pagelist);
	int count = 0;
	nodemask_t nodes;

	nodes_andnot(nodes, *from_nodes, *to_nodes);

	down_read(&mm->mmap_sem);
	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
600

601
	if (!list_empty(&pagelist)) {
602
603
		count = swap_pages(&pagelist);
		putback_lru_pages(&pagelist);
604
	}
605

606
607
608
609
	up_read(&mm->mmap_sem);
	return count;
}

610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
long do_mbind(unsigned long start, unsigned long len,
		unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
	struct vm_area_struct *vma;
	struct mm_struct *mm = current->mm;
	struct mempolicy *new;
	unsigned long end;
	int err;
	LIST_HEAD(pagelist);

	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
	    || mode > MPOL_MAX)
		return -EINVAL;
	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
		return -EPERM;

	if (start & ~PAGE_MASK)
		return -EINVAL;

	if (mode == MPOL_DEFAULT)
		flags &= ~MPOL_MF_STRICT;

	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
	end = start + len;

	if (end < start)
		return -EINVAL;
	if (end == start)
		return 0;

	if (mpol_check_policy(mode, nmask))
		return -EINVAL;

	new = mpol_new(mode, nmask);
	if (IS_ERR(new))
		return PTR_ERR(new);

	/*
	 * If we are using the default policy then operation
	 * on discontinuous address spaces is okay after all
	 */
	if (!new)
		flags |= MPOL_MF_DISCONTIG_OK;

	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
			mode,nodes_addr(nodes)[0]);

	down_write(&mm->mmap_sem);
	vma = check_range(mm, start, end, nmask,
			  flags | MPOL_MF_INVERT, &pagelist);

	err = PTR_ERR(vma);
	if (!IS_ERR(vma)) {
		int nr_failed = 0;

		err = mbind_range(vma, start, end, new);
		if (!list_empty(&pagelist))
			nr_failed = swap_pages(&pagelist);

		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
			err = -EIO;
	}
	if (!list_empty(&pagelist))
		putback_lru_pages(&pagelist);

	up_write(&mm->mmap_sem);
	mpol_free(new);
	return err;
}

681
682
683
684
685
/*
 * User space interface with variable sized bitmaps for nodelists.
 */

/* Copy a node mask from user space. */
686
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
		     unsigned long maxnode)
{
	unsigned long k;
	unsigned long nlongs;
	unsigned long endmask;

	--maxnode;
	nodes_clear(*nodes);
	if (maxnode == 0 || !nmask)
		return 0;

	nlongs = BITS_TO_LONGS(maxnode);
	if ((maxnode % BITS_PER_LONG) == 0)
		endmask = ~0UL;
	else
		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;

	/* When the user specified more nodes than supported just check
	   if the non supported part is all zero. */
	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
		if (nlongs > PAGE_SIZE/sizeof(long))
			return -EINVAL;
		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
			unsigned long t;
			if (get_user(t, nmask + k))
				return -EFAULT;
			if (k == nlongs - 1) {
				if (t & endmask)
					return -EINVAL;
			} else if (t)
				return -EINVAL;
		}
		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
		endmask = ~0UL;
	}

	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
		return -EFAULT;
	nodes_addr(*nodes)[nlongs-1] &= endmask;
	return 0;
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
			      nodemask_t *nodes)
{
	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);

	if (copy > nbytes) {
		if (copy > PAGE_SIZE)
			return -EINVAL;
		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
			return -EFAULT;
		copy = nbytes;
	}
	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

asmlinkage long sys_mbind(unsigned long start, unsigned long len,
			unsigned long mode,
			unsigned long __user *nmask, unsigned long maxnode,
			unsigned flags)
{
	nodemask_t nodes;
	int err;

	err = get_nodes(&nodes, nmask, maxnode);
	if (err)
		return err;
	return do_mbind(start, len, mode, &nodes, flags);
}

/* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
		unsigned long maxnode)
{
	int err;
	nodemask_t nodes;

	if (mode < 0 || mode > MPOL_MAX)
		return -EINVAL;
	err = get_nodes(&nodes, nmask, maxnode);
	if (err)
		return err;
	return do_set_mempolicy(mode, &nodes);
}

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
/* Macro needed until Paul implements this function in kernel/cpusets.c */
#define cpuset_mems_allowed(task) node_online_map

asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
		const unsigned long __user *old_nodes,
		const unsigned long __user *new_nodes)
{
	struct mm_struct *mm;
	struct task_struct *task;
	nodemask_t old;
	nodemask_t new;
	nodemask_t task_nodes;
	int err;

	err = get_nodes(&old, old_nodes, maxnode);
	if (err)
		return err;

	err = get_nodes(&new, new_nodes, maxnode);
	if (err)
		return err;

	/* Find the mm_struct */
	read_lock(&tasklist_lock);
	task = pid ? find_task_by_pid(pid) : current;
	if (!task) {
		read_unlock(&tasklist_lock);
		return -ESRCH;
	}
	mm = get_task_mm(task);
	read_unlock(&tasklist_lock);

	if (!mm)
		return -EINVAL;

	/*
	 * Check if this process has the right to modify the specified
	 * process. The right exists if the process has administrative
	 * capabilities, superuser priviledges or the same
	 * userid as the target process.
	 */
	if ((current->euid != task->suid) && (current->euid != task->uid) &&
	    (current->uid != task->suid) && (current->uid != task->uid) &&
	    !capable(CAP_SYS_ADMIN)) {
		err = -EPERM;
		goto out;
	}

	task_nodes = cpuset_mems_allowed(task);
	/* Is the user allowed to access the target nodes? */
	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
		err = -EPERM;
		goto out;
	}

	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
out:
	mmput(mm);
	return err;
}


837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
				unsigned long __user *nmask,
				unsigned long maxnode,
				unsigned long addr, unsigned long flags)
{
	int err, pval;
	nodemask_t nodes;

	if (nmask != NULL && maxnode < MAX_NUMNODES)
		return -EINVAL;

	err = do_get_mempolicy(&pval, &nodes, addr, flags);

	if (err)
		return err;

	if (policy && put_user(pval, policy))
		return -EFAULT;

	if (nmask)
		err = copy_nodes_to_user(nmask, maxnode, &nodes);

	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
#ifdef CONFIG_COMPAT

asmlinkage long compat_sys_get_mempolicy(int __user *policy,
				     compat_ulong_t __user *nmask,
				     compat_ulong_t maxnode,
				     compat_ulong_t addr, compat_ulong_t flags)
{
	long err;
	unsigned long __user *nm = NULL;
	unsigned long nr_bits, alloc_size;
	DECLARE_BITMAP(bm, MAX_NUMNODES);

	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

	if (nmask)
		nm = compat_alloc_user_space(alloc_size);

	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);

	if (!err && nmask) {
		err = copy_from_user(bm, nm, alloc_size);
		/* ensure entire bitmap is zeroed */
		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
		err |= compat_put_bitmap(nmask, bm, nr_bits);
	}

	return err;
}

asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
				     compat_ulong_t maxnode)
{
	long err = 0;
	unsigned long __user *nm = NULL;
	unsigned long nr_bits, alloc_size;
	DECLARE_BITMAP(bm, MAX_NUMNODES);

	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

	if (nmask) {
		err = compat_get_bitmap(bm, nmask, nr_bits);
		nm = compat_alloc_user_space(alloc_size);
		err |= copy_to_user(nm, bm, alloc_size);
	}

	if (err)
		return -EFAULT;

	return sys_set_mempolicy(mode, nm, nr_bits+1);
}

asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
			     compat_ulong_t mode, compat_ulong_t __user *nmask,
			     compat_ulong_t maxnode, compat_ulong_t flags)
{
	long err = 0;
	unsigned long __user *nm = NULL;
	unsigned long nr_bits, alloc_size;
923
	nodemask_t bm;
Linus Torvalds's avatar
Linus Torvalds committed
924
925
926
927
928

	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

	if (nmask) {
929
		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
Linus Torvalds's avatar
Linus Torvalds committed
930
		nm = compat_alloc_user_space(alloc_size);
931
		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
Linus Torvalds's avatar
Linus Torvalds committed
932
933
934
935
936
937
938
939
940
941
942
	}

	if (err)
		return -EFAULT;

	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
}

#endif

/* Return effective policy for a VMA */
943
944
static struct mempolicy * get_vma_policy(struct task_struct *task,
		struct vm_area_struct *vma, unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
945
{
946
	struct mempolicy *pol = task->mempolicy;
Linus Torvalds's avatar
Linus Torvalds committed
947
948
949

	if (vma) {
		if (vma->vm_ops && vma->vm_ops->get_policy)
950
			pol = vma->vm_ops->get_policy(vma, addr);
Linus Torvalds's avatar
Linus Torvalds committed
951
952
953
954
955
956
957
958
959
960
		else if (vma->vm_policy &&
				vma->vm_policy->policy != MPOL_DEFAULT)
			pol = vma->vm_policy;
	}
	if (!pol)
		pol = &default_policy;
	return pol;
}

/* Return a zonelist representing a mempolicy */
961
static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
Linus Torvalds's avatar
Linus Torvalds committed
962
963
964
965
966
967
968
969
970
971
972
973
{
	int nd;

	switch (policy->policy) {
	case MPOL_PREFERRED:
		nd = policy->v.preferred_node;
		if (nd < 0)
			nd = numa_node_id();
		break;
	case MPOL_BIND:
		/* Lower zones don't get a policy applied */
		/* Careful: current->mems_allowed might have moved */
Al Viro's avatar
Al Viro committed
974
		if (gfp_zone(gfp) >= policy_zone)
Linus Torvalds's avatar
Linus Torvalds committed
975
976
977
978
979
980
981
982
983
984
985
			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
				return policy->v.zonelist;
		/*FALL THROUGH*/
	case MPOL_INTERLEAVE: /* should not happen */
	case MPOL_DEFAULT:
		nd = numa_node_id();
		break;
	default:
		nd = 0;
		BUG();
	}
Al Viro's avatar
Al Viro committed
986
	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
Linus Torvalds's avatar
Linus Torvalds committed
987
988
989
990
991
992
993
994
995
}

/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
	unsigned nid, next;
	struct task_struct *me = current;

	nid = me->il_next;
996
	next = next_node(nid, policy->v.nodes);
Linus Torvalds's avatar
Linus Torvalds committed
997
	if (next >= MAX_NUMNODES)
998
		next = first_node(policy->v.nodes);
Linus Torvalds's avatar
Linus Torvalds committed
999
1000
	me->il_next = next;
	return nid;