hugetlb.c 46.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

David Gibson's avatar
David Gibson committed
18
19
20
21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23
24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26
static unsigned long surplus_huge_pages;
27
static unsigned long nr_overcommit_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
28
unsigned long max_huge_pages;
29
unsigned long sysctl_overcommit_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
33
static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34
35
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
36
static int hugetlb_next_nid;
37

38
39
40
41
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
42

43
44
45
/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
46
47
48
49
50
51
52
53
54
55
 *
 * The region data structures are protected by a combination of the mmap_sem
 * and the hugetlb_instantion_mutex.  To access or modify a region the caller
 * must either hold the mmap_sem for write, or the mmap_sem for read and
 * the hugetlb_instantiation mutex:
 *
 * 	down_write(&mm->mmap_sem);
 * or
 * 	down_read(&mm->mmap_sem);
 * 	mutex_lock(&hugetlb_instantiation_mutex);
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
 */
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarantee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		if (!nrg)
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
static long region_count(struct list_head *head, long f, long t)
{
	struct file_region *rg;
	long chg = 0;

	/* Locate each segment we overlap with, and count that overlap. */
	list_for_each_entry(rg, head, link) {
		int seg_from;
		int seg_to;

		if (rg->to <= f)
			continue;
		if (rg->from >= t)
			break;

		seg_from = max(rg->from, f);
		seg_to = min(rg->to, t);

		chg += seg_to - seg_from;
	}

	return chg;
}

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/*
 * Convert the address within this vma to the page offset within
 * the mapping, in base page units.
 */
static pgoff_t vma_page_offset(struct vm_area_struct *vma,
				unsigned long address)
{
	return ((address - vma->vm_start) >> PAGE_SHIFT) +
					(vma->vm_pgoff >> PAGE_SHIFT);
}

/*
 * Convert the address within this vma to the page offset within
 * the mapping, in pagecache page units; huge pages here.
 */
static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
					unsigned long address)
{
	return ((address - vma->vm_start) >> HPAGE_SHIFT) +
			(vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
}

224
225
226
227
228
229
230
/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
231
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
232

233
234
235
236
237
238
239
240
241
/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
242
243
244
245
246
247
248
249
250
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
251
 */
252
253
254
255
256
257
258
259
260
261
262
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
	return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
							unsigned long value)
{
	vma->vm_private_data = (void *)value;
}

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
struct resv_map {
	struct kref refs;
	struct list_head regions;
};

struct resv_map *resv_map_alloc(void)
{
	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
	if (!resv_map)
		return NULL;

	kref_init(&resv_map->refs);
	INIT_LIST_HEAD(&resv_map->regions);

	return resv_map;
}

void resv_map_release(struct kref *ref)
{
	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);

	/* Clear out any active regions before we release the map. */
	region_truncate(&resv_map->regions, 0);
	kfree(resv_map);
}

static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
290
291
292
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	if (!(vma->vm_flags & VM_SHARED))
293
294
		return (struct resv_map *)(get_vma_private_data(vma) &
							~HPAGE_RESV_MASK);
295
296
297
	return 0;
}

298
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
299
300
301
302
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	VM_BUG_ON(vma->vm_flags & VM_SHARED);

303
304
	set_vma_private_data(vma, (get_vma_private_data(vma) &
				HPAGE_RESV_MASK) | (unsigned long)map);
305
306
307
308
309
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
310
311
312
	VM_BUG_ON(vma->vm_flags & VM_SHARED);

	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
313
314
315
316
317
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
318
319

	return (get_vma_private_data(vma) & flag) != 0;
320
321
322
323
324
}

/* Decrement the reserved pages in the hugepage pool by one */
static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
{
325
326
327
	if (vma->vm_flags & VM_NORESERVE)
		return;

328
329
330
	if (vma->vm_flags & VM_SHARED) {
		/* Shared mappings always use reserves */
		resv_huge_pages--;
331
	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
332
333
334
335
		/*
		 * Only the process that called mmap() has reserves for
		 * private mappings.
		 */
336
		resv_huge_pages--;
337
338
339
	}
}

340
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
341
342
343
344
345
346
347
348
349
350
351
352
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
	VM_BUG_ON(!is_vm_hugetlb_page(vma));
	if (!(vma->vm_flags & VM_SHARED))
		vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
static int vma_has_private_reserves(struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_SHARED)
		return 0;
353
	if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
354
355
356
357
		return 0;
	return 1;
}

358
359
360
361
362
363
364
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
365
		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
366
367
368
369
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
370
			   unsigned long addr, struct vm_area_struct *vma)
371
372
373
374
375
376
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
377
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
378
379
380
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
381
382
383
384
385
386
387
388
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
static struct page *dequeue_huge_page(void)
{
	int nid;
	struct page *page = NULL;

	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
		if (!list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
			break;
		}
	}
	return page;
}

static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
408
				unsigned long address, int avoid_reserve)
Linus Torvalds's avatar
Linus Torvalds committed
409
{
410
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
411
	struct page *page = NULL;
412
	struct mempolicy *mpol;
413
	nodemask_t *nodemask;
414
	struct zonelist *zonelist = huge_zonelist(vma, address,
415
					htlb_alloc_mask, &mpol, &nodemask);
416
417
	struct zone *zone;
	struct zoneref *z;
Linus Torvalds's avatar
Linus Torvalds committed
418

419
420
421
422
423
424
425
426
427
	/*
	 * A child process with MAP_PRIVATE mappings created by their parent
	 * have no page reserves. This check ensures that reservations are
	 * not "stolen". The child may still get SIGKILLed
	 */
	if (!vma_has_private_reserves(vma) &&
			free_huge_pages - resv_huge_pages == 0)
		return NULL;

428
429
430
431
	/* If reserves cannot be used, ensure enough pages are in the pool */
	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
		return NULL;

432
433
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
						MAX_NR_ZONES - 1, nodemask) {
434
435
		nid = zone_to_nid(zone);
		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
436
437
438
439
440
441
		    !list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
442
443
444

			if (!avoid_reserve)
				decrement_hugepage_resv_vma(vma);
445

Ken Chen's avatar
Ken Chen committed
446
			break;
447
		}
Linus Torvalds's avatar
Linus Torvalds committed
448
	}
449
	mpol_cond_put(mpol);
Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
	return page;
}

453
454
455
456
457
458
459
460
461
462
463
464
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
	nr_huge_pages_node[page_to_nid(page)]--;
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
	set_compound_page_dtor(page, NULL);
	set_page_refcounted(page);
465
	arch_release_hugepage(page);
466
467
468
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

469
470
static void free_huge_page(struct page *page)
{
471
	int nid = page_to_nid(page);
472
	struct address_space *mapping;
473

474
	mapping = (struct address_space *) page_private(page);
475
	set_page_private(page, 0);
476
	BUG_ON(page_count(page));
477
478
479
	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
480
481
482
483
484
485
486
	if (surplus_huge_pages_node[nid]) {
		update_and_free_page(page);
		surplus_huge_pages--;
		surplus_huge_pages_node[nid]--;
	} else {
		enqueue_huge_page(page);
	}
487
	spin_unlock(&hugetlb_lock);
488
	if (mapping)
489
		hugetlb_put_quota(mapping, 1);
490
491
}

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
/*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
static int adjust_pool_surplus(int delta)
{
	static int prev_nid;
	int nid = prev_nid;
	int ret = 0;

	VM_BUG_ON(delta != -1 && delta != 1);
	do {
		nid = next_node(nid, node_online_map);
		if (nid == MAX_NUMNODES)
			nid = first_node(node_online_map);

		/* To shrink on this node, there must be a surplus page */
		if (delta < 0 && !surplus_huge_pages_node[nid])
			continue;
		/* Surplus cannot exceed the total number of pages */
		if (delta > 0 && surplus_huge_pages_node[nid] >=
						nr_huge_pages_node[nid])
			continue;

		surplus_huge_pages += delta;
		surplus_huge_pages_node[nid] += delta;
		ret = 1;
		break;
	} while (nid != prev_nid);

	prev_nid = nid;
	return ret;
}

527
static struct page *alloc_fresh_huge_page_node(int nid)
Linus Torvalds's avatar
Linus Torvalds committed
528
529
{
	struct page *page;
530

531
	page = alloc_pages_node(nid,
532
533
		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
						__GFP_REPEAT|__GFP_NOWARN,
534
		HUGETLB_PAGE_ORDER);
Linus Torvalds's avatar
Linus Torvalds committed
535
	if (page) {
536
537
		if (arch_prepare_hugepage(page)) {
			__free_pages(page, HUGETLB_PAGE_ORDER);
538
			return NULL;
539
		}
540
		set_compound_page_dtor(page, free_huge_page);
541
		spin_lock(&hugetlb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
542
		nr_huge_pages++;
543
		nr_huge_pages_node[nid]++;
544
		spin_unlock(&hugetlb_lock);
545
		put_page(page); /* free it into the hugepage allocator */
Linus Torvalds's avatar
Linus Torvalds committed
546
	}
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580

	return page;
}

static int alloc_fresh_huge_page(void)
{
	struct page *page;
	int start_nid;
	int next_nid;
	int ret = 0;

	start_nid = hugetlb_next_nid;

	do {
		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
		if (page)
			ret = 1;
		/*
		 * Use a helper variable to find the next node and then
		 * copy it back to hugetlb_next_nid afterwards:
		 * otherwise there's a window in which a racer might
		 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
		 * But we don't need to use a spin_lock here: it really
		 * doesn't matter if occasionally a racer chooses the
		 * same nid as we do.  Move nid forward in the mask even
		 * if we just successfully allocated a hugepage so that
		 * the next caller gets hugepages on the next node.
		 */
		next_nid = next_node(hugetlb_next_nid, node_online_map);
		if (next_nid == MAX_NUMNODES)
			next_nid = first_node(node_online_map);
		hugetlb_next_nid = next_nid;
	} while (!page && hugetlb_next_nid != start_nid);

581
582
583
584
585
	if (ret)
		count_vm_event(HTLB_BUDDY_PGALLOC);
	else
		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);

586
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
587
588
}

589
590
591
592
static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
						unsigned long address)
{
	struct page *page;
593
	unsigned int nid;
594

595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
	/*
	 * Assume we will successfully allocate the surplus page to
	 * prevent racing processes from causing the surplus to exceed
	 * overcommit
	 *
	 * This however introduces a different race, where a process B
	 * tries to grow the static hugepage pool while alloc_pages() is
	 * called by process A. B will only examine the per-node
	 * counters in determining if surplus huge pages can be
	 * converted to normal huge pages in adjust_pool_surplus(). A
	 * won't be able to increment the per-node counter, until the
	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
	 * no more huge pages can be converted from surplus to normal
	 * state (and doesn't try to convert again). Thus, we have a
	 * case where a surplus huge page exists, the pool is grown, and
	 * the surplus huge page still exists after, even though it
	 * should just have been converted to a normal huge page. This
	 * does not leak memory, though, as the hugepage will be freed
	 * once it is out of use. It also does not allow the counters to
	 * go out of whack in adjust_pool_surplus() as we don't modify
	 * the node values until we've gotten the hugepage and only the
	 * per-node value is checked there.
	 */
	spin_lock(&hugetlb_lock);
	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
		spin_unlock(&hugetlb_lock);
		return NULL;
	} else {
		nr_huge_pages++;
		surplus_huge_pages++;
	}
	spin_unlock(&hugetlb_lock);

628
629
	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
					__GFP_REPEAT|__GFP_NOWARN,
630
					HUGETLB_PAGE_ORDER);
631
632

	spin_lock(&hugetlb_lock);
633
	if (page) {
634
635
636
637
638
639
		/*
		 * This page is now managed by the hugetlb allocator and has
		 * no users -- drop the buddy allocator's reference.
		 */
		put_page_testzero(page);
		VM_BUG_ON(page_count(page));
640
		nid = page_to_nid(page);
641
		set_compound_page_dtor(page, free_huge_page);
642
643
644
645
646
		/*
		 * We incremented the global counters already
		 */
		nr_huge_pages_node[nid]++;
		surplus_huge_pages_node[nid]++;
647
		__count_vm_event(HTLB_BUDDY_PGALLOC);
648
649
650
	} else {
		nr_huge_pages--;
		surplus_huge_pages--;
651
		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
652
	}
653
	spin_unlock(&hugetlb_lock);
654
655
656
657

	return page;
}

658
659
660
661
662
663
664
665
666
667
668
669
/*
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
static int gather_surplus_pages(int delta)
{
	struct list_head surplus_list;
	struct page *page, *tmp;
	int ret, i;
	int needed, allocated;

	needed = (resv_huge_pages + delta) - free_huge_pages;
670
671
	if (needed <= 0) {
		resv_huge_pages += delta;
672
		return 0;
673
	}
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710

	allocated = 0;
	INIT_LIST_HEAD(&surplus_list);

	ret = -ENOMEM;
retry:
	spin_unlock(&hugetlb_lock);
	for (i = 0; i < needed; i++) {
		page = alloc_buddy_huge_page(NULL, 0);
		if (!page) {
			/*
			 * We were not able to allocate enough pages to
			 * satisfy the entire reservation so we free what
			 * we've allocated so far.
			 */
			spin_lock(&hugetlb_lock);
			needed = 0;
			goto free;
		}

		list_add(&page->lru, &surplus_list);
	}
	allocated += needed;

	/*
	 * After retaking hugetlb_lock, we need to recalculate 'needed'
	 * because either resv_huge_pages or free_huge_pages may have changed.
	 */
	spin_lock(&hugetlb_lock);
	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
	if (needed > 0)
		goto retry;

	/*
	 * The surplus_list now contains _at_least_ the number of extra pages
	 * needed to accomodate the reservation.  Add the appropriate number
	 * of pages to the hugetlb pool and free the extras back to the buddy
711
712
713
	 * allocator.  Commit the entire reservation here to prevent another
	 * process from stealing the pages as they are added to the pool but
	 * before they are reserved.
714
715
	 */
	needed += allocated;
716
	resv_huge_pages += delta;
717
718
	ret = 0;
free:
719
	/* Free the needed pages to the hugetlb pool */
720
	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
721
722
		if ((--needed) < 0)
			break;
723
		list_del(&page->lru);
724
725
726
727
728
729
730
731
		enqueue_huge_page(page);
	}

	/* Free unnecessary surplus pages to the buddy allocator */
	if (!list_empty(&surplus_list)) {
		spin_unlock(&hugetlb_lock);
		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
			list_del(&page->lru);
732
			/*
733
734
735
			 * The page has a reference count of zero already, so
			 * call free_huge_page directly instead of using
			 * put_page.  This must be done with hugetlb_lock
736
737
738
			 * unlocked which is safe because free_huge_page takes
			 * hugetlb_lock before deciding how to free the page.
			 */
739
			free_huge_page(page);
740
		}
741
		spin_lock(&hugetlb_lock);
742
743
744
745
746
747
748
749
750
751
	}

	return ret;
}

/*
 * When releasing a hugetlb pool reservation, any surplus pages that were
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
 */
752
static void return_unused_surplus_pages(unsigned long unused_resv_pages)
753
754
755
756
757
{
	static int nid = -1;
	struct page *page;
	unsigned long nr_pages;

758
759
760
761
762
763
764
765
	/*
	 * We want to release as many surplus pages as possible, spread
	 * evenly across all nodes. Iterate across all nodes until we
	 * can no longer free unreserved surplus pages. This occurs when
	 * the nodes with surplus pages have no free pages.
	 */
	unsigned long remaining_iterations = num_online_nodes();

766
767
768
	/* Uncommit the reservation */
	resv_huge_pages -= unused_resv_pages;

769
770
	nr_pages = min(unused_resv_pages, surplus_huge_pages);

771
	while (remaining_iterations-- && nr_pages) {
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
		nid = next_node(nid, node_online_map);
		if (nid == MAX_NUMNODES)
			nid = first_node(node_online_map);

		if (!surplus_huge_pages_node[nid])
			continue;

		if (!list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
			surplus_huge_pages--;
			surplus_huge_pages_node[nid]--;
			nr_pages--;
789
			remaining_iterations = num_online_nodes();
790
791
792
793
		}
	}
}

794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
/*
 * Determine if the huge page at addr within the vma has an associated
 * reservation.  Where it does not we will need to logically increase
 * reservation and actually increase quota before an allocation can occur.
 * Where any new reservation would be required the reservation change is
 * prepared, but not committed.  Once the page has been quota'd allocated
 * an instantiated the change should be committed via vma_commit_reservation.
 * No action is required on failure.
 */
static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;

	if (vma->vm_flags & VM_SHARED) {
		pgoff_t idx = vma_pagecache_offset(vma, addr);
		return region_chg(&inode->i_mapping->private_list,
							idx, idx + 1);

813
814
	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		return 1;
815

816
817
818
819
820
821
822
823
824
825
	} else  {
		int err;
		pgoff_t idx = vma_pagecache_offset(vma, addr);
		struct resv_map *reservations = vma_resv_map(vma);

		err = region_chg(&reservations->regions, idx, idx + 1);
		if (err < 0)
			return err;
		return 0;
	}
826
827
828
829
830
831
832
833
834
835
}
static void vma_commit_reservation(struct vm_area_struct *vma,
							unsigned long addr)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;

	if (vma->vm_flags & VM_SHARED) {
		pgoff_t idx = vma_pagecache_offset(vma, addr);
		region_add(&inode->i_mapping->private_list, idx, idx + 1);
836
837
838
839
840
841
842

	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		pgoff_t idx = vma_pagecache_offset(vma, addr);
		struct resv_map *reservations = vma_resv_map(vma);

		/* Mark this page used in the map. */
		region_add(&reservations->regions, idx, idx + 1);
843
844
845
	}
}

846
static struct page *alloc_huge_page(struct vm_area_struct *vma,
847
				    unsigned long addr, int avoid_reserve)
Linus Torvalds's avatar
Linus Torvalds committed
848
{
849
	struct page *page;
850
851
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;
852
	unsigned int chg;
853
854
855
856
857

	/*
	 * Processes that did not create the mapping will have no reserves and
	 * will not have accounted against quota. Check that the quota can be
	 * made before satisfying the allocation
858
859
	 * MAP_NORESERVE mappings may also need pages and quota allocated
	 * if no reserve mapping overlaps.
860
	 */
861
862
863
864
	chg = vma_needs_reservation(vma, addr);
	if (chg < 0)
		return ERR_PTR(chg);
	if (chg)
865
866
		if (hugetlb_get_quota(inode->i_mapping, chg))
			return ERR_PTR(-ENOSPC);
Linus Torvalds's avatar
Linus Torvalds committed
867
868

	spin_lock(&hugetlb_lock);
869
	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
Linus Torvalds's avatar
Linus Torvalds committed
870
	spin_unlock(&hugetlb_lock);
871

Ken Chen's avatar
Ken Chen committed
872
	if (!page) {
873
		page = alloc_buddy_huge_page(vma, addr);
Ken Chen's avatar
Ken Chen committed
874
		if (!page) {
875
			hugetlb_put_quota(inode->i_mapping, chg);
Ken Chen's avatar
Ken Chen committed
876
877
878
			return ERR_PTR(-VM_FAULT_OOM);
		}
	}
879

880
881
	set_page_refcounted(page);
	set_page_private(page, (unsigned long) mapping);
882

883
884
	vma_commit_reservation(vma, addr);

885
	return page;
886
887
}

Linus Torvalds's avatar
Linus Torvalds committed
888
889
890
891
static int __init hugetlb_init(void)
{
	unsigned long i;

892
893
894
	if (HPAGE_SHIFT == 0)
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
895
896
897
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

898
899
	hugetlb_next_nid = first_node(node_online_map);

Linus Torvalds's avatar
Linus Torvalds committed
900
	for (i = 0; i < max_huge_pages; ++i) {
901
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

918
919
920
921
922
923
924
925
926
927
928
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

Linus Torvalds's avatar
Linus Torvalds committed
929
930
931
932
#ifdef CONFIG_SYSCTL
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
933
934
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
935
936
937
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
938
939
			if (count >= nr_huge_pages)
				return;
Linus Torvalds's avatar
Linus Torvalds committed
940
941
942
943
944
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
945
			free_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
946
947
948
949
950
951
952
953
954
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

955
#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
Linus Torvalds's avatar
Linus Torvalds committed
956
957
static unsigned long set_max_huge_pages(unsigned long count)
{
958
	unsigned long min_count, ret;
Linus Torvalds's avatar
Linus Torvalds committed
959

960
961
962
963
	/*
	 * Increase the pool size
	 * First take pages out of surplus state.  Then make up the
	 * remaining difference by allocating fresh huge pages.
964
965
966
967
968
969
	 *
	 * We might race with alloc_buddy_huge_page() here and be unable
	 * to convert a surplus huge page to a normal huge page. That is
	 * not critical, though, it just means the overall size of the
	 * pool might be one hugepage larger than it needs to be, but
	 * within all the constraints specified by the sysctls.
970
	 */
Linus Torvalds's avatar
Linus Torvalds committed
971
	spin_lock(&hugetlb_lock);
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
	while (surplus_huge_pages && count > persistent_huge_pages) {
		if (!adjust_pool_surplus(-1))
			break;
	}

	while (count > persistent_huge_pages) {
		/*
		 * If this allocation races such that we no longer need the
		 * page, free_huge_page will handle it by freeing the page
		 * and reducing the surplus.
		 */
		spin_unlock(&hugetlb_lock);
		ret = alloc_fresh_huge_page();
		spin_lock(&hugetlb_lock);
		if (!ret)
			goto out;

	}

	/*
	 * Decrease the pool size
	 * First return free pages to the buddy allocator (being careful
	 * to keep enough around to satisfy reservations).  Then place
	 * pages into surplus state as needed so the pool will shrink
	 * to the desired size as pages become free.
997
998
999
1000
	 *
	 * By placing pages into the surplus state independent of the
	 * overcommit value, we are allowing the surplus pool size to
	 * exceed overcommit. There are few sane options here. Since