hugetlb.c 21.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

David Gibson's avatar
David Gibson committed
18
19
20
21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23
24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
29
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
30
31
32
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;

33
34
35
36
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
37

38
39
40
41
42
43
44
45
46
47
48
49
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
		clear_user_highpage(page + i, addr);
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
50
			   unsigned long addr, struct vm_area_struct *vma)
51
52
53
54
55
56
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
57
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
58
59
60
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
64
65
66
67
68
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

69
70
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
				unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
71
{
72
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
73
	struct page *page = NULL;
74
75
	struct zonelist *zonelist = huge_zonelist(vma, address,
						htlb_alloc_mask);
76
	struct zone **z;
Linus Torvalds's avatar
Linus Torvalds committed
77

78
	for (z = zonelist->zones; *z; z++) {
79
		nid = zone_to_nid(*z);
80
		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
81
82
83
84
85
86
		    !list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
Ken Chen's avatar
Ken Chen committed
87
			break;
88
		}
Linus Torvalds's avatar
Linus Torvalds committed
89
90
91
92
	}
	return page;
}

93
94
95
96
97
98
99
100
101
102
103
static void free_huge_page(struct page *page)
{
	BUG_ON(page_count(page));

	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
	enqueue_huge_page(page);
	spin_unlock(&hugetlb_lock);
}

104
static int alloc_fresh_huge_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
105
{
106
	static int prev_nid;
Linus Torvalds's avatar
Linus Torvalds committed
107
	struct page *page;
108
109
	int nid;

110
111
112
113
114
115
116
	/*
	 * Copy static prev_nid to local nid, work on that, then copy it
	 * back to prev_nid afterwards: otherwise there's a window in which
	 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
	 * But we don't need to use a spin_lock here: it really doesn't
	 * matter if occasionally a racer chooses the same nid as we do.
	 */
117
	nid = next_node(prev_nid, node_online_map);
118
119
	if (nid == MAX_NUMNODES)
		nid = first_node(node_online_map);
120
121
	prev_nid = nid;

122
	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
123
					HUGETLB_PAGE_ORDER);
Linus Torvalds's avatar
Linus Torvalds committed
124
	if (page) {
125
		set_compound_page_dtor(page, free_huge_page);
126
		spin_lock(&hugetlb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
127
128
		nr_huge_pages++;
		nr_huge_pages_node[page_to_nid(page)]++;
129
		spin_unlock(&hugetlb_lock);
130
131
		put_page(page); /* free it into the hugepage allocator */
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
132
	}
133
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
134
135
}

136
137
static struct page *alloc_huge_page(struct vm_area_struct *vma,
				    unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
138
139
140
141
{
	struct page *page;

	spin_lock(&hugetlb_lock);
142
143
144
145
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages--;
	else if (free_huge_pages <= resv_huge_pages)
		goto fail;
146
147
148
149
150

	page = dequeue_huge_page(vma, addr);
	if (!page)
		goto fail;

Linus Torvalds's avatar
Linus Torvalds committed
151
	spin_unlock(&hugetlb_lock);
152
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
153
	return page;
154

155
fail:
156
157
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages++;
158
159
160
161
	spin_unlock(&hugetlb_lock);
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
165
static int __init hugetlb_init(void)
{
	unsigned long i;

166
167
168
	if (HPAGE_SHIFT == 0)
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
169
170
171
172
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

	for (i = 0; i < max_huge_pages; ++i) {
173
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

190
191
192
193
194
195
196
197
198
199
200
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

Linus Torvalds's avatar
Linus Torvalds committed
201
202
203
204
205
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
206
	nr_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
207
208
209
210
211
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
212
	set_compound_page_dtor(page, NULL);
213
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
214
215
216
217
218
219
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
220
221
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
225
226
227
228
229
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
230
			free_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
			if (count >= nr_huge_pages)
				return;
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

static unsigned long set_max_huge_pages(unsigned long count)
{
	while (count > nr_huge_pages) {
245
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
246
247
248
249
250
251
			return nr_huge_pages;
	}
	if (count >= nr_huge_pages)
		return nr_huge_pages;

	spin_lock(&hugetlb_lock);
252
	count = max(count, resv_huge_pages);
Linus Torvalds's avatar
Linus Torvalds committed
253
254
	try_to_free_low(count);
	while (count < nr_huge_pages) {
255
		struct page *page = dequeue_huge_page(NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
		if (!page)
			break;
		update_and_free_page(page);
	}
	spin_unlock(&hugetlb_lock);
	return nr_huge_pages;
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
			   struct file *file, void __user *buffer,
			   size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	max_huge_pages = set_max_huge_pages(max_huge_pages);
	return 0;
}
272
273
274
275
276
277
278
279
280
281
282
283
284

int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
			struct file *file, void __user *buffer,
			size_t *length, loff_t *ppos)
{
	proc_dointvec(table, write, file, buffer, length, ppos);
	if (hugepages_treat_as_movable)
		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
	else
		htlb_alloc_mask = GFP_HIGHUSER;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
285
286
287
288
289
290
291
#endif /* CONFIG_SYSCTL */

int hugetlb_report_meminfo(char *buf)
{
	return sprintf(buf,
			"HugePages_Total: %5lu\n"
			"HugePages_Free:  %5lu\n"
292
			"HugePages_Rsvd:  %5lu\n"
Linus Torvalds's avatar
Linus Torvalds committed
293
294
295
			"Hugepagesize:    %5lu kB\n",
			nr_huge_pages,
			free_huge_pages,
296
			resv_huge_pages,
Linus Torvalds's avatar
Linus Torvalds committed
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
			HPAGE_SIZE/1024);
}

int hugetlb_report_node_meminfo(int nid, char *buf)
{
	return sprintf(buf,
		"Node %d HugePages_Total: %5u\n"
		"Node %d HugePages_Free:  %5u\n",
		nid, nr_huge_pages_node[nid],
		nid, free_huge_pages_node[nid]);
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
Nick Piggin's avatar
Nick Piggin committed
321
static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
322
323
{
	BUG();
Nick Piggin's avatar
Nick Piggin committed
324
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
325
326
327
}

struct vm_operations_struct hugetlb_vm_ops = {
Nick Piggin's avatar
Nick Piggin committed
328
	.fault = hugetlb_vm_op_fault,
Linus Torvalds's avatar
Linus Torvalds committed
329
330
};

331
332
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
				int writable)
David Gibson's avatar
David Gibson committed
333
334
335
{
	pte_t entry;

336
	if (writable) {
David Gibson's avatar
David Gibson committed
337
338
339
340
341
342
343
344
345
346
347
		entry =
		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	} else {
		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
	}
	entry = pte_mkyoung(entry);
	entry = pte_mkhuge(entry);

	return entry;
}

348
349
350
351
352
353
static void set_huge_ptep_writable(struct vm_area_struct *vma,
				   unsigned long address, pte_t *ptep)
{
	pte_t entry;

	entry = pte_mkwrite(pte_mkdirty(*ptep));
354
355
356
357
	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
		update_mmu_cache(vma, address, entry);
		lazy_mmu_prot_update(entry);
	}
358
359
360
}


David Gibson's avatar
David Gibson committed
361
362
363
364
365
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
			    struct vm_area_struct *vma)
{
	pte_t *src_pte, *dst_pte, entry;
	struct page *ptepage;
366
	unsigned long addr;
367
368
369
	int cow;

	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
David Gibson's avatar
David Gibson committed
370

371
	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
372
373
374
		src_pte = huge_pte_offset(src, addr);
		if (!src_pte)
			continue;
David Gibson's avatar
David Gibson committed
375
376
377
		dst_pte = huge_pte_alloc(dst, addr);
		if (!dst_pte)
			goto nomem;
378
		spin_lock(&dst->page_table_lock);
379
		spin_lock(&src->page_table_lock);
380
		if (!pte_none(*src_pte)) {
381
382
			if (cow)
				ptep_set_wrprotect(src, addr, src_pte);
383
384
385
386
387
388
			entry = *src_pte;
			ptepage = pte_page(entry);
			get_page(ptepage);
			set_huge_pte_at(dst, addr, dst_pte, entry);
		}
		spin_unlock(&src->page_table_lock);
389
		spin_unlock(&dst->page_table_lock);
David Gibson's avatar
David Gibson committed
390
391
392
393
394
395
396
	}
	return 0;

nomem:
	return -ENOMEM;
}

397
398
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			    unsigned long end)
David Gibson's avatar
David Gibson committed
399
400
401
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address;
402
	pte_t *ptep;
David Gibson's avatar
David Gibson committed
403
404
	pte_t pte;
	struct page *page;
405
	struct page *tmp;
406
407
408
409
410
	/*
	 * A page gathering list, protected by per file i_mmap_lock. The
	 * lock is used to avoid list corruption from multiple unmapping
	 * of the same page since we are using page->lru.
	 */
411
	LIST_HEAD(page_list);
David Gibson's avatar
David Gibson committed
412
413
414
415
416

	WARN_ON(!is_vm_hugetlb_page(vma));
	BUG_ON(start & ~HPAGE_MASK);
	BUG_ON(end & ~HPAGE_MASK);

417
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
418
	for (address = start; address < end; address += HPAGE_SIZE) {
419
		ptep = huge_pte_offset(mm, address);
420
		if (!ptep)
421
422
			continue;

423
424
425
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;

426
		pte = huge_ptep_get_and_clear(mm, address, ptep);
David Gibson's avatar
David Gibson committed
427
428
		if (pte_none(pte))
			continue;
429

David Gibson's avatar
David Gibson committed
430
		page = pte_page(pte);
431
432
		if (pte_dirty(pte))
			set_page_dirty(page);
433
		list_add(&page->lru, &page_list);
David Gibson's avatar
David Gibson committed
434
	}
Linus Torvalds's avatar
Linus Torvalds committed
435
	spin_unlock(&mm->page_table_lock);
436
	flush_tlb_range(vma, start, end);
437
438
439
440
	list_for_each_entry_safe(page, tmp, &page_list, lru) {
		list_del(&page->lru);
		put_page(page);
	}
Linus Torvalds's avatar
Linus Torvalds committed
441
}
David Gibson's avatar
David Gibson committed
442

443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end)
{
	/*
	 * It is undesirable to test vma->vm_file as it should be non-null
	 * for valid hugetlb area. However, vm_file will be NULL in the error
	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
	 * to clean up. Since no pte has actually been setup, it is safe to
	 * do nothing in this case.
	 */
	if (vma->vm_file) {
		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
		__unmap_hugepage_range(vma, start, end);
		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	}
}

461
462
463
464
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *ptep, pte_t pte)
{
	struct page *old_page, *new_page;
465
	int avoidcopy;
466
467
468
469
470
471
472
473

	old_page = pte_page(pte);

	/* If no-one else is actually using this page, avoid the copy
	 * and just make the page writable */
	avoidcopy = (page_count(old_page) == 1);
	if (avoidcopy) {
		set_huge_ptep_writable(vma, address, ptep);
Nick Piggin's avatar
Nick Piggin committed
474
		return 0;
475
476
477
	}

	page_cache_get(old_page);
478
	new_page = alloc_huge_page(vma, address);
479
480
481

	if (!new_page) {
		page_cache_release(old_page);
482
		return VM_FAULT_OOM;
483
484
485
	}

	spin_unlock(&mm->page_table_lock);
486
	copy_huge_page(new_page, old_page, address, vma);
487
488
489
490
491
492
493
494
495
496
497
498
	spin_lock(&mm->page_table_lock);

	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
	if (likely(pte_same(*ptep, pte))) {
		/* Break COW */
		set_huge_pte_at(mm, address, ptep,
				make_huge_pte(vma, new_page, 1));
		/* Make the old page be freed below */
		new_page = old_page;
	}
	page_cache_release(new_page);
	page_cache_release(old_page);
Nick Piggin's avatar
Nick Piggin committed
499
	return 0;
500
501
}

502
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
503
			unsigned long address, pte_t *ptep, int write_access)
504
505
{
	int ret = VM_FAULT_SIGBUS;
506
507
508
509
	unsigned long idx;
	unsigned long size;
	struct page *page;
	struct address_space *mapping;
510
	pte_t new_pte;
511
512
513
514
515
516
517
518
519

	mapping = vma->vm_file->f_mapping;
	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

	/*
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
520
521
522
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
523
524
525
		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
		if (idx >= size)
			goto out;
526
527
528
529
530
		if (hugetlb_get_quota(mapping))
			goto out;
		page = alloc_huge_page(vma, address);
		if (!page) {
			hugetlb_put_quota(mapping);
531
			ret = VM_FAULT_OOM;
532
533
			goto out;
		}
534
		clear_huge_page(page, address);
535

536
537
538
539
540
541
542
543
544
545
546
547
548
549
		if (vma->vm_flags & VM_SHARED) {
			int err;

			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
			if (err) {
				put_page(page);
				hugetlb_put_quota(mapping);
				if (err == -EEXIST)
					goto retry;
				goto out;
			}
		} else
			lock_page(page);
	}
550

551
	spin_lock(&mm->page_table_lock);
552
553
554
555
	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	if (idx >= size)
		goto backout;

Nick Piggin's avatar
Nick Piggin committed
556
	ret = 0;
557
	if (!pte_none(*ptep))
558
559
		goto backout;

560
561
562
563
564
565
566
567
568
	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				&& (vma->vm_flags & VM_SHARED)));
	set_huge_pte_at(mm, address, ptep, new_pte);

	if (write_access && !(vma->vm_flags & VM_SHARED)) {
		/* Optimization, do the COW without a second fault */
		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	}

569
	spin_unlock(&mm->page_table_lock);
570
571
	unlock_page(page);
out:
572
	return ret;
573
574
575
576
577
578
579

backout:
	spin_unlock(&mm->page_table_lock);
	hugetlb_put_quota(mapping);
	unlock_page(page);
	put_page(page);
	goto out;
580
581
}

582
583
584
585
586
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, int write_access)
{
	pte_t *ptep;
	pte_t entry;
587
	int ret;
588
	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
589
590
591
592
593

	ptep = huge_pte_alloc(mm, address);
	if (!ptep)
		return VM_FAULT_OOM;

594
595
596
597
598
599
	/*
	 * Serialize hugepage allocation and instantiation, so that we don't
	 * get spurious allocation failures if two CPUs race to instantiate
	 * the same page in the page cache.
	 */
	mutex_lock(&hugetlb_instantiation_mutex);
600
	entry = *ptep;
601
602
603
604
605
	if (pte_none(entry)) {
		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		mutex_unlock(&hugetlb_instantiation_mutex);
		return ret;
	}
606

Nick Piggin's avatar
Nick Piggin committed
607
	ret = 0;
608
609
610
611
612
613
614

	spin_lock(&mm->page_table_lock);
	/* Check for a racing update before calling hugetlb_cow */
	if (likely(pte_same(entry, *ptep)))
		if (write_access && !pte_write(entry))
			ret = hugetlb_cow(mm, vma, address, ptep, entry);
	spin_unlock(&mm->page_table_lock);
615
	mutex_unlock(&hugetlb_instantiation_mutex);
616
617

	return ret;
618
619
}

David Gibson's avatar
David Gibson committed
620
621
622
623
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			struct page **pages, struct vm_area_struct **vmas,
			unsigned long *position, int *length, int i)
{
624
625
	unsigned long pfn_offset;
	unsigned long vaddr = *position;
David Gibson's avatar
David Gibson committed
626
627
	int remainder = *length;

628
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
629
	while (vaddr < vma->vm_end && remainder) {
630
631
		pte_t *pte;
		struct page *page;
David Gibson's avatar
David Gibson committed
632

633
634
635
636
637
638
		/*
		 * Some archs (sparc64, sh*) have multiple pte_ts to
		 * each hugepage.  We have to make * sure we get the
		 * first, for the page indexing below to work.
		 */
		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
David Gibson's avatar
David Gibson committed
639

640
641
		if (!pte || pte_none(*pte)) {
			int ret;
David Gibson's avatar
David Gibson committed
642

643
644
645
			spin_unlock(&mm->page_table_lock);
			ret = hugetlb_fault(mm, vma, vaddr, 0);
			spin_lock(&mm->page_table_lock);
646
			if (!(ret & VM_FAULT_ERROR))
647
				continue;
David Gibson's avatar
David Gibson committed
648

649
650
651
652
653
654
			remainder = 0;
			if (!i)
				i = -EFAULT;
			break;
		}

655
656
657
		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
		page = pte_page(*pte);
same_page:
658
659
		if (pages) {
			get_page(page);
660
			pages[i] = page + pfn_offset;
661
		}
David Gibson's avatar
David Gibson committed
662
663
664
665
666

		if (vmas)
			vmas[i] = vma;

		vaddr += PAGE_SIZE;
667
		++pfn_offset;
David Gibson's avatar
David Gibson committed
668
669
		--remainder;
		++i;
670
671
672
673
674
675
676
677
		if (vaddr < vma->vm_end && remainder &&
				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
			/*
			 * We use pfn_offset to avoid touching the pageframes
			 * of this compound page.
			 */
			goto same_page;
		}
David Gibson's avatar
David Gibson committed
678
	}
679
	spin_unlock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
680
681
682
683
684
	*length = remainder;
	*position = vaddr;

	return i;
}
685
686
687
688
689
690
691
692
693
694
695
696

void hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long address, unsigned long end, pgprot_t newprot)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long start = address;
	pte_t *ptep;
	pte_t pte;

	BUG_ON(address >= end);
	flush_cache_range(vma, address, end);

697
	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
698
699
700
701
702
	spin_lock(&mm->page_table_lock);
	for (; address < end; address += HPAGE_SIZE) {
		ptep = huge_pte_offset(mm, address);
		if (!ptep)
			continue;
703
704
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;
705
706
707
708
709
710
711
712
		if (!pte_none(*ptep)) {
			pte = huge_ptep_get_and_clear(mm, address, ptep);
			pte = pte_mkhuge(pte_modify(pte, newprot));
			set_huge_pte_at(mm, address, ptep, pte);
			lazy_mmu_prot_update(pte);
		}
	}
	spin_unlock(&mm->page_table_lock);
713
	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
714
715
716
717

	flush_tlb_range(vma, start, end);
}

718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarentee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		if (nrg == 0)
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

static int hugetlb_acct_memory(long delta)
{
	int ret = -ENOMEM;

	spin_lock(&hugetlb_lock);
	if ((delta + resv_huge_pages) <= free_huge_pages) {
		resv_huge_pages += delta;
		ret = 0;
	}
	spin_unlock(&hugetlb_lock);
	return ret;
}

int hugetlb_reserve_pages(struct inode *inode, long from, long to)
{
	long ret, chg;

	chg = region_chg(&inode->i_mapping->private_list, from, to);
	if (chg < 0)
		return chg;
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
	/*
	 * When cpuset is configured, it breaks the strict hugetlb page
	 * reservation as the accounting is done on a global variable. Such
	 * reservation is completely rubbish in the presence of cpuset because
	 * the reservation is not checked against page availability for the
	 * current cpuset. Application can still potentially OOM'ed by kernel
	 * with lack of free htlb page in cpuset that the task is in.
	 * Attempt to enforce strict accounting with cpuset is almost
	 * impossible (or too ugly) because cpuset is too fluid that
	 * task or memory node can be dynamically moved between cpusets.
	 *
	 * The change of semantics for shared hugetlb mapping with cpuset is
	 * undesirable. However, in order to preserve some of the semantics,
	 * we fall back to check against current free page availability as
	 * a best attempt and hopefully to minimize the impact of changing
	 * semantics that cpuset has.
	 */
	if (chg > cpuset_mems_nr(free_huge_pages_node))
		return -ENOMEM;

879
880
881
882
883
884
885
886
887
888
889
890
	ret = hugetlb_acct_memory(chg);
	if (ret < 0)
		return ret;
	region_add(&inode->i_mapping->private_list, from, to);
	return 0;
}

void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	hugetlb_acct_memory(freed - chg);
}