hugetlb.c 21.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

David Gibson's avatar
David Gibson committed
18
19
20
21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23
24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
29
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
30
31
32
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;

33
34
35
36
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
37

38
39
40
41
42
43
44
45
46
47
48
49
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
		clear_user_highpage(page + i, addr);
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
50
			   unsigned long addr, struct vm_area_struct *vma)
51
52
53
54
55
56
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
57
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
58
59
60
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
64
65
66
67
68
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

69
70
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
				unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
71
{
72
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
73
	struct page *page = NULL;
74
75
	struct zonelist *zonelist = huge_zonelist(vma, address,
						htlb_alloc_mask);
76
	struct zone **z;
Linus Torvalds's avatar
Linus Torvalds committed
77

78
	for (z = zonelist->zones; *z; z++) {
79
		nid = zone_to_nid(*z);
80
		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
81
82
83
84
85
86
87
		    !list_empty(&hugepage_freelists[nid])) {
			page = list_entry(hugepage_freelists[nid].next,
					  struct page, lru);
			list_del(&page->lru);
			free_huge_pages--;
			free_huge_pages_node[nid]--;
		}
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91
	}
	return page;
}

92
93
94
95
96
97
98
99
100
101
102
static void free_huge_page(struct page *page)
{
	BUG_ON(page_count(page));

	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
	enqueue_huge_page(page);
	spin_unlock(&hugetlb_lock);
}

103
static int alloc_fresh_huge_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
104
{
105
	static int prev_nid;
Linus Torvalds's avatar
Linus Torvalds committed
106
	struct page *page;
107
108
	int nid;

109
110
111
112
113
114
115
	/*
	 * Copy static prev_nid to local nid, work on that, then copy it
	 * back to prev_nid afterwards: otherwise there's a window in which
	 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
	 * But we don't need to use a spin_lock here: it really doesn't
	 * matter if occasionally a racer chooses the same nid as we do.
	 */
116
	nid = next_node(prev_nid, node_online_map);
117
118
	if (nid == MAX_NUMNODES)
		nid = first_node(node_online_map);
119
120
	prev_nid = nid;

121
	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
122
					HUGETLB_PAGE_ORDER);
Linus Torvalds's avatar
Linus Torvalds committed
123
	if (page) {
124
		set_compound_page_dtor(page, free_huge_page);
125
		spin_lock(&hugetlb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
126
127
		nr_huge_pages++;
		nr_huge_pages_node[page_to_nid(page)]++;
128
		spin_unlock(&hugetlb_lock);
129
130
		put_page(page); /* free it into the hugepage allocator */
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
131
	}
132
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
133
134
}

135
136
static struct page *alloc_huge_page(struct vm_area_struct *vma,
				    unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
137
138
139
140
{
	struct page *page;

	spin_lock(&hugetlb_lock);
141
142
143
144
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages--;
	else if (free_huge_pages <= resv_huge_pages)
		goto fail;
145
146
147
148
149

	page = dequeue_huge_page(vma, addr);
	if (!page)
		goto fail;

Linus Torvalds's avatar
Linus Torvalds committed
150
	spin_unlock(&hugetlb_lock);
151
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
152
	return page;
153

154
fail:
155
156
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages++;
157
158
159
160
	spin_unlock(&hugetlb_lock);
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
161
162
163
164
static int __init hugetlb_init(void)
{
	unsigned long i;

165
166
167
	if (HPAGE_SHIFT == 0)
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

	for (i = 0; i < max_huge_pages; ++i) {
172
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

189
190
191
192
193
194
195
196
197
198
199
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

Linus Torvalds's avatar
Linus Torvalds committed
200
201
202
203
204
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
205
	nr_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
211
	set_compound_page_dtor(page, NULL);
212
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
213
214
215
216
217
218
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
219
220
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
221
222
223
224
225
226
227
228
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
229
			free_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
			if (count >= nr_huge_pages)
				return;
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

static unsigned long set_max_huge_pages(unsigned long count)
{
	while (count > nr_huge_pages) {
244
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
245
246
247
248
249
250
			return nr_huge_pages;
	}
	if (count >= nr_huge_pages)
		return nr_huge_pages;

	spin_lock(&hugetlb_lock);
251
	count = max(count, resv_huge_pages);
Linus Torvalds's avatar
Linus Torvalds committed
252
253
	try_to_free_low(count);
	while (count < nr_huge_pages) {
254
		struct page *page = dequeue_huge_page(NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
		if (!page)
			break;
		update_and_free_page(page);
	}
	spin_unlock(&hugetlb_lock);
	return nr_huge_pages;
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
			   struct file *file, void __user *buffer,
			   size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	max_huge_pages = set_max_huge_pages(max_huge_pages);
	return 0;
}
271
272
273
274
275
276
277
278
279
280
281
282
283

int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
			struct file *file, void __user *buffer,
			size_t *length, loff_t *ppos)
{
	proc_dointvec(table, write, file, buffer, length, ppos);
	if (hugepages_treat_as_movable)
		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
	else
		htlb_alloc_mask = GFP_HIGHUSER;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
284
285
286
287
288
289
290
#endif /* CONFIG_SYSCTL */

int hugetlb_report_meminfo(char *buf)
{
	return sprintf(buf,
			"HugePages_Total: %5lu\n"
			"HugePages_Free:  %5lu\n"
291
			"HugePages_Rsvd:  %5lu\n"
Linus Torvalds's avatar
Linus Torvalds committed
292
293
294
			"Hugepagesize:    %5lu kB\n",
			nr_huge_pages,
			free_huge_pages,
295
			resv_huge_pages,
Linus Torvalds's avatar
Linus Torvalds committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
			HPAGE_SIZE/1024);
}

int hugetlb_report_node_meminfo(int nid, char *buf)
{
	return sprintf(buf,
		"Node %d HugePages_Total: %5u\n"
		"Node %d HugePages_Free:  %5u\n",
		nid, nr_huge_pages_node[nid],
		nid, free_huge_pages_node[nid]);
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
Nick Piggin's avatar
Nick Piggin committed
320
static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
321
322
{
	BUG();
Nick Piggin's avatar
Nick Piggin committed
323
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
324
325
326
}

struct vm_operations_struct hugetlb_vm_ops = {
Nick Piggin's avatar
Nick Piggin committed
327
	.fault = hugetlb_vm_op_fault,
Linus Torvalds's avatar
Linus Torvalds committed
328
329
};

330
331
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
				int writable)
David Gibson's avatar
David Gibson committed
332
333
334
{
	pte_t entry;

335
	if (writable) {
David Gibson's avatar
David Gibson committed
336
337
338
339
340
341
342
343
344
345
346
		entry =
		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	} else {
		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
	}
	entry = pte_mkyoung(entry);
	entry = pte_mkhuge(entry);

	return entry;
}

347
348
349
350
351
352
static void set_huge_ptep_writable(struct vm_area_struct *vma,
				   unsigned long address, pte_t *ptep)
{
	pte_t entry;

	entry = pte_mkwrite(pte_mkdirty(*ptep));
353
354
355
356
	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
		update_mmu_cache(vma, address, entry);
		lazy_mmu_prot_update(entry);
	}
357
358
359
}


David Gibson's avatar
David Gibson committed
360
361
362
363
364
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
			    struct vm_area_struct *vma)
{
	pte_t *src_pte, *dst_pte, entry;
	struct page *ptepage;
365
	unsigned long addr;
366
367
368
	int cow;

	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
David Gibson's avatar
David Gibson committed
369

370
	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
371
372
373
		src_pte = huge_pte_offset(src, addr);
		if (!src_pte)
			continue;
David Gibson's avatar
David Gibson committed
374
375
376
		dst_pte = huge_pte_alloc(dst, addr);
		if (!dst_pte)
			goto nomem;
377
		spin_lock(&dst->page_table_lock);
378
		spin_lock(&src->page_table_lock);
379
		if (!pte_none(*src_pte)) {
380
381
			if (cow)
				ptep_set_wrprotect(src, addr, src_pte);
382
383
384
385
386
387
			entry = *src_pte;
			ptepage = pte_page(entry);
			get_page(ptepage);
			set_huge_pte_at(dst, addr, dst_pte, entry);
		}
		spin_unlock(&src->page_table_lock);
388
		spin_unlock(&dst->page_table_lock);
David Gibson's avatar
David Gibson committed
389
390
391
392
393
394
395
	}
	return 0;

nomem:
	return -ENOMEM;
}

396
397
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			    unsigned long end)
David Gibson's avatar
David Gibson committed
398
399
400
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address;
401
	pte_t *ptep;
David Gibson's avatar
David Gibson committed
402
403
	pte_t pte;
	struct page *page;
404
	struct page *tmp;
405
406
407
408
409
	/*
	 * A page gathering list, protected by per file i_mmap_lock. The
	 * lock is used to avoid list corruption from multiple unmapping
	 * of the same page since we are using page->lru.
	 */
410
	LIST_HEAD(page_list);
David Gibson's avatar
David Gibson committed
411
412
413
414
415

	WARN_ON(!is_vm_hugetlb_page(vma));
	BUG_ON(start & ~HPAGE_MASK);
	BUG_ON(end & ~HPAGE_MASK);

416
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
417
	for (address = start; address < end; address += HPAGE_SIZE) {
418
		ptep = huge_pte_offset(mm, address);
419
		if (!ptep)
420
421
			continue;

422
423
424
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;

425
		pte = huge_ptep_get_and_clear(mm, address, ptep);
David Gibson's avatar
David Gibson committed
426
427
		if (pte_none(pte))
			continue;
428

David Gibson's avatar
David Gibson committed
429
		page = pte_page(pte);
430
431
		if (pte_dirty(pte))
			set_page_dirty(page);
432
		list_add(&page->lru, &page_list);
David Gibson's avatar
David Gibson committed
433
	}
Linus Torvalds's avatar
Linus Torvalds committed
434
	spin_unlock(&mm->page_table_lock);
435
	flush_tlb_range(vma, start, end);
436
437
438
439
	list_for_each_entry_safe(page, tmp, &page_list, lru) {
		list_del(&page->lru);
		put_page(page);
	}
Linus Torvalds's avatar
Linus Torvalds committed
440
}
David Gibson's avatar
David Gibson committed
441

442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end)
{
	/*
	 * It is undesirable to test vma->vm_file as it should be non-null
	 * for valid hugetlb area. However, vm_file will be NULL in the error
	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
	 * to clean up. Since no pte has actually been setup, it is safe to
	 * do nothing in this case.
	 */
	if (vma->vm_file) {
		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
		__unmap_hugepage_range(vma, start, end);
		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	}
}

460
461
462
463
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *ptep, pte_t pte)
{
	struct page *old_page, *new_page;
464
	int avoidcopy;
465
466
467
468
469
470
471
472

	old_page = pte_page(pte);

	/* If no-one else is actually using this page, avoid the copy
	 * and just make the page writable */
	avoidcopy = (page_count(old_page) == 1);
	if (avoidcopy) {
		set_huge_ptep_writable(vma, address, ptep);
Nick Piggin's avatar
Nick Piggin committed
473
		return 0;
474
475
476
	}

	page_cache_get(old_page);
477
	new_page = alloc_huge_page(vma, address);
478
479
480

	if (!new_page) {
		page_cache_release(old_page);
481
		return VM_FAULT_OOM;
482
483
484
	}

	spin_unlock(&mm->page_table_lock);
485
	copy_huge_page(new_page, old_page, address, vma);
486
487
488
489
490
491
492
493
494
495
496
497
	spin_lock(&mm->page_table_lock);

	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
	if (likely(pte_same(*ptep, pte))) {
		/* Break COW */
		set_huge_pte_at(mm, address, ptep,
				make_huge_pte(vma, new_page, 1));
		/* Make the old page be freed below */
		new_page = old_page;
	}
	page_cache_release(new_page);
	page_cache_release(old_page);
Nick Piggin's avatar
Nick Piggin committed
498
	return 0;
499
500
}

501
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
502
			unsigned long address, pte_t *ptep, int write_access)
503
504
{
	int ret = VM_FAULT_SIGBUS;
505
506
507
508
	unsigned long idx;
	unsigned long size;
	struct page *page;
	struct address_space *mapping;
509
	pte_t new_pte;
510
511
512
513
514
515
516
517
518

	mapping = vma->vm_file->f_mapping;
	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

	/*
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
519
520
521
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
522
523
524
		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
		if (idx >= size)
			goto out;
525
526
527
528
529
		if (hugetlb_get_quota(mapping))
			goto out;
		page = alloc_huge_page(vma, address);
		if (!page) {
			hugetlb_put_quota(mapping);
530
			ret = VM_FAULT_OOM;
531
532
			goto out;
		}
533
		clear_huge_page(page, address);
534

535
536
537
538
539
540
541
542
543
544
545
546
547
548
		if (vma->vm_flags & VM_SHARED) {
			int err;

			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
			if (err) {
				put_page(page);
				hugetlb_put_quota(mapping);
				if (err == -EEXIST)
					goto retry;
				goto out;
			}
		} else
			lock_page(page);
	}
549

550
	spin_lock(&mm->page_table_lock);
551
552
553
554
	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	if (idx >= size)
		goto backout;

Nick Piggin's avatar
Nick Piggin committed
555
	ret = 0;
556
	if (!pte_none(*ptep))
557
558
		goto backout;

559
560
561
562
563
564
565
566
567
	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				&& (vma->vm_flags & VM_SHARED)));
	set_huge_pte_at(mm, address, ptep, new_pte);

	if (write_access && !(vma->vm_flags & VM_SHARED)) {
		/* Optimization, do the COW without a second fault */
		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	}

568
	spin_unlock(&mm->page_table_lock);
569
570
	unlock_page(page);
out:
571
	return ret;
572
573
574
575
576
577
578

backout:
	spin_unlock(&mm->page_table_lock);
	hugetlb_put_quota(mapping);
	unlock_page(page);
	put_page(page);
	goto out;
579
580
}

581
582
583
584
585
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, int write_access)
{
	pte_t *ptep;
	pte_t entry;
586
	int ret;
587
	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
588
589
590
591
592

	ptep = huge_pte_alloc(mm, address);
	if (!ptep)
		return VM_FAULT_OOM;

593
594
595
596
597
598
	/*
	 * Serialize hugepage allocation and instantiation, so that we don't
	 * get spurious allocation failures if two CPUs race to instantiate
	 * the same page in the page cache.
	 */
	mutex_lock(&hugetlb_instantiation_mutex);
599
	entry = *ptep;
600
601
602
603
604
	if (pte_none(entry)) {
		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		mutex_unlock(&hugetlb_instantiation_mutex);
		return ret;
	}
605

Nick Piggin's avatar
Nick Piggin committed
606
	ret = 0;
607
608
609
610
611
612
613

	spin_lock(&mm->page_table_lock);
	/* Check for a racing update before calling hugetlb_cow */
	if (likely(pte_same(entry, *ptep)))
		if (write_access && !pte_write(entry))
			ret = hugetlb_cow(mm, vma, address, ptep, entry);
	spin_unlock(&mm->page_table_lock);
614
	mutex_unlock(&hugetlb_instantiation_mutex);
615
616

	return ret;
617
618
}

David Gibson's avatar
David Gibson committed
619
620
621
622
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			struct page **pages, struct vm_area_struct **vmas,
			unsigned long *position, int *length, int i)
{
623
624
	unsigned long pfn_offset;
	unsigned long vaddr = *position;
David Gibson's avatar
David Gibson committed
625
626
	int remainder = *length;

627
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
628
	while (vaddr < vma->vm_end && remainder) {
629
630
		pte_t *pte;
		struct page *page;
David Gibson's avatar
David Gibson committed
631

632
633
634
635
636
637
		/*
		 * Some archs (sparc64, sh*) have multiple pte_ts to
		 * each hugepage.  We have to make * sure we get the
		 * first, for the page indexing below to work.
		 */
		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
David Gibson's avatar
David Gibson committed
638

639
640
		if (!pte || pte_none(*pte)) {
			int ret;
David Gibson's avatar
David Gibson committed
641

642
643
644
			spin_unlock(&mm->page_table_lock);
			ret = hugetlb_fault(mm, vma, vaddr, 0);
			spin_lock(&mm->page_table_lock);
Nick Piggin's avatar
Nick Piggin committed
645
			if (!(ret & VM_FAULT_MAJOR))
646
				continue;
David Gibson's avatar
David Gibson committed
647

648
649
650
651
652
653
			remainder = 0;
			if (!i)
				i = -EFAULT;
			break;
		}

654
655
656
		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
		page = pte_page(*pte);
same_page:
657
658
		if (pages) {
			get_page(page);
659
			pages[i] = page + pfn_offset;
660
		}
David Gibson's avatar
David Gibson committed
661
662
663
664
665

		if (vmas)
			vmas[i] = vma;

		vaddr += PAGE_SIZE;
666
		++pfn_offset;
David Gibson's avatar
David Gibson committed
667
668
		--remainder;
		++i;
669
670
671
672
673
674
675
676
		if (vaddr < vma->vm_end && remainder &&
				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
			/*
			 * We use pfn_offset to avoid touching the pageframes
			 * of this compound page.
			 */
			goto same_page;
		}
David Gibson's avatar
David Gibson committed
677
	}
678
	spin_unlock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
679
680
681
682
683
	*length = remainder;
	*position = vaddr;

	return i;
}
684
685
686
687
688
689
690
691
692
693
694
695

void hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long address, unsigned long end, pgprot_t newprot)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long start = address;
	pte_t *ptep;
	pte_t pte;

	BUG_ON(address >= end);
	flush_cache_range(vma, address, end);

696
	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
697
698
699
700
701
	spin_lock(&mm->page_table_lock);
	for (; address < end; address += HPAGE_SIZE) {
		ptep = huge_pte_offset(mm, address);
		if (!ptep)
			continue;
702
703
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;
704
705
706
707
708
709
710
711
		if (!pte_none(*ptep)) {
			pte = huge_ptep_get_and_clear(mm, address, ptep);
			pte = pte_mkhuge(pte_modify(pte, newprot));
			set_huge_pte_at(mm, address, ptep, pte);
			lazy_mmu_prot_update(pte);
		}
	}
	spin_unlock(&mm->page_table_lock);
712
	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
713
714
715
716

	flush_tlb_range(vma, start, end);
}

717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarentee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		if (nrg == 0)
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

static int hugetlb_acct_memory(long delta)
{
	int ret = -ENOMEM;

	spin_lock(&hugetlb_lock);
	if ((delta + resv_huge_pages) <= free_huge_pages) {
		resv_huge_pages += delta;
		ret = 0;
	}
	spin_unlock(&hugetlb_lock);
	return ret;
}

int hugetlb_reserve_pages(struct inode *inode, long from, long to)
{
	long ret, chg;

	chg = region_chg(&inode->i_mapping->private_list, from, to);
	if (chg < 0)
		return chg;
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
	/*
	 * When cpuset is configured, it breaks the strict hugetlb page
	 * reservation as the accounting is done on a global variable. Such
	 * reservation is completely rubbish in the presence of cpuset because
	 * the reservation is not checked against page availability for the
	 * current cpuset. Application can still potentially OOM'ed by kernel
	 * with lack of free htlb page in cpuset that the task is in.
	 * Attempt to enforce strict accounting with cpuset is almost
	 * impossible (or too ugly) because cpuset is too fluid that
	 * task or memory node can be dynamically moved between cpusets.
	 *
	 * The change of semantics for shared hugetlb mapping with cpuset is
	 * undesirable. However, in order to preserve some of the semantics,
	 * we fall back to check against current free page availability as
	 * a best attempt and hopefully to minimize the impact of changing
	 * semantics that cpuset has.
	 */
	if (chg > cpuset_mems_nr(free_huge_pages_node))
		return -ENOMEM;

878
879
880
881
882
883
884
885
886
887
888
889
	ret = hugetlb_acct_memory(chg);
	if (ret < 0)
		return ret;
	region_add(&inode->i_mapping->private_list, from, to);
	return 0;
}

void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	hugetlb_acct_memory(freed - chg);
}