hugetlb.c 20.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
David Gibson's avatar
David Gibson committed
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/cpuset.h>
16
#include <linux/mutex.h>
17

David Gibson's avatar
David Gibson committed
18
19
20
21
#include <asm/page.h>
#include <asm/pgtable.h>

#include <linux/hugetlb.h>
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23
24

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
29
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
30
31
32
33
/*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
static DEFINE_SPINLOCK(hugetlb_lock);
34

35
36
37
38
39
40
41
42
43
44
45
46
static void clear_huge_page(struct page *page, unsigned long addr)
{
	int i;

	might_sleep();
	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		cond_resched();
		clear_user_highpage(page + i, addr);
	}
}

static void copy_huge_page(struct page *dst, struct page *src,
47
			   unsigned long addr, struct vm_area_struct *vma)
48
49
50
51
52
53
{
	int i;

	might_sleep();
	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		cond_resched();
54
		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
55
56
57
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
58
59
60
61
62
63
64
65
static void enqueue_huge_page(struct page *page)
{
	int nid = page_to_nid(page);
	list_add(&page->lru, &hugepage_freelists[nid]);
	free_huge_pages++;
	free_huge_pages_node[nid]++;
}

66
67
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
				unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
{
	int nid = numa_node_id();
	struct page *page = NULL;
71
	struct zonelist *zonelist = huge_zonelist(vma, address);
72
	struct zone **z;
Linus Torvalds's avatar
Linus Torvalds committed
73

74
	for (z = zonelist->zones; *z; z++) {
75
		nid = zone_to_nid(*z);
76
		if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
77
		    !list_empty(&hugepage_freelists[nid]))
78
			break;
Linus Torvalds's avatar
Linus Torvalds committed
79
	}
80
81

	if (*z) {
Linus Torvalds's avatar
Linus Torvalds committed
82
83
84
85
86
87
88
89
90
		page = list_entry(hugepage_freelists[nid].next,
				  struct page, lru);
		list_del(&page->lru);
		free_huge_pages--;
		free_huge_pages_node[nid]--;
	}
	return page;
}

91
92
93
94
95
96
97
98
99
100
101
static void free_huge_page(struct page *page)
{
	BUG_ON(page_count(page));

	INIT_LIST_HEAD(&page->lru);

	spin_lock(&hugetlb_lock);
	enqueue_huge_page(page);
	spin_unlock(&hugetlb_lock);
}

102
static int alloc_fresh_huge_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
103
104
105
106
107
{
	static int nid = 0;
	struct page *page;
	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
					HUGETLB_PAGE_ORDER);
108
109
110
	nid = next_node(nid, node_online_map);
	if (nid == MAX_NUMNODES)
		nid = first_node(node_online_map);
Linus Torvalds's avatar
Linus Torvalds committed
111
	if (page) {
112
		set_compound_page_dtor(page, free_huge_page);
113
		spin_lock(&hugetlb_lock);
Linus Torvalds's avatar
Linus Torvalds committed
114
115
		nr_huge_pages++;
		nr_huge_pages_node[page_to_nid(page)]++;
116
		spin_unlock(&hugetlb_lock);
117
118
		put_page(page); /* free it into the hugepage allocator */
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
119
	}
120
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
121
122
}

123
124
static struct page *alloc_huge_page(struct vm_area_struct *vma,
				    unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
125
126
127
128
{
	struct page *page;

	spin_lock(&hugetlb_lock);
129
130
131
132
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages--;
	else if (free_huge_pages <= resv_huge_pages)
		goto fail;
133
134
135
136
137

	page = dequeue_huge_page(vma, addr);
	if (!page)
		goto fail;

Linus Torvalds's avatar
Linus Torvalds committed
138
	spin_unlock(&hugetlb_lock);
139
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
140
	return page;
141

142
fail:
143
144
	if (vma->vm_flags & VM_MAYSHARE)
		resv_huge_pages++;
145
146
147
148
	spin_unlock(&hugetlb_lock);
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
149
150
151
152
static int __init hugetlb_init(void)
{
	unsigned long i;

153
154
155
	if (HPAGE_SHIFT == 0)
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
156
157
158
159
	for (i = 0; i < MAX_NUMNODES; ++i)
		INIT_LIST_HEAD(&hugepage_freelists[i]);

	for (i = 0; i < max_huge_pages; ++i) {
160
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
			break;
	}
	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	return 0;
}
module_init(hugetlb_init);

static int __init hugetlb_setup(char *s)
{
	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
		max_huge_pages = 0;
	return 1;
}
__setup("hugepages=", hugetlb_setup);

177
178
179
180
181
182
183
184
185
186
187
static unsigned int cpuset_mems_nr(unsigned int *array)
{
	int node;
	unsigned int nr = 0;

	for_each_node_mask(node, cpuset_current_mems_allowed)
		nr += array[node];

	return nr;
}

Linus Torvalds's avatar
Linus Torvalds committed
188
189
190
191
192
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
	int i;
	nr_huge_pages--;
193
	nr_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
194
195
196
197
198
	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
				1 << PG_private | 1<< PG_writeback);
	}
199
	page[1].lru.next = NULL;
200
	set_page_refcounted(page);
Linus Torvalds's avatar
Linus Torvalds committed
201
202
203
204
205
206
	__free_pages(page, HUGETLB_PAGE_ORDER);
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
207
208
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
209
210
211
212
213
214
215
216
	for (i = 0; i < MAX_NUMNODES; ++i) {
		struct page *page, *next;
		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
			if (PageHighMem(page))
				continue;
			list_del(&page->lru);
			update_and_free_page(page);
			free_huge_pages--;
217
			free_huge_pages_node[page_to_nid(page)]--;
Linus Torvalds's avatar
Linus Torvalds committed
218
219
220
221
222
223
224
225
226
227
228
229
230
231
			if (count >= nr_huge_pages)
				return;
		}
	}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif

static unsigned long set_max_huge_pages(unsigned long count)
{
	while (count > nr_huge_pages) {
232
		if (!alloc_fresh_huge_page())
Linus Torvalds's avatar
Linus Torvalds committed
233
234
235
236
237
238
			return nr_huge_pages;
	}
	if (count >= nr_huge_pages)
		return nr_huge_pages;

	spin_lock(&hugetlb_lock);
239
	count = max(count, resv_huge_pages);
Linus Torvalds's avatar
Linus Torvalds committed
240
241
	try_to_free_low(count);
	while (count < nr_huge_pages) {
242
		struct page *page = dequeue_huge_page(NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
		if (!page)
			break;
		update_and_free_page(page);
	}
	spin_unlock(&hugetlb_lock);
	return nr_huge_pages;
}

int hugetlb_sysctl_handler(struct ctl_table *table, int write,
			   struct file *file, void __user *buffer,
			   size_t *length, loff_t *ppos)
{
	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	max_huge_pages = set_max_huge_pages(max_huge_pages);
	return 0;
}
#endif /* CONFIG_SYSCTL */

int hugetlb_report_meminfo(char *buf)
{
	return sprintf(buf,
			"HugePages_Total: %5lu\n"
			"HugePages_Free:  %5lu\n"
266
			"HugePages_Rsvd:  %5lu\n"
Linus Torvalds's avatar
Linus Torvalds committed
267
268
269
			"Hugepagesize:    %5lu kB\n",
			nr_huge_pages,
			free_huge_pages,
270
			resv_huge_pages,
Linus Torvalds's avatar
Linus Torvalds committed
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
			HPAGE_SIZE/1024);
}

int hugetlb_report_node_meminfo(int nid, char *buf)
{
	return sprintf(buf,
		"Node %d HugePages_Total: %5u\n"
		"Node %d HugePages_Free:  %5u\n",
		nid, nr_huge_pages_node[nid],
		nid, free_huge_pages_node[nid]);
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
				unsigned long address, int *unused)
{
	BUG();
	return NULL;
}

struct vm_operations_struct hugetlb_vm_ops = {
	.nopage = hugetlb_nopage,
};

306
307
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
				int writable)
David Gibson's avatar
David Gibson committed
308
309
310
{
	pte_t entry;

311
	if (writable) {
David Gibson's avatar
David Gibson committed
312
313
314
315
316
317
318
319
320
321
322
		entry =
		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	} else {
		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
	}
	entry = pte_mkyoung(entry);
	entry = pte_mkhuge(entry);

	return entry;
}

323
324
325
326
327
328
static void set_huge_ptep_writable(struct vm_area_struct *vma,
				   unsigned long address, pte_t *ptep)
{
	pte_t entry;

	entry = pte_mkwrite(pte_mkdirty(*ptep));
329
330
331
332
	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
		update_mmu_cache(vma, address, entry);
		lazy_mmu_prot_update(entry);
	}
333
334
335
}


David Gibson's avatar
David Gibson committed
336
337
338
339
340
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
			    struct vm_area_struct *vma)
{
	pte_t *src_pte, *dst_pte, entry;
	struct page *ptepage;
341
	unsigned long addr;
342
343
344
	int cow;

	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
David Gibson's avatar
David Gibson committed
345

346
	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
347
348
349
		src_pte = huge_pte_offset(src, addr);
		if (!src_pte)
			continue;
David Gibson's avatar
David Gibson committed
350
351
352
		dst_pte = huge_pte_alloc(dst, addr);
		if (!dst_pte)
			goto nomem;
353
		spin_lock(&dst->page_table_lock);
354
		spin_lock(&src->page_table_lock);
355
		if (!pte_none(*src_pte)) {
356
357
			if (cow)
				ptep_set_wrprotect(src, addr, src_pte);
358
359
360
361
362
363
			entry = *src_pte;
			ptepage = pte_page(entry);
			get_page(ptepage);
			set_huge_pte_at(dst, addr, dst_pte, entry);
		}
		spin_unlock(&src->page_table_lock);
364
		spin_unlock(&dst->page_table_lock);
David Gibson's avatar
David Gibson committed
365
366
367
368
369
370
371
	}
	return 0;

nomem:
	return -ENOMEM;
}

372
373
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			    unsigned long end)
David Gibson's avatar
David Gibson committed
374
375
376
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address;
377
	pte_t *ptep;
David Gibson's avatar
David Gibson committed
378
379
	pte_t pte;
	struct page *page;
380
	struct page *tmp;
381
382
383
384
385
	/*
	 * A page gathering list, protected by per file i_mmap_lock. The
	 * lock is used to avoid list corruption from multiple unmapping
	 * of the same page since we are using page->lru.
	 */
386
	LIST_HEAD(page_list);
David Gibson's avatar
David Gibson committed
387
388
389
390
391

	WARN_ON(!is_vm_hugetlb_page(vma));
	BUG_ON(start & ~HPAGE_MASK);
	BUG_ON(end & ~HPAGE_MASK);

392
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
393
	for (address = start; address < end; address += HPAGE_SIZE) {
394
		ptep = huge_pte_offset(mm, address);
395
		if (!ptep)
396
397
			continue;

398
399
400
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;

401
		pte = huge_ptep_get_and_clear(mm, address, ptep);
David Gibson's avatar
David Gibson committed
402
403
		if (pte_none(pte))
			continue;
404

David Gibson's avatar
David Gibson committed
405
		page = pte_page(pte);
406
407
		if (pte_dirty(pte))
			set_page_dirty(page);
408
		list_add(&page->lru, &page_list);
David Gibson's avatar
David Gibson committed
409
	}
Linus Torvalds's avatar
Linus Torvalds committed
410
	spin_unlock(&mm->page_table_lock);
411
	flush_tlb_range(vma, start, end);
412
413
414
415
	list_for_each_entry_safe(page, tmp, &page_list, lru) {
		list_del(&page->lru);
		put_page(page);
	}
Linus Torvalds's avatar
Linus Torvalds committed
416
}
David Gibson's avatar
David Gibson committed
417

418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
			  unsigned long end)
{
	/*
	 * It is undesirable to test vma->vm_file as it should be non-null
	 * for valid hugetlb area. However, vm_file will be NULL in the error
	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
	 * to clean up. Since no pte has actually been setup, it is safe to
	 * do nothing in this case.
	 */
	if (vma->vm_file) {
		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
		__unmap_hugepage_range(vma, start, end);
		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	}
}

436
437
438
439
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *ptep, pte_t pte)
{
	struct page *old_page, *new_page;
440
	int avoidcopy;
441
442
443
444
445
446
447
448
449
450
451
452

	old_page = pte_page(pte);

	/* If no-one else is actually using this page, avoid the copy
	 * and just make the page writable */
	avoidcopy = (page_count(old_page) == 1);
	if (avoidcopy) {
		set_huge_ptep_writable(vma, address, ptep);
		return VM_FAULT_MINOR;
	}

	page_cache_get(old_page);
453
	new_page = alloc_huge_page(vma, address);
454
455
456

	if (!new_page) {
		page_cache_release(old_page);
457
		return VM_FAULT_OOM;
458
459
460
	}

	spin_unlock(&mm->page_table_lock);
461
	copy_huge_page(new_page, old_page, address, vma);
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
	spin_lock(&mm->page_table_lock);

	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
	if (likely(pte_same(*ptep, pte))) {
		/* Break COW */
		set_huge_pte_at(mm, address, ptep,
				make_huge_pte(vma, new_page, 1));
		/* Make the old page be freed below */
		new_page = old_page;
	}
	page_cache_release(new_page);
	page_cache_release(old_page);
	return VM_FAULT_MINOR;
}

477
int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
478
			unsigned long address, pte_t *ptep, int write_access)
479
480
{
	int ret = VM_FAULT_SIGBUS;
481
482
483
484
	unsigned long idx;
	unsigned long size;
	struct page *page;
	struct address_space *mapping;
485
	pte_t new_pte;
486
487
488
489
490
491
492
493
494

	mapping = vma->vm_file->f_mapping;
	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

	/*
	 * Use page lock to guard against racing truncation
	 * before we get page_table_lock.
	 */
495
496
497
retry:
	page = find_lock_page(mapping, idx);
	if (!page) {
498
499
500
		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
		if (idx >= size)
			goto out;
501
502
503
504
505
		if (hugetlb_get_quota(mapping))
			goto out;
		page = alloc_huge_page(vma, address);
		if (!page) {
			hugetlb_put_quota(mapping);
506
			ret = VM_FAULT_OOM;
507
508
			goto out;
		}
509
		clear_huge_page(page, address);
510

511
512
513
514
515
516
517
518
519
520
521
522
523
524
		if (vma->vm_flags & VM_SHARED) {
			int err;

			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
			if (err) {
				put_page(page);
				hugetlb_put_quota(mapping);
				if (err == -EEXIST)
					goto retry;
				goto out;
			}
		} else
			lock_page(page);
	}
525

526
	spin_lock(&mm->page_table_lock);
527
528
529
530
531
	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	if (idx >= size)
		goto backout;

	ret = VM_FAULT_MINOR;
532
	if (!pte_none(*ptep))
533
534
		goto backout;

535
536
537
538
539
540
541
542
543
	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				&& (vma->vm_flags & VM_SHARED)));
	set_huge_pte_at(mm, address, ptep, new_pte);

	if (write_access && !(vma->vm_flags & VM_SHARED)) {
		/* Optimization, do the COW without a second fault */
		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	}

544
	spin_unlock(&mm->page_table_lock);
545
546
	unlock_page(page);
out:
547
	return ret;
548
549
550
551
552
553
554

backout:
	spin_unlock(&mm->page_table_lock);
	hugetlb_put_quota(mapping);
	unlock_page(page);
	put_page(page);
	goto out;
555
556
}

557
558
559
560
561
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, int write_access)
{
	pte_t *ptep;
	pte_t entry;
562
	int ret;
563
	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
564
565
566
567
568

	ptep = huge_pte_alloc(mm, address);
	if (!ptep)
		return VM_FAULT_OOM;

569
570
571
572
573
574
	/*
	 * Serialize hugepage allocation and instantiation, so that we don't
	 * get spurious allocation failures if two CPUs race to instantiate
	 * the same page in the page cache.
	 */
	mutex_lock(&hugetlb_instantiation_mutex);
575
	entry = *ptep;
576
577
578
579
580
	if (pte_none(entry)) {
		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		mutex_unlock(&hugetlb_instantiation_mutex);
		return ret;
	}
581

582
583
584
585
586
587
588
589
	ret = VM_FAULT_MINOR;

	spin_lock(&mm->page_table_lock);
	/* Check for a racing update before calling hugetlb_cow */
	if (likely(pte_same(entry, *ptep)))
		if (write_access && !pte_write(entry))
			ret = hugetlb_cow(mm, vma, address, ptep, entry);
	spin_unlock(&mm->page_table_lock);
590
	mutex_unlock(&hugetlb_instantiation_mutex);
591
592

	return ret;
593
594
}

David Gibson's avatar
David Gibson committed
595
596
597
598
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			struct page **pages, struct vm_area_struct **vmas,
			unsigned long *position, int *length, int i)
{
599
600
	unsigned long pfn_offset;
	unsigned long vaddr = *position;
David Gibson's avatar
David Gibson committed
601
602
	int remainder = *length;

603
	spin_lock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
604
	while (vaddr < vma->vm_end && remainder) {
605
606
		pte_t *pte;
		struct page *page;
David Gibson's avatar
David Gibson committed
607

608
609
610
611
612
613
		/*
		 * Some archs (sparc64, sh*) have multiple pte_ts to
		 * each hugepage.  We have to make * sure we get the
		 * first, for the page indexing below to work.
		 */
		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
David Gibson's avatar
David Gibson committed
614

615
616
		if (!pte || pte_none(*pte)) {
			int ret;
David Gibson's avatar
David Gibson committed
617

618
619
620
621
622
			spin_unlock(&mm->page_table_lock);
			ret = hugetlb_fault(mm, vma, vaddr, 0);
			spin_lock(&mm->page_table_lock);
			if (ret == VM_FAULT_MINOR)
				continue;
David Gibson's avatar
David Gibson committed
623

624
625
626
627
628
629
			remainder = 0;
			if (!i)
				i = -EFAULT;
			break;
		}

630
631
632
		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
		page = pte_page(*pte);
same_page:
633
634
		if (pages) {
			get_page(page);
635
			pages[i] = page + pfn_offset;
636
		}
David Gibson's avatar
David Gibson committed
637
638
639
640
641

		if (vmas)
			vmas[i] = vma;

		vaddr += PAGE_SIZE;
642
		++pfn_offset;
David Gibson's avatar
David Gibson committed
643
644
		--remainder;
		++i;
645
646
647
648
649
650
651
652
		if (vaddr < vma->vm_end && remainder &&
				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
			/*
			 * We use pfn_offset to avoid touching the pageframes
			 * of this compound page.
			 */
			goto same_page;
		}
David Gibson's avatar
David Gibson committed
653
	}
654
	spin_unlock(&mm->page_table_lock);
David Gibson's avatar
David Gibson committed
655
656
657
658
659
	*length = remainder;
	*position = vaddr;

	return i;
}
660
661
662
663
664
665
666
667
668
669
670
671

void hugetlb_change_protection(struct vm_area_struct *vma,
		unsigned long address, unsigned long end, pgprot_t newprot)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long start = address;
	pte_t *ptep;
	pte_t pte;

	BUG_ON(address >= end);
	flush_cache_range(vma, address, end);

672
	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
673
674
675
676
677
	spin_lock(&mm->page_table_lock);
	for (; address < end; address += HPAGE_SIZE) {
		ptep = huge_pte_offset(mm, address);
		if (!ptep)
			continue;
678
679
		if (huge_pmd_unshare(mm, &address, ptep))
			continue;
680
681
682
683
684
685
686
687
		if (!pte_none(*ptep)) {
			pte = huge_ptep_get_and_clear(mm, address, ptep);
			pte = pte_mkhuge(pte_modify(pte, newprot));
			set_huge_pte_at(mm, address, ptep, pte);
			lazy_mmu_prot_update(pte);
		}
	}
	spin_unlock(&mm->page_table_lock);
688
	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
689
690
691
692

	flush_tlb_range(vma, start, end);
}

693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
struct file_region {
	struct list_head link;
	long from;
	long to;
};

static long region_add(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg, *trg;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
			list_del(&rg->link);
			kfree(rg);
		}
	}
	nrg->from = f;
	nrg->to = t;
	return 0;
}

static long region_chg(struct list_head *head, long f, long t)
{
	struct file_region *rg, *nrg;
	long chg = 0;

	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarentee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		if (nrg == 0)
			return -ENOMEM;
		nrg->from = f;
		nrg->to   = f;
		INIT_LIST_HEAD(&nrg->link);
		list_add(&nrg->link, rg->link.prev);

		return t - f;
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			return chg;

		/* We overlap with this area, if it extends futher than
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
	return chg;
}

static long region_truncate(struct list_head *head, long end)
{
	struct file_region *rg, *trg;
	long chg = 0;

	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (end <= rg->to)
			break;
	if (&rg->link == head)
		return 0;

	/* If we are in the middle of a region then adjust it. */
	if (end > rg->from) {
		chg = rg->to - end;
		rg->to = end;
		rg = list_entry(rg->link.next, typeof(*rg), link);
	}

	/* Drop any remaining regions. */
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		chg += rg->to - rg->from;
		list_del(&rg->link);
		kfree(rg);
	}
	return chg;
}

static int hugetlb_acct_memory(long delta)
{
	int ret = -ENOMEM;

	spin_lock(&hugetlb_lock);
	if ((delta + resv_huge_pages) <= free_huge_pages) {
		resv_huge_pages += delta;
		ret = 0;
	}
	spin_unlock(&hugetlb_lock);
	return ret;
}

int hugetlb_reserve_pages(struct inode *inode, long from, long to)
{
	long ret, chg;

	chg = region_chg(&inode->i_mapping->private_list, from, to);
	if (chg < 0)
		return chg;
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
	/*
	 * When cpuset is configured, it breaks the strict hugetlb page
	 * reservation as the accounting is done on a global variable. Such
	 * reservation is completely rubbish in the presence of cpuset because
	 * the reservation is not checked against page availability for the
	 * current cpuset. Application can still potentially OOM'ed by kernel
	 * with lack of free htlb page in cpuset that the task is in.
	 * Attempt to enforce strict accounting with cpuset is almost
	 * impossible (or too ugly) because cpuset is too fluid that
	 * task or memory node can be dynamically moved between cpusets.
	 *
	 * The change of semantics for shared hugetlb mapping with cpuset is
	 * undesirable. However, in order to preserve some of the semantics,
	 * we fall back to check against current free page availability as
	 * a best attempt and hopefully to minimize the impact of changing
	 * semantics that cpuset has.
	 */
	if (chg > cpuset_mems_nr(free_huge_pages_node))
		return -ENOMEM;

854
855
856
857
858
859
860
861
862
863
864
865
	ret = hugetlb_acct_memory(chg);
	if (ret < 0)
		return ret;
	region_add(&inode->i_mapping->private_list, from, to);
	return 0;
}

void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	hugetlb_acct_memory(freed - chg);
}