pgtable.c 19.4 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3
4
5
6
7
8
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
9
#include <linux/gfp.h>
10
11
12
13
14
15
16
17
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
18
#include <linux/rcupdate.h>
19
#include <linux/slab.h>
20
21
22
23
24
25

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
26
#include <asm/mmu_context.h>
27
28
29

#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
30
#define FRAG_MASK	0x0f
31
32
#else
#define ALLOC_ORDER	2
33
#define FRAG_MASK	0x03
34
35
#endif

36
37
38
39
40
41
42
43
44
45
46
47
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

48
unsigned long *crst_table_alloc(struct mm_struct *mm)
49
50
51
52
53
54
55
56
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	return (unsigned long *) page_to_phys(page);
}

57
58
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
59
	free_pages((unsigned long) table, ALLOC_ORDER);
60
61
}

62
63
64
65
66
67
68
69
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
70
	table = crst_table_alloc(mm);
71
72
	if (!table)
		return -ENOMEM;
73
	spin_lock_bh(&mm->page_table_lock);
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
92
		mm->task_size = mm->context.asce_limit;
93
94
		table = NULL;
	}
95
	spin_unlock_bh(&mm->page_table_lock);
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
130
		mm->task_size = mm->context.asce_limit;
131
132
133
134
135
136
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

137
138
139
140
141
142
143
144
145
#ifdef CONFIG_PGSTE

/**
 * gmap_alloc - allocate a guest address space
 * @mm: pointer to the parent mm_struct
 *
 * Returns a guest address space structure.
 */
struct gmap *gmap_alloc(struct mm_struct *mm)
146
{
147
148
149
	struct gmap *gmap;
	struct page *page;
	unsigned long *table;
150

151
152
153
154
155
156
157
158
159
160
161
162
	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
	if (!gmap)
		goto out;
	INIT_LIST_HEAD(&gmap->crst_list);
	gmap->mm = mm;
	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
	if (!page)
		goto out_free;
	list_add(&page->lru, &gmap->crst_list);
	table = (unsigned long *) page_to_phys(page);
	crst_table_init(table, _REGION1_ENTRY_EMPTY);
	gmap->table = table;
163
164
	gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
		     _ASCE_USER_BITS | __pa(table);
165
166
167
168
169
170
171
	list_add(&gmap->list, &mm->context.gmap_list);
	return gmap;

out_free:
	kfree(gmap);
out:
	return NULL;
172
}
173
EXPORT_SYMBOL_GPL(gmap_alloc);
174

175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
{
	struct gmap_pgtable *mp;
	struct gmap_rmap *rmap;
	struct page *page;

	if (*table & _SEGMENT_ENTRY_INV)
		return 0;
	page = pfn_to_page(*table >> PAGE_SHIFT);
	mp = (struct gmap_pgtable *) page->index;
	list_for_each_entry(rmap, &mp->mapper, list) {
		if (rmap->entry != table)
			continue;
		list_del(&rmap->list);
		kfree(rmap);
		break;
	}
	*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
	return 1;
}

static void gmap_flush_tlb(struct gmap *gmap)
{
	if (MACHINE_HAS_IDTE)
		__tlb_flush_idte((unsigned long) gmap->table |
				 _ASCE_TYPE_REGION1);
	else
		__tlb_flush_global();
}

/**
 * gmap_free - free a guest address space
 * @gmap: pointer to the guest address space structure
208
 */
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
void gmap_free(struct gmap *gmap)
{
	struct page *page, *next;
	unsigned long *table;
	int i;


	/* Flush tlb. */
	if (MACHINE_HAS_IDTE)
		__tlb_flush_idte((unsigned long) gmap->table |
				 _ASCE_TYPE_REGION1);
	else
		__tlb_flush_global();

	/* Free all segment & region tables. */
	down_read(&gmap->mm->mmap_sem);
	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
		table = (unsigned long *) page_to_phys(page);
		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
			/* Remove gmap rmap structures for segment table. */
			for (i = 0; i < PTRS_PER_PMD; i++, table++)
				gmap_unlink_segment(gmap, table);
		__free_pages(page, ALLOC_ORDER);
	}
	up_read(&gmap->mm->mmap_sem);
	list_del(&gmap->list);
	kfree(gmap);
}
EXPORT_SYMBOL_GPL(gmap_free);

/**
 * gmap_enable - switch primary space to the guest address space
 * @gmap: pointer to the guest address space structure
 */
void gmap_enable(struct gmap *gmap)
{
	S390_lowcore.gmap = (unsigned long) gmap;
}
EXPORT_SYMBOL_GPL(gmap_enable);

/**
 * gmap_disable - switch back to the standard primary address space
 * @gmap: pointer to the guest address space structure
 */
void gmap_disable(struct gmap *gmap)
{
	S390_lowcore.gmap = 0UL;
}
EXPORT_SYMBOL_GPL(gmap_disable);

static int gmap_alloc_table(struct gmap *gmap,
			       unsigned long *table, unsigned long init)
{
	struct page *page;
	unsigned long *new;

	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
	if (!page)
		return -ENOMEM;
	new = (unsigned long *) page_to_phys(page);
	crst_table_init(new, init);
	down_read(&gmap->mm->mmap_sem);
	if (*table & _REGION_ENTRY_INV) {
		list_add(&page->lru, &gmap->crst_list);
		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
			(*table & _REGION_ENTRY_TYPE_MASK);
	} else
		__free_pages(page, ALLOC_ORDER);
	up_read(&gmap->mm->mmap_sem);
	return 0;
}

/**
 * gmap_unmap_segment - unmap segment from the guest address space
 * @gmap: pointer to the guest address space structure
 * @addr: address in the guest address space
 * @len: length of the memory area to unmap
 *
 * Returns 0 if the unmap succeded, -EINVAL if not.
 */
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
{
	unsigned long *table;
	unsigned long off;
	int flush;

	if ((to | len) & (PMD_SIZE - 1))
		return -EINVAL;
	if (len == 0 || to + len < to)
		return -EINVAL;

	flush = 0;
	down_read(&gmap->mm->mmap_sem);
	for (off = 0; off < len; off += PMD_SIZE) {
		/* Walk the guest addr space page table */
		table = gmap->table + (((to + off) >> 53) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
306
			goto out;
307
308
309
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 42) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
310
			goto out;
311
312
313
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 31) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
314
			goto out;
315
316
317
318
319
320
321
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 20) & 0x7ff);

		/* Clear segment table entry in guest address space. */
		flush |= gmap_unlink_segment(gmap, table);
		*table = _SEGMENT_ENTRY_INV;
	}
322
out:
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
	up_read(&gmap->mm->mmap_sem);
	if (flush)
		gmap_flush_tlb(gmap);
	return 0;
}
EXPORT_SYMBOL_GPL(gmap_unmap_segment);

/**
 * gmap_mmap_segment - map a segment to the guest address space
 * @gmap: pointer to the guest address space structure
 * @from: source address in the parent address space
 * @to: target address in the guest address space
 *
 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
 */
int gmap_map_segment(struct gmap *gmap, unsigned long from,
		     unsigned long to, unsigned long len)
{
	unsigned long *table;
	unsigned long off;
	int flush;

	if ((from | to | len) & (PMD_SIZE - 1))
		return -EINVAL;
	if (len == 0 || from + len > PGDIR_SIZE ||
	    from + len < from || to + len < to)
		return -EINVAL;

	flush = 0;
	down_read(&gmap->mm->mmap_sem);
	for (off = 0; off < len; off += PMD_SIZE) {
		/* Walk the gmap address space page table */
		table = gmap->table + (((to + off) >> 53) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 42) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 31) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 20) & 0x7ff);

		/* Store 'from' address in an invalid segment table entry. */
		flush |= gmap_unlink_segment(gmap, table);
		*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
	}
	up_read(&gmap->mm->mmap_sem);
	if (flush)
		gmap_flush_tlb(gmap);
	return 0;

out_unmap:
	up_read(&gmap->mm->mmap_sem);
	gmap_unmap_segment(gmap, to, len);
	return -ENOMEM;
}
EXPORT_SYMBOL_GPL(gmap_map_segment);

unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
{
	unsigned long *table, vmaddr, segment;
	struct mm_struct *mm;
	struct gmap_pgtable *mp;
	struct gmap_rmap *rmap;
	struct vm_area_struct *vma;
	struct page *page;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	current->thread.gmap_addr = address;
	mm = gmap->mm;
	/* Walk the gmap address space page table */
	table = gmap->table + ((address >> 53) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 42) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 31) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 20) & 0x7ff);

	/* Convert the gmap address to an mm address. */
	segment = *table;
	if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
		page = pfn_to_page(segment >> PAGE_SHIFT);
		mp = (struct gmap_pgtable *) page->index;
		return mp->vmaddr | (address & ~PMD_MASK);
	} else if (segment & _SEGMENT_ENTRY_RO) {
		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
		vma = find_vma(mm, vmaddr);
		if (!vma || vma->vm_start > vmaddr)
			return -EFAULT;

		/* Walk the parent mm page table */
		pgd = pgd_offset(mm, vmaddr);
		pud = pud_alloc(mm, pgd, vmaddr);
		if (!pud)
			return -ENOMEM;
		pmd = pmd_alloc(mm, pud, vmaddr);
		if (!pmd)
			return -ENOMEM;
		if (!pmd_present(*pmd) &&
		    __pte_alloc(mm, vma, pmd, vmaddr))
			return -ENOMEM;
		/* pmd now points to a valid segment table entry. */
		rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
		if (!rmap)
			return -ENOMEM;
		/* Link gmap segment table entry location to page table. */
		page = pmd_page(*pmd);
		mp = (struct gmap_pgtable *) page->index;
		rmap->entry = table;
		list_add(&rmap->list, &mp->mapper);
		/* Set gmap segment table entry to page table. */
		*table = pmd_val(*pmd) & PAGE_MASK;
		return vmaddr | (address & ~PMD_MASK);
	}
	return -EFAULT;

}
EXPORT_SYMBOL_GPL(gmap_fault);

void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
{
	struct gmap_rmap *rmap, *next;
	struct gmap_pgtable *mp;
	struct page *page;
	int flush;

	flush = 0;
	spin_lock(&mm->page_table_lock);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	mp = (struct gmap_pgtable *) page->index;
	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
		*rmap->entry =
			_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
		list_del(&rmap->list);
		kfree(rmap);
		flush = 1;
	}
	spin_unlock(&mm->page_table_lock);
	if (flush)
		__tlb_flush_global();
}

static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
						    unsigned long vmaddr)
483
484
485
{
	struct page *page;
	unsigned long *table;
486
	struct gmap_pgtable *mp;
487
488
489
490

	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
	if (!page)
		return NULL;
491
492
493
494
495
	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
	if (!mp) {
		__free_page(page);
		return NULL;
	}
496
	pgtable_page_ctor(page);
497
498
499
	mp->vmaddr = vmaddr & PMD_MASK;
	INIT_LIST_HEAD(&mp->mapper);
	page->index = (unsigned long) mp;
500
501
502
503
504
505
506
507
508
509
	atomic_set(&page->_mapcount, 3);
	table = (unsigned long *) page_to_phys(page);
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
	return table;
}

static inline void page_table_free_pgste(unsigned long *table)
{
	struct page *page;
510
	struct gmap_pgtable *mp;
511
512

	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
513
514
	mp = (struct gmap_pgtable *) page->index;
	BUG_ON(!list_empty(&mp->mapper));
515
516
	pgtable_page_ctor(page);
	atomic_set(&page->_mapcount, -1);
517
	kfree(mp);
518
519
520
	__free_page(page);
}

521
522
523
524
525
#else /* CONFIG_PGSTE */

static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
						    unsigned long vmaddr)
{
526
	return NULL;
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
}

static inline void page_table_free_pgste(unsigned long *table)
{
}

static inline void gmap_unmap_notifier(struct mm_struct *mm,
					  unsigned long *table)
{
}

#endif /* CONFIG_PGSTE */

static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
{
	unsigned int old, new;

	do {
		old = atomic_read(v);
		new = old ^ bits;
	} while (atomic_cmpxchg(v, old, new) != old);
	return new;
}

/*
 * page table entry allocation/free routines.
 */
unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
555
{
556
	struct page *page;
557
	unsigned long *table;
558
	unsigned int mask, bit;
559

560
	if (mm_has_pgste(mm))
561
		return page_table_alloc_pgste(mm, vmaddr);
562
	/* Allocate fragments of a 4K page as 1K/2K page table */
563
	spin_lock_bh(&mm->context.list_lock);
564
	mask = FRAG_MASK;
565
566
567
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
568
569
570
		table = (unsigned long *) page_to_phys(page);
		mask = atomic_read(&page->_mapcount);
		mask = mask | (mask >> 4);
571
	}
572
	if ((mask & FRAG_MASK) == FRAG_MASK) {
573
		spin_unlock_bh(&mm->context.list_lock);
574
575
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
576
			return NULL;
577
		pgtable_page_ctor(page);
578
		atomic_set(&page->_mapcount, 1);
579
		table = (unsigned long *) page_to_phys(page);
580
		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
581
		spin_lock_bh(&mm->context.list_lock);
582
		list_add(&page->lru, &mm->context.pgtable_list);
583
584
585
586
587
588
	} else {
		for (bit = 1; mask & bit; bit <<= 1)
			table += PTRS_PER_PTE;
		mask = atomic_xor_bits(&page->_mapcount, bit);
		if ((mask & FRAG_MASK) == FRAG_MASK)
			list_del(&page->lru);
589
	}
590
	spin_unlock_bh(&mm->context.list_lock);
591
592
593
	return table;
}

594
void page_table_free(struct mm_struct *mm, unsigned long *table)
595
596
{
	struct page *page;
597
	unsigned int bit, mask;
598

599
600
	if (mm_has_pgste(mm)) {
		gmap_unmap_notifier(mm, table);
601
		return page_table_free_pgste(table);
602
	}
603
	/* Free 1K/2K page table fragment of a 4K page */
604
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
605
606
607
608
609
610
611
612
613
	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
	spin_lock_bh(&mm->context.list_lock);
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit);
	if (mask & FRAG_MASK)
		list_add(&page->lru, &mm->context.pgtable_list);
	spin_unlock_bh(&mm->context.list_lock);
	if (mask == 0) {
614
		pgtable_page_dtor(page);
615
		atomic_set(&page->_mapcount, -1);
616
617
618
619
		__free_page(page);
	}
}

620
621
622
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

static void __page_table_free_rcu(void *table, unsigned bit)
623
{
624
	struct page *page;
625

626
627
628
	if (bit == FRAG_MASK)
		return page_table_free_pgste(table);
	/* Free 1K/2K page table fragment of a 4K page */
629
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
630
	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
631
		pgtable_page_dtor(page);
632
		atomic_set(&page->_mapcount, -1);
633
634
635
		__free_page(page);
	}
}
636

637
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
638
{
639
	struct mm_struct *mm;
640
	struct page *page;
641
	unsigned int bit, mask;
642

643
644
	mm = tlb->mm;
	if (mm_has_pgste(mm)) {
645
		gmap_unmap_notifier(mm, table);
646
647
648
		table = (unsigned long *) (__pa(table) | FRAG_MASK);
		tlb_remove_table(tlb, table);
		return;
649
	}
650
	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
651
652
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock_bh(&mm->context.list_lock);
653
654
655
656
657
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
	if (mask & FRAG_MASK)
		list_add_tail(&page->lru, &mm->context.pgtable_list);
658
	spin_unlock_bh(&mm->context.list_lock);
659
660
661
662
663
664
	table = (unsigned long *) (__pa(table) | (bit << 4));
	tlb_remove_table(tlb, table);
}

void __tlb_remove_table(void *_table)
{
665
666
667
	const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
	void *table = (void *)((unsigned long) _table & ~mask);
	unsigned type = (unsigned long) _table & mask;
668
669
670
671
672

	if (type)
		__page_table_free_rcu(table, type);
	else
		free_pages((unsigned long) table, ALLOC_ORDER);
673
674
}

675
676
#endif

677
678
679
680
681
682
/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
683
	struct mm_struct *mm, *old_mm;
684

685
	/* Do we have switched amode? If no, we cannot do sie */
686
	if (user_mode == HOME_SPACE_MODE)
687
688
		return -EINVAL;

689
	/* Do we have pgstes? if yes, we are done */
690
	if (mm_has_pgste(tsk->mm))
691
		return 0;
692

693
694
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
695
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
696
697
698
699
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
700
701
702
703
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
704

705
706
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
707
	mm = dup_mm(tsk);
708
	tsk->mm->context.alloc_pgste = 0;
709
	if (!mm)
710
711
		return -ENOMEM;

712
	/* Now lets check again if something happened */
713
714
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
715
716
717
718
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
719
720
721
722
723
724
725
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
726
727
728
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
729
730
	atomic_inc(&mm->context.attach_count);
	atomic_dec(&old_mm->context.attach_count);
731
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
732
733
	preempt_enable();
	task_unlock(tsk);
734
735
	mmput(old_mm);
	return 0;
736
737
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
738

739
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
740
741
742
743
744
745
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
746
747
748
749
750
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
751
752
	return cc == 0;
}
753
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */