pgtable.c 19.6 KB
Newer Older
1
/*
2
 *    Copyright IBM Corp. 2007,2009
3
4
5
6
7
8
 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
9
#include <linux/gfp.h>
10
11
12
13
14
15
16
17
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
18
#include <linux/rcupdate.h>
19
#include <linux/slab.h>
20
21
22
23
24
25

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
26
#include <asm/mmu_context.h>
27
28
29

#ifndef CONFIG_64BIT
#define ALLOC_ORDER	1
30
#define FRAG_MASK	0x0f
31
32
#else
#define ALLOC_ORDER	2
33
#define FRAG_MASK	0x03
34
35
#endif

36
37
38
39
40
41
42
43
44
45
46
47
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);

static int __init parse_vmalloc(char *arg)
{
	if (!arg)
		return -EINVAL;
	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
	return 0;
}
early_param("vmalloc", parse_vmalloc);

48
unsigned long *crst_table_alloc(struct mm_struct *mm)
49
50
51
52
53
54
55
56
{
	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);

	if (!page)
		return NULL;
	return (unsigned long *) page_to_phys(page);
}

57
58
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
59
	free_pages((unsigned long) table, ALLOC_ORDER);
60
61
}

62
63
64
65
66
67
68
69
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
	unsigned long *table, *pgd;
	unsigned long entry;

	BUG_ON(limit > (1UL << 53));
repeat:
70
	table = crst_table_alloc(mm);
71
72
	if (!table)
		return -ENOMEM;
73
	spin_lock_bh(&mm->page_table_lock);
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
	if (mm->context.asce_limit < limit) {
		pgd = (unsigned long *) mm->pgd;
		if (mm->context.asce_limit <= (1UL << 31)) {
			entry = _REGION3_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
		} else {
			entry = _REGION2_ENTRY_EMPTY;
			mm->context.asce_limit = 1UL << 53;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION2;
		}
		crst_table_init(table, entry);
		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
		mm->pgd = (pgd_t *) table;
92
		mm->task_size = mm->context.asce_limit;
93
94
		table = NULL;
	}
95
	spin_unlock_bh(&mm->page_table_lock);
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
	if (table)
		crst_table_free(mm, table);
	if (mm->context.asce_limit < limit)
		goto repeat;
	update_mm(mm, current);
	return 0;
}

void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
	pgd_t *pgd;

	if (mm->context.asce_limit <= limit)
		return;
	__tlb_flush_mm(mm);
	while (mm->context.asce_limit > limit) {
		pgd = mm->pgd;
		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
		case _REGION_ENTRY_TYPE_R2:
			mm->context.asce_limit = 1UL << 42;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_REGION3;
			break;
		case _REGION_ENTRY_TYPE_R3:
			mm->context.asce_limit = 1UL << 31;
			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
						_ASCE_USER_BITS |
						_ASCE_TYPE_SEGMENT;
			break;
		default:
			BUG();
		}
		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
130
		mm->task_size = mm->context.asce_limit;
131
132
133
134
135
136
		crst_table_free(mm, (unsigned long *) pgd);
	}
	update_mm(mm, current);
}
#endif

137
138
139
140
141
142
143
144
145
#ifdef CONFIG_PGSTE

/**
 * gmap_alloc - allocate a guest address space
 * @mm: pointer to the parent mm_struct
 *
 * Returns a guest address space structure.
 */
struct gmap *gmap_alloc(struct mm_struct *mm)
146
{
147
148
149
	struct gmap *gmap;
	struct page *page;
	unsigned long *table;
150

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
	if (!gmap)
		goto out;
	INIT_LIST_HEAD(&gmap->crst_list);
	gmap->mm = mm;
	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
	if (!page)
		goto out_free;
	list_add(&page->lru, &gmap->crst_list);
	table = (unsigned long *) page_to_phys(page);
	crst_table_init(table, _REGION1_ENTRY_EMPTY);
	gmap->table = table;
	list_add(&gmap->list, &mm->context.gmap_list);
	return gmap;

out_free:
	kfree(gmap);
out:
	return NULL;
170
}
171
EXPORT_SYMBOL_GPL(gmap_alloc);
172

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
{
	struct gmap_pgtable *mp;
	struct gmap_rmap *rmap;
	struct page *page;

	if (*table & _SEGMENT_ENTRY_INV)
		return 0;
	page = pfn_to_page(*table >> PAGE_SHIFT);
	mp = (struct gmap_pgtable *) page->index;
	list_for_each_entry(rmap, &mp->mapper, list) {
		if (rmap->entry != table)
			continue;
		list_del(&rmap->list);
		kfree(rmap);
		break;
	}
	*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
	return 1;
}

static void gmap_flush_tlb(struct gmap *gmap)
{
	if (MACHINE_HAS_IDTE)
		__tlb_flush_idte((unsigned long) gmap->table |
				 _ASCE_TYPE_REGION1);
	else
		__tlb_flush_global();
}

/**
 * gmap_free - free a guest address space
 * @gmap: pointer to the guest address space structure
206
 */
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
void gmap_free(struct gmap *gmap)
{
	struct page *page, *next;
	unsigned long *table;
	int i;


	/* Flush tlb. */
	if (MACHINE_HAS_IDTE)
		__tlb_flush_idte((unsigned long) gmap->table |
				 _ASCE_TYPE_REGION1);
	else
		__tlb_flush_global();

	/* Free all segment & region tables. */
	down_read(&gmap->mm->mmap_sem);
	list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
		table = (unsigned long *) page_to_phys(page);
		if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
			/* Remove gmap rmap structures for segment table. */
			for (i = 0; i < PTRS_PER_PMD; i++, table++)
				gmap_unlink_segment(gmap, table);
		__free_pages(page, ALLOC_ORDER);
	}
	up_read(&gmap->mm->mmap_sem);
	list_del(&gmap->list);
	kfree(gmap);
}
EXPORT_SYMBOL_GPL(gmap_free);

/**
 * gmap_enable - switch primary space to the guest address space
 * @gmap: pointer to the guest address space structure
 */
void gmap_enable(struct gmap *gmap)
{
	/* Load primary space page table origin. */
	S390_lowcore.user_asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
				 _ASCE_USER_BITS | __pa(gmap->table);
	asm volatile("lctlg 1,1,%0\n" : : "m" (S390_lowcore.user_asce) );
	S390_lowcore.gmap = (unsigned long) gmap;
}
EXPORT_SYMBOL_GPL(gmap_enable);

/**
 * gmap_disable - switch back to the standard primary address space
 * @gmap: pointer to the guest address space structure
 */
void gmap_disable(struct gmap *gmap)
{
	/* Load primary space page table origin. */
	S390_lowcore.user_asce =
		gmap->mm->context.asce_bits | __pa(gmap->mm->pgd);
	asm volatile("lctlg 1,1,%0\n" : : "m" (S390_lowcore.user_asce) );
	S390_lowcore.gmap = 0UL;
}
EXPORT_SYMBOL_GPL(gmap_disable);

static int gmap_alloc_table(struct gmap *gmap,
			       unsigned long *table, unsigned long init)
{
	struct page *page;
	unsigned long *new;

	page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
	if (!page)
		return -ENOMEM;
	new = (unsigned long *) page_to_phys(page);
	crst_table_init(new, init);
	down_read(&gmap->mm->mmap_sem);
	if (*table & _REGION_ENTRY_INV) {
		list_add(&page->lru, &gmap->crst_list);
		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
			(*table & _REGION_ENTRY_TYPE_MASK);
	} else
		__free_pages(page, ALLOC_ORDER);
	up_read(&gmap->mm->mmap_sem);
	return 0;
}

/**
 * gmap_unmap_segment - unmap segment from the guest address space
 * @gmap: pointer to the guest address space structure
 * @addr: address in the guest address space
 * @len: length of the memory area to unmap
 *
 * Returns 0 if the unmap succeded, -EINVAL if not.
 */
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
{
	unsigned long *table;
	unsigned long off;
	int flush;

	if ((to | len) & (PMD_SIZE - 1))
		return -EINVAL;
	if (len == 0 || to + len < to)
		return -EINVAL;

	flush = 0;
	down_read(&gmap->mm->mmap_sem);
	for (off = 0; off < len; off += PMD_SIZE) {
		/* Walk the guest addr space page table */
		table = gmap->table + (((to + off) >> 53) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
			return 0;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 42) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
			return 0;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 31) & 0x7ff);
		if (*table & _REGION_ENTRY_INV)
			return 0;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 20) & 0x7ff);

		/* Clear segment table entry in guest address space. */
		flush |= gmap_unlink_segment(gmap, table);
		*table = _SEGMENT_ENTRY_INV;
	}
	up_read(&gmap->mm->mmap_sem);
	if (flush)
		gmap_flush_tlb(gmap);
	return 0;
}
EXPORT_SYMBOL_GPL(gmap_unmap_segment);

/**
 * gmap_mmap_segment - map a segment to the guest address space
 * @gmap: pointer to the guest address space structure
 * @from: source address in the parent address space
 * @to: target address in the guest address space
 *
 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
 */
int gmap_map_segment(struct gmap *gmap, unsigned long from,
		     unsigned long to, unsigned long len)
{
	unsigned long *table;
	unsigned long off;
	int flush;

	if ((from | to | len) & (PMD_SIZE - 1))
		return -EINVAL;
	if (len == 0 || from + len > PGDIR_SIZE ||
	    from + len < from || to + len < to)
		return -EINVAL;

	flush = 0;
	down_read(&gmap->mm->mmap_sem);
	for (off = 0; off < len; off += PMD_SIZE) {
		/* Walk the gmap address space page table */
		table = gmap->table + (((to + off) >> 53) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 42) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 31) & 0x7ff);
		if ((*table & _REGION_ENTRY_INV) &&
		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
			goto out_unmap;
		table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
		table = table + (((to + off) >> 20) & 0x7ff);

		/* Store 'from' address in an invalid segment table entry. */
		flush |= gmap_unlink_segment(gmap, table);
		*table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
	}
	up_read(&gmap->mm->mmap_sem);
	if (flush)
		gmap_flush_tlb(gmap);
	return 0;

out_unmap:
	up_read(&gmap->mm->mmap_sem);
	gmap_unmap_segment(gmap, to, len);
	return -ENOMEM;
}
EXPORT_SYMBOL_GPL(gmap_map_segment);

unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
{
	unsigned long *table, vmaddr, segment;
	struct mm_struct *mm;
	struct gmap_pgtable *mp;
	struct gmap_rmap *rmap;
	struct vm_area_struct *vma;
	struct page *page;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	current->thread.gmap_addr = address;
	mm = gmap->mm;
	/* Walk the gmap address space page table */
	table = gmap->table + ((address >> 53) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 42) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 31) & 0x7ff);
	if (unlikely(*table & _REGION_ENTRY_INV))
		return -EFAULT;
	table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
	table = table + ((address >> 20) & 0x7ff);

	/* Convert the gmap address to an mm address. */
	segment = *table;
	if (likely(!(segment & _SEGMENT_ENTRY_INV))) {
		page = pfn_to_page(segment >> PAGE_SHIFT);
		mp = (struct gmap_pgtable *) page->index;
		return mp->vmaddr | (address & ~PMD_MASK);
	} else if (segment & _SEGMENT_ENTRY_RO) {
		vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
		vma = find_vma(mm, vmaddr);
		if (!vma || vma->vm_start > vmaddr)
			return -EFAULT;

		/* Walk the parent mm page table */
		pgd = pgd_offset(mm, vmaddr);
		pud = pud_alloc(mm, pgd, vmaddr);
		if (!pud)
			return -ENOMEM;
		pmd = pmd_alloc(mm, pud, vmaddr);
		if (!pmd)
			return -ENOMEM;
		if (!pmd_present(*pmd) &&
		    __pte_alloc(mm, vma, pmd, vmaddr))
			return -ENOMEM;
		/* pmd now points to a valid segment table entry. */
		rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
		if (!rmap)
			return -ENOMEM;
		/* Link gmap segment table entry location to page table. */
		page = pmd_page(*pmd);
		mp = (struct gmap_pgtable *) page->index;
		rmap->entry = table;
		list_add(&rmap->list, &mp->mapper);
		/* Set gmap segment table entry to page table. */
		*table = pmd_val(*pmd) & PAGE_MASK;
		return vmaddr | (address & ~PMD_MASK);
	}
	return -EFAULT;

}
EXPORT_SYMBOL_GPL(gmap_fault);

void gmap_unmap_notifier(struct mm_struct *mm, unsigned long *table)
{
	struct gmap_rmap *rmap, *next;
	struct gmap_pgtable *mp;
	struct page *page;
	int flush;

	flush = 0;
	spin_lock(&mm->page_table_lock);
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	mp = (struct gmap_pgtable *) page->index;
	list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
		*rmap->entry =
			_SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
		list_del(&rmap->list);
		kfree(rmap);
		flush = 1;
	}
	spin_unlock(&mm->page_table_lock);
	if (flush)
		__tlb_flush_global();
}

static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
						    unsigned long vmaddr)
488
489
490
{
	struct page *page;
	unsigned long *table;
491
	struct gmap_pgtable *mp;
492
493
494
495

	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
	if (!page)
		return NULL;
496
497
498
499
500
	mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
	if (!mp) {
		__free_page(page);
		return NULL;
	}
501
	pgtable_page_ctor(page);
502
503
504
	mp->vmaddr = vmaddr & PMD_MASK;
	INIT_LIST_HEAD(&mp->mapper);
	page->index = (unsigned long) mp;
505
506
507
508
509
510
511
512
513
514
	atomic_set(&page->_mapcount, 3);
	table = (unsigned long *) page_to_phys(page);
	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
	clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
	return table;
}

static inline void page_table_free_pgste(unsigned long *table)
{
	struct page *page;
515
	struct gmap_pgtable *mp;
516
517

	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
518
519
	mp = (struct gmap_pgtable *) page->index;
	BUG_ON(!list_empty(&mp->mapper));
520
521
	pgtable_page_ctor(page);
	atomic_set(&page->_mapcount, -1);
522
	kfree(mp);
523
524
525
	__free_page(page);
}

526
527
528
529
530
#else /* CONFIG_PGSTE */

static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
						    unsigned long vmaddr)
{
531
	return NULL;
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
}

static inline void page_table_free_pgste(unsigned long *table)
{
}

static inline void gmap_unmap_notifier(struct mm_struct *mm,
					  unsigned long *table)
{
}

#endif /* CONFIG_PGSTE */

static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
{
	unsigned int old, new;

	do {
		old = atomic_read(v);
		new = old ^ bits;
	} while (atomic_cmpxchg(v, old, new) != old);
	return new;
}

/*
 * page table entry allocation/free routines.
 */
unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
560
{
561
	struct page *page;
562
	unsigned long *table;
563
	unsigned int mask, bit;
564

565
	if (mm_has_pgste(mm))
566
		return page_table_alloc_pgste(mm, vmaddr);
567
	/* Allocate fragments of a 4K page as 1K/2K page table */
568
	spin_lock_bh(&mm->context.list_lock);
569
	mask = FRAG_MASK;
570
571
572
	if (!list_empty(&mm->context.pgtable_list)) {
		page = list_first_entry(&mm->context.pgtable_list,
					struct page, lru);
573
574
575
		table = (unsigned long *) page_to_phys(page);
		mask = atomic_read(&page->_mapcount);
		mask = mask | (mask >> 4);
576
	}
577
	if ((mask & FRAG_MASK) == FRAG_MASK) {
578
		spin_unlock_bh(&mm->context.list_lock);
579
580
		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
		if (!page)
581
			return NULL;
582
		pgtable_page_ctor(page);
583
		atomic_set(&page->_mapcount, 1);
584
		table = (unsigned long *) page_to_phys(page);
585
		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
586
		spin_lock_bh(&mm->context.list_lock);
587
		list_add(&page->lru, &mm->context.pgtable_list);
588
589
590
591
592
593
	} else {
		for (bit = 1; mask & bit; bit <<= 1)
			table += PTRS_PER_PTE;
		mask = atomic_xor_bits(&page->_mapcount, bit);
		if ((mask & FRAG_MASK) == FRAG_MASK)
			list_del(&page->lru);
594
	}
595
	spin_unlock_bh(&mm->context.list_lock);
596
597
598
	return table;
}

599
void page_table_free(struct mm_struct *mm, unsigned long *table)
600
601
{
	struct page *page;
602
	unsigned int bit, mask;
603

604
605
	if (mm_has_pgste(mm)) {
		gmap_unmap_notifier(mm, table);
606
		return page_table_free_pgste(table);
607
	}
608
	/* Free 1K/2K page table fragment of a 4K page */
609
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
610
611
612
613
614
615
616
617
618
	bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
	spin_lock_bh(&mm->context.list_lock);
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit);
	if (mask & FRAG_MASK)
		list_add(&page->lru, &mm->context.pgtable_list);
	spin_unlock_bh(&mm->context.list_lock);
	if (mask == 0) {
619
		pgtable_page_dtor(page);
620
		atomic_set(&page->_mapcount, -1);
621
622
623
624
		__free_page(page);
	}
}

625
626
627
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

static void __page_table_free_rcu(void *table, unsigned bit)
628
{
629
	struct page *page;
630

631
632
633
	if (bit == FRAG_MASK)
		return page_table_free_pgste(table);
	/* Free 1K/2K page table fragment of a 4K page */
634
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
635
	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
636
		pgtable_page_dtor(page);
637
		atomic_set(&page->_mapcount, -1);
638
639
640
		__free_page(page);
	}
}
641

642
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
643
{
644
	struct mm_struct *mm;
645
	struct page *page;
646
	unsigned int bit, mask;
647

648
649
	mm = tlb->mm;
	if (mm_has_pgste(mm)) {
650
		gmap_unmap_notifier(mm, table);
651
652
653
		table = (unsigned long *) (__pa(table) | FRAG_MASK);
		tlb_remove_table(tlb, table);
		return;
654
	}
655
	bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
656
657
	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
	spin_lock_bh(&mm->context.list_lock);
658
659
660
661
662
	if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
		list_del(&page->lru);
	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
	if (mask & FRAG_MASK)
		list_add_tail(&page->lru, &mm->context.pgtable_list);
663
	spin_unlock_bh(&mm->context.list_lock);
664
665
666
667
668
669
670
671
672
673
674
675
676
	table = (unsigned long *) (__pa(table) | (bit << 4));
	tlb_remove_table(tlb, table);
}

void __tlb_remove_table(void *_table)
{
	void *table = (void *)((unsigned long) _table & PAGE_MASK);
	unsigned type = (unsigned long) _table & ~PAGE_MASK;

	if (type)
		__page_table_free_rcu(table, type);
	else
		free_pages((unsigned long) table, ALLOC_ORDER);
677
678
}

679
680
#endif

681
682
683
684
685
686
/*
 * switch on pgstes for its userspace process (for kvm)
 */
int s390_enable_sie(void)
{
	struct task_struct *tsk = current;
687
	struct mm_struct *mm, *old_mm;
688

689
	/* Do we have switched amode? If no, we cannot do sie */
690
	if (user_mode == HOME_SPACE_MODE)
691
692
		return -EINVAL;

693
	/* Do we have pgstes? if yes, we are done */
694
	if (mm_has_pgste(tsk->mm))
695
		return 0;
696

697
698
	/* lets check if we are allowed to replace the mm */
	task_lock(tsk);
699
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
700
701
702
703
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
704
705
706
707
		task_unlock(tsk);
		return -EINVAL;
	}
	task_unlock(tsk);
708

709
710
	/* we copy the mm and let dup_mm create the page tables with_pgstes */
	tsk->mm->context.alloc_pgste = 1;
711
	mm = dup_mm(tsk);
712
	tsk->mm->context.alloc_pgste = 0;
713
	if (!mm)
714
715
		return -ENOMEM;

716
	/* Now lets check again if something happened */
717
718
	task_lock(tsk);
	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
719
720
721
722
#ifdef CONFIG_AIO
	    !hlist_empty(&tsk->mm->ioctx_list) ||
#endif
	    tsk->mm != tsk->active_mm) {
723
724
725
726
727
728
729
		mmput(mm);
		task_unlock(tsk);
		return -EINVAL;
	}

	/* ok, we are alone. No ptrace, no threads, etc. */
	old_mm = tsk->mm;
730
731
732
	tsk->mm = tsk->active_mm = mm;
	preempt_disable();
	update_mm(mm, tsk);
733
734
	atomic_inc(&mm->context.attach_count);
	atomic_dec(&old_mm->context.attach_count);
735
	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
736
737
	preempt_enable();
	task_unlock(tsk);
738
739
	mmput(old_mm);
	return 0;
740
741
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
742

743
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
744
745
746
747
748
749
bool kernel_page_present(struct page *page)
{
	unsigned long addr;
	int cc;

	addr = page_to_phys(page);
750
751
752
753
754
	asm volatile(
		"	lra	%1,0(%1)\n"
		"	ipm	%0\n"
		"	srl	%0,28"
		: "=d" (cc), "+a" (addr) : : "cc");
755
756
	return cc == 0;
}
757
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */