fault.c 36.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
/*
 *  Copyright (C) 1995  Linus Torvalds
Ingo Molnar's avatar
Ingo Molnar committed
3
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 */
6
7
#include <linux/sched.h>		/* test_thread_flag(), ...	*/
#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
Al Viro's avatar
Al Viro committed
8
#include <linux/module.h>		/* search_exception_tables	*/
9
#include <linux/bootmem.h>		/* max_low_pfn			*/
10
#include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
11
#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
12
#include <linux/perf_event.h>		/* perf_sw_event		*/
13
#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
14
#include <linux/prefetch.h>		/* prefetchw			*/
15
#include <linux/context_tracking.h>	/* exception_enter(), ...	*/
16
#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
Ingo Molnar's avatar
Ingo Molnar committed
17

18
#include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
19
20
#include <asm/traps.h>			/* dotraplinkage, ...		*/
#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
Vegard Nossum's avatar
Vegard Nossum committed
21
#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
22
23
#include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
#include <asm/vsyscall.h>		/* emulate_vsyscall		*/
24
#include <asm/vm86.h>			/* struct vm86			*/
25
#include <asm/mmu_context.h>		/* vma_pkey()			*/
Linus Torvalds's avatar
Linus Torvalds committed
26

27
28
29
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

30
/*
Ingo Molnar's avatar
Ingo Molnar committed
31
32
33
34
35
36
37
 * Page fault error code bits:
 *
 *   bit 0 ==	 0: no page found	1: protection fault
 *   bit 1 ==	 0: read access		1: write access
 *   bit 2 ==	 0: kernel-mode access	1: user-mode access
 *   bit 3 ==				1: use of reserved bit detected
 *   bit 4 ==				1: fault was an instruction fetch
38
 *   bit 5 ==				1: protection keys block access
39
 */
Ingo Molnar's avatar
Ingo Molnar committed
40
41
42
43
44
45
46
enum x86_pf_error_code {

	PF_PROT		=		1 << 0,
	PF_WRITE	=		1 << 1,
	PF_USER		=		1 << 2,
	PF_RSVD		=		1 << 3,
	PF_INSTR	=		1 << 4,
47
	PF_PK		=		1 << 5,
Ingo Molnar's avatar
Ingo Molnar committed
48
};
49

50
/*
51
52
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
53
 */
54
static nokprobe_inline int
55
kmmio_fault(struct pt_regs *regs, unsigned long addr)
56
{
57
58
59
60
	if (unlikely(is_kmmio_active()))
		if (kmmio_handler(regs, addr) == 1)
			return -1;
	return 0;
61
62
}

63
static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
64
{
65
66
67
	int ret = 0;

	/* kprobe_running() needs smp_processor_id() */
68
	if (kprobes_built_in() && !user_mode(regs)) {
69
70
71
72
73
		preempt_disable();
		if (kprobe_running() && kprobe_fault_handler(regs, 14))
			ret = 1;
		preempt_enable();
	}
74

75
	return ret;
76
}
77

78
/*
Ingo Molnar's avatar
Ingo Molnar committed
79
80
81
82
83
84
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.
85
 *
Ingo Molnar's avatar
Ingo Molnar committed
86
 * 64-bit mode:
87
 *
Ingo Molnar's avatar
Ingo Molnar committed
88
89
90
91
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
92
 */
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
		      unsigned char opcode, int *prefetch)
{
	unsigned char instr_hi = opcode & 0xf0;
	unsigned char instr_lo = opcode & 0x0f;

	switch (instr_hi) {
	case 0x20:
	case 0x30:
		/*
		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
		 * In X86_64 long mode, the CPU will signal invalid
		 * opcode if some of these prefixes are present so
		 * X86_64 will never get here anyway
		 */
		return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
	case 0x40:
		/*
		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
		 * Need to figure out under what instruction mode the
		 * instruction was issued. Could check the LDT for lm,
		 * but for now it's good enough to assume that long
		 * mode only uses well known segments or kernel.
		 */
119
		return (!user_mode(regs) || user_64bit_mode(regs));
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#endif
	case 0x60:
		/* 0x64 thru 0x67 are valid prefixes in all modes. */
		return (instr_lo & 0xC) == 0x4;
	case 0xF0:
		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
		return !instr_lo || (instr_lo>>1) == 1;
	case 0x00:
		/* Prefetch instruction is 0x0F0D or 0x0F18 */
		if (probe_kernel_address(instr, opcode))
			return 0;

		*prefetch = (instr_lo == 0xF) &&
			(opcode == 0x0D || opcode == 0x18);
		return 0;
	default:
		return 0;
	}
}

Ingo Molnar's avatar
Ingo Molnar committed
140
141
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
142
{
Ingo Molnar's avatar
Ingo Molnar committed
143
	unsigned char *max_instr;
144
	unsigned char *instr;
145
	int prefetch = 0;
Linus Torvalds's avatar
Linus Torvalds committed
146

Ingo Molnar's avatar
Ingo Molnar committed
147
148
149
150
	/*
	 * If it was a exec (instruction fetch) fault on NX page, then
	 * do not ignore the fault:
	 */
151
	if (error_code & PF_INSTR)
Linus Torvalds's avatar
Linus Torvalds committed
152
		return 0;
153

154
	instr = (void *)convert_ip_to_linear(current, regs);
155
	max_instr = instr + 15;
Linus Torvalds's avatar
Linus Torvalds committed
156

157
	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
Linus Torvalds's avatar
Linus Torvalds committed
158
159
		return 0;

160
	while (instr < max_instr) {
Ingo Molnar's avatar
Ingo Molnar committed
161
		unsigned char opcode;
Linus Torvalds's avatar
Linus Torvalds committed
162

163
		if (probe_kernel_address(instr, opcode))
164
			break;
Linus Torvalds's avatar
Linus Torvalds committed
165
166
167

		instr++;

168
		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
Linus Torvalds's avatar
Linus Torvalds committed
169
170
171
172
173
			break;
	}
	return prefetch;
}

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/*
 * A protection key fault means that the PKRU value did not allow
 * access to some PTE.  Userspace can figure out what PKRU was
 * from the XSAVE state, and this function fills out a field in
 * siginfo so userspace can discover which protection key was set
 * on the PTE.
 *
 * If we get here, we know that the hardware signaled a PF_PK
 * fault and that there was a VMA once we got in the fault
 * handler.  It does *not* guarantee that the VMA we find here
 * was the one that we faulted on.
 *
 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
 * 2. T1   : set PKRU to deny access to pkey=4, touches page
 * 3. T1   : faults...
 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
 * 5. T1   : enters fault handler, takes mmap_sem, etc...
 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
 *	     faulted on a pte with its pkey=4.
 */
static void fill_sig_info_pkey(int si_code, siginfo_t *info,
		struct vm_area_struct *vma)
{
	/* This is effectively an #ifdef */
	if (!boot_cpu_has(X86_FEATURE_OSPKE))
		return;

	/* Fault not from Protection Keys: nothing to do */
	if (si_code != SEGV_PKUERR)
		return;
	/*
	 * force_sig_info_fault() is called from a number of
	 * contexts, some of which have a VMA and some of which
	 * do not.  The PF_PK handing happens after we have a
	 * valid VMA, so we should never reach this without a
	 * valid VMA.
	 */
	if (!vma) {
		WARN_ONCE(1, "PKU fault with no VMA passed in");
		info->si_pkey = 0;
		return;
	}
	/*
	 * si_pkey should be thought of as a strong hint, but not
	 * absolutely guranteed to be 100% accurate because of
	 * the race explained above.
	 */
	info->si_pkey = vma_pkey(vma);
}

Ingo Molnar's avatar
Ingo Molnar committed
224
225
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
226
227
		     struct task_struct *tsk, struct vm_area_struct *vma,
		     int fault)
228
{
229
	unsigned lsb = 0;
230
231
	siginfo_t info;

Ingo Molnar's avatar
Ingo Molnar committed
232
233
234
235
	info.si_signo	= si_signo;
	info.si_errno	= 0;
	info.si_code	= si_code;
	info.si_addr	= (void __user *)address;
236
237
238
239
240
	if (fault & VM_FAULT_HWPOISON_LARGE)
		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
	if (fault & VM_FAULT_HWPOISON)
		lsb = PAGE_SHIFT;
	info.si_addr_lsb = lsb;
Ingo Molnar's avatar
Ingo Molnar committed
241

242
243
	fill_sig_info_pkey(si_code, &info, vma);

244
245
246
	force_sig_info(si_signo, &info, tsk);
}

247
248
249
250
251
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
252
{
253
254
255
256
	unsigned index = pgd_index(address);
	pgd_t *pgd_k;
	pud_t *pud, *pud_k;
	pmd_t *pmd, *pmd_k;
Ingo Molnar's avatar
Ingo Molnar committed
257

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
	pgd += index;
	pgd_k = init_mm.pgd + index;

	if (!pgd_present(*pgd_k))
		return NULL;

	/*
	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
	 * and redundant with the set_pmd() on non-PAE. As would
	 * set_pud.
	 */
	pud = pud_offset(pgd, address);
	pud_k = pud_offset(pgd_k, address);
	if (!pud_present(*pud_k))
		return NULL;

	pmd = pmd_offset(pud, address);
	pmd_k = pmd_offset(pud_k, address);
	if (!pmd_present(*pmd_k))
		return NULL;

279
	if (!pmd_present(*pmd))
280
		set_pmd(pmd, *pmd_k);
281
	else
282
283
284
285
286
287
288
289
290
291
292
293
294
		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));

	return pmd_k;
}

void vmalloc_sync_all(void)
{
	unsigned long address;

	if (SHARED_KERNEL_PMD)
		return;

	for (address = VMALLOC_START & PMD_MASK;
295
	     address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
296
297
298
	     address += PMD_SIZE) {
		struct page *page;

299
		spin_lock(&pgd_lock);
300
		list_for_each_entry(page, &pgd_list, lru) {
301
			spinlock_t *pgt_lock;
302
			pmd_t *ret;
303

304
			/* the pgt_lock only for Xen */
305
306
307
308
309
310
311
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

			spin_lock(pgt_lock);
			ret = vmalloc_sync_one(page_address(page), address);
			spin_unlock(pgt_lock);

			if (!ret)
312
313
				break;
		}
314
		spin_unlock(&pgd_lock);
315
316
317
318
319
320
321
322
	}
}

/*
 * 32-bit:
 *
 *   Handle a fault on the vmalloc or module mapping area
 */
323
static noinline int vmalloc_fault(unsigned long address)
324
325
326
327
328
329
330
331
332
{
	unsigned long pgd_paddr;
	pmd_t *pmd_k;
	pte_t *pte_k;

	/* Make sure we are in vmalloc area: */
	if (!(address >= VMALLOC_START && address < VMALLOC_END))
		return -1;

333
334
	WARN_ON_ONCE(in_nmi());

335
336
337
338
339
340
341
342
343
344
345
346
	/*
	 * Synchronize this task's top level page-table
	 * with the 'reference' page table.
	 *
	 * Do _not_ use "current" here. We might be inside
	 * an interrupt in the middle of a task switch..
	 */
	pgd_paddr = read_cr3();
	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
	if (!pmd_k)
		return -1;

347
348
349
	if (pmd_huge(*pmd_k))
		return 0;

350
351
352
353
354
355
	pte_k = pte_offset_kernel(pmd_k, address);
	if (!pte_present(*pte_k))
		return -1;

	return 0;
}
356
NOKPROBE_SYMBOL(vmalloc_fault);
357
358
359
360
361
362
363
364

/*
 * Did it hit the DOS screen memory VA from vm86 mode?
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
		 struct task_struct *tsk)
{
365
#ifdef CONFIG_VM86
366
367
	unsigned long bit;

368
	if (!v8086_mode(regs) || !tsk->thread.vm86)
369
370
371
372
		return;

	bit = (address - 0xA0000) >> PAGE_SHIFT;
	if (bit < 32)
373
374
		tsk->thread.vm86->screen_bitmap |= 1 << bit;
#endif
375
}
Linus Torvalds's avatar
Linus Torvalds committed
376

Akinobu Mita's avatar
Akinobu Mita committed
377
static bool low_pfn(unsigned long pfn)
Linus Torvalds's avatar
Linus Torvalds committed
378
{
Akinobu Mita's avatar
Akinobu Mita committed
379
380
	return pfn < max_low_pfn;
}
381

Akinobu Mita's avatar
Akinobu Mita committed
382
383
384
385
386
387
static void dump_pagetable(unsigned long address)
{
	pgd_t *base = __va(read_cr3());
	pgd_t *pgd = &base[pgd_index(address)];
	pmd_t *pmd;
	pte_t *pte;
Ingo Molnar's avatar
Ingo Molnar committed
388

389
#ifdef CONFIG_X86_PAE
Akinobu Mita's avatar
Akinobu Mita committed
390
391
392
	printk("*pdpt = %016Lx ", pgd_val(*pgd));
	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
		goto out;
393
#endif
Akinobu Mita's avatar
Akinobu Mita committed
394
395
	pmd = pmd_offset(pud_offset(pgd, address), address);
	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
396
397
398
399
400

	/*
	 * We must not directly access the pte in the highpte
	 * case if the page table is located in highmem.
	 * And let's rather not kmap-atomic the pte, just in case
Ingo Molnar's avatar
Ingo Molnar committed
401
	 * it's allocated already:
402
	 */
Akinobu Mita's avatar
Akinobu Mita committed
403
404
	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
		goto out;
405

Akinobu Mita's avatar
Akinobu Mita committed
406
407
408
	pte = pte_offset_kernel(pmd, address);
	printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
409
	printk("\n");
410
411
412
413
414
415
}

#else /* CONFIG_X86_64: */

void vmalloc_sync_all(void)
{
416
	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
417
418
419
420
421
422
423
}

/*
 * 64-bit:
 *
 *   Handle a fault on the vmalloc area
 */
424
static noinline int vmalloc_fault(unsigned long address)
425
426
427
428
429
430
431
432
433
434
{
	pgd_t *pgd, *pgd_ref;
	pud_t *pud, *pud_ref;
	pmd_t *pmd, *pmd_ref;
	pte_t *pte, *pte_ref;

	/* Make sure we are in vmalloc area: */
	if (!(address >= VMALLOC_START && address < VMALLOC_END))
		return -1;

435
436
	WARN_ON_ONCE(in_nmi());

437
438
439
440
441
	/*
	 * Copy kernel mappings over when needed. This can also
	 * happen within a race in page table update. In the later
	 * case just flush:
	 */
442
	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
443
444
445
446
	pgd_ref = pgd_offset_k(address);
	if (pgd_none(*pgd_ref))
		return -1;

447
	if (pgd_none(*pgd)) {
448
		set_pgd(pgd, *pgd_ref);
449
450
		arch_flush_lazy_mmu_mode();
	} else {
451
		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
452
	}
453
454
455
456
457
458
459
460
461
462
463

	/*
	 * Below here mismatches are bugs because these lower tables
	 * are shared:
	 */

	pud = pud_offset(pgd, address);
	pud_ref = pud_offset(pgd_ref, address);
	if (pud_none(*pud_ref))
		return -1;

464
	if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
465
466
		BUG();

467
468
469
	if (pud_huge(*pud))
		return 0;

470
471
472
473
474
	pmd = pmd_offset(pud, address);
	pmd_ref = pmd_offset(pud_ref, address);
	if (pmd_none(*pmd_ref))
		return -1;

475
	if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
476
477
		BUG();

478
479
480
	if (pmd_huge(*pmd))
		return 0;

481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
	pte_ref = pte_offset_kernel(pmd_ref, address);
	if (!pte_present(*pte_ref))
		return -1;

	pte = pte_offset_kernel(pmd, address);

	/*
	 * Don't use pte_page here, because the mappings can point
	 * outside mem_map, and the NUMA hash lookup cannot handle
	 * that:
	 */
	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
		BUG();

	return 0;
}
497
NOKPROBE_SYMBOL(vmalloc_fault);
498

499
#ifdef CONFIG_CPU_SUP_AMD
500
static const char errata93_warning[] =
501
502
503
504
505
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
506
#endif
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525

/*
 * No vm86 mode in 64-bit mode:
 */
static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
		 struct task_struct *tsk)
{
}

static int bad_address(void *p)
{
	unsigned long dummy;

	return probe_kernel_address((unsigned long *)p, dummy);
}

static void dump_pagetable(unsigned long address)
{
Akinobu Mita's avatar
Akinobu Mita committed
526
527
	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
	pgd_t *pgd = base + pgd_index(address);
Linus Torvalds's avatar
Linus Torvalds committed
528
529
530
531
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

Ingo Molnar's avatar
Ingo Molnar committed
532
533
534
	if (bad_address(pgd))
		goto bad;

535
	printk("PGD %lx ", pgd_val(*pgd));
Ingo Molnar's avatar
Ingo Molnar committed
536
537
538

	if (!pgd_present(*pgd))
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
539

540
	pud = pud_offset(pgd, address);
Ingo Molnar's avatar
Ingo Molnar committed
541
542
543
	if (bad_address(pud))
		goto bad;

Linus Torvalds's avatar
Linus Torvalds committed
544
	printk("PUD %lx ", pud_val(*pud));
545
	if (!pud_present(*pud) || pud_large(*pud))
Ingo Molnar's avatar
Ingo Molnar committed
546
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
547
548

	pmd = pmd_offset(pud, address);
Ingo Molnar's avatar
Ingo Molnar committed
549
550
551
	if (bad_address(pmd))
		goto bad;

Linus Torvalds's avatar
Linus Torvalds committed
552
	printk("PMD %lx ", pmd_val(*pmd));
Ingo Molnar's avatar
Ingo Molnar committed
553
554
	if (!pmd_present(*pmd) || pmd_large(*pmd))
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
555
556

	pte = pte_offset_kernel(pmd, address);
Ingo Molnar's avatar
Ingo Molnar committed
557
558
559
	if (bad_address(pte))
		goto bad;

560
	printk("PTE %lx", pte_val(*pte));
Ingo Molnar's avatar
Ingo Molnar committed
561
out:
Linus Torvalds's avatar
Linus Torvalds committed
562
563
564
565
	printk("\n");
	return;
bad:
	printk("BAD\n");
566
567
}

568
#endif /* CONFIG_X86_64 */
Linus Torvalds's avatar
Linus Torvalds committed
569

Ingo Molnar's avatar
Ingo Molnar committed
570
571
572
573
574
575
576
577
578
579
580
581
582
/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
583
 */
584
static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
585
{
586
587
588
589
590
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
	    || boot_cpu_data.x86 != 0xf)
		return 0;

591
	if (address != regs->ip)
Linus Torvalds's avatar
Linus Torvalds committed
592
		return 0;
Ingo Molnar's avatar
Ingo Molnar committed
593

594
	if ((address >> 32) != 0)
Linus Torvalds's avatar
Linus Torvalds committed
595
		return 0;
Ingo Molnar's avatar
Ingo Molnar committed
596

Linus Torvalds's avatar
Linus Torvalds committed
597
	address |= 0xffffffffUL << 32;
598
599
	if ((address >= (u64)_stext && address <= (u64)_etext) ||
	    (address >= MODULES_VADDR && address <= MODULES_END)) {
600
		printk_once(errata93_warning);
601
		regs->ip = address;
Linus Torvalds's avatar
Linus Torvalds committed
602
603
		return 1;
	}
604
#endif
Linus Torvalds's avatar
Linus Torvalds committed
605
	return 0;
606
}
Linus Torvalds's avatar
Linus Torvalds committed
607

608
/*
Ingo Molnar's avatar
Ingo Molnar committed
609
610
611
612
613
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
614
615
616
617
618
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
Ingo Molnar's avatar
Ingo Molnar committed
619
	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
620
621
622
623
624
		return 1;
#endif
	return 0;
}

625
626
627
628
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
	unsigned long nr;
Ingo Molnar's avatar
Ingo Molnar committed
629

630
	/*
Ingo Molnar's avatar
Ingo Molnar committed
631
	 * Pentium F0 0F C7 C8 bug workaround:
632
	 */
633
	if (boot_cpu_has_bug(X86_BUG_F00F)) {
634
635
636
637
638
639
640
641
642
643
644
		nr = (address - idt_descr.address) >> 3;

		if (nr == 6) {
			do_invalid_op(regs, 0);
			return 1;
		}
	}
#endif
	return 0;
}

645
646
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
647
648
static const char smep_warning[] = KERN_CRIT
"unable to execute userspace code (SMEP?) (uid: %d)\n";
649

Ingo Molnar's avatar
Ingo Molnar committed
650
651
652
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
		unsigned long address)
653
{
654
655
656
657
	if (!oops_may_print())
		return;

	if (error_code & PF_INSTR) {
658
		unsigned int level;
659
660
		pgd_t *pgd;
		pte_t *pte;
Ingo Molnar's avatar
Ingo Molnar committed
661

662
663
664
665
		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
		pgd += pgd_index(address);

		pte = lookup_address_in_pgd(pgd, address, &level);
666

667
		if (pte && pte_present(*pte) && !pte_exec(*pte))
668
			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
669
670
		if (pte && pte_present(*pte) && pte_exec(*pte) &&
				(pgd_flags(*pgd) & _PAGE_USER) &&
671
				(__read_cr4() & X86_CR4_SMEP))
672
			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
673
674
	}

675
	printk(KERN_ALERT "BUG: unable to handle kernel ");
676
	if (address < PAGE_SIZE)
677
		printk(KERN_CONT "NULL pointer dereference");
678
	else
679
		printk(KERN_CONT "paging request");
Ingo Molnar's avatar
Ingo Molnar committed
680

681
	printk(KERN_CONT " at %p\n", (void *) address);
682
	printk(KERN_ALERT "IP:");
683
	printk_address(regs->ip);
Ingo Molnar's avatar
Ingo Molnar committed
684

685
686
687
	dump_pagetable(address);
}

Ingo Molnar's avatar
Ingo Molnar committed
688
689
690
static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
	    unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
691
{
Ingo Molnar's avatar
Ingo Molnar committed
692
693
694
695
696
697
698
	struct task_struct *tsk;
	unsigned long flags;
	int sig;

	flags = oops_begin();
	tsk = current;
	sig = SIGKILL;
699

Linus Torvalds's avatar
Linus Torvalds committed
700
	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
701
	       tsk->comm, address);
Linus Torvalds's avatar
Linus Torvalds committed
702
	dump_pagetable(address);
Ingo Molnar's avatar
Ingo Molnar committed
703
704

	tsk->thread.cr2		= address;
705
	tsk->thread.trap_nr	= X86_TRAP_PF;
Ingo Molnar's avatar
Ingo Molnar committed
706
707
	tsk->thread.error_code	= error_code;

708
	if (__die("Bad pagetable", regs, error_code))
709
		sig = 0;
Ingo Molnar's avatar
Ingo Molnar committed
710

711
	oops_end(flags, regs, sig);
Linus Torvalds's avatar
Linus Torvalds committed
712
713
}

Ingo Molnar's avatar
Ingo Molnar committed
714
715
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
716
	   unsigned long address, int signal, int si_code)
717
718
719
720
{
	struct task_struct *tsk = current;
	unsigned long flags;
	int sig;
721
722
	/* No context means no VMA to pass down */
	struct vm_area_struct *vma = NULL;
723

Ingo Molnar's avatar
Ingo Molnar committed
724
	/* Are we prepared to handle this kernel fault? */
725
	if (fixup_exception(regs, X86_TRAP_PF)) {
726
727
728
729
730
731
732
733
734
735
736
737
738
739
		/*
		 * Any interrupt that takes a fault gets the fixup. This makes
		 * the below recursive fault logic only apply to a faults from
		 * task context.
		 */
		if (in_interrupt())
			return;

		/*
		 * Per the above we're !in_interrupt(), aka. task context.
		 *
		 * In this case we need to make sure we're not recursively
		 * faulting through the emulate_vsyscall() logic.
		 */
740
		if (current->thread.sig_on_uaccess_err && signal) {
741
			tsk->thread.trap_nr = X86_TRAP_PF;
742
743
744
745
			tsk->thread.error_code = error_code | PF_USER;
			tsk->thread.cr2 = address;

			/* XXX: hwpoison faults will set the wrong code. */
746
747
			force_sig_info_fault(signal, si_code, address,
					     tsk, vma, 0);
748
		}
749
750
751
752

		/*
		 * Barring that, we can do the fixup and be happy.
		 */
753
		return;
754
	}
755
756

	/*
Ingo Molnar's avatar
Ingo Molnar committed
757
758
759
760
761
762
763
	 * 32-bit:
	 *
	 *   Valid to do another page fault here, because if this fault
	 *   had been triggered by is_prefetch fixup_exception would have
	 *   handled it.
	 *
	 * 64-bit:
764
	 *
Ingo Molnar's avatar
Ingo Molnar committed
765
	 *   Hall of shame of CPU/BIOS bugs.
766
767
768
769
770
771
772
773
774
	 */
	if (is_prefetch(regs, error_code, address))
		return;

	if (is_errata93(regs, address))
		return;

	/*
	 * Oops. The kernel tried to access some bad page. We'll have to
Ingo Molnar's avatar
Ingo Molnar committed
775
	 * terminate things with extreme prejudice:
776
777
778
779
780
	 */
	flags = oops_begin();

	show_fault_oops(regs, error_code, address);

781
	if (task_stack_end_corrupted(tsk))
782
		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
783

784
	tsk->thread.cr2		= address;
785
	tsk->thread.trap_nr	= X86_TRAP_PF;
786
	tsk->thread.error_code	= error_code;
787
788
789
790

	sig = SIGKILL;
	if (__die("Oops", regs, error_code))
		sig = 0;
Ingo Molnar's avatar
Ingo Molnar committed
791

792
	/* Executive summary in case the body of the oops scrolled away */
793
	printk(KERN_DEFAULT "CR2: %016lx\n", address);
Ingo Molnar's avatar
Ingo Molnar committed
794

795
796
797
	oops_end(flags, regs, sig);
}

Ingo Molnar's avatar
Ingo Molnar committed
798
799
800
801
802
803
804
805
806
807
808
809
810
811
/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
		unsigned long address, struct task_struct *tsk)
{
	if (!unhandled_signal(tsk, SIGSEGV))
		return;

	if (!printk_ratelimit())
		return;

812
	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
Ingo Molnar's avatar
Ingo Molnar committed
813
814
815
816
817
818
819
820
821
822
823
		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
		tsk->comm, task_pid_nr(tsk), address,
		(void *)regs->ip, (void *)regs->sp, error_code);

	print_vma_addr(KERN_CONT " in ", regs->ip);

	printk(KERN_CONT "\n");
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
824
825
		       unsigned long address, struct vm_area_struct *vma,
		       int si_code)
826
827
828
829
830
831
{
	struct task_struct *tsk = current;

	/* User mode accesses just cause a SIGSEGV */
	if (error_code & PF_USER) {
		/*
Ingo Molnar's avatar
Ingo Molnar committed
832
		 * It's possible to have interrupts off here:
833
834
835
836
837
		 */
		local_irq_enable();

		/*
		 * Valid to do another page fault here because this one came
Ingo Molnar's avatar
Ingo Molnar committed
838
		 * from user space:
839
840
841
842
843
844
845
		 */
		if (is_prefetch(regs, error_code, address))
			return;

		if (is_errata100(regs, address))
			return;

846
847
848
849
850
851
#ifdef CONFIG_X86_64
		/*
		 * Instruction fetch faults in the vsyscall page might need
		 * emulation.
		 */
		if (unlikely((error_code & PF_INSTR) &&
852
			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
853
854
855
856
			if (emulate_vsyscall(regs, address))
				return;
		}
#endif
857
858
859
860
861
862
863

		/*
		 * To avoid leaking information about the kernel page table
		 * layout, pretend that user-mode accesses to kernel addresses
		 * are always protection faults.
		 */
		if (address >= TASK_SIZE_MAX)
864
			error_code |= PF_PROT;
865

866
		if (likely(show_unhandled_signals))
Ingo Molnar's avatar
Ingo Molnar committed
867
868
869
			show_signal_msg(regs, error_code, address, tsk);

		tsk->thread.cr2		= address;
870
		tsk->thread.error_code	= error_code;
871
		tsk->thread.trap_nr	= X86_TRAP_PF;
872

873
		force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
Ingo Molnar's avatar
Ingo Molnar committed
874

875
876
877
878
879
880
		return;
	}

	if (is_f00f_bug(regs, address))
		return;

881
	no_context(regs, error_code, address, SIGSEGV, si_code);
882
883
}

Ingo Molnar's avatar
Ingo Molnar committed
884
885
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
886
		     unsigned long address, struct vm_area_struct *vma)
887
{
888
	__bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
889
890
}

Ingo Molnar's avatar
Ingo Molnar committed
891
892
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
893
	   unsigned long address,  struct vm_area_struct *vma, int si_code)
894
895
896
897
898
899
900
901
902
{
	struct mm_struct *mm = current->mm;

	/*
	 * Something tried to access memory that isn't in our memory map..
	 * Fix it, but check if it's kernel or user first..
	 */
	up_read(&mm->mmap_sem);

903
	__bad_area_nosemaphore(regs, error_code, address, vma, si_code);
904
905
}

Ingo Molnar's avatar
Ingo Molnar committed
906
907
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
908
{
909
	__bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
910
911
}

912
913
914
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
		struct vm_area_struct *vma)
{
915
916
917
	/* This code is always called on the current mm */
	bool foreign = false;

918
919
920
921
	if (!boot_cpu_has(X86_FEATURE_OSPKE))
		return false;
	if (error_code & PF_PK)
		return true;
922
	/* this checks permission keys on the VMA: */
923
924
	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
				(error_code & PF_INSTR), foreign))
925
		return true;
926
	return false;
927
928
}

Ingo Molnar's avatar
Ingo Molnar committed
929
930
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
931
		      unsigned long address, struct vm_area_struct *vma)
932
{
933
934
935
936
937
	/*
	 * This OSPKE check is not strictly necessary at runtime.
	 * But, doing it this way allows compiler optimizations
	 * if pkeys are compiled out.
	 */
938
	if (bad_area_access_from_pkeys(error_code, vma))
939
940
941
		__bad_area(regs, error_code, address, vma, SEGV_PKUERR);
	else
		__bad_area(regs, error_code, address, vma, SEGV_ACCERR);
942
943
}

Ingo Molnar's avatar
Ingo Molnar committed
944
static void
945
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
946
	  struct vm_area_struct *vma, unsigned int fault)
947
948
{
	struct task_struct *tsk = current;
949
	int code = BUS_ADRERR;
950

Ingo Molnar's avatar
Ingo Molnar committed
951
	/* Kernel mode? Handle exceptions or die: */
952
	if (!(error_code & PF_USER)) {
953
		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
954
955
		return;
	}
Ingo Molnar's avatar
Ingo Molnar committed
956

957
	/* User-space => ok to do another page fault: */
958
959
	if (is_prefetch(regs, error_code, address))
		return;
Ingo Molnar's avatar
Ingo Molnar committed
960
961
962

	tsk->thread.cr2		= address;
	tsk->thread.error_code	= error_code;
963
	tsk->thread.trap_nr	= X86_TRAP_PF;
Ingo Molnar's avatar
Ingo Molnar committed
964

965
#ifdef CONFIG_MEMORY_FAILURE
966
	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
967
968
969
970
971
972
		printk(KERN_ERR
	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
			tsk->comm, tsk->pid, address);
		code = BUS_MCEERR_AR;
	}
#endif
973
	force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
974
975
}

976
static noinline void
Ingo Molnar's avatar
Ingo Molnar committed
977
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
978
979
	       unsigned long address, struct vm_area_struct *vma,
	       unsigned int fault)
980
{
981
982
983
	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
		no_context(regs, error_code, address, 0, 0);
		return;
984
985
	}

Ingo Molnar's avatar
Ingo Molnar committed
986
	if (fault & VM_FAULT_OOM) {
987
988
		/* Kernel mode? Handle exceptions or die: */
		if (!(error_code & PF_USER)) {
989
990
			no_context(regs, error_code, address,
				   SIGSEGV, SEGV_MAPERR);
991
			return;
992
993
		}

994
995
996
997
998
999
		/*
		 * We ran out of memory, call the OOM killer, and return the
		 * userspace (which will retry the fault, or kill us if we got
		 * oom-killed):
		 */
		pagefault_out_of_memory();
Ingo Molnar's avatar
Ingo Molnar committed
1000
	} else {