mce.c 62 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
/*
 * Machine check handler.
Ingo Molnar's avatar
Ingo Molnar committed
3
 *
Linus Torvalds's avatar
Linus Torvalds committed
4
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5
6
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
7
8
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
Linus Torvalds's avatar
Linus Torvalds committed
9
 */
10
11
12

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Ingo Molnar's avatar
Ingo Molnar committed
13
14
15
16
17
18
19
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
20
#include <linux/uaccess.h>
Ingo Molnar's avatar
Ingo Molnar committed
21
22
23
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
#include <linux/string.h>
25
#include <linux/device.h>
26
#include <linux/syscore_ops.h>
27
#include <linux/delay.h>
28
#include <linux/ctype.h>
Ingo Molnar's avatar
Ingo Molnar committed
29
#include <linux/sched.h>
30
#include <linux/sysfs.h>
Ingo Molnar's avatar
Ingo Molnar committed
31
#include <linux/types.h>
32
#include <linux/slab.h>
Ingo Molnar's avatar
Ingo Molnar committed
33
34
35
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
36
#include <linux/nmi.h>
Ingo Molnar's avatar
Ingo Molnar committed
37
#include <linux/cpu.h>
38
#include <linux/smp.h>
Ingo Molnar's avatar
Ingo Molnar committed
39
#include <linux/fs.h>
40
#include <linux/mm.h>
41
#include <linux/debugfs.h>
42
#include <linux/irq_work.h>
43
#include <linux/export.h>
44
#include <linux/jump_label.h>
Ingo Molnar's avatar
Ingo Molnar committed
45

46
#include <asm/processor.h>
47
#include <asm/traps.h>
48
#include <asm/tlbflush.h>
Ingo Molnar's avatar
Ingo Molnar committed
49
50
#include <asm/mce.h>
#include <asm/msr.h>
Linus Torvalds's avatar
Linus Torvalds committed
51

52
#include "mce-internal.h"
53

54
static DEFINE_MUTEX(mce_chrdev_read_mutex);
55

56
#define mce_log_get_idx_check(p) \
57
({ \
58
59
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
			 !lockdep_is_held(&mce_chrdev_read_mutex), \
60
			 "suspicious mce_log_get_idx_check() usage"); \
61
62
	smp_load_acquire(&(p)); \
})
63

64
65
66
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>

67
#define SPINUNIT		100	/* 100ns */
68

69
70
DEFINE_PER_CPU(unsigned, mce_exception_count);

71
struct mce_bank *mce_banks __read_mostly;
72
struct mce_vendor_flags mce_flags __read_mostly;
73

74
struct mca_config mca_cfg __read_mostly = {
75
	.bootlog  = -1,
76
77
78
79
80
81
82
	/*
	 * Tolerant levels:
	 * 0: always panic on uncorrected errors, log corrected errors
	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
	 * 3: never panic or SIGBUS, log all errors (for testing only)
	 */
83
84
	.tolerant = 1,
	.monarch_timeout = -1
85
86
};

87
88
89
90
/* User mode helper program triggered by machine check event */
static unsigned long		mce_need_notify;
static char			mce_helper[128];
static char			*mce_helper_argv[2] = { mce_helper, NULL };
Linus Torvalds's avatar
Linus Torvalds committed
91

92
93
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);

94
95
96
static DEFINE_PER_CPU(struct mce, mces_seen);
static int			cpu_missing;

97
98
99
100
/*
 * MCA banks polled by the period polling timer for corrected events.
 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 */
101
102
103
104
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

105
106
107
108
109
110
111
112
113
/*
 * MCA banks controlled through firmware first for corrected errors.
 * This is a global list of banks for which we won't enable CMCI and we
 * won't poll. Firmware controls these banks and is responsible for
 * reporting corrected errors through GHES. Uncorrected/recoverable
 * errors are still notified through a machine check.
 */
mce_banks_t mce_banks_ce_disabled;

114
115
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
116

117
118
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);

119
120
121
122
/*
 * CPU/chipset specific EDAC code can register a notifier call here to print
 * MCE errors in a human-readable form.
 */
123
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
124

125
126
127
128
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
	memset(m, 0, sizeof(struct mce));
129
	m->cpu = m->extcpu = smp_processor_id();
130
	m->tsc = rdtsc();
131
132
133
134
135
136
137
	/* We hope get_seconds stays lockless */
	m->time = get_seconds();
	m->cpuvendor = boot_cpu_data.x86_vendor;
	m->cpuid = cpuid_eax(1);
	m->socketid = cpu_data(m->extcpu).phys_proc_id;
	m->apicid = cpu_data(m->extcpu).initial_apicid;
	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
138
139
}

140
141
142
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);

Linus Torvalds's avatar
Linus Torvalds committed
143
144
145
146
147
148
/*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
 * separate MCEs from kernel messages to avoid bogus bug reports.
 */

149
static struct mce_log mcelog = {
150
151
152
	.signature	= MCE_LOG_SIGNATURE,
	.len		= MCE_LOG_LEN,
	.recordlen	= sizeof(struct mce),
153
};
Linus Torvalds's avatar
Linus Torvalds committed
154
155
156
157

void mce_log(struct mce *mce)
{
	unsigned next, entry;
Ingo Molnar's avatar
Ingo Molnar committed
158

159
160
161
	/* Emit the trace record: */
	trace_mce_record(mce);

162
163
	if (!mce_gen_pool_add(mce))
		irq_work_queue(&mce_irq_work);
164

Mike Waychison's avatar
Mike Waychison committed
165
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
166
	for (;;) {
167
		entry = mce_log_get_idx_check(mcelog.next);
168
		for (;;) {
169

Ingo Molnar's avatar
Ingo Molnar committed
170
171
172
173
174
			/*
			 * When the buffer fills up discard new entries.
			 * Assume that the earlier errors are the more
			 * interesting ones:
			 */
175
			if (entry >= MCE_LOG_LEN) {
176
177
				set_bit(MCE_OVERFLOW,
					(unsigned long *)&mcelog.flags);
178
179
				return;
			}
Ingo Molnar's avatar
Ingo Molnar committed
180
			/* Old left over entry. Skip: */
181
182
183
184
			if (mcelog.entry[entry].finished) {
				entry++;
				continue;
			}
Mike Waychison's avatar
Mike Waychison committed
185
			break;
Linus Torvalds's avatar
Linus Torvalds committed
186
187
188
189
190
191
192
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison's avatar
Mike Waychison committed
193
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
194
	mcelog.entry[entry].finished = 1;
Mike Waychison's avatar
Mike Waychison committed
195
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
196

197
	set_bit(0, &mce_need_notify);
Linus Torvalds's avatar
Linus Torvalds committed
198
199
}

200
void mce_inject_log(struct mce *m)
201
{
202
203
204
	mutex_lock(&mce_chrdev_read_mutex);
	mce_log(m);
	mutex_unlock(&mce_chrdev_read_mutex);
205
}
206
EXPORT_SYMBOL_GPL(mce_inject_log);
207

208
static struct notifier_block mce_srao_nb;
209

210
211
void mce_register_decode_chain(struct notifier_block *nb)
{
212
213
214
215
	/* Ensure SRAO notifier has the highest priority in the decode chain. */
	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
		nb->priority -= 1;

216
217
218
219
220
221
222
223
224
225
	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);

void mce_unregister_decode_chain(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);

226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
static inline u32 ctl_reg(int bank)
{
	return MSR_IA32_MCx_CTL(bank);
}

static inline u32 status_reg(int bank)
{
	return MSR_IA32_MCx_STATUS(bank);
}

static inline u32 addr_reg(int bank)
{
	return MSR_IA32_MCx_ADDR(bank);
}

static inline u32 misc_reg(int bank)
{
	return MSR_IA32_MCx_MISC(bank);
}

static inline u32 smca_ctl_reg(int bank)
{
	return MSR_AMD64_SMCA_MCx_CTL(bank);
}

static inline u32 smca_status_reg(int bank)
{
	return MSR_AMD64_SMCA_MCx_STATUS(bank);
}

static inline u32 smca_addr_reg(int bank)
{
	return MSR_AMD64_SMCA_MCx_ADDR(bank);
}

static inline u32 smca_misc_reg(int bank)
{
	return MSR_AMD64_SMCA_MCx_MISC(bank);
}

struct mca_msr_regs msr_ops = {
	.ctl	= ctl_reg,
	.status	= status_reg,
	.addr	= addr_reg,
	.misc	= misc_reg
};

Hidetoshi Seto's avatar
Hidetoshi Seto committed
273
static void print_mce(struct mce *m)
Linus Torvalds's avatar
Linus Torvalds committed
274
{
275
276
	int ret = 0;

277
	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
278
	       m->extcpu, m->mcgstatus, m->bank, m->status);
279

280
	if (m->ip) {
281
		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
282
283
284
			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
				m->cs, m->ip);

Linus Torvalds's avatar
Linus Torvalds committed
285
		if (m->cs == __KERNEL_CS)
286
			print_symbol("{%s}", m->ip);
287
		pr_cont("\n");
Linus Torvalds's avatar
Linus Torvalds committed
288
	}
289

290
	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
Linus Torvalds's avatar
Linus Torvalds committed
291
	if (m->addr)
292
		pr_cont("ADDR %llx ", m->addr);
Linus Torvalds's avatar
Linus Torvalds committed
293
	if (m->misc)
294
		pr_cont("MISC %llx ", m->misc);
295

296
297
298
299
300
301
302
	if (mce_flags.smca) {
		if (m->synd)
			pr_cont("SYND %llx ", m->synd);
		if (m->ipid)
			pr_cont("IPID %llx ", m->ipid);
	}

303
	pr_cont("\n");
304
305
306
307
	/*
	 * Note this output is parsed by external tools and old fields
	 * should not be changed.
	 */
308
	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
309
310
		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
		cpu_data(m->extcpu).microcode);
311
312
313

	/*
	 * Print out human-readable details about the MCE error,
314
	 * (if the CPU has an implementation for that)
315
	 */
316
317
318
319
320
	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
	if (ret == NOTIFY_STOP)
		return;

	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
321
322
}

323
324
#define PANIC_TIMEOUT 5 /* 5 seconds */

325
static atomic_t mce_panicked;
326

327
static int fake_panic;
328
static atomic_t mce_fake_panicked;
329

330
331
332
333
/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
334

335
336
337
338
	preempt_disable();
	local_irq_enable();
	while (timeout-- > 0)
		udelay(1);
339
	if (panic_timeout == 0)
340
		panic_timeout = mca_cfg.panic_timeout;
341
342
343
	panic("Panicing machine check CPU died");
}

344
static void mce_panic(const char *msg, struct mce *final, char *exp)
345
{
346
347
348
	int apei_err = 0;
	struct llist_node *pending;
	struct mce_evt_llist *l;
349

350
351
352
353
	if (!fake_panic) {
		/*
		 * Make sure only one CPU runs in machine check panic
		 */
354
		if (atomic_inc_return(&mce_panicked) > 1)
355
356
			wait_for_panic();
		barrier();
357

358
359
360
361
		bust_spinlocks(1);
		console_verbose();
	} else {
		/* Don't log too much for fake panic */
362
		if (atomic_inc_return(&mce_fake_panicked) > 1)
363
364
			return;
	}
365
	pending = mce_gen_pool_prepare_records();
366
	/* First print corrected ones that are still unlogged */
367
368
	llist_for_each_entry(l, pending, llnode) {
		struct mce *m = &l->mce;
369
		if (!(m->status & MCI_STATUS_UC)) {
Hidetoshi Seto's avatar
Hidetoshi Seto committed
370
			print_mce(m);
371
372
373
			if (!apei_err)
				apei_err = apei_write_mce(m);
		}
374
375
	}
	/* Now print uncorrected but with the final one last */
376
377
	llist_for_each_entry(l, pending, llnode) {
		struct mce *m = &l->mce;
Hidetoshi Seto's avatar
Hidetoshi Seto committed
378
379
		if (!(m->status & MCI_STATUS_UC))
			continue;
380
		if (!final || mce_cmp(m, final)) {
Hidetoshi Seto's avatar
Hidetoshi Seto committed
381
			print_mce(m);
382
383
384
			if (!apei_err)
				apei_err = apei_write_mce(m);
		}
Linus Torvalds's avatar
Linus Torvalds committed
385
	}
386
	if (final) {
Hidetoshi Seto's avatar
Hidetoshi Seto committed
387
		print_mce(final);
388
389
390
		if (!apei_err)
			apei_err = apei_write_mce(final);
	}
391
	if (cpu_missing)
392
		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
393
	if (exp)
394
		pr_emerg(HW_ERR "Machine check: %s\n", exp);
395
396
	if (!fake_panic) {
		if (panic_timeout == 0)
397
			panic_timeout = mca_cfg.panic_timeout;
398
399
		panic(msg);
	} else
400
		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
401
}
Linus Torvalds's avatar
Linus Torvalds committed
402

403
404
405
406
/* Support code for software error injection */

static int msr_to_offset(u32 msr)
{
407
	unsigned bank = __this_cpu_read(injectm.bank);
408

409
	if (msr == mca_cfg.rip_msr)
410
		return offsetof(struct mce, ip);
411
	if (msr == msr_ops.status(bank))
412
		return offsetof(struct mce, status);
413
	if (msr == msr_ops.addr(bank))
414
		return offsetof(struct mce, addr);
415
	if (msr == msr_ops.misc(bank))
416
417
418
419
420
421
		return offsetof(struct mce, misc);
	if (msr == MSR_IA32_MCG_STATUS)
		return offsetof(struct mce, mcgstatus);
	return -1;
}

422
423
424
425
/* MSR access wrappers used for error injection */
static u64 mce_rdmsrl(u32 msr)
{
	u64 v;
426

427
	if (__this_cpu_read(injectm.finished)) {
428
		int offset = msr_to_offset(msr);
429

430
431
		if (offset < 0)
			return 0;
432
		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
433
	}
434
435

	if (rdmsrl_safe(msr, &v)) {
436
		WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
437
438
439
440
441
442
443
444
		/*
		 * Return zero in case the access faulted. This should
		 * not happen normally but can happen if the CPU does
		 * something weird, or if the code is buggy.
		 */
		v = 0;
	}

445
446
447
448
449
	return v;
}

static void mce_wrmsrl(u32 msr, u64 v)
{
450
	if (__this_cpu_read(injectm.finished)) {
451
		int offset = msr_to_offset(msr);
452

453
		if (offset >= 0)
454
			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
455
456
		return;
	}
457
458
459
	wrmsrl(msr, v);
}

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
/*
 * Collect all global (w.r.t. this processor) status about this machine
 * check into our "mce" struct so that we can use it later to assess
 * the severity of the problem as we read per-bank specific details.
 */
static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
	mce_setup(m);

	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
	if (regs) {
		/*
		 * Get the address of the instruction at the time of
		 * the machine check error.
		 */
		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
			m->ip = regs->ip;
			m->cs = regs->cs;
478
479
480
481
482
483
484
485

			/*
			 * When in VM86 mode make the cs look like ring 3
			 * always. This is a lie, but it's better than passing
			 * the additional vm86 bit around everywhere.
			 */
			if (v8086_mode(regs))
				m->cs |= 3;
486
487
		}
		/* Use accurate RIP reporting if available. */
488
489
		if (mca_cfg.rip_msr)
			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
490
491
492
	}
}

Andi Kleen's avatar
Andi Kleen committed
493
int mce_available(struct cpuinfo_x86 *c)
Linus Torvalds's avatar
Linus Torvalds committed
494
{
495
	if (mca_cfg.disabled)
496
		return 0;
497
	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds's avatar
Linus Torvalds committed
498
499
}

500
501
static void mce_schedule_work(void)
{
502
	if (!mce_gen_pool_empty() && keventd_up())
503
		schedule_work(&mce_work);
504
505
}

506
static void mce_irq_work_cb(struct irq_work *entry)
507
{
508
	mce_notify_irq();
509
	mce_schedule_work();
510
511
512
513
514
}

static void mce_report_event(struct pt_regs *regs)
{
	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
515
		mce_notify_irq();
516
517
518
519
520
521
522
		/*
		 * Triggering the work queue here is just an insurance
		 * policy in case the syscall exit notify handler
		 * doesn't run soon enough or ends up running on the
		 * wrong CPU (can happen when audit sleeps)
		 */
		mce_schedule_work();
523
524
525
		return;
	}

526
	irq_work_queue(&mce_irq_work);
527
528
}

529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
/*
 * Check if the address reported by the CPU is in a format we can parse.
 * It would be possible to add code for most other cases, but all would
 * be somewhat complicated (e.g. segment offset would require an instruction
 * parser). So only support physical addresses up to page granuality for now.
 */
static int mce_usable_address(struct mce *m)
{
	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
		return 0;

	/* Checks after this one are Intel-specific: */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
		return 1;

	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
		return 0;
	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
		return 0;
	return 1;
}

551
552
553
554
555
556
557
558
559
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{
	struct mce *mce = (struct mce *)data;
	unsigned long pfn;

	if (!mce)
		return NOTIFY_DONE;

560
	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
561
562
563
564
565
		pfn = mce->addr >> PAGE_SHIFT;
		memory_failure(pfn, MCE_VECTOR, 0);
	}

	return NOTIFY_OK;
566
}
567
568
569
570
static struct notifier_block mce_srao_nb = {
	.notifier_call	= srao_decode_notifier,
	.priority = INT_MAX,
};
571

572
573
574
575
576
577
/*
 * Read ADDR and MISC registers.
 */
static void mce_read_aux(struct mce *m, int i)
{
	if (m->status & MCI_STATUS_MISCV)
578
		m->misc = mce_rdmsrl(msr_ops.misc(i));
579

580
	if (m->status & MCI_STATUS_ADDRV) {
581
		m->addr = mce_rdmsrl(msr_ops.addr(i));
582
583
584
585

		/*
		 * Mask the reported address by the reported granularity.
		 */
586
		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
587
588
589
590
			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
			m->addr >>= shift;
			m->addr <<= shift;
		}
591
592
593
594
595
596
597
598
599
600

		/*
		 * Extract [55:<lsb>] where lsb is the least significant
		 * *valid* bit of the address bits.
		 */
		if (mce_flags.smca) {
			u8 lsb = (m->addr >> 56) & 0x3f;

			m->addr &= GENMASK_ULL(55, lsb);
		}
601
	}
602

603
604
605
606
607
608
	if (mce_flags.smca) {
		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));

		if (m->status & MCI_STATUS_SYNDV)
			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
	}
609
610
}

611
612
613
614
615
static bool memory_error(struct mce *m)
{
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (c->x86_vendor == X86_VENDOR_AMD) {
616
617
618
619
		/* ErrCodeExt[20:16] */
		u8 xec = (m->status >> 16) & 0x1f;

		return (xec == 0x0 || xec == 0x8);
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		 *
		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		 * indicating a memory error. Bit 8 is used for indicating a
		 * cache hierarchy error. The combination of bit 2 and bit 3
		 * is used for indicating a `generic' cache hierarchy error
		 * But we can't just blindly check the above bits, because if
		 * bit 11 is set, then it is a bus/interconnect error - and
		 * either way the above bits just gives more detail on what
		 * bus/interconnect error happened. Note that bit 12 can be
		 * ignored, as it's the "filter" bit.
		 */
		return (m->status & 0xef80) == BIT(7) ||
		       (m->status & 0xef00) == BIT(8) ||
		       (m->status & 0xeffc) == 0xc;
	}

	return false;
}

642
643
DEFINE_PER_CPU(unsigned, mce_poll_count);

644
/*
645
646
647
648
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
649
650
651
652
653
654
655
656
657
 *
 * Note: spec recommends to panic for fatal unsignalled
 * errors here. However this would be quite problematic --
 * we would need to reimplement the Monarch handling and
 * it would mess up the exclusion between exception handler
 * and poll hander -- * so we skip this for now.
 * These cases should not happen anyways, or only when the CPU
 * is already totally * confused. In this case it's likely it will
 * not fully execute the machine check handler either.
658
 */
659
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
660
{
661
	bool error_seen = false;
662
	struct mce m;
663
	int severity;
664
665
	int i;

666
	this_cpu_inc(mce_poll_count);
667

668
	mce_gather_info(&m, NULL);
669

670
	for (i = 0; i < mca_cfg.banks; i++) {
671
		if (!mce_banks[i].ctl || !test_bit(i, *b))
672
673
674
675
676
677
678
679
			continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;
		m.tsc = 0;

		barrier();
680
		m.status = mce_rdmsrl(msr_ops.status(i));
681
682
683
		if (!(m.status & MCI_STATUS_VAL))
			continue;

684

685
		/*
686
687
		 * Uncorrected or signalled events are handled by the exception
		 * handler when it is enabled, so don't process those here.
688
689
690
		 *
		 * TBD do the same check for MCI_STATUS_EN here?
		 */
691
		if (!(flags & MCP_UC) &&
692
		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
693
694
			continue;

695
696
		error_seen = true;

697
		mce_read_aux(&m, i);
698
699
700

		if (!(flags & MCP_TIMESTAMP))
			m.tsc = 0;
701
702
703

		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);

704
705
		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
			if (m.status & MCI_STATUS_ADDRV)
706
				m.severity = severity;
707

708
709
710
711
		/*
		 * Don't get the IP here because it's unlikely to
		 * have anything to do with the actual error location.
		 */
712
		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
Andi Kleen's avatar
Andi Kleen committed
713
			mce_log(&m);
714
		else if (mce_usable_address(&m)) {
715
716
717
718
719
720
721
			/*
			 * Although we skipped logging this, we still want
			 * to take action. Add to the pool so the registered
			 * notifiers will see it.
			 */
			if (!mce_gen_pool_add(&m))
				mce_schedule_work();
722
		}
723
724
725
726

		/*
		 * Clear state for this bank.
		 */
727
		mce_wrmsrl(msr_ops.status(i), 0);
728
729
730
731
732
733
	}

	/*
	 * Don't clear MCG_STATUS here because it's only defined for
	 * exceptions.
	 */
734
735

	sync_core();
736

737
	return error_seen;
738
}
739
EXPORT_SYMBOL_GPL(machine_check_poll);
740

741
742
743
744
/*
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 */
745
746
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
			  struct pt_regs *regs)
747
{
748
	int i, ret = 0;
749
	char *tmp;
750

751
	for (i = 0; i < mca_cfg.banks; i++) {
752
		m->status = mce_rdmsrl(msr_ops.status(i));
753
		if (m->status & MCI_STATUS_VAL) {
754
			__set_bit(i, validp);
755
756
757
			if (quirk_no_way_out)
				quirk_no_way_out(i, m, regs);
		}
758
759
760

		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
			*msg = tmp;
761
			ret = 1;
762
		}
763
	}
764
	return ret;
765
766
}

767
768
769
770
771
772
773
774
775
776
777
778
779
780
/*
 * Variable to establish order between CPUs while scanning.
 * Each CPU spins initially until executing is equal its number.
 */
static atomic_t mce_executing;

/*
 * Defines order of CPUs on entry. First CPU becomes Monarch.
 */
static atomic_t mce_callin;

/*
 * Check if a timeout waiting for other CPUs happened.
 */
781
static int mce_timed_out(u64 *t, const char *msg)
782
783
784
785
786
787
788
789
{
	/*
	 * The others already did panic for some reason.
	 * Bail out like in a timeout.
	 * rmb() to tell the compiler that system_state
	 * might have been modified by someone else.
	 */
	rmb();
790
	if (atomic_read(&mce_panicked))
791
		wait_for_panic();
792
	if (!mca_cfg.monarch_timeout)
793
794
		goto out;
	if ((s64)*t < SPINUNIT) {
795
		if (mca_cfg.tolerant <= 1)
796
			mce_panic(msg, NULL, NULL);
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
		cpu_missing = 1;
		return 1;
	}
	*t -= SPINUNIT;
out:
	touch_nmi_watchdog();
	return 0;
}

/*
 * The Monarch's reign.  The Monarch is the CPU who entered
 * the machine check handler first. It waits for the others to
 * raise the exception too and then grades them. When any
 * error is fatal panic. Only then let the others continue.
 *
 * The other CPUs entering the MCE handler will be controlled by the
 * Monarch. They are called Subjects.
 *
 * This way we prevent any potential data corruption in a unrecoverable case
 * and also makes sure always all CPU's errors are examined.
 *
818
 * Also this detects the case of a machine check event coming from outer
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
 * space (not detected by any CPUs) In this case some external agent wants
 * us to shut down, so panic too.
 *
 * The other CPUs might still decide to panic if the handler happens
 * in a unrecoverable place, but in this case the system is in a semi-stable
 * state and won't corrupt anything by itself. It's ok to let the others
 * continue for a bit first.
 *
 * All the spin loops have timeouts; when a timeout happens a CPU
 * typically elects itself to be Monarch.
 */
static void mce_reign(void)
{
	int cpu;
	struct mce *m = NULL;
	int global_worst = 0;
	char *msg = NULL;
	char *nmsg = NULL;

	/*
	 * This CPU is the Monarch and the other CPUs have run
	 * through their handlers.
	 * Grade the severity of the errors of all the CPUs.
	 */
	for_each_possible_cpu(cpu) {
844
845
		int severity = mce_severity(&per_cpu(mces_seen, cpu),
					    mca_cfg.tolerant,
846
					    &nmsg, true);
847
848
849
850
851
852
853
854
855
856
857
858
		if (severity > global_worst) {
			msg = nmsg;
			global_worst = severity;
			m = &per_cpu(mces_seen, cpu);
		}
	}

	/*
	 * Cannot recover? Panic here then.
	 * This dumps all the mces in the log buffer and stops the
	 * other CPUs.
	 */
859
	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
860
		mce_panic("Fatal machine check", m, msg);
861
862
863
864
865
866
867
868
869
870
871

	/*
	 * For UC somewhere we let the CPU who detects it handle it.
	 * Also must let continue the others, otherwise the handling
	 * CPU could deadlock on a lock.
	 */

	/*
	 * No machine check event found. Must be some external
	 * source or one CPU is hung. Panic.
	 */
872
	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
873
		mce_panic("Fatal machine check from unknown source", NULL, NULL);
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891

	/*
	 * Now clear all the mces_seen so that they don't reappear on
	 * the next mce.
	 */
	for_each_possible_cpu(cpu)
		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
}

static atomic_t global_nwo;

/*
 * Start of Monarch synchronization. This waits until all CPUs have
 * entered the exception handler and then determines if any of them
 * saw a fatal event that requires panic. Then it executes them
 * in the entry order.
 * TBD double check parallel CPU hotunplug
 */
Hidetoshi Seto's avatar
Hidetoshi Seto committed
892
static int mce_start(int *no_way_out)
893
{
Hidetoshi Seto's avatar
Hidetoshi Seto committed
894
	int order;
895
	int cpus = num_online_cpus();
896
	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
897

Hidetoshi Seto's avatar
Hidetoshi Seto committed
898
899
	if (!timeout)
		return -1;
900

Hidetoshi Seto's avatar
Hidetoshi Seto committed
901
	atomic_add(*no_way_out, &global_nwo);
902
	/*
903
904
	 * Rely on the implied barrier below, such that global_nwo
	 * is updated before mce_callin.
905
	 */
906
	order = atomic_inc_return(&mce_callin);
907
908
909
910
911

	/*
	 * Wait for everyone.
	 */
	while (atomic_read(&mce_callin) != cpus) {
912
913
		if (mce_timed_out(&timeout,
				  "Timeout: Not all CPUs entered broadcast exception handler")) {
914
			atomic_set(&global_nwo, 0);
Hidetoshi Seto's avatar
Hidetoshi Seto committed
915
			return -1;
916
917
918
919
		}
		ndelay(SPINUNIT);
	}

920
921
922
923
	/*
	 * mce_callin should be read before global_nwo
	 */
	smp_rmb();
924

Hidetoshi Seto's avatar
Hidetoshi Seto committed
925
926
927
928
	if (order == 1) {
		/*
		 * Monarch: Starts executing now, the others wait.
		 */
929
		atomic_set(&mce_executing, 1);
Hidetoshi Seto's avatar
Hidetoshi Seto committed
930
931
932
933
934
935
936
937
	} else {
		/*
		 * Subject: Now start the scanning loop one by one in
		 * the original callin order.
		 * This way when there are any shared banks it will be
		 * only seen by one CPU before cleared, avoiding duplicates.
		 */
		while (atomic_read(&mce_executing) < order) {
938
939
			if (mce_timed_out(&timeout,
					  "Timeout: Subject CPUs unable to finish machine check processing")) {
Hidetoshi Seto's avatar
Hidetoshi Seto committed
940
941
942
943
944
				atomic_set(&global_nwo, 0);
				return -1;
			}
			ndelay(SPINUNIT);
		}
945
946
947
	}

	/*
Hidetoshi Seto's avatar
Hidetoshi Seto committed
948
	 * Cache the global no_way_out state.
949
	 */
Hidetoshi Seto's avatar
Hidetoshi Seto committed
950
951
952
	*no_way_out = atomic_read(&global_nwo);

	return order;
953
954
955
956
957
958
959
960
961
}

/*
 * Synchronize between CPUs after main scanning loop.
 * This invokes the bulk of the Monarch processing.
 */
static int mce_end(int order)
{
	int ret = -1;
962
	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982

	if (!timeout)
		goto reset;
	if (order < 0)
		goto reset;

	/*
	 * Allow others to run.
	 */
	atomic_inc(&mce_executing);

	if (order == 1) {
		/* CHECKME: Can this race with a parallel hotplug? */
		int cpus = num_online_cpus();

		/*
		 * Monarch: Wait for everyone to go through their scanning
		 * loops.
		 */
		while (atomic_read(&mce_executing) <= cpus) {
983
984
			if (mce_timed_out(&timeout,
					  "Timeout: Monarch CPU unable to finish machine check processing"))
985
986
987
988
989
990
991
992
993
994
995
996
				goto reset;
			ndelay(SPINUNIT);
		}

		mce_reign();
		barrier();
		ret = 0;
	} else {
		/*
		 * Subject: Wait for Monarch to finish.
		 */
		while (atomic_read(&mce_executing) != 0) {
997
998
			if (mce_timed_out(&timeout,
					  "Timeout: Monarch CPU did not finish machine check processing"))
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
				goto reset;
			ndelay(SPINUNIT);
		}

		/*
		 * Don't reset anything. That's done by the Monarch.
		 */
		return 0;
	}

	/*
	 * Reset all global state.
	 */
reset:
	atomic_set(&global_nwo, 0);
	atomic_set(&mce_callin, 0);
	barrier();

	/*
	 * Let others run again.
	 */
	atomic_set(&mce_executing, 0);
	return ret;
}

static void mce_clear_state(unsigned long *toclear)
{
	int i;

1028
	for (i = 0; i < mca_cfg.banks; i++) {
1029
		if (test_bit(i, toclear))
1030
			mce_wrmsrl(msr_ops.status(i), 0);
1031
1032
1033
	}
}

1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
static int do_memory_failure(struct mce *m)
{
	int flags = MF_ACTION_REQUIRED;
	int ret;

	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
	if (!(m->mcgstatus & MCG_STATUS_RIPV))
		flags |= MF_MUST_KILL;
	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
	if (ret)
		pr_err("Memory error not recovered");
	return ret;
}

1048
1049
1050
1051
1052
1053
1054
/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
 * This is executed in NMI context not subject to normal locking rules. This
 * implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
1055
1056
1057
1058
 *
 * On Intel systems this is entered on all CPUs in parallel through
 * MCE broadcast. However some CPUs might be broken beyond repair,
 * so be always careful when synchronizing with others.
Linus Torvalds's avatar
Linus Torvalds committed
1059
 */
Ingo Molnar's avatar
Ingo Molnar committed
1060
void do_machine_check(struct pt_regs *regs, long error_code)
Linus Torvalds's avatar
Linus Torvalds committed
1061
{
1062
	struct mca_config *cfg = &mca_cfg;
1063
	struct mce m, *final;
Linus Torvalds's avatar
Linus Torvalds committed
1064
	int i;
1065
1066
	int worst = 0;
	int severity;
1067

1068
1069
1070
1071
	/*
	 * Establish sequential order between the CPUs entering the machine
	 * check handler.
	 */
1072
	int order = -1;
1073
1074
	/*
	 * If no_way_out gets set, there is no safe way to recover from this
1075
	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1076
1077
1078
1079
1080
1081
1082
	 */
	int no_way_out = 0;
	/*
	 * If kill_it gets set, there might be a way to recover from this
	 * error.
	 */
	int kill_it = 0;
1083
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1084
	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1085
	char *msg = "Unknown";
1086
1087
1088
1089
1090
1091

	/*
	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
	 * on Intel.
	 */
	int lmce = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1092

1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
	/* If this CPU is offline, just bail out. */
	if (cpu_is_offline(smp_processor_id())) {
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
		if (mcgstatus & MCG_STATUS_RIPV) {
			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
			return;
		}
	}

1104
	ist_enter(regs);
1105

1106
	this_cpu_inc(mce_exception_count);
1107

1108
	if (!cfg->banks)
1109
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1110

1111
	mce_gather_info(&m, regs);
1112