mce_64.c 27.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
/*
 * Machine check handler.
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4
5
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
6
7
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
Linus Torvalds's avatar
Linus Torvalds committed
8
9
10
11
12
13
 */

#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
Arnd Bergmann's avatar
Arnd Bergmann committed
14
#include <linux/smp_lock.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
18
19
20
#include <linux/string.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
21
#include <linux/capability.h>
22
23
#include <linux/cpu.h>
#include <linux/percpu.h>
24
25
#include <linux/poll.h>
#include <linux/thread_info.h>
26
#include <linux/ctype.h>
27
#include <linux/kmod.h>
28
#include <linux/kdebug.h>
29
30
#include <linux/kobject.h>
#include <linux/sysfs.h>
31
#include <linux/ratelimit.h>
32
#include <asm/processor.h>
Linus Torvalds's avatar
Linus Torvalds committed
33
34
35
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/uaccess.h>
36
#include <asm/smp.h>
37
#include <asm/idle.h>
Linus Torvalds's avatar
Linus Torvalds committed
38
39

#define MISC_MCELOG_MINOR 227
40

41
42
atomic_t mce_entry;

Linus Torvalds's avatar
Linus Torvalds committed
43
44
static int mce_dont_init;

45
46
47
48
49
50
51
/*
 * Tolerant levels:
 *   0: always panic on uncorrected errors, log corrected errors
 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
 *   3: never panic or SIGBUS, log all errors (for testing only)
 */
Linus Torvalds's avatar
Linus Torvalds committed
52
53
static int tolerant = 1;
static int banks;
54
static u64 *bank;
55
static unsigned long notify_user;
56
static int rip_msr;
57
static int mce_bootlog = -1;
58
59
60
61
static atomic_t mce_events;

static char trigger[128];
static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds's avatar
Linus Torvalds committed
62

63
64
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);

65
66
67
68
69
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

70
71
72
73
74
75
76
77
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
	memset(m, 0, sizeof(struct mce));
	m->cpu = smp_processor_id();
	rdtscll(m->tsc);
}

Linus Torvalds's avatar
Linus Torvalds committed
78
79
80
81
82
83
/*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
 * separate MCEs from kernel messages to avoid bogus bug reports.
 */

84
static struct mce_log mcelog = {
Linus Torvalds's avatar
Linus Torvalds committed
85
86
	MCE_LOG_SIGNATURE,
	MCE_LOG_LEN,
87
};
Linus Torvalds's avatar
Linus Torvalds committed
88
89
90
91

void mce_log(struct mce *mce)
{
	unsigned next, entry;
92
	atomic_inc(&mce_events);
Linus Torvalds's avatar
Linus Torvalds committed
93
	mce->finished = 0;
Mike Waychison's avatar
Mike Waychison committed
94
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
95
96
	for (;;) {
		entry = rcu_dereference(mcelog.next);
97
98
99
100
		for (;;) {
			/* When the buffer fills up discard new entries. Assume
			   that the earlier errors are the more interesting. */
			if (entry >= MCE_LOG_LEN) {
101
				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102
103
104
105
106
107
108
				return;
			}
			/* Old left over entry. Skip. */
			if (mcelog.entry[entry].finished) {
				entry++;
				continue;
			}
Mike Waychison's avatar
Mike Waychison committed
109
			break;
Linus Torvalds's avatar
Linus Torvalds committed
110
111
112
113
114
115
116
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison's avatar
Mike Waychison committed
117
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
118
	mcelog.entry[entry].finished = 1;
Mike Waychison's avatar
Mike Waychison committed
119
	wmb();
Linus Torvalds's avatar
Linus Torvalds committed
120

121
	set_bit(0, &notify_user);
Linus Torvalds's avatar
Linus Torvalds committed
122
123
124
125
126
}

static void print_mce(struct mce *m)
{
	printk(KERN_EMERG "\n"
127
	       KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds's avatar
Linus Torvalds committed
128
129
130
	       KERN_EMERG
	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
	       m->cpu, m->mcgstatus, m->bank, m->status);
131
	if (m->ip) {
132
		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
Linus Torvalds's avatar
Linus Torvalds committed
133
		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134
		       m->cs, m->ip);
Linus Torvalds's avatar
Linus Torvalds committed
135
		if (m->cs == __KERNEL_CS)
136
			print_symbol("{%s}", m->ip);
Linus Torvalds's avatar
Linus Torvalds committed
137
138
		printk("\n");
	}
139
	printk(KERN_EMERG "TSC %llx ", m->tsc);
Linus Torvalds's avatar
Linus Torvalds committed
140
	if (m->addr)
141
		printk("ADDR %llx ", m->addr);
Linus Torvalds's avatar
Linus Torvalds committed
142
	if (m->misc)
143
		printk("MISC %llx ", m->misc);
Linus Torvalds's avatar
Linus Torvalds committed
144
	printk("\n");
145
	printk(KERN_EMERG "This is not a software problem!\n");
146
147
	printk(KERN_EMERG "Run through mcelog --ascii to decode "
	       "and contact your hardware vendor\n");
Linus Torvalds's avatar
Linus Torvalds committed
148
149
150
}

static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151
{
Linus Torvalds's avatar
Linus Torvalds committed
152
	int i;
153

Linus Torvalds's avatar
Linus Torvalds committed
154
155
156
	oops_begin();
	for (i = 0; i < MCE_LOG_LEN; i++) {
		unsigned long tsc = mcelog.entry[i].tsc;
157

Linus Torvalds's avatar
Linus Torvalds committed
158
159
		if (time_before(tsc, start))
			continue;
160
		print_mce(&mcelog.entry[i]);
Linus Torvalds's avatar
Linus Torvalds committed
161
162
163
164
165
		if (backup && mcelog.entry[i].tsc == backup->tsc)
			backup = NULL;
	}
	if (backup)
		print_mce(backup);
166
	panic(msg);
167
}
Linus Torvalds's avatar
Linus Torvalds committed
168

Andi Kleen's avatar
Andi Kleen committed
169
int mce_available(struct cpuinfo_x86 *c)
Linus Torvalds's avatar
Linus Torvalds committed
170
{
171
172
	if (mce_dont_init)
		return 0;
173
	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds's avatar
Linus Torvalds committed
174
175
}

176
177
178
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179
		m->ip = regs->ip;
180
181
		m->cs = regs->cs;
	} else {
182
		m->ip = 0;
183
184
185
186
187
		m->cs = 0;
	}
	if (rip_msr) {
		/* Assume the RIP in the MSR is exact. Is this true? */
		m->mcgstatus |= MCG_STATUS_EIPV;
188
		rdmsrl(rip_msr, m->ip);
189
190
191
192
		m->cs = 0;
	}
}

193
/*
194
195
196
197
198
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
 */
199
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200
201
202
203
204
205
206
207
{
	struct mce m;
	int i;

	mce_setup(&m);

	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
	for (i = 0; i < banks; i++) {
208
		if (!bank[i] || !test_bit(i, *b))
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
			continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;
		m.tsc = 0;

		barrier();
		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
		if (!(m.status & MCI_STATUS_VAL))
			continue;

		/*
		 * Uncorrected events are handled by the exception handler
		 * when it is enabled. But when the exception is disabled log
		 * everything.
		 *
		 * TBD do the same check for MCI_STATUS_EN here?
		 */
		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
			continue;

		if (m.status & MCI_STATUS_MISCV)
			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
		if (m.status & MCI_STATUS_ADDRV)
			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);

		if (!(flags & MCP_TIMESTAMP))
			m.tsc = 0;
		/*
		 * Don't get the IP here because it's unlikely to
		 * have anything to do with the actual error location.
		 */
Andi Kleen's avatar
Andi Kleen committed
242
243
244
245
		if (!(flags & MCP_DONTLOG)) {
			mce_log(&m);
			add_taint(TAINT_MACHINE_CHECK);
		}
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

		/*
		 * Clear state for this bank.
		 */
		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
	}

	/*
	 * Don't clear MCG_STATUS here because it's only defined for
	 * exceptions.
	 */
}

/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
 * This is executed in NMI context not subject to normal locking rules. This
 * implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
Linus Torvalds's avatar
Linus Torvalds committed
266
267
268
269
270
271
272
 */
void do_machine_check(struct pt_regs * regs, long error_code)
{
	struct mce m, panicm;
	u64 mcestart = 0;
	int i;
	int panicm_found = 0;
273
274
275
276
277
278
279
280
281
282
	/*
	 * If no_way_out gets set, there is no safe way to recover from this
	 * MCE.  If tolerant is cranked up, we'll try anyway.
	 */
	int no_way_out = 0;
	/*
	 * If kill_it gets set, there might be a way to recover from this
	 * error.
	 */
	int kill_it = 0;
283
	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
Linus Torvalds's avatar
Linus Torvalds committed
284

285
286
	atomic_inc(&mce_entry);

287
	if (notify_die(DIE_NMI, "machine check", regs, error_code,
288
			   18, SIGKILL) == NOTIFY_STOP)
289
290
		goto out2;
	if (!banks)
291
		goto out2;
Linus Torvalds's avatar
Linus Torvalds committed
292

293
294
	mce_setup(&m);

Linus Torvalds's avatar
Linus Torvalds committed
295
	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296
	/* if the restart IP is not valid, we're done for */
Linus Torvalds's avatar
Linus Torvalds committed
297
	if (!(m.mcgstatus & MCG_STATUS_RIPV))
298
		no_way_out = 1;
299

Linus Torvalds's avatar
Linus Torvalds committed
300
301
302
303
	rdtscll(mcestart);
	barrier();

	for (i = 0; i < banks; i++) {
304
		__clear_bit(i, toclear);
305
		if (!bank[i])
Linus Torvalds's avatar
Linus Torvalds committed
306
			continue;
307
308

		m.misc = 0;
Linus Torvalds's avatar
Linus Torvalds committed
309
310
311
312
313
314
315
		m.addr = 0;
		m.bank = i;

		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
		if ((m.status & MCI_STATUS_VAL) == 0)
			continue;

316
317
318
319
320
321
322
323
324
325
326
327
328
329
		/*
		 * Non uncorrected errors are handled by machine_check_poll
		 * Leave them alone.
		 */
		if ((m.status & MCI_STATUS_UC) == 0)
			continue;

		/*
		 * Set taint even when machine check was not enabled.
		 */
		add_taint(TAINT_MACHINE_CHECK);

		__set_bit(i, toclear);

Linus Torvalds's avatar
Linus Torvalds committed
330
		if (m.status & MCI_STATUS_EN) {
331
332
333
334
335
336
337
338
339
340
341
342
			/* if PCC was set, there's no way out */
			no_way_out |= !!(m.status & MCI_STATUS_PCC);
			/*
			 * If this error was uncorrectable and there was
			 * an overflow, we're in trouble.  If no overflow,
			 * we might get away with just killing a task.
			 */
			if (m.status & MCI_STATUS_UC) {
				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
					no_way_out = 1;
				kill_it = 1;
			}
343
344
345
346
347
348
		} else {
			/*
			 * Machine check event was not enabled. Clear, but
			 * ignore.
			 */
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
349
350
351
352
353
354
355
		}

		if (m.status & MCI_STATUS_MISCV)
			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
		if (m.status & MCI_STATUS_ADDRV)
			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);

356
		mce_get_rip(&m, regs);
357
		mce_log(&m);
Linus Torvalds's avatar
Linus Torvalds committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371

		/* Did this bank cause the exception? */
		/* Assume that the bank with uncorrectable errors did it,
		   and that there is only a single one. */
		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
			panicm = m;
			panicm_found = 1;
		}
	}

	/* If we didn't find an uncorrectable error, pick
	   the last one (shouldn't happen, just being safe). */
	if (!panicm_found)
		panicm = m;
372
373
374
375
376
377

	/*
	 * If we have decided that we just CAN'T continue, and the user
	 *  has not set tolerant to an insane level, give up and die.
	 */
	if (no_way_out && tolerant < 3)
Linus Torvalds's avatar
Linus Torvalds committed
378
		mce_panic("Machine check", &panicm, mcestart);
379
380
381
382
383
384
385
386

	/*
	 * If the error seems to be unrecoverable, something should be
	 * done.  Try to kill as little as possible.  If we can kill just
	 * one task, do that.  If the user has set the tolerance very
	 * high, don't try to do anything at all.
	 */
	if (kill_it && tolerant < 3) {
Linus Torvalds's avatar
Linus Torvalds committed
387
388
		int user_space = 0;

389
390
391
392
393
		/*
		 * If the EIPV bit is set, it means the saved IP is the
		 * instruction which caused the MCE.
		 */
		if (m.mcgstatus & MCG_STATUS_EIPV)
394
			user_space = panicm.ip && (panicm.cs & 3);
395
396
397
398
399

		/*
		 * If we know that the error was in user space, send a
		 * SIGBUS.  Otherwise, panic if tolerance is low.
		 *
400
		 * force_sig() takes an awful lot of locks and has a slight
401
402
403
		 * risk of deadlocking.
		 */
		if (user_space) {
404
			force_sig(SIGBUS, current);
405
406
407
408
		} else if (panic_on_oops || tolerant < 2) {
			mce_panic("Uncorrected machine check",
				&panicm, mcestart);
		}
Linus Torvalds's avatar
Linus Torvalds committed
409
410
	}

411
412
413
	/* notify userspace ASAP */
	set_thread_flag(TIF_MCE_NOTIFY);

414
	/* the last thing we do is clear state */
415
416
417
418
	for (i = 0; i < banks; i++) {
		if (test_bit(i, toclear))
			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
	}
Linus Torvalds's avatar
Linus Torvalds committed
419
	wrmsrl(MSR_IA32_MCG_STATUS, 0);
420
421
 out2:
	atomic_dec(&mce_entry);
Linus Torvalds's avatar
Linus Torvalds committed
422
423
}

424
425
426
#ifdef CONFIG_X86_MCE_INTEL
/***
 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
Simon Arlott's avatar
Simon Arlott committed
427
 * @cpu: The CPU on which the event occurred.
428
429
430
431
432
433
434
435
436
437
 * @status: Event status information
 *
 * This function should be called by the thermal interrupt after the
 * event has been processed and the decision was made to log the event
 * further.
 *
 * The status parameter will be saved to the 'status' field of 'struct mce'
 * and historically has been the register value of the
 * MSR_IA32_THERMAL_STATUS (Intel) msr.
 */
438
void mce_log_therm_throt_event(__u64 status)
439
440
441
{
	struct mce m;

442
	mce_setup(&m);
443
444
445
446
447
448
	m.bank = MCE_THERMAL_BANK;
	m.status = status;
	mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */

Linus Torvalds's avatar
Linus Torvalds committed
449
/*
450
451
452
 * Periodic polling timer for "silent" machine check errors.  If the
 * poller finds an MCE, poll 2x faster.  When the poller finds no more
 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds's avatar
Linus Torvalds committed
453
454
455
 */

static int check_interval = 5 * 60; /* 5 minutes */
456
static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457
458
static void mcheck_timer(unsigned long);
static DEFINE_PER_CPU(struct timer_list, mce_timer);
Linus Torvalds's avatar
Linus Torvalds committed
459

460
static void mcheck_timer(unsigned long data)
Linus Torvalds's avatar
Linus Torvalds committed
461
{
462
	struct timer_list *t = &per_cpu(mce_timer, data);
463
	int *n;
464
465
466

	WARN_ON(smp_processor_id() != data);

Linus Torvalds's avatar
Linus Torvalds committed
467
	if (mce_available(&current_cpu_data))
468
469
		machine_check_poll(MCP_TIMESTAMP,
				&__get_cpu_var(mce_poll_banks));
Linus Torvalds's avatar
Linus Torvalds committed
470
471

	/*
472
473
	 * Alert userspace if needed.  If we logged an MCE, reduce the
	 * polling interval, otherwise increase the polling interval.
Linus Torvalds's avatar
Linus Torvalds committed
474
	 */
475
	n = &__get_cpu_var(next_interval);
476
	if (mce_notify_user()) {
477
		*n = max(*n/2, HZ/100);
478
	} else {
479
		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480
481
	}

482
	t->expires = jiffies + *n;
483
	add_timer(t);
484
485
}

486
487
488
489
490
491
492
static void mce_do_trigger(struct work_struct *work)
{
	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}

static DECLARE_WORK(mce_trigger_work, mce_do_trigger);

493
/*
494
495
496
 * Notify the user(s) about new machine check events.
 * Can be called from interrupt context, but not from machine check/NMI
 * context.
497
498
499
 */
int mce_notify_user(void)
{
500
501
502
	/* Not more than two messages every minute */
	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);

503
504
505
	clear_thread_flag(TIF_MCE_NOTIFY);
	if (test_and_clear_bit(0, &notify_user)) {
		wake_up_interruptible(&mce_wait);
506
507
508
509
510
511
512
513

		/*
		 * There is no risk of missing notifications because
		 * work_pending is always cleared before the function is
		 * executed.
		 */
		if (trigger[0] && !work_pending(&mce_trigger_work))
			schedule_work(&mce_trigger_work);
514

515
		if (__ratelimit(&ratelimit))
516
			printk(KERN_INFO "Machine check events logged\n");
517
518

		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
519
	}
520
521
	return 0;
}
522

523
524
525
526
527
528
529
530
531
/* see if the idle task needs to notify userspace */
static int
mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
{
	/* IDLE_END should be safe - interrupts are back on */
	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
		mce_notify_user();

	return NOTIFY_OK;
Linus Torvalds's avatar
Linus Torvalds committed
532
533
}

534
535
536
static struct notifier_block mce_idle_notifier = {
	.notifier_call = mce_idle_callback,
};
Linus Torvalds's avatar
Linus Torvalds committed
537
538

static __init int periodic_mcheck_init(void)
539
{
540
541
       idle_notifier_register(&mce_idle_notifier);
       return 0;
542
}
Linus Torvalds's avatar
Linus Torvalds committed
543
544
__initcall(periodic_mcheck_init);

545
/*
Linus Torvalds's avatar
Linus Torvalds committed
546
547
 * Initialize Machine Checks for a CPU.
 */
548
static int mce_cap_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
549
550
{
	u64 cap;
551
	unsigned b;
Linus Torvalds's avatar
Linus Torvalds committed
552
553

	rdmsrl(MSR_IA32_MCG_CAP, cap);
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
	b = cap & 0xff;
	if (b > MAX_NR_BANKS) {
		printk(KERN_WARNING
		       "MCE: Using only %u machine check banks out of %u\n",
			MAX_NR_BANKS, b);
		b = MAX_NR_BANKS;
	}

	/* Don't support asymmetric configurations today */
	WARN_ON(banks != 0 && b != banks);
	banks = b;
	if (!bank) {
		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
		if (!bank)
			return -ENOMEM;
		memset(bank, 0xff, banks * sizeof(u64));
Linus Torvalds's avatar
Linus Torvalds committed
570
	}
571

572
573
574
	/* Use accurate RIP reporting if available. */
	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
		rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds's avatar
Linus Torvalds committed
575

576
577
578
579
580
581
582
	return 0;
}

static void mce_init(void *dummy)
{
	u64 cap;
	int i;
583
	mce_banks_t all_banks;
584

585
586
587
	/*
	 * Log the machine checks left over from the previous reset.
	 */
588
	bitmap_fill(all_banks, MAX_NR_BANKS);
Andi Kleen's avatar
Andi Kleen committed
589
	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
Linus Torvalds's avatar
Linus Torvalds committed
590
591
592

	set_in_cr4(X86_CR4_MCE);

593
	rdmsrl(MSR_IA32_MCG_CAP, cap);
Linus Torvalds's avatar
Linus Torvalds committed
594
595
596
597
	if (cap & MCG_CTL_P)
		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);

	for (i = 0; i < banks; i++) {
598
		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
Linus Torvalds's avatar
Linus Torvalds committed
599
		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600
	}
Linus Torvalds's avatar
Linus Torvalds committed
601
602
603
}

/* Add per CPU specific workarounds here */
604
static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605
{
Linus Torvalds's avatar
Linus Torvalds committed
606
	/* This should be disabled by the BIOS, but isn't always */
607
	if (c->x86_vendor == X86_VENDOR_AMD) {
608
		if (c->x86 == 15 && banks > 4)
609
610
			/* disable GART TBL walk error reporting, which trips off
			   incorrectly with the IOMMU & 3ware & Cerberus. */
611
			clear_bit(10, (unsigned long *)&bank[4]);
612
613
614
615
		if(c->x86 <= 17 && mce_bootlog < 0)
			/* Lots of broken BIOS around that don't clear them
			   by default and leave crap in there. Don't log. */
			mce_bootlog = 0;
Linus Torvalds's avatar
Linus Torvalds committed
616
	}
617

618
}
Linus Torvalds's avatar
Linus Torvalds committed
619

620
static void mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds's avatar
Linus Torvalds committed
621
622
623
624
625
{
	switch (c->x86_vendor) {
	case X86_VENDOR_INTEL:
		mce_intel_feature_init(c);
		break;
626
627
628
	case X86_VENDOR_AMD:
		mce_amd_feature_init(c);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
629
630
631
632
633
	default:
		break;
	}
}

634
635
636
static void mce_init_timer(void)
{
	struct timer_list *t = &__get_cpu_var(mce_timer);
637
	int *n = &__get_cpu_var(next_interval);
638

639
640
	*n = check_interval * HZ;
	if (!*n)
641
642
		return;
	setup_timer(t, mcheck_timer, smp_processor_id());
643
	t->expires = round_jiffies(jiffies + *n);
644
645
646
	add_timer(t);
}

647
/*
Linus Torvalds's avatar
Linus Torvalds committed
648
 * Called for each booted CPU to set up machine checks.
649
 * Must be called with preempt off.
Linus Torvalds's avatar
Linus Torvalds committed
650
 */
651
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds's avatar
Linus Torvalds committed
652
{
653
	if (!mce_available(c))
Linus Torvalds's avatar
Linus Torvalds committed
654
655
		return;

656
657
658
659
660
661
	if (mce_cap_init() < 0) {
		mce_dont_init = 1;
		return;
	}
	mce_cpu_quirks(c);

Linus Torvalds's avatar
Linus Torvalds committed
662
663
	mce_init(NULL);
	mce_cpu_features(c);
664
	mce_init_timer();
Linus Torvalds's avatar
Linus Torvalds committed
665
666
667
668
669
670
}

/*
 * Character device to read and clear the MCE log.
 */

Tim Hockin's avatar
Tim Hockin committed
671
672
673
674
675
676
static DEFINE_SPINLOCK(mce_state_lock);
static int open_count;	/* #times opened */
static int open_exclu;	/* already open exclusive? */

static int mce_open(struct inode *inode, struct file *file)
{
Arnd Bergmann's avatar
Arnd Bergmann committed
677
	lock_kernel();
Tim Hockin's avatar
Tim Hockin committed
678
679
680
681
	spin_lock(&mce_state_lock);

	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
		spin_unlock(&mce_state_lock);
Arnd Bergmann's avatar
Arnd Bergmann committed
682
		unlock_kernel();
Tim Hockin's avatar
Tim Hockin committed
683
684
685
686
687
688
689
690
		return -EBUSY;
	}

	if (file->f_flags & O_EXCL)
		open_exclu = 1;
	open_count++;

	spin_unlock(&mce_state_lock);
Arnd Bergmann's avatar
Arnd Bergmann committed
691
	unlock_kernel();
Tim Hockin's avatar
Tim Hockin committed
692

693
	return nonseekable_open(inode, file);
Tim Hockin's avatar
Tim Hockin committed
694
695
696
697
698
699
700
701
702
703
704
705
706
707
}

static int mce_release(struct inode *inode, struct file *file)
{
	spin_lock(&mce_state_lock);

	open_count--;
	open_exclu = 0;

	spin_unlock(&mce_state_lock);

	return 0;
}

708
709
static void collect_tscs(void *data)
{
Linus Torvalds's avatar
Linus Torvalds committed
710
	unsigned long *cpu_tsc = (unsigned long *)data;
711

Linus Torvalds's avatar
Linus Torvalds committed
712
	rdtscll(cpu_tsc[smp_processor_id()]);
713
}
Linus Torvalds's avatar
Linus Torvalds committed
714

715
716
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
			loff_t *off)
Linus Torvalds's avatar
Linus Torvalds committed
717
{
718
	unsigned long *cpu_tsc;
719
	static DEFINE_MUTEX(mce_read_mutex);
720
	unsigned prev, next;
Linus Torvalds's avatar
Linus Torvalds committed
721
722
723
	char __user *buf = ubuf;
	int i, err;

724
	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725
726
727
	if (!cpu_tsc)
		return -ENOMEM;

728
	mutex_lock(&mce_read_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
729
730
731
	next = rcu_dereference(mcelog.next);

	/* Only supports full reads right now */
732
	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733
		mutex_unlock(&mce_read_mutex);
734
		kfree(cpu_tsc);
Linus Torvalds's avatar
Linus Torvalds committed
735
736
737
738
		return -EINVAL;
	}

	err = 0;
739
740
741
742
743
744
745
746
747
748
749
750
	prev = 0;
	do {
		for (i = prev; i < next; i++) {
			unsigned long start = jiffies;

			while (!mcelog.entry[i].finished) {
				if (time_after_eq(jiffies, start + 2)) {
					memset(mcelog.entry + i, 0,
					       sizeof(struct mce));
					goto timeout;
				}
				cpu_relax();
751
			}
752
753
754
755
756
757
			smp_rmb();
			err |= copy_to_user(buf, mcelog.entry + i,
					    sizeof(struct mce));
			buf += sizeof(struct mce);
timeout:
			;
758
		}
Linus Torvalds's avatar
Linus Torvalds committed
759

760
761
762
763
764
		memset(mcelog.entry + prev, 0,
		       (next - prev) * sizeof(struct mce));
		prev = next;
		next = cmpxchg(&mcelog.next, prev, 0);
	} while (next != prev);
Linus Torvalds's avatar
Linus Torvalds committed
765

766
	synchronize_sched();
Linus Torvalds's avatar
Linus Torvalds committed
767

768
769
770
771
	/*
	 * Collect entries that were still getting written before the
	 * synchronize.
	 */
772
	on_each_cpu(collect_tscs, cpu_tsc, 1);
773
774
775
776
777
	for (i = next; i < MCE_LOG_LEN; i++) {
		if (mcelog.entry[i].finished &&
		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
			err |= copy_to_user(buf, mcelog.entry+i,
					    sizeof(struct mce));
Linus Torvalds's avatar
Linus Torvalds committed
778
779
780
781
			smp_rmb();
			buf += sizeof(struct mce);
			memset(&mcelog.entry[i], 0, sizeof(struct mce));
		}
782
	}
783
	mutex_unlock(&mce_read_mutex);
784
	kfree(cpu_tsc);
785
	return err ? -EFAULT : buf - ubuf;
Linus Torvalds's avatar
Linus Torvalds committed
786
787
}

788
789
790
791
792
793
794
795
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
	poll_wait(file, &mce_wait, wait);
	if (rcu_dereference(mcelog.next))
		return POLLIN | POLLRDNORM;
	return 0;
}

796
static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
Linus Torvalds's avatar
Linus Torvalds committed
797
798
{
	int __user *p = (int __user *)arg;
799

Linus Torvalds's avatar
Linus Torvalds committed
800
	if (!capable(CAP_SYS_ADMIN))
801
		return -EPERM;
Linus Torvalds's avatar
Linus Torvalds committed
802
	switch (cmd) {
803
	case MCE_GET_RECORD_LEN:
Linus Torvalds's avatar
Linus Torvalds committed
804
805
		return put_user(sizeof(struct mce), p);
	case MCE_GET_LOG_LEN:
806
		return put_user(MCE_LOG_LEN, p);
Linus Torvalds's avatar
Linus Torvalds committed
807
808
	case MCE_GETCLEAR_FLAGS: {
		unsigned flags;
809
810

		do {
Linus Torvalds's avatar
Linus Torvalds committed
811
			flags = mcelog.flags;
812
813
		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
		return put_user(flags, p);
Linus Torvalds's avatar
Linus Torvalds committed
814
815
	}
	default:
816
817
		return -ENOTTY;
	}
Linus Torvalds's avatar
Linus Torvalds committed
818
819
}

820
static const struct file_operations mce_chrdev_ops = {
Tim Hockin's avatar
Tim Hockin committed
821
822
	.open = mce_open,
	.release = mce_release,
Linus Torvalds's avatar
Linus Torvalds committed
823
	.read = mce_read,
824
	.poll = mce_poll,
825
	.unlocked_ioctl = mce_ioctl,
Linus Torvalds's avatar
Linus Torvalds committed
826
827
828
829
830
831
832
833
};

static struct miscdevice mce_log_device = {
	MISC_MCELOG_MINOR,
	"mcelog",
	&mce_chrdev_ops,
};

834
835
/*
 * Old style boot options parsing. Only for compatibility.
Linus Torvalds's avatar
Linus Torvalds committed
836
837
838
839
 */
static int __init mcheck_disable(char *str)
{
	mce_dont_init = 1;
840
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
841
842
}

843
/* mce=off disables machine check.
844
   mce=TOLERANCELEVEL (number, see above)
845
846
   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
   mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds's avatar
Linus Torvalds committed
847
848
849
850
static int __init mcheck_enable(char *str)
{
	if (!strcmp(str, "off"))
		mce_dont_init = 1;
851
852
	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
		mce_bootlog = str[0] == 'b';
853
854
	else if (isdigit(str[0]))
		get_option(&str, &tolerant);
Linus Torvalds's avatar
Linus Torvalds committed
855
	else
856
		printk("mce= argument %s ignored. Please use /sys", str);
857
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
858
859
860
}

__setup("nomce", mcheck_disable);
861
__setup("mce=", mcheck_enable);
Linus Torvalds's avatar
Linus Torvalds committed
862

863
/*
Linus Torvalds's avatar
Linus Torvalds committed
864
 * Sysfs support
865
 */
Linus Torvalds's avatar
Linus Torvalds committed
866

867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
/*
 * Disable machine checks on suspend and shutdown. We can't really handle
 * them later.
 */
static int mce_disable(void)
{
	int i;

	for (i = 0; i < banks; i++)
		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
	return 0;
}

static int mce_suspend(struct sys_device *dev, pm_message_t state)
{
	return mce_disable();
}

static int mce_shutdown(struct sys_device *dev)
{
	return mce_disable();
}

890
891
892
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
   Only one CPU is active at this time, the others get readded later using
   CPU hotplug. */
Linus Torvalds's avatar
Linus Torvalds committed
893
894
static int mce_resume(struct sys_device *dev)
{
895
	mce_init(NULL);
896
	mce_cpu_features(&current_cpu_data);
Linus Torvalds's avatar
Linus Torvalds committed
897
898
899
	return 0;
}

900
901
902
903
904
905
906
907
static void mce_cpu_restart(void *data)
{
	del_timer_sync(&__get_cpu_var(mce_timer));
	if (mce_available(&current_cpu_data))
		mce_init(NULL);
	mce_init_timer();
}

Linus Torvalds's avatar
Linus Torvalds committed
908
/* Reinit MCEs after user configuration changes */
909
910
static void mce_restart(void)
{
911
	on_each_cpu(mce_cpu_restart, NULL, 1);
Linus Torvalds's avatar
Linus Torvalds committed
912
913
914
}

static struct sysdev_class mce_sysclass = {
915
916
	.suspend = mce_suspend,
	.shutdown = mce_shutdown,
Linus Torvalds's avatar
Linus Torvalds committed
917
	.resume = mce_resume,
918
	.name = "machinecheck",
Linus Torvalds's avatar
Linus Torvalds committed
919
920
};

921
DEFINE_PER_CPU(struct sys_device, device_mce);
922
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
Linus Torvalds's avatar
Linus Torvalds committed
923
924
925

/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
926
927
928
	static ssize_t show_ ## name(struct sys_device *s,		\
				     struct sysdev_attribute *attr,	\
				     char *buf) {			\
929
930
		return sprintf(buf, "%lx\n", (unsigned long)var);	\
	}								\
931
932
933
	static ssize_t set_ ## name(struct sys_device *s,		\
				    struct sysdev_attribute *attr,	\
				    const char *buf, size_t siz) {	\
934
935
936
937
938
939
940
		char *end;						\
		unsigned long new = simple_strtoul(buf, &end, 0);	\
		if (end == buf) return -EINVAL;				\
		var = new;						\
		start;							\
		return end-buf;						\
	}								\
Linus Torvalds's avatar
Linus Torvalds committed
941
942
	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);

943
944
945
946
947
948
static struct sysdev_attribute *bank_attrs;

static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
			 char *buf)
{
	u64 b = bank[attr - bank_attrs];
949
	return sprintf(buf, "%llx\n", b);
950
951
952
953
954
955
956
957
958
959
960
961
962
}

static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
			const char *buf, size_t siz)
{
	char *end;
	u64 new = simple_strtoull(buf, &end, 0);
	if (end == buf)
		return -EINVAL;
	bank[attr - bank_attrs] = new;
	mce_restart();
	return end-buf;
}
963

964
965
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
				char *buf)
966
967
968
969
970
971
{
	strcpy(buf, trigger);
	strcat(buf, "\n");
	return strlen(trigger) + 1;
}

972
973
static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
				const char *buf,size_t siz)
974
975
976
977
978
979
980
981
982
983
984
985
{
	char *p;
	int len;
	strncpy(trigger, buf, sizeof(trigger));
	trigger[sizeof(trigger)-1] = 0;
	len = strlen(trigger);
	p = strchr(trigger, '\n');
	if (*p) *p = 0;
	return len;
}

static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
Linus Torvalds's avatar
Linus Torvalds committed
987
ACCESSOR(check_interval,check_interval,mce_restart())
988
static struct sysdev_attribute *mce_attributes[] = {
989
	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990
991
	NULL
};
Linus Torvalds's avatar
Linus Torvalds committed
992

993
static cpumask_var_t mce_device_initialized;
994

995
996
/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds's avatar
Linus Torvalds committed
997
998
{
	int err;
999
	int i;
1000

1001
	if (!mce_available(&boot_cpu_data))
1002
1003
		return -EIO;

1004
	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005
1006
1007
1008
	per_cpu(device_mce,cpu).id = cpu;
	per_cpu(device_mce,cpu).cls = &mce_sysclass;

	err = sysdev_register(&per_cpu(device_mce,cpu));
1009
1010
1011
1012
1013
1014
1015
1016
1017
	if (err)
		return err;

	for (i = 0; mce_attributes[i]; i++) {
		err = sysdev_create_file(&per_cpu(device_mce,cpu),
					 mce_attributes[i]);
		if (err)
			goto error;
	}
1018
1019
1020
1021
1022
1023
	for (i = 0; i < banks; i++) {
		err = sysdev_create_file(&per_cpu(device_mce, cpu),
					&bank_attrs[i]);
		if (err)
			goto error2;
	}
1024
	cpumask_set_cpu(cpu, mce_device_initialized);
1025

1026
	return 0;
1027
1028
1029
1030
1031
error2:
	while (--i >= 0) {
		sysdev_remove_file(&per_cpu(device_mce, cpu),
					&bank_attrs[i]);
	}
1032
error:
1033
	while (--i >= 0) {
1034
1035
		sysdev_remove_file(&per_cpu(device_mce,cpu),
				   mce_attributes[i]);
1036
	}
1037
1038
	sysdev_unregister(&per_cpu(device_mce,cpu));

1039
1040
1041
	return err;
}

1042
static __cpuinit void mce_remove_device(unsigned int cpu)
1043
{
1044
1045
	int i;

1046
	if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047
1048
		return;

1049
	for (i = 0; mce_attributes[i]; i++)
1050
		sysdev_remove_file(&per_cpu(device_mce,cpu),
1051
			mce_attributes[i]);
1052
1053
1054
	for (i = 0; i < banks; i++)
		sysdev_remove_file(&per_cpu(device_mce, cpu),
			&bank_attrs[i]);
1055
	sysdev_unregister(&per_cpu(device_mce,cpu));
1056
	cpumask_clear_cpu(cpu, mce_device_initialized);
1057
1058
}

1059
/* Make sure there are no machine checks on offlined CPUs. */
1060
static void mce_disable_cpu(void *h)
1061
1062
{
	int i;
Andi Kleen's avatar
Andi Kleen committed
1063
	unsigned long action = *(unsigned long *)h;
1064
1065
1066

	if (!mce_available(&current_cpu_data))
		return;
Andi Kleen's avatar
Andi Kleen committed
1067
1068
	if (!(action & CPU_TASKS_FROZEN))
		cmci_clear();
1069
1070
1071
1072
	for (i = 0; i < banks; i++)
		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
}

1073
static void mce_reenable_cpu(void *h)
1074
1075
{
	int i;
Andi Kleen's avatar
Andi Kleen committed
1076
	unsigned long action = *(unsigned long *)h;
1077
1078
1079

	if (!mce_available(&current_cpu_data))
		return;
Andi Kleen's avatar
Andi Kleen committed
1080
1081
	if (!(action & CPU_TASKS_FROZEN))
		cmci_reenable();
1082
1083
1084
1085
	for (i = 0; i < banks; i++)
		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
}

1086
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087
1088
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
				      unsigned long action, void *hcpu)
1089
1090
{
	unsigned int cpu = (unsigned long)hcpu;
1091
	struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093

	switch (action) {
1094
1095
1096
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
		mce_create_device(cpu);
1097
1098
		if (threshold_cpu_callback)
			threshold_cpu_callback(action, cpu);
1099
1100
		break;
	case CPU_DEAD:
1101
	case CPU_DEAD_FROZEN:
1102