tsc.c 35.5 KB
Newer Older
1
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Alok Kataria's avatar
Alok Kataria committed
3
#include <linux/kernel.h>
Alok Kataria's avatar
Alok Kataria committed
4
5
#include <linux/sched.h>
#include <linux/init.h>
6
#include <linux/export.h>
Alok Kataria's avatar
Alok Kataria committed
7
#include <linux/timer.h>
Alok Kataria's avatar
Alok Kataria committed
8
#include <linux/acpi_pmtmr.h>
9
#include <linux/cpufreq.h>
10
11
12
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
13
#include <linux/timex.h>
14
#include <linux/static_key.h>
Alok Kataria's avatar
Alok Kataria committed
15
16

#include <asm/hpet.h>
17
18
19
20
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
21
#include <asm/hypervisor.h>
22
#include <asm/nmi.h>
23
#include <asm/x86_init.h>
24
#include <asm/geode.h>
25
#include <asm/apic.h>
26
#include <asm/intel-family.h>
Alok Kataria's avatar
Alok Kataria committed
27

28
unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
Alok Kataria's avatar
Alok Kataria committed
29
EXPORT_SYMBOL(cpu_khz);
30
31

unsigned int __read_mostly tsc_khz;
Alok Kataria's avatar
Alok Kataria committed
32
33
34
35
36
EXPORT_SYMBOL(tsc_khz);

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
37
static int __read_mostly tsc_unstable;
Alok Kataria's avatar
Alok Kataria committed
38
39
40

/* native_sched_clock() is called before tsc_init(), so
   we must start with the TSC soft disabled to prevent
41
   erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */
42
static int __read_mostly tsc_disabled = -1;
Alok Kataria's avatar
Alok Kataria committed
43

44
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
45

46
int tsc_clocksource_reliable;
47

48
49
50
51
52
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;

53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 * Use a ring-buffer like data structure, where a writer advances the head by
 * writing a new data entry and a reader advances the tail when it observes a
 * new entry.
 *
 * Writers are made to wait on readers until there's space to write a new
 * entry.
 *
 * This means that we can always use an {offset, mul} pair to compute a ns
 * value that is 'roughly' in the right direction, even if we're writing a new
 * {offset, mul} pair during the clock read.
 *
 * The down-side is that we can no longer guarantee strict monotonicity anymore
 * (assuming the TSC was that to begin with), because while we compute the
 * intersection point of the two clock slopes and make sure the time is
 * continuous at the point of switching; we can no longer guarantee a reader is
 * strictly before or after the switch point.
 *
 * It does mean a reader no longer needs to disable IRQs in order to avoid
 * CPU-Freq updates messing with his times, and similarly an NMI reader will
 * no longer run the risk of hitting half-written state.
 */

struct cyc2ns {
	struct cyc2ns_data data[2];	/*  0 + 2*24 = 48 */
	struct cyc2ns_data *head;	/* 48 + 8    = 56 */
	struct cyc2ns_data *tail;	/* 56 + 8    = 64 */
}; /* exactly fits one cacheline */

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

struct cyc2ns_data *cyc2ns_read_begin(void)
{
	struct cyc2ns_data *head;

	preempt_disable();

	head = this_cpu_read(cyc2ns.head);
	/*
	 * Ensure we observe the entry when we observe the pointer to it.
	 * matches the wmb from cyc2ns_write_end().
	 */
	smp_read_barrier_depends();
	head->__count++;
	barrier();

	return head;
}

void cyc2ns_read_end(struct cyc2ns_data *head)
{
	barrier();
	/*
	 * If we're the outer most nested read; update the tail pointer
	 * when we're done. This notifies possible pending writers
	 * that we've observed the head pointer and that the other
	 * entry is now free.
	 */
	if (!--head->__count) {
		/*
		 * x86-TSO does not reorder writes with older reads;
		 * therefore once this write becomes visible to another
		 * cpu, we must be finished reading the cyc2ns_data.
		 *
		 * matches with cyc2ns_write_begin().
		 */
		this_cpu_write(cyc2ns.tail, head);
	}
	preempt_enable();
}

/*
 * Begin writing a new @data entry for @cpu.
 *
 * Assumes some sort of write side lock; currently 'provided' by the assumption
 * that cpufreq will call its notifiers sequentially.
 */
static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
{
	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
	struct cyc2ns_data *data = c2n->data;

	if (data == c2n->head)
		data++;

	/* XXX send an IPI to @cpu in order to guarantee a read? */

	/*
	 * When we observe the tail write from cyc2ns_read_end(),
	 * the cpu must be done with that entry and its safe
	 * to start writing to it.
	 */
	while (c2n->tail == data)
		cpu_relax();

	return data;
}

static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
{
	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);

	/*
	 * Ensure the @data writes are visible before we publish the
	 * entry. Matches the data-depencency in cyc2ns_read_begin().
	 */
	smp_wmb();

	ACCESS_ONCE(c2n->head) = data;
}

/*
 * Accelerators for sched_clock()
166
167
168
169
170
171
172
173
174
175
176
177
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
178
179
180
 *  into a shift. The larger SC is, the more accurate the conversion, but
 *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
 *  (64-bit result) can be used.
181
 *
182
 *  We can use khz divisor instead of mhz to keep a better precision.
183
184
185
186
187
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

188
189
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
190
	data->cyc2ns_mul = 0;
191
	data->cyc2ns_shift = 0;
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
	data->cyc2ns_offset = 0;
	data->__count = 0;
}

static void cyc2ns_init(int cpu)
{
	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);

	cyc2ns_data_init(&c2n->data[0]);
	cyc2ns_data_init(&c2n->data[1]);

	c2n->head = c2n->data;
	c2n->tail = c2n->data;
}

207
208
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
209
210
211
212
213
214
215
216
217
218
	struct cyc2ns_data *data, *tail;
	unsigned long long ns;

	/*
	 * See cyc2ns_read_*() for details; replicated in order to avoid
	 * an extra few instructions that came with the abstraction.
	 * Notable, it allows us to only do the __count and tail update
	 * dance when its actually needed.
	 */

219
	preempt_disable_notrace();
220
221
222
223
224
	data = this_cpu_read(cyc2ns.head);
	tail = this_cpu_read(cyc2ns.tail);

	if (likely(data == tail)) {
		ns = data->cyc2ns_offset;
225
		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
226
227
228
229
230
231
	} else {
		data->__count++;

		barrier();

		ns = data->cyc2ns_offset;
232
		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
233
234
235
236
237
238

		barrier();

		if (!--data->__count)
			this_cpu_write(cyc2ns.tail, data);
	}
239
	preempt_enable_notrace();
240

241
242
243
	return ns;
}

244
static void set_cyc2ns_scale(unsigned long khz, int cpu)
245
{
246
247
248
	unsigned long long tsc_now, ns_now;
	struct cyc2ns_data *data;
	unsigned long flags;
249
250
251
252

	local_irq_save(flags);
	sched_clock_idle_sleep_event();

253
	if (!khz)
254
255
256
		goto done;

	data = cyc2ns_write_begin(cpu);
257

258
	tsc_now = rdtsc();
259
260
	ns_now = cycles_2_ns(tsc_now);

261
262
263
264
265
	/*
	 * Compute a new multiplier as per the above comment and ensure our
	 * time function is continuous; see the comment near struct
	 * cyc2ns_data.
	 */
266
	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
267
268
			       NSEC_PER_MSEC, 0);

269
270
271
272
273
274
275
276
277
278
279
	/*
	 * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
	 * not expected to be greater than 31 due to the original published
	 * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
	 * value) - refer perf_event_mmap_page documentation in perf_event.h.
	 */
	if (data->cyc2ns_shift == 32) {
		data->cyc2ns_shift = 31;
		data->cyc2ns_mul >>= 1;
	}

280
	data->cyc2ns_offset = ns_now -
281
		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
282
283

	cyc2ns_write_end(cpu, data);
284

285
done:
286
287
288
	sched_clock_idle_wakeup_event(0);
	local_irq_restore(flags);
}
Alok Kataria's avatar
Alok Kataria committed
289
290
291
292
293
/*
 * Scheduler clock - returns current time in nanosec units.
 */
u64 native_sched_clock(void)
{
294
295
296
297
298
299
	if (static_branch_likely(&__use_tsc)) {
		u64 tsc_now = rdtsc();

		/* return the value in ns */
		return cycles_2_ns(tsc_now);
	}
Alok Kataria's avatar
Alok Kataria committed
300
301
302
303
304
305
306

	/*
	 * Fall back to jiffies if there's no TSC available:
	 * ( But note that we still use it if the TSC is marked
	 *   unstable. We do this because unlike Time Of Day,
	 *   the scheduler clock tolerates small errors and it's
	 *   very important for it to be as fast as the platform
307
	 *   can achieve it. )
Alok Kataria's avatar
Alok Kataria committed
308
309
	 */

310
311
	/* No locking but a rare wrong value is not a big deal: */
	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
Alok Kataria's avatar
Alok Kataria committed
312
313
}

314
315
316
317
318
319
320
321
/*
 * Generate a sched_clock if you already have a TSC value.
 */
u64 native_sched_clock_from_tsc(u64 tsc)
{
	return cycles_2_ns(tsc);
}

Alok Kataria's avatar
Alok Kataria committed
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
	return paravirt_sched_clock();
}
#else
unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
#endif

int check_tsc_unstable(void)
{
	return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
343
	pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
Alok Kataria's avatar
Alok Kataria committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
	tsc_disabled = 1;
	return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
	setup_clear_cpu_cap(X86_FEATURE_TSC);
	return 1;
}
#endif

__setup("notsc", notsc_setup);
Alok Kataria's avatar
Alok Kataria committed
360

361
362
static int no_sched_irq_time;

363
364
365
366
static int __init tsc_setup(char *str)
{
	if (!strcmp(str, "reliable"))
		tsc_clocksource_reliable = 1;
367
368
	if (!strncmp(str, "noirqtime", 9))
		no_sched_irq_time = 1;
369
370
371
372
373
	return 1;
}

__setup("tsc=", tsc_setup);

Alok Kataria's avatar
Alok Kataria committed
374
375
376
377
378
379
#define MAX_RETRIES     5
#define SMI_TRESHOLD    50000

/*
 * Read TSC and the reference counters. Take care of SMI disturbance
 */
380
static u64 tsc_read_refs(u64 *p, int hpet)
Alok Kataria's avatar
Alok Kataria committed
381
382
383
384
385
386
387
{
	u64 t1, t2;
	int i;

	for (i = 0; i < MAX_RETRIES; i++) {
		t1 = get_cycles();
		if (hpet)
388
			*p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
Alok Kataria's avatar
Alok Kataria committed
389
		else
390
			*p = acpi_pm_read_early();
Alok Kataria's avatar
Alok Kataria committed
391
392
393
394
395
396
397
		t2 = get_cycles();
		if ((t2 - t1) < SMI_TRESHOLD)
			return t2;
	}
	return ULLONG_MAX;
}

398
399
/*
 * Calculate the TSC frequency from HPET reference
Alok Kataria's avatar
Alok Kataria committed
400
 */
401
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
Alok Kataria's avatar
Alok Kataria committed
402
{
403
	u64 tmp;
Alok Kataria's avatar
Alok Kataria committed
404

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
	if (hpet2 < hpet1)
		hpet2 += 0x100000000ULL;
	hpet2 -= hpet1;
	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
	do_div(tmp, 1000000);
	do_div(deltatsc, tmp);

	return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
	u64 tmp;
Alok Kataria's avatar
Alok Kataria committed
421

422
423
424
425
426
427
428
429
430
431
432
433
434
	if (!pm1 && !pm2)
		return ULONG_MAX;

	if (pm2 < pm1)
		pm2 += (u64)ACPI_PM_OVRRUN;
	pm2 -= pm1;
	tmp = pm2 * 1000000000LL;
	do_div(tmp, PMTMR_TICKS_PER_SEC);
	do_div(deltatsc, tmp);

	return (unsigned long) deltatsc;
}

435
#define CAL_MS		10
436
#define CAL_LATCH	(PIT_TICK_RATE / (1000 / CAL_MS))
437
438
439
#define CAL_PIT_LOOPS	1000

#define CAL2_MS		50
440
#define CAL2_LATCH	(PIT_TICK_RATE / (1000 / CAL2_MS))
441
442
#define CAL2_PIT_LOOPS	5000

443

444
445
446
447
448
449
450
/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
451
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
452
453
454
455
456
457
458
459
460
461
462
463
464
465
{
	u64 tsc, t1, t2, delta;
	unsigned long tscmin, tscmax;
	int pitcnt;

	/* Set the Gate high, disable speaker */
	outb((inb(0x61) & ~0x02) | 0x01, 0x61);

	/*
	 * Setup CTC channel 2* for mode 0, (interrupt on terminal
	 * count mode), binary count. Set the latch register to 50ms
	 * (LSB then MSB) to begin countdown.
	 */
	outb(0xb0, 0x43);
466
467
	outb(latch & 0xff, 0x42);
	outb(latch >> 8, 0x42);
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487

	tsc = t1 = t2 = get_cycles();

	pitcnt = 0;
	tscmax = 0;
	tscmin = ULONG_MAX;
	while ((inb(0x61) & 0x20) == 0) {
		t2 = get_cycles();
		delta = t2 - tsc;
		tsc = t2;
		if ((unsigned long) delta < tscmin)
			tscmin = (unsigned int) delta;
		if ((unsigned long) delta > tscmax)
			tscmax = (unsigned int) delta;
		pitcnt++;
	}

	/*
	 * Sanity checks:
	 *
488
	 * If we were not able to read the PIT more than loopmin
489
490
491
492
493
	 * times, then we have been hit by a massive SMI
	 *
	 * If the maximum is 10 times larger than the minimum,
	 * then we got hit by an SMI as well.
	 */
494
	if (pitcnt < loopmin || tscmax > 10 * tscmin)
495
496
497
498
		return ULONG_MAX;

	/* Calculate the PIT value */
	delta = t2 - t1;
499
	do_div(delta, ms);
500
501
502
	return delta;
}

Linus Torvalds's avatar
Linus Torvalds committed
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequencty.
 */
538
539
540
541
542
543
544
static inline int pit_verify_msb(unsigned char val)
{
	/* Ignore LSB */
	inb(0x42);
	return inb(0x42) == val;
}

545
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
Linus Torvalds's avatar
Linus Torvalds committed
546
{
547
	int count;
548
	u64 tsc = 0, prev_tsc = 0;
Alok Kataria's avatar
Alok Kataria committed
549

Linus Torvalds's avatar
Linus Torvalds committed
550
	for (count = 0; count < 50000; count++) {
551
		if (!pit_verify_msb(val))
Linus Torvalds's avatar
Linus Torvalds committed
552
			break;
553
		prev_tsc = tsc;
554
		tsc = get_cycles();
Linus Torvalds's avatar
Linus Torvalds committed
555
	}
556
	*deltap = get_cycles() - prev_tsc;
557
558
559
560
561
562
563
	*tscp = tsc;

	/*
	 * We require _some_ success, but the quality control
	 * will be based on the error terms on the TSC values.
	 */
	return count > 5;
Linus Torvalds's avatar
Linus Torvalds committed
564
565
566
}

/*
567
568
569
 * How many MSB values do we want to see? We aim for
 * a maximum error rate of 500ppm (in practice the
 * real error is much smaller), but refuse to spend
570
 * more than 50ms on it.
Linus Torvalds's avatar
Linus Torvalds committed
571
 */
572
#define MAX_QUICK_PIT_MS 50
573
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
Alok Kataria's avatar
Alok Kataria committed
574

Linus Torvalds's avatar
Linus Torvalds committed
575
576
static unsigned long quick_pit_calibrate(void)
{
577
578
579
580
	int i;
	u64 tsc, delta;
	unsigned long d1, d2;

Linus Torvalds's avatar
Linus Torvalds committed
581
	/* Set the Gate high, disable speaker */
Alok Kataria's avatar
Alok Kataria committed
582
583
	outb((inb(0x61) & ~0x02) | 0x01, 0x61);

Linus Torvalds's avatar
Linus Torvalds committed
584
585
586
587
588
589
590
591
592
	/*
	 * Counter 2, mode 0 (one-shot), binary count
	 *
	 * NOTE! Mode 2 decrements by two (and then the
	 * output is flipped each time, giving the same
	 * final output frequency as a decrement-by-one),
	 * so mode 0 is much better when looking at the
	 * individual counts.
	 */
Alok Kataria's avatar
Alok Kataria committed
593
594
	outb(0xb0, 0x43);

Linus Torvalds's avatar
Linus Torvalds committed
595
596
597
598
	/* Start at 0xffff */
	outb(0xff, 0x42);
	outb(0xff, 0x42);

599
600
601
602
603
604
	/*
	 * The PIT starts counting at the next edge, so we
	 * need to delay for a microsecond. The easiest way
	 * to do that is to just read back the 16-bit counter
	 * once from the PIT.
	 */
605
	pit_verify_msb(0);
606

607
608
609
610
611
	if (pit_expect_msb(0xff, &tsc, &d1)) {
		for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
			if (!pit_expect_msb(0xff-i, &delta, &d2))
				break;

612
613
614
615
616
617
618
619
620
621
			delta -= tsc;

			/*
			 * Extrapolate the error and fail fast if the error will
			 * never be below 500 ppm.
			 */
			if (i == 1 &&
			    d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
				return 0;

622
623
624
			/*
			 * Iterate until the error is less than 500 ppm
			 */
625
626
627
628
629
630
631
632
633
634
635
636
637
			if (d1+d2 >= delta >> 11)
				continue;

			/*
			 * Check the PIT one more time to verify that
			 * all TSC reads were stable wrt the PIT.
			 *
			 * This also guarantees serialization of the
			 * last cycle read ('d2') in pit_expect_msb.
			 */
			if (!pit_verify_msb(0xfe - i))
				break;
			goto success;
Linus Torvalds's avatar
Linus Torvalds committed
638
639
		}
	}
640
	pr_info("Fast TSC calibration failed\n");
Linus Torvalds's avatar
Linus Torvalds committed
641
	return 0;
642
643
644
645
646
647
648
649
650

success:
	/*
	 * Ok, if we get here, then we've seen the
	 * MSB of the PIT decrement 'i' times, and the
	 * error has shrunk to less than 500 ppm.
	 *
	 * As a result, we can depend on there not being
	 * any odd delays anywhere, and the TSC reads are
651
	 * reliable (within the error).
652
653
654
655
656
657
658
	 *
	 * kHz = ticks / time-in-seconds / 1000;
	 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
	 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
	 */
	delta *= PIT_TICK_RATE;
	do_div(delta, i*256*1000);
659
	pr_info("Fast TSC calibration using PIT\n");
660
	return delta;
Linus Torvalds's avatar
Linus Torvalds committed
661
}
662

Alok Kataria's avatar
Alok Kataria committed
663
/**
664
665
 * native_calibrate_tsc
 * Determine TSC frequency via CPUID, else return 0.
Alok Kataria's avatar
Alok Kataria committed
666
 */
667
unsigned long native_calibrate_tsc(void)
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
{
	unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
	unsigned int crystal_khz;

	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
		return 0;

	if (boot_cpu_data.cpuid_level < 0x15)
		return 0;

	eax_denominator = ebx_numerator = ecx_hz = edx = 0;

	/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
	cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);

	if (ebx_numerator == 0 || eax_denominator == 0)
		return 0;

	crystal_khz = ecx_hz / 1000;

	if (crystal_khz == 0) {
		switch (boot_cpu_data.x86_model) {
690
691
		case INTEL_FAM6_SKYLAKE_MOBILE:
		case INTEL_FAM6_SKYLAKE_DESKTOP:
692
693
		case INTEL_FAM6_KABYLAKE_MOBILE:
		case INTEL_FAM6_KABYLAKE_DESKTOP:
694
695
			crystal_khz = 24000;	/* 24.0 MHz */
			break;
696
		case INTEL_FAM6_SKYLAKE_X:
697
		case INTEL_FAM6_ATOM_DENVERTON:
698
699
			crystal_khz = 25000;	/* 25.0 MHz */
			break;
700
		case INTEL_FAM6_ATOM_GOLDMONT:
701
702
			crystal_khz = 19200;	/* 19.2 MHz */
			break;
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
		}
	}

	return crystal_khz * ebx_numerator / eax_denominator;
}

static unsigned long cpu_khz_from_cpuid(void)
{
	unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;

	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
		return 0;

	if (boot_cpu_data.cpuid_level < 0x16)
		return 0;

	eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;

	cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);

	return eax_base_mhz * 1000;
}

/**
 * native_calibrate_cpu - calibrate the cpu on boot
 */
unsigned long native_calibrate_cpu(void)
Alok Kataria's avatar
Alok Kataria committed
730
{
731
	u64 tsc1, tsc2, delta, ref1, ref2;
732
	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
733
	unsigned long flags, latch, ms, fast_calibrate;
734
	int hpet = is_hpet_enabled(), i, loopmin;
Alok Kataria's avatar
Alok Kataria committed
735

736
737
738
739
	fast_calibrate = cpu_khz_from_cpuid();
	if (fast_calibrate)
		return fast_calibrate;

740
	fast_calibrate = cpu_khz_from_msr();
741
	if (fast_calibrate)
742
743
		return fast_calibrate;

Linus Torvalds's avatar
Linus Torvalds committed
744
745
	local_irq_save(flags);
	fast_calibrate = quick_pit_calibrate();
Alok Kataria's avatar
Alok Kataria committed
746
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
747
748
	if (fast_calibrate)
		return fast_calibrate;
Alok Kataria's avatar
Alok Kataria committed
749

750
751
752
753
754
755
756
757
758
759
760
761
	/*
	 * Run 5 calibration loops to get the lowest frequency value
	 * (the best estimate). We use two different calibration modes
	 * here:
	 *
	 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
	 * load a timeout of 50ms. We read the time right after we
	 * started the timer and wait until the PIT count down reaches
	 * zero. In each wait loop iteration we read the TSC and check
	 * the delta to the previous read. We keep track of the min
	 * and max values of that delta. The delta is mostly defined
	 * by the IO time of the PIT access, so we can detect when a
Lucas De Marchi's avatar
Lucas De Marchi committed
762
	 * SMI/SMM disturbance happened between the two reads. If the
763
764
765
766
767
768
769
770
771
772
773
	 * maximum time is significantly larger than the minimum time,
	 * then we discard the result and have another try.
	 *
	 * 2) Reference counter. If available we use the HPET or the
	 * PMTIMER as a reference to check the sanity of that value.
	 * We use separate TSC readouts and check inside of the
	 * reference read for a SMI/SMM disturbance. We dicard
	 * disturbed values here as well. We do that around the PIT
	 * calibration delay loop as we have to wait for a certain
	 * amount of time anyway.
	 */
774
775
776
777
778
779
780

	/* Preset PIT loop values */
	latch = CAL_LATCH;
	ms = CAL_MS;
	loopmin = CAL_PIT_LOOPS;

	for (i = 0; i < 3; i++) {
781
		unsigned long tsc_pit_khz;
782
783
784

		/*
		 * Read the start value and the reference count of
785
786
787
		 * hpet/pmtimer when available. Then do the PIT
		 * calibration, which will take at least 50ms, and
		 * read the end value.
788
		 */
789
		local_irq_save(flags);
790
		tsc1 = tsc_read_refs(&ref1, hpet);
791
		tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
792
		tsc2 = tsc_read_refs(&ref2, hpet);
793
794
		local_irq_restore(flags);

795
796
		/* Pick the lowest PIT TSC calibration so far */
		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
797
798

		/* hpet or pmtimer available ? */
799
		if (ref1 == ref2)
800
801
802
803
804
805
806
			continue;

		/* Check, whether the sampling was disturbed by an SMI */
		if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
			continue;

		tsc2 = (tsc2 - tsc1) * 1000000LL;
807
		if (hpet)
808
			tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
809
		else
810
			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
811
812

		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
813
814
815
816
817
818
819
820
821
822
823
824

		/* Check the reference deviation */
		delta = ((u64) tsc_pit_min) * 100;
		do_div(delta, tsc_ref_min);

		/*
		 * If both calibration results are inside a 10% window
		 * then we can be sure, that the calibration
		 * succeeded. We break out of the loop right away. We
		 * use the reference value, as it is more precise.
		 */
		if (delta >= 90 && delta <= 110) {
825
826
			pr_info("PIT calibration matches %s. %d loops\n",
				hpet ? "HPET" : "PMTIMER", i + 1);
827
			return tsc_ref_min;
828
829
		}

830
831
832
833
834
835
836
837
838
839
840
		/*
		 * Check whether PIT failed more than once. This
		 * happens in virtualized environments. We need to
		 * give the virtual PC a slightly longer timeframe for
		 * the HPET/PMTIMER to make the result precise.
		 */
		if (i == 1 && tsc_pit_min == ULONG_MAX) {
			latch = CAL2_LATCH;
			ms = CAL2_MS;
			loopmin = CAL2_PIT_LOOPS;
		}
841
	}
Alok Kataria's avatar
Alok Kataria committed
842
843

	/*
844
	 * Now check the results.
Alok Kataria's avatar
Alok Kataria committed
845
	 */
846
847
	if (tsc_pit_min == ULONG_MAX) {
		/* PIT gave no useful value */
848
		pr_warn("Unable to calibrate against PIT\n");
849
850

		/* We don't have an alternative source, disable TSC */
851
		if (!hpet && !ref1 && !ref2) {
852
			pr_notice("No reference (HPET/PMTIMER) available\n");
853
854
855
856
857
			return 0;
		}

		/* The alternative source failed as well, disable TSC */
		if (tsc_ref_min == ULONG_MAX) {
858
			pr_warn("HPET/PMTIMER calibration failed\n");
859
860
861
862
			return 0;
		}

		/* Use the alternative source */
863
864
		pr_info("using %s reference calibration\n",
			hpet ? "HPET" : "PMTIMER");
865
866
867

		return tsc_ref_min;
	}
Alok Kataria's avatar
Alok Kataria committed
868

869
	/* We don't have an alternative source, use the PIT calibration value */
870
	if (!hpet && !ref1 && !ref2) {
871
		pr_info("Using PIT calibration value\n");
872
		return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
873
874
	}

875
876
	/* The alternative source failed, use the PIT calibration value */
	if (tsc_ref_min == ULONG_MAX) {
877
		pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
878
		return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
879
880
	}

881
882
883
	/*
	 * The calibration values differ too much. In doubt, we use
	 * the PIT value as we know that there are PMTIMERs around
884
	 * running at double speed. At least we let the user know:
885
	 */
886
887
888
	pr_warn("PIT calibration deviates from %s: %lu %lu\n",
		hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
	pr_info("Using PIT calibration value\n");
889
	return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
890
891
892
893
894
895
896
}

int recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
	unsigned long cpu_khz_old = cpu_khz;

897
	if (!boot_cpu_has(X86_FEATURE_TSC))
Alok Kataria's avatar
Alok Kataria committed
898
		return -ENODEV;
899

900
	cpu_khz = x86_platform.calibrate_cpu();
901
	tsc_khz = x86_platform.calibrate_tsc();
902
903
	if (tsc_khz == 0)
		tsc_khz = cpu_khz;
904
905
	else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
		cpu_khz = tsc_khz;
906
907
908
909
	cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
						    cpu_khz_old, cpu_khz);

	return 0;
Alok Kataria's avatar
Alok Kataria committed
910
911
912
913
914
915
916
#else
	return -ENODEV;
#endif
}

EXPORT_SYMBOL(recalibrate_cpu_khz);

917

918
919
static unsigned long long cyc2ns_suspend;

920
void tsc_save_sched_clock_state(void)
921
{
922
	if (!sched_clock_stable())
923
924
925
926
927
928
929
930
931
932
933
934
935
		return;

	cyc2ns_suspend = sched_clock();
}

/*
 * Even on processors with invariant TSC, TSC gets reset in some the
 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
 * arbitrary value (still sync'd across cpu's) during resume from such sleep
 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
 * that sched_clock() continues from the point where it was left off during
 * suspend.
 */
936
void tsc_restore_sched_clock_state(void)
937
938
939
940
941
{
	unsigned long long offset;
	unsigned long flags;
	int cpu;

942
	if (!sched_clock_stable())
943
944
945
946
		return;

	local_irq_save(flags);

947
	/*
948
	 * We're coming out of suspend, there's no concurrency yet; don't
949
950
951
952
953
954
955
	 * bother being nice about the RCU stuff, just write to both
	 * data fields.
	 */

	this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
	this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);

956
957
	offset = cyc2ns_suspend - sched_clock();

958
959
960
961
	for_each_possible_cpu(cpu) {
		per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
		per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
	}
962
963
964
965

	local_irq_restore(flags);
}

966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
#ifdef CONFIG_CPU_FREQ

/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
 * changes.
 *
 * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
 * not that important because current Opteron setups do not support
 * scaling on SMP anyroads.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{
	struct cpufreq_freqs *freq = data;
987
	unsigned long *lpj;
988

989
	lpj = &boot_cpu_data.loops_per_jiffy;
990
#ifdef CONFIG_SMP
991
	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
992
993
994
995
996
997
998
999
1000
		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
#endif

	if (!ref_freq) {
		ref_freq = freq->old;
		loops_per_jiffy_ref = *lpj;
		tsc_khz_ref = tsc_khz;
	}
	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
1001
			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
1002
		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
1003
1004
1005
1006
1007

		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
			mark_tsc_unstable("cpufreq changes");

Peter Zijlstra's avatar
Peter Zijlstra committed
1008
1009
		set_cyc2ns_scale(tsc_khz, freq->cpu);
	}
1010
1011
1012
1013
1014
1015
1016
1017

	return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
	.notifier_call  = time_cpufreq_notifier
};

1018
static int __init cpufreq_register_tsc_scaling(void)
1019
{
1020
	if (!boot_cpu_has(X86_FEATURE_TSC))
1021
1022
1023
		return 0;
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return 0;
1024
1025
1026
1027
1028
	cpufreq_register_notifier(&time_cpufreq_notifier_block,
				CPUFREQ_TRANSITION_NOTIFIER);
	return 0;
}

1029
core_initcall(cpufreq_register_tsc_scaling);
1030
1031

#endif /* CONFIG_CPU_FREQ */
1032

1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
#define ART_CPUID_LEAF (0x15)
#define ART_MIN_DENOMINATOR (1)


/*
 * If ART is present detect the numerator:denominator to convert to TSC
 */
static void detect_art(void)
{
	unsigned int unused[2];

	if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
		return;

	cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
	      &art_to_tsc_numerator, unused, unused+1);

	/* Don't enable ART in a VM, non-stop TSC required */
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
	    !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
	    art_to_tsc_denominator < ART_MIN_DENOMINATOR)
		return;

	if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
		return;

	/* Make this sticky over multiple CPU init calls */
	setup_force_cpu_cap(X86_FEATURE_ART);
}


1064
1065
1066
1067
1068
/* clocksource code */

static struct clocksource clocksource_tsc;

/*
1069
 * We used to compare the TSC to the cycle_last value in the clocksource
1070
1071
1072
1073
1074
1075
1076
1077
1078
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slighty behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
1079
1080
1081
1082
 *
 * This sanity check is now done in the core timekeeping code.
 * checking the result of read_tsc() - cycle_last for being negative.
 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
1083
 */
1084
static cycle_t read_tsc(struct clocksource *cs)
1085
{
1086
	return (cycle_t)rdtsc_ordered();
1087
1088
}

1089
1090
1091
/*
 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
 */
1092
1093
1094
1095
1096
1097
1098
static struct clocksource clocksource_tsc = {
	.name                   = "tsc",
	.rating                 = 300,
	.read                   = read_tsc,
	.mask                   = CLOCKSOURCE_MASK(64),
	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
				  CLOCK_SOURCE_MUST_VERIFY,
1099
	.archdata               = { .vclock_mode = VCLOCK_TSC },
1100
1101
1102
1103
1104
1105
};

void mark_tsc_unstable(char *reason)
{
	if (!tsc_unstable) {
		tsc_unstable = 1;
1106
		clear_sched_clock_stable();
1107
		disable_sched_clock_irqtime();
1108
		pr_info("Marking TSC unstable due to %s\n", reason);
1109
1110
		/* Change only the rating, when not registered */
		if (clocksource_tsc.mult)
1111
1112
1113
			clocksource_mark_unstable(&clocksource_tsc);
		else {
			clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
1114
			clocksource_tsc.rating = 0;
1115
		}
1116
1117
1118
1119
1120
	}
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

1121
1122
static void __init check_system_tsc_reliable(void)
{
1123
1124
1125
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
	if (is_geode_lx()) {
		/* RTSC counts during suspend */
1126
#define RTSC_SUSP 0x100
1127
		unsigned long res_low, res_high;
1128

1129
1130
1131
1132
1133
		rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
		/* Geode_LX - the OLPC CPU has a very reliable TSC */
		if (res_low & RTSC_SUSP)
			tsc_clocksource_reliable = 1;
	}
1134
#endif
1135
1136
1137
	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
		tsc_clocksource_reliable = 1;
}
1138
1139
1140
1141
1142

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
1143
int unsynchronized_tsc(void)
1144
{
1145
	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
1146
1147
		return 1;

1148
#ifdef CONFIG_SMP
1149
1150
1151
1152
1153
1154
	if (apic_is_clustered_box())
		return 1;
#endif

	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return 0;
1155
1156
1157

	if (tsc_clocksource_reliable)
		return 0;
1158
1159
1160
1161
1162
1163
1164
	/*
	 * Intel systems are normally all synchronized.
	 * Exceptions must mark TSC as unstable:
	 */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
		/* assume multi socket systems are not synchronized: */
		if (num_possible_cpus() > 1)
1165
			return 1;
1166
1167
	}

1168
	return 0;
1169
1170
}

1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
/*
 * Convert ART to TSC given numerator/denominator found in detect_art()
 */
struct system_counterval_t convert_art_to_tsc(cycle_t art)
{
	u64 tmp, res, rem;

	rem = do_div(art, art_to_tsc_denominator);

	res = art * art_to_tsc_numerator;
	tmp = rem * art_to_tsc_numerator;

	do_div(tmp, art_to_tsc_denominator);
	res += tmp + art_to_tsc_offset;

	return (struct system_counterval_t) {.cs = art_related_clocksource,
			.cycles = res};
}
EXPORT_SYMBOL(convert_art_to_tsc);
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201

static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
 * tsc_refine_calibration_work - Further refine tsc freq calibration
 * @work - ignored.
 *
 * This functions uses delayed work over a period of a
 * second to further refine the TSC freq value. Since this is
 * timer based, instead of loop based, we don't block the boot
 * process while this longer calibration is done.
 *
Lucas De Marchi's avatar
Lucas De Marchi committed
1202
 * If there are any calibration anomalies (too many SMIs, etc),
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
 * or the refined calibration is off by 1% of the fast early
 * calibration, we throw out the new calibration and use the
 * early calibration.
 */
static void tsc_refine_calibration_work(struct work_struct *work)
{
	static u64 tsc_start = -1, ref_start;
	static int hpet;
	u64 tsc_stop, ref_stop, delta;
	unsigned long freq;

	/* Don't bother refining TSC on unstable systems */
	if (check_tsc_unstable())
		goto out;

	/*
	 * Since the work is started early in boot, we may be
	 * delayed the first time we expire. So set the workqueue
	 * again once we know timers are working.
	 */
	if (tsc_start == -1) {
		/*
		 * Only set hpet once, to avoid mixing hardware
		 * if the hpet becomes enabled later.
		 */
		hpet = is_hpet_enabled();
		schedule_delayed_work(&tsc_irqwork, HZ);
		tsc_start = tsc_read_refs(&ref_start, hpet);
		return;
	}

	tsc_stop = tsc_read_refs(&ref_stop, hpet);

	/* hpet or pmtimer available ? */
1237
	if (ref_start == ref_stop)
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
		goto out;

	/* Check, whether the sampling was disturbed by an SMI */
	if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
		goto out;

	delta = tsc_stop - tsc_start;
	delta *= 1000000LL;
	if (hpet)
		freq = calc_hpet_ref(delta, ref_start, ref_stop);
	else
		freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

	/* Make sure we're within 1% */
	if (abs(tsc_khz - freq) > tsc_khz/100)
		goto out;

	tsc_khz = freq;
1256
1257
1258
	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
		(unsigned long)tsc_khz / 1000,
		(unsigned long)tsc_khz % 1000);
1259

1260
1261
1262
	/* Inform the TSC deadline clockevent devices about the recalibration */
	lapic_update_tsc_freq();

1263
out:
1264
1265
	if (boot_cpu_has(X86_FEATURE_ART))
		art_related_clocksource = &clocksource_tsc;
1266
1267
1268
1269
1270
	clocksource_register_khz(&clocksource_tsc, tsc_khz);
}


static int __init init_tsc_clocksource(void)
1271
{
1272
	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
1273
1274
		return 0;

1275
1276
	if (tsc_clocksource_reliable)
		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1277
1278
1279
1280
1281
	/* lower the rating if we already know its unstable: */
	if (check_tsc_unstable()) {
		clocksource_tsc.rating = 0;
		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
	}
1282

1283
1284
1285
	if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
		clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

1286
1287
1288
1289
1290
	/*
	 * Trust the results of the earlier calibration on systems
	 * exporting a reliable TSC.
	 */
	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
1291
1292
		if (boot_cpu_has(X86_FEATURE_ART))
			art_related_clocksource = &clocksource_tsc;
1293
1294
1295
1296
		clocksource_register_khz(&clocksource_tsc, tsc_khz);
		return 0;
	}

1297
1298
	schedule_delayed_work(&tsc_irqwork, 0);
	return 0;
1299
}
1300
1301
1302
1303
1304
/*
 * We use device_initcall here, to ensure we run after the hpet
 * is fully initialized, which may occur at fs_initcall time.
 */
device_initcall(init_tsc_clocksource);
1305
1306
1307
1308
1309
1310

void __init tsc_init(void)
{
	u64 lpj;
	int cpu;

1311
	if (!boot_cpu_has(X86_FEATURE_TSC)) {
1312
		setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
1313
		return;
1314
	}
1315

1316
	cpu_khz = x86_platform.calibrate_cpu();
1317
	tsc_khz = x86_platform.calibrate_tsc();
1318
1319
1320
1321
1322
1323

	/*
	 * Trust non-zero tsc_khz as authorative,
	 * and use it to sanity check cpu_khz,
	 * which will be off if system timer is off.
	 */
1324
1325
	if (tsc_khz == 0)
		tsc_khz = cpu_khz;
1326
1327
	else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
		cpu_khz = tsc_khz;
1328

1329
	if (!tsc_khz) {
1330
		mark_tsc_unstable("could not calculate TSC khz");
1331
		setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
1332
1333
1334
		return;
	}

1335
1336
1337
	pr_info("Detected %lu.%03lu MHz processor\n",
		(unsigned long)cpu_khz / 1000,
		(unsigned long)cpu_khz % 1000);
1338
1339
1340
1341
1342
1343
1344

	/*
	 * Secondary CPUs do not run through tsc_init(), so set up
	 * all the scale factors for all CPUs, assuming the same
	 * speed as the bootup CPU. (cpufreq notifiers will fix this
	 * up if their speed diverges)
	 */
1345
1346
	for_each_possible_cpu(cpu) {
		cyc2ns_init(cpu);
1347
		set_cyc2ns_scale(tsc_khz, cpu);
1348
	}
1349
1350
1351
1352
1353

	if (tsc_disabled > 0)
		return;

	/* now allow native_sched_clock() to use rdtsc */
1354

1355
	tsc_disabled = 0;
1356
	static_branch_enable(&__use_tsc);
1357

1358
1359
1360
	if (!no_sched_irq_time)
		enable_sched_clock_irqtime();

1361
1362
1363
1364
	lpj = ((u64)tsc_khz * 1000);
	do_div(lpj, HZ);
	lpj_fine = lpj;

1365
1366
1367
1368
1369
	use_tsc_delay();

	if (unsynchronized_tsc())
		mark_tsc_unstable("TSCs unsynchronized");

1370
	check_system_tsc_reliable();
1371
1372

	detect_art();
1373
1374
}

1375
1376
1377
1378
1379
1380
1381
#ifdef CONFIG_SMP
/*
 * If we have a constant TSC and are using the TSC for the delay loop,
 * we can skip clock calibration if another cpu in the same socket has already
 * been calibrated. This assumes that CONSTANT_TSC applies to all
 * cpus in the socket - this should be a safe assumption.
 */
Paul Gortmaker's avatar