tsc.c 22.1 KB
Newer Older
Alok Kataria's avatar
Alok Kataria committed
1
#include <linux/kernel.h>
Alok Kataria's avatar
Alok Kataria committed
2
3
4
5
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/timer.h>
Alok Kataria's avatar
Alok Kataria committed
6
#include <linux/acpi_pmtmr.h>
7
#include <linux/cpufreq.h>
8
9
10
11
#include <linux/dmi.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
Alok Kataria's avatar
Alok Kataria committed
12
13

#include <asm/hpet.h>
14
15
16
17
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
18
#include <asm/hypervisor.h>
Alok Kataria's avatar
Alok Kataria committed
19
20
21
22
23
24
25
26
27

unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
unsigned int tsc_khz;
EXPORT_SYMBOL(tsc_khz);

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
28
static int tsc_unstable;
Alok Kataria's avatar
Alok Kataria committed
29
30
31
32

/* native_sched_clock() is called before tsc_init(), so
   we must start with the TSC soft disabled to prevent
   erroneous rdtsc usage on !cpu_has_tsc processors */
33
static int tsc_disabled = -1;
Alok Kataria's avatar
Alok Kataria committed
34

35
static int tsc_clocksource_reliable;
Alok Kataria's avatar
Alok Kataria committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/*
 * Scheduler clock - returns current time in nanosec units.
 */
u64 native_sched_clock(void)
{
	u64 this_offset;

	/*
	 * Fall back to jiffies if there's no TSC available:
	 * ( But note that we still use it if the TSC is marked
	 *   unstable. We do this because unlike Time Of Day,
	 *   the scheduler clock tolerates small errors and it's
	 *   very important for it to be as fast as the platform
	 *   can achive it. )
	 */
	if (unlikely(tsc_disabled)) {
		/* No locking but a rare wrong value is not a big deal: */
		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
	}

	/* read the Time Stamp Counter: */
	rdtscll(this_offset);

	/* return the value in ns */
60
	return __cycles_2_ns(this_offset);
Alok Kataria's avatar
Alok Kataria committed
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
}

/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
	return paravirt_sched_clock();
}
#else
unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
#endif

int check_tsc_unstable(void)
{
	return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
			"cannot disable TSC completely.\n");
	tsc_disabled = 1;
	return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
	setup_clear_cpu_cap(X86_FEATURE_TSC);
	return 1;
}
#endif

__setup("notsc", notsc_setup);
Alok Kataria's avatar
Alok Kataria committed
102

103
104
105
106
107
108
109
110
111
static int __init tsc_setup(char *str)
{
	if (!strcmp(str, "reliable"))
		tsc_clocksource_reliable = 1;
	return 1;
}

__setup("tsc=", tsc_setup);

Alok Kataria's avatar
Alok Kataria committed
112
113
114
115
116
117
#define MAX_RETRIES     5
#define SMI_TRESHOLD    50000

/*
 * Read TSC and the reference counters. Take care of SMI disturbance
 */
118
static u64 tsc_read_refs(u64 *p, int hpet)
Alok Kataria's avatar
Alok Kataria committed
119
120
121
122
123
124
125
{
	u64 t1, t2;
	int i;

	for (i = 0; i < MAX_RETRIES; i++) {
		t1 = get_cycles();
		if (hpet)
126
			*p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
Alok Kataria's avatar
Alok Kataria committed
127
		else
128
			*p = acpi_pm_read_early();
Alok Kataria's avatar
Alok Kataria committed
129
130
131
132
133
134
135
		t2 = get_cycles();
		if ((t2 - t1) < SMI_TRESHOLD)
			return t2;
	}
	return ULLONG_MAX;
}

136
137
/*
 * Calculate the TSC frequency from HPET reference
Alok Kataria's avatar
Alok Kataria committed
138
 */
139
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
Alok Kataria's avatar
Alok Kataria committed
140
{
141
	u64 tmp;
Alok Kataria's avatar
Alok Kataria committed
142

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
	if (hpet2 < hpet1)
		hpet2 += 0x100000000ULL;
	hpet2 -= hpet1;
	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
	do_div(tmp, 1000000);
	do_div(deltatsc, tmp);

	return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
	u64 tmp;
Alok Kataria's avatar
Alok Kataria committed
159

160
161
162
163
164
165
166
167
168
169
170
171
172
	if (!pm1 && !pm2)
		return ULONG_MAX;

	if (pm2 < pm1)
		pm2 += (u64)ACPI_PM_OVRRUN;
	pm2 -= pm1;
	tmp = pm2 * 1000000000LL;
	do_div(tmp, PMTMR_TICKS_PER_SEC);
	do_div(deltatsc, tmp);

	return (unsigned long) deltatsc;
}

173
#define CAL_MS		10
174
#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS))
175
176
177
178
179
180
#define CAL_PIT_LOOPS	1000

#define CAL2_MS		50
#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS	5000

181

182
183
184
185
186
187
188
/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
189
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
190
191
192
193
194
195
196
197
198
199
200
201
202
203
{
	u64 tsc, t1, t2, delta;
	unsigned long tscmin, tscmax;
	int pitcnt;

	/* Set the Gate high, disable speaker */
	outb((inb(0x61) & ~0x02) | 0x01, 0x61);

	/*
	 * Setup CTC channel 2* for mode 0, (interrupt on terminal
	 * count mode), binary count. Set the latch register to 50ms
	 * (LSB then MSB) to begin countdown.
	 */
	outb(0xb0, 0x43);
204
205
	outb(latch & 0xff, 0x42);
	outb(latch >> 8, 0x42);
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

	tsc = t1 = t2 = get_cycles();

	pitcnt = 0;
	tscmax = 0;
	tscmin = ULONG_MAX;
	while ((inb(0x61) & 0x20) == 0) {
		t2 = get_cycles();
		delta = t2 - tsc;
		tsc = t2;
		if ((unsigned long) delta < tscmin)
			tscmin = (unsigned int) delta;
		if ((unsigned long) delta > tscmax)
			tscmax = (unsigned int) delta;
		pitcnt++;
	}

	/*
	 * Sanity checks:
	 *
226
	 * If we were not able to read the PIT more than loopmin
227
228
229
230
231
	 * times, then we have been hit by a massive SMI
	 *
	 * If the maximum is 10 times larger than the minimum,
	 * then we got hit by an SMI as well.
	 */
232
	if (pitcnt < loopmin || tscmax > 10 * tscmin)
233
234
235
236
		return ULONG_MAX;

	/* Calculate the PIT value */
	delta = t2 - t1;
237
	do_div(delta, ms);
238
239
240
	return delta;
}

Linus Torvalds's avatar
Linus Torvalds committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequencty.
 */
static inline int pit_expect_msb(unsigned char val)
{
	int count = 0;
Alok Kataria's avatar
Alok Kataria committed
279

Linus Torvalds's avatar
Linus Torvalds committed
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
	for (count = 0; count < 50000; count++) {
		/* Ignore LSB */
		inb(0x42);
		if (inb(0x42) != val)
			break;
	}
	return count > 50;
}

/*
 * How many MSB values do we want to see? We aim for a
 * 15ms calibration, which assuming a 2us counter read
 * error should give us roughly 150 ppm precision for
 * the calibration.
 */
#define QUICK_PIT_MS 15
#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
Alok Kataria's avatar
Alok Kataria committed
297

Linus Torvalds's avatar
Linus Torvalds committed
298
299
300
static unsigned long quick_pit_calibrate(void)
{
	/* Set the Gate high, disable speaker */
Alok Kataria's avatar
Alok Kataria committed
301
302
	outb((inb(0x61) & ~0x02) | 0x01, 0x61);

Linus Torvalds's avatar
Linus Torvalds committed
303
304
305
306
307
308
309
310
311
	/*
	 * Counter 2, mode 0 (one-shot), binary count
	 *
	 * NOTE! Mode 2 decrements by two (and then the
	 * output is flipped each time, giving the same
	 * final output frequency as a decrement-by-one),
	 * so mode 0 is much better when looking at the
	 * individual counts.
	 */
Alok Kataria's avatar
Alok Kataria committed
312
313
	outb(0xb0, 0x43);

Linus Torvalds's avatar
Linus Torvalds committed
314
315
316
317
	/* Start at 0xffff */
	outb(0xff, 0x42);
	outb(0xff, 0x42);

318
319
320
321
322
323
324
325
326
	/*
	 * The PIT starts counting at the next edge, so we
	 * need to delay for a microsecond. The easiest way
	 * to do that is to just read back the 16-bit counter
	 * once from the PIT.
	 */
	inb(0x42);
	inb(0x42);

Linus Torvalds's avatar
Linus Torvalds committed
327
328
329
330
331
332
333
334
335
336
337
338
	if (pit_expect_msb(0xff)) {
		int i;
		u64 t1, t2, delta;
		unsigned char expect = 0xfe;

		t1 = get_cycles();
		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
			if (!pit_expect_msb(expect))
				goto failed;
		}
		t2 = get_cycles();

339
340
341
		/*
		 * Make sure we can rely on the second TSC timestamp:
		 */
Ingo Molnar's avatar
Ingo Molnar committed
342
		if (!pit_expect_msb(expect))
343
344
			goto failed;

Linus Torvalds's avatar
Linus Torvalds committed
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
		/*
		 * Ok, if we get here, then we've seen the
		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
		 * times, and each MSB had many hits, so we never
		 * had any sudden jumps.
		 *
		 * As a result, we can depend on there not being
		 * any odd delays anywhere, and the TSC reads are
		 * reliable.
		 *
		 * kHz = ticks / time-in-seconds / 1000;
		 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
		 */
		delta = (t2 - t1)*PIT_TICK_RATE;
		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
		printk("Fast TSC calibration using PIT\n");
		return delta;
	}
failed:
	return 0;
}
367

Alok Kataria's avatar
Alok Kataria committed
368
/**
369
 * native_calibrate_tsc - calibrate the tsc on boot
Alok Kataria's avatar
Alok Kataria committed
370
 */
371
unsigned long native_calibrate_tsc(void)
Alok Kataria's avatar
Alok Kataria committed
372
{
373
	u64 tsc1, tsc2, delta, ref1, ref2;
374
	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
375
	unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
376
	int hpet = is_hpet_enabled(), i, loopmin;
Alok Kataria's avatar
Alok Kataria committed
377

378
379
380
381
382
383
	tsc_khz = get_hypervisor_tsc_freq();
	if (tsc_khz) {
		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
		return tsc_khz;
	}

Linus Torvalds's avatar
Linus Torvalds committed
384
385
	local_irq_save(flags);
	fast_calibrate = quick_pit_calibrate();
Alok Kataria's avatar
Alok Kataria committed
386
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
387
388
	if (fast_calibrate)
		return fast_calibrate;
Alok Kataria's avatar
Alok Kataria committed
389

390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
	/*
	 * Run 5 calibration loops to get the lowest frequency value
	 * (the best estimate). We use two different calibration modes
	 * here:
	 *
	 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
	 * load a timeout of 50ms. We read the time right after we
	 * started the timer and wait until the PIT count down reaches
	 * zero. In each wait loop iteration we read the TSC and check
	 * the delta to the previous read. We keep track of the min
	 * and max values of that delta. The delta is mostly defined
	 * by the IO time of the PIT access, so we can detect when a
	 * SMI/SMM disturbance happend between the two reads. If the
	 * maximum time is significantly larger than the minimum time,
	 * then we discard the result and have another try.
	 *
	 * 2) Reference counter. If available we use the HPET or the
	 * PMTIMER as a reference to check the sanity of that value.
	 * We use separate TSC readouts and check inside of the
	 * reference read for a SMI/SMM disturbance. We dicard
	 * disturbed values here as well. We do that around the PIT
	 * calibration delay loop as we have to wait for a certain
	 * amount of time anyway.
	 */
414
415
416
417
418
419
420

	/* Preset PIT loop values */
	latch = CAL_LATCH;
	ms = CAL_MS;
	loopmin = CAL_PIT_LOOPS;

	for (i = 0; i < 3; i++) {
421
		unsigned long tsc_pit_khz;
422
423
424

		/*
		 * Read the start value and the reference count of
425
426
427
		 * hpet/pmtimer when available. Then do the PIT
		 * calibration, which will take at least 50ms, and
		 * read the end value.
428
		 */
429
		local_irq_save(flags);
430
		tsc1 = tsc_read_refs(&ref1, hpet);
431
		tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
432
		tsc2 = tsc_read_refs(&ref2, hpet);
433
434
		local_irq_restore(flags);

435
436
		/* Pick the lowest PIT TSC calibration so far */
		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
437
438

		/* hpet or pmtimer available ? */
439
		if (!hpet && !ref1 && !ref2)
440
441
442
443
444
445
446
			continue;

		/* Check, whether the sampling was disturbed by an SMI */
		if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
			continue;

		tsc2 = (tsc2 - tsc1) * 1000000LL;
447
		if (hpet)
448
			tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
449
		else
450
			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
451
452

		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468

		/* Check the reference deviation */
		delta = ((u64) tsc_pit_min) * 100;
		do_div(delta, tsc_ref_min);

		/*
		 * If both calibration results are inside a 10% window
		 * then we can be sure, that the calibration
		 * succeeded. We break out of the loop right away. We
		 * use the reference value, as it is more precise.
		 */
		if (delta >= 90 && delta <= 110) {
			printk(KERN_INFO
			       "TSC: PIT calibration matches %s. %d loops\n",
			       hpet ? "HPET" : "PMTIMER", i + 1);
			return tsc_ref_min;
469
470
		}

471
472
473
474
475
476
477
478
479
480
481
		/*
		 * Check whether PIT failed more than once. This
		 * happens in virtualized environments. We need to
		 * give the virtual PC a slightly longer timeframe for
		 * the HPET/PMTIMER to make the result precise.
		 */
		if (i == 1 && tsc_pit_min == ULONG_MAX) {
			latch = CAL2_LATCH;
			ms = CAL2_MS;
			loopmin = CAL2_PIT_LOOPS;
		}
482
	}
Alok Kataria's avatar
Alok Kataria committed
483
484

	/*
485
	 * Now check the results.
Alok Kataria's avatar
Alok Kataria committed
486
	 */
487
488
	if (tsc_pit_min == ULONG_MAX) {
		/* PIT gave no useful value */
489
		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
490
491

		/* We don't have an alternative source, disable TSC */
492
		if (!hpet && !ref1 && !ref2) {
493
494
495
496
497
498
499
			printk("TSC: No reference (HPET/PMTIMER) available\n");
			return 0;
		}

		/* The alternative source failed as well, disable TSC */
		if (tsc_ref_min == ULONG_MAX) {
			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
500
			       "failed.\n");
501
502
503
504
505
506
507
508
509
			return 0;
		}

		/* Use the alternative source */
		printk(KERN_INFO "TSC: using %s reference calibration\n",
		       hpet ? "HPET" : "PMTIMER");

		return tsc_ref_min;
	}
Alok Kataria's avatar
Alok Kataria committed
510

511
	/* We don't have an alternative source, use the PIT calibration value */
512
	if (!hpet && !ref1 && !ref2) {
513
514
		printk(KERN_INFO "TSC: Using PIT calibration value\n");
		return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
515
516
	}

517
518
	/* The alternative source failed, use the PIT calibration value */
	if (tsc_ref_min == ULONG_MAX) {
519
520
		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
		       "Using PIT calibration\n");
521
		return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
522
523
	}

524
525
526
	/*
	 * The calibration values differ too much. In doubt, we use
	 * the PIT value as we know that there are PMTIMERs around
527
	 * running at double speed. At least we let the user know:
528
	 */
529
530
	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
531
532
	printk(KERN_INFO "TSC: Using PIT calibration value\n");
	return tsc_pit_min;
Alok Kataria's avatar
Alok Kataria committed
533
534
535
536
537
538
539
540
541
542
}

#ifdef CONFIG_X86_32
/* Only called from the Powernow K7 cpu freq driver */
int recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
	unsigned long cpu_khz_old = cpu_khz;

	if (cpu_has_tsc) {
543
544
		tsc_khz = calibrate_tsc();
		cpu_khz = tsc_khz;
Alok Kataria's avatar
Alok Kataria committed
545
546
547
548
549
550
551
552
553
554
555
556
557
558
		cpu_data(0).loops_per_jiffy =
			cpufreq_scale(cpu_data(0).loops_per_jiffy,
					cpu_khz_old, cpu_khz);
		return 0;
	} else
		return -ENODEV;
#else
	return -ENODEV;
#endif
}

EXPORT_SYMBOL(recalibrate_cpu_khz);

#endif /* CONFIG_X86_32 */
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583

/* Accelerators for sched_clock()
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
 *  into a shift.
 *
 *  We can use khz divisor instead of mhz to keep a better precision, since
 *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

DEFINE_PER_CPU(unsigned long, cyc2ns);

584
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
{
	unsigned long long tsc_now, ns_now;
	unsigned long flags, *scale;

	local_irq_save(flags);
	sched_clock_idle_sleep_event();

	scale = &per_cpu(cyc2ns, cpu);

	rdtscll(tsc_now);
	ns_now = __cycles_2_ns(tsc_now);

	if (cpu_khz)
		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;

	sched_clock_idle_wakeup_event(0);
	local_irq_restore(flags);
}

#ifdef CONFIG_CPU_FREQ

/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
 * changes.
 *
 * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
 * not that important because current Opteron setups do not support
 * scaling on SMP anyroads.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{
	struct cpufreq_freqs *freq = data;
	unsigned long *lpj, dummy;

	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
		return 0;

	lpj = &dummy;
	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
#ifdef CONFIG_SMP
		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
#else
	lpj = &boot_cpu_data.loops_per_jiffy;
#endif

	if (!ref_freq) {
		ref_freq = freq->old;
		loops_per_jiffy_ref = *lpj;
		tsc_khz_ref = tsc_khz;
	}
	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
			(val == CPUFREQ_RESUMECHANGE)) {
		*lpj = 	cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
			mark_tsc_unstable("cpufreq changes");
	}

653
	set_cyc2ns_scale(tsc_khz, freq->cpu);
654
655
656
657
658
659
660
661
662
663

	return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
	.notifier_call  = time_cpufreq_notifier
};

static int __init cpufreq_tsc(void)
{
664
665
666
667
	if (!cpu_has_tsc)
		return 0;
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return 0;
668
669
670
671
672
673
674
675
	cpufreq_register_notifier(&time_cpufreq_notifier_block,
				CPUFREQ_TRANSITION_NOTIFIER);
	return 0;
}

core_initcall(cpufreq_tsc);

#endif /* CONFIG_CPU_FREQ */
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700

/* clocksource code */

static struct clocksource clocksource_tsc;

/*
 * We compare the TSC to the cycle_last value in the clocksource
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slighty behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
 */
static cycle_t read_tsc(void)
{
	cycle_t ret = (cycle_t)get_cycles();

	return ret >= clocksource_tsc.cycle_last ?
		ret : clocksource_tsc.cycle_last;
}

701
#ifdef CONFIG_X86_64
702
703
704
705
706
707
708
static cycle_t __vsyscall_fn vread_tsc(void)
{
	cycle_t ret = (cycle_t)vget_cycles();

	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
		ret : __vsyscall_gtod_data.clock.cycle_last;
}
709
#endif
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759

static struct clocksource clocksource_tsc = {
	.name                   = "tsc",
	.rating                 = 300,
	.read                   = read_tsc,
	.mask                   = CLOCKSOURCE_MASK(64),
	.shift                  = 22,
	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
				  CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
	.vread                  = vread_tsc,
#endif
};

void mark_tsc_unstable(char *reason)
{
	if (!tsc_unstable) {
		tsc_unstable = 1;
		printk("Marking TSC unstable due to %s\n", reason);
		/* Change only the rating, when not registered */
		if (clocksource_tsc.mult)
			clocksource_change_rating(&clocksource_tsc, 0);
		else
			clocksource_tsc.rating = 0;
	}
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
{
	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
			d->ident);
	tsc_unstable = 1;
	return 0;
}

/* List of systems that have known TSC problems */
static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
	{
		.callback = dmi_mark_tsc_unstable,
		.ident = "IBM Thinkpad 380XD",
		.matches = {
			DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
			DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
		},
	},
	{}
};

760
761
static void __init check_system_tsc_reliable(void)
{
762
#ifdef CONFIG_MGEODE_LX
763
	/* RTSC counts during suspend */
764
765
766
767
#define RTSC_SUSP 0x100
	unsigned long res_low, res_high;

	rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
768
	/* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
769
	if (res_low & RTSC_SUSP)
770
		tsc_clocksource_reliable = 1;
771
#endif
772
773
774
	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
		tsc_clocksource_reliable = 1;
}
775
776
777
778
779
780
781
782
783
784

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
__cpuinit int unsynchronized_tsc(void)
{
	if (!cpu_has_tsc || tsc_unstable)
		return 1;

785
#ifdef CONFIG_X86_SMP
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
	if (apic_is_clustered_box())
		return 1;
#endif

	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return 0;
	/*
	 * Intel systems are normally all synchronized.
	 * Exceptions must mark TSC as unstable:
	 */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
		/* assume multi socket systems are not synchronized: */
		if (num_possible_cpus() > 1)
			tsc_unstable = 1;
	}

	return tsc_unstable;
}

static void __init init_tsc_clocksource(void)
{
	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
			clocksource_tsc.shift);
809
810
	if (tsc_clocksource_reliable)
		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
	/* lower the rating if we already know its unstable: */
	if (check_tsc_unstable()) {
		clocksource_tsc.rating = 0;
		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
	}
	clocksource_register(&clocksource_tsc);
}

void __init tsc_init(void)
{
	u64 lpj;
	int cpu;

	if (!cpu_has_tsc)
		return;

827
828
	tsc_khz = calibrate_tsc();
	cpu_khz = tsc_khz;
829

830
	if (!tsc_khz) {
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
		mark_tsc_unstable("could not calculate TSC khz");
		return;
	}

#ifdef CONFIG_X86_64
	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
		cpu_khz = calibrate_cpu();
#endif

	printk("Detected %lu.%03lu MHz processor.\n",
			(unsigned long)cpu_khz / 1000,
			(unsigned long)cpu_khz % 1000);

	/*
	 * Secondary CPUs do not run through tsc_init(), so set up
	 * all the scale factors for all CPUs, assuming the same
	 * speed as the bootup CPU. (cpufreq notifiers will fix this
	 * up if their speed diverges)
	 */
	for_each_possible_cpu(cpu)
		set_cyc2ns_scale(cpu_khz, cpu);

	if (tsc_disabled > 0)
		return;

	/* now allow native_sched_clock() to use rdtsc */
	tsc_disabled = 0;

860
861
862
863
	lpj = ((u64)tsc_khz * 1000);
	do_div(lpj, HZ);
	lpj_fine = lpj;

864
865
866
867
868
869
870
	use_tsc_delay();
	/* Check and install the TSC clocksource */
	dmi_check_system(bad_tsc_dmi_table);

	if (unsynchronized_tsc())
		mark_tsc_unstable("TSCs unsynchronized");

871
	check_system_tsc_reliable();
872
873
874
	init_tsc_clocksource();
}