core.c 58.3 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4
5
6
7
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11
12
13
14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16
17
18
19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
21
#include <linux/export.h>
#include <linux/init.h>
22
23
#include <linux/kdebug.h>
#include <linux/sched.h>
24
#include <linux/uaccess.h>
25
#include <linux/slab.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28
#include <linux/device.h>
29
30

#include <asm/apic.h>
31
#include <asm/stacktrace.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
32
#include <asm/nmi.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35
#include <asm/mmu_context.h>
36
#include <asm/tlbflush.h>
37
#include <asm/timer.h>
38
39
#include <asm/desc.h>
#include <asm/ldt.h>
40
#include <asm/unwind.h>
41

42
#include "perf_event.h"
43
44

struct x86_pmu x86_pmu __read_mostly;
45

46
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
47
48
	.enabled = 1,
};
49

50
51
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

52
u64 __read_mostly hw_cache_event_ids
53
54
55
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
56
u64 __read_mostly hw_cache_extra_regs
57
58
59
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
60

61
/*
62
63
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
64
65
 * Returns the delta events processed.
 */
66
u64 x86_perf_event_update(struct perf_event *event)
67
{
68
	struct hw_perf_event *hwc = &event->hw;
69
	int shift = 64 - x86_pmu.cntval_bits;
70
	u64 prev_raw_count, new_raw_count;
71
	int idx = hwc->idx;
72
	u64 delta;
73

74
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
75
76
		return 0;

77
	/*
78
	 * Careful: an NMI might modify the previous event value.
79
80
81
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
82
	 * count to the generic event atomically:
83
84
	 */
again:
85
	prev_raw_count = local64_read(&hwc->prev_count);
86
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
87

88
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
89
90
91
92
93
94
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
95
	 * (event-)time and add that to the generic event.
96
97
	 *
	 * Careful, not all hw sign-extends above the physical width
98
	 * of the count.
99
	 */
100
101
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
102

103
104
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
105
106

	return new_raw_count;
107
108
}

109
110
111
112
113
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
114
	struct hw_perf_event_extra *reg;
115
116
	struct extra_reg *er;

117
	reg = &event->hw.extra_reg;
118
119
120
121
122
123
124
125
126

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
127
128
129
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
130
131
132
133

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
134
135
136
137
138
		break;
	}
	return 0;
}

139
static atomic_t active_events;
140
static atomic_t pmc_refcount;
Peter Zijlstra's avatar
Peter Zijlstra committed
141
142
static DEFINE_MUTEX(pmc_reserve_mutex);

143
144
#ifdef CONFIG_X86_LOCAL_APIC

Peter Zijlstra's avatar
Peter Zijlstra committed
145
146
147
148
static bool reserve_pmc_hardware(void)
{
	int i;

149
	for (i = 0; i < x86_pmu.num_counters; i++) {
150
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
151
152
153
			goto perfctr_fail;
	}

154
	for (i = 0; i < x86_pmu.num_counters; i++) {
155
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
156
157
158
159
160
161
162
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
163
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
164

165
	i = x86_pmu.num_counters;
Peter Zijlstra's avatar
Peter Zijlstra committed
166
167
168

perfctr_fail:
	for (i--; i >= 0; i--)
169
		release_perfctr_nmi(x86_pmu_event_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
170
171
172
173
174
175
176
177

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

178
	for (i = 0; i < x86_pmu.num_counters; i++) {
179
180
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
181
182
183
	}
}

184
185
186
187
188
189
190
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

191
192
static bool check_hw_exists(void)
{
193
194
195
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
196
	int reg_safe = -1;
197

198
199
200
201
202
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
203
		reg = x86_pmu_config_addr(i);
204
205
206
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
207
208
209
210
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
211
212
		} else {
			reg_safe = i;
213
		}
214
215
216
217
218
219
220
221
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
222
223
224
225
226
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
227
228
229
		}
	}

230
231
232
233
234
235
236
237
238
239
240
	/*
	 * If all the counters are enabled, the below test will always
	 * fail.  The tools will also become useless in this scenario.
	 * Just fail and disable the hardware counters.
	 */

	if (reg_safe == -1) {
		reg = reg_safe;
		goto msr_fail;
	}

241
	/*
242
243
244
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
245
	 */
246
	reg = x86_pmu_event_addr(reg_safe);
247
248
249
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
250
251
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
252
	if (ret || val != val_new)
253
		goto msr_fail;
254

255
256
257
	/*
	 * We still allow the PMU driver to operate:
	 */
258
	if (bios_fail) {
259
260
261
		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
			      reg_fail, val_fail);
262
	}
263
264

	return true;
265
266

msr_fail:
267
268
269
270
271
272
273
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
		pr_cont("PMU not available due to virtualization, using software events only.\n");
	} else {
		pr_cont("Broken PMU hardware detected, using software events only.\n");
		pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
		       reg, val_new);
	}
274

275
	return false;
276
277
}

278
static void hw_perf_event_destroy(struct perf_event *event)
Peter Zijlstra's avatar
Peter Zijlstra committed
279
{
280
	x86_release_hardware();
281
	atomic_dec(&active_events);
Peter Zijlstra's avatar
Peter Zijlstra committed
282
283
}

284
285
286
287
288
289
290
291
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

292
293
294
295
296
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

297
static inline int
298
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
299
{
300
	struct perf_event_attr *attr = &event->attr;
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
327
328
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
329
330
}

331
332
333
334
int x86_reserve_hardware(void)
{
	int err = 0;

335
	if (!atomic_inc_not_zero(&pmc_refcount)) {
336
		mutex_lock(&pmc_reserve_mutex);
337
		if (atomic_read(&pmc_refcount) == 0) {
338
339
340
341
342
343
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
				reserve_ds_buffers();
		}
		if (!err)
344
			atomic_inc(&pmc_refcount);
345
346
347
348
349
350
351
352
		mutex_unlock(&pmc_reserve_mutex);
	}

	return err;
}

void x86_release_hardware(void)
{
353
	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
354
355
356
357
358
359
		release_pmc_hardware();
		release_ds_buffers();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

360
361
362
363
364
365
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
366
	int i;
367

368
369
370
371
372
	/*
	 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
	 * LBR and BTS are still mutually exclusive.
	 */
	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
373
374
		return 0;

375
376
377
378
379
380
381
382
	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
		mutex_lock(&pmc_reserve_mutex);
		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
				goto fail_unlock;
		}
		atomic_inc(&x86_pmu.lbr_exclusive[what]);
		mutex_unlock(&pmc_reserve_mutex);
383
	}
384

385
386
	atomic_inc(&active_events);
	return 0;
387

388
fail_unlock:
389
	mutex_unlock(&pmc_reserve_mutex);
390
	return -EBUSY;
391
392
393
394
}

void x86_del_exclusive(unsigned int what)
{
395
	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
396
397
		return;

398
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
399
	atomic_dec(&active_events);
400
401
}

402
int x86_setup_perfctr(struct perf_event *event)
403
404
405
406
407
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

408
	if (!is_sampling_event(event)) {
409
410
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
411
		local64_set(&hwc->period_left, hwc->sample_period);
412
413
414
	}

	if (attr->type == PERF_TYPE_RAW)
415
		return x86_pmu_extra_regs(event->attr.config, event);
416
417

	if (attr->type == PERF_TYPE_HW_CACHE)
418
		return set_ext_hw_attr(hwc, event);
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
437
438
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
439
		/* BTS is not supported by this architecture. */
440
		if (!x86_pmu.bts_active)
441
442
443
444
445
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
446
447
448
449
450
451

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
452
453
454
455
456
457
	}

	hwc->config |= config;

	return 0;
}
458

459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

489
int x86_pmu_hw_config(struct perf_event *event)
490
{
491
492
493
494
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
495
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
496
497
			precise++;

498
			/* Support for IP fixup */
Andi Kleen's avatar
Andi Kleen committed
499
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
500
				precise++;
501
502
503

			if (x86_pmu.pebs_prec_dist)
				precise++;
504
		}
505
506
507

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
508
509
510
511

		/* There's no sense in having PEBS for non sampling events: */
		if (!is_sampling_event(event))
			return -EINVAL;
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
541
		}
542
543
	}

544
545
546
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

547
548
549
550
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
551
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
552
553
554
555

	/*
	 * Count user and OS events unless requested not to
	 */
556
557
558
559
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
560

561
562
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
563

564
565
566
567
568
569
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

570
	return x86_setup_perfctr(event);
571
572
}

573
/*
574
 * Setup the hardware configuration for a given attr_type
575
 */
576
static int __x86_pmu_event_init(struct perf_event *event)
577
{
Peter Zijlstra's avatar
Peter Zijlstra committed
578
	int err;
579

580
581
	if (!x86_pmu_initialized())
		return -ENODEV;
582

583
	err = x86_reserve_hardware();
Peter Zijlstra's avatar
Peter Zijlstra committed
584
585
586
	if (err)
		return err;

587
	atomic_inc(&active_events);
588
	event->destroy = hw_perf_event_destroy;
589

590
591
592
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
593

594
595
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
596
597
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

598
	return x86_pmu.hw_config(event);
599
600
}

601
void x86_pmu_disable_all(void)
602
{
603
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
604
605
	int idx;

606
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
607
608
		u64 val;

609
		if (!test_bit(idx, cpuc->active_mask))
610
			continue;
611
		rdmsrl(x86_pmu_config_addr(idx), val);
612
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
613
			continue;
614
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
615
		wrmsrl(x86_pmu_config_addr(idx), val);
616
617
618
	}
}

619
620
621
622
623
624
625
626
627
628
629
630
631
/*
 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 * after disable_all.
 *
 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 * handling the NMI, disable_all will be called, which will not change the
 * state either. If PMI hits after disable_all, the PMU is already disabled
 * before entering NMI handler. The NMI handler will not change the state
 * either.
 *
 * So either situation is harmless.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
632
static void x86_pmu_disable(struct pmu *pmu)
633
{
634
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
635

636
	if (!x86_pmu_initialized())
637
		return;
638

639
640
641
642
643
644
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
645
646

	x86_pmu.disable_all();
647
}
648

649
void x86_pmu_enable_all(int added)
650
{
651
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
652
653
	int idx;

654
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
655
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
656

657
		if (!test_bit(idx, cpuc->active_mask))
658
			continue;
659

660
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
661
662
663
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
664
static struct pmu pmu;
665
666
667
668
669
670

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

671
672
673
674
675
676
677
678
679
680
681
682
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
683
	int	nr_gp;		/* number of GP counters used */
684
685
686
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

687
688
689
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

690
691
692
struct perf_sched {
	int			max_weight;
	int			max_events;
693
694
	int			max_gp;
	int			saved_states;
695
	struct event_constraint	**constraints;
696
	struct sched_state	state;
697
	struct sched_state	saved[SCHED_STATES_MAX];
698
699
700
701
702
};

/*
 * Initialize interator that runs through all events and counters.
 */
703
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
704
			    int num, int wmin, int wmax, int gpmax)
705
706
707
708
709
710
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
711
	sched->max_gp		= gpmax;
712
	sched->constraints	= constraints;
713
714

	for (idx = 0; idx < num; idx++) {
715
		if (constraints[idx]->weight == wmin)
716
717
718
719
720
721
722
723
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

747
748
749
750
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
751
static bool __perf_sched_find_counter(struct perf_sched *sched)
752
753
754
755
756
757
758
759
760
761
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

762
	c = sched->constraints[sched->state.event];
763
	/* Prefer fixed purpose counters */
764
765
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
766
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
767
768
769
770
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
771

772
773
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
774
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
775
776
777
778
		if (!__test_and_set_bit(idx, sched->state.used)) {
			if (sched->state.nr_gp++ >= sched->max_gp)
				return false;

779
			goto done;
780
		}
781
782
	}

783
784
785
786
	return false;

done:
	sched->state.counter = idx;
787

788
789
790
791
792
793
794
795
796
797
798
799
800
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
825
		c = sched->constraints[sched->state.event];
826
827
828
829
830
831
832
833
834
835
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
836
int perf_assign_events(struct event_constraint **constraints, int n,
837
			int wmin, int wmax, int gpmax, int *assign)
838
839
840
{
	struct perf_sched sched;

841
	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
842
843
844
845
846
847
848
849
850
851

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
852
EXPORT_SYMBOL_GPL(perf_assign_events);
853

854
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
855
{
856
	struct event_constraint *c;
857
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
858
	struct perf_event *e;
859
	int i, wmin, wmax, unsched = 0;
860
861
862
863
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

864
865
866
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

867
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
868
		cpuc->event_constraint[i] = NULL;
869
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
870
		cpuc->event_constraint[i] = c;
871

872
873
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
874
875
	}

876
877
878
	/*
	 * fastpath, try to reuse previous register
	 */
879
	for (i = 0; i < n; i++) {
880
		hwc = &cpuc->event_list[i]->hw;
881
		c = cpuc->event_constraint[i];
882
883
884
885
886
887

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
888
		if (!test_bit(hwc->idx, c->idxmsk))
889
890
891
892
893
894
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

895
		__set_bit(hwc->idx, used_mask);
896
897
898
899
		if (assign)
			assign[i] = hwc->idx;
	}

900
	/* slow path */
901
	if (i != n) {
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
		int gpmax = x86_pmu.num_counters;

		/*
		 * Do not allow scheduling of more than half the available
		 * generic counters.
		 *
		 * This helps avoid counter starvation of sibling thread by
		 * ensuring at most half the counters cannot be in exclusive
		 * mode. There is no designated counters for the limits. Any
		 * N/2 counters can be used. This helps with events with
		 * specific counter constraints.
		 */
		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
			gpmax /= 2;

918
		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
919
					     wmax, gpmax, assign);
920
	}
921

922
	/*
923
924
925
926
927
928
929
930
	 * In case of success (unsched = 0), mark events as committed,
	 * so we do not put_constraint() in case new events are added
	 * and fail to be scheduled
	 *
	 * We invoke the lower level commit callback to lock the resource
	 *
	 * We do not need to do all of this in case we are called to
	 * validate an event group (assign == NULL)
931
	 */
932
	if (!unsched && assign) {
933
934
935
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
936
			if (x86_pmu.commit_scheduling)
937
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
938
		}
939
	} else {
940
		for (i = 0; i < n; i++) {
941
942
943
944
945
946
947
948
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

949
950
951
			/*
			 * release events that failed scheduling
			 */
952
			if (x86_pmu.put_event_constraints)
953
				x86_pmu.put_event_constraints(cpuc, e);
954
955
		}
	}
956
957
958
959

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

960
	return unsched ? -EINVAL : 0;
961
962
963
964
965
966
967
968
969
970
971
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

972
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
973
974
975
976
977
978

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
979
			return -EINVAL;
980
981
982
983
984
985
986
987
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
988
		    event->state <= PERF_EVENT_STATE_OFF)
989
990
991
			continue;

		if (n >= max_count)
992
			return -EINVAL;
993
994
995
996
997
998
999
1000

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
1001
				struct cpu_hw_events *cpuc, int i)
1002
{
1003
1004
1005
1006
1007
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
1008

1009
	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
1010
1011
		hwc->config_base = 0;
		hwc->event_base	= 0;
1012
	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
1013
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1014
1015
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
1016
	} else {
1017
1018
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
1019
		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1020
1021
1022
	}
}

1023
1024
1025
1026
1027
1028
1029
1030
1031
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1032
static void x86_pmu_start(struct perf_event *event, int flags);
1033

Peter Zijlstra's avatar
Peter Zijlstra committed
1034
static void x86_pmu_enable(struct pmu *pmu)
1035
{
1036
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1037
1038
	struct perf_event *event;
	struct hw_perf_event *hwc;
1039
	int i, added = cpuc->n_added;
1040

1041
	if (!x86_pmu_initialized())
1042
		return;
1043
1044
1045
1046

	if (cpuc->enabled)
		return;

1047
	if (cpuc->n_added) {
1048
		int n_running = cpuc->n_events - cpuc->n_added;
1049
1050
1051
1052
1053
1054
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 */
1055
		for (i = 0; i < n_running; i++) {
1056
1057
1058
			event = cpuc->event_list[i];
			hwc = &event->hw;

1059
1060
1061
1062
1063
1064
1065
1066
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
1067
1068
				continue;

Peter Zijlstra's avatar
Peter Zijlstra committed
1069
1070
1071
1072
1073
1074
1075
1076
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
1077
1078
		}

1079
1080
1081
		/*
		 * step2: reprogram moved events into new counters
		 */
1082
1083
1084
1085
		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

1086
			if (!match_prev_assignment(hwc, cpuc, i))
1087
				x86_assign_hw_event(event, cpuc, i);
1088
1089
			else if (i < n_running)
				continue;
1090

Peter Zijlstra's avatar
Peter Zijlstra committed
1091
1092
1093
1094
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
1095
1096
1097
1098
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
1099
1100
1101
1102

	cpuc->enabled = 1;
	barrier();

1103
	x86_pmu.enable_all(added);
1104
1105
}

1106
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1107

1108
1109
/*
 * Set the next IRQ period, based on the hwc->period_left value.
1110
 * To be called with the event disabled in hw:
1111
 */
1112
int x86_perf_event_set_period(struct perf_event *event)
1113
{
1114
	struct hw_perf_event *hwc = &event->hw;
1115
	s64 left = local64_read(&hwc->period_left);
1116