1/*
2 * Performance events x86 architecture code
3 *
4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 *  Copyright (C) 2009 Jaswinder Singh Rajput
7 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 *
12 *  For licencing details see kernel-base/COPYING
13 */
14
15#include <linux/perf_event.h>
16#include <linux/capability.h>
17#include <linux/notifier.h>
18#include <linux/hardirq.h>
19#include <linux/kprobes.h>
20#include <linux/module.h>
21#include <linux/kdebug.h>
22#include <linux/sched.h>
23#include <linux/uaccess.h>
24#include <linux/slab.h>
25#include <linux/cpu.h>
26#include <linux/bitops.h>
27#include <linux/device.h>
28
29#include <asm/apic.h>
30#include <asm/stacktrace.h>
31#include <asm/nmi.h>
32#include <asm/smp.h>
33#include <asm/alternative.h>
34#include <asm/mmu_context.h>
35#include <asm/tlbflush.h>
36#include <asm/timer.h>
37#include <asm/desc.h>
38#include <asm/ldt.h>
39
40#include "perf_event.h"
41
42struct x86_pmu x86_pmu __read_mostly;
43
44DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
45	.enabled = 1,
46};
47
48struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
49
50u64 __read_mostly hw_cache_event_ids
51				[PERF_COUNT_HW_CACHE_MAX]
52				[PERF_COUNT_HW_CACHE_OP_MAX]
53				[PERF_COUNT_HW_CACHE_RESULT_MAX];
54u64 __read_mostly hw_cache_extra_regs
55				[PERF_COUNT_HW_CACHE_MAX]
56				[PERF_COUNT_HW_CACHE_OP_MAX]
57				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58
59/*
60 * Propagate event elapsed time into the generic event.
61 * Can only be executed on the CPU where the event is active.
62 * Returns the delta events processed.
63 */
64u64 x86_perf_event_update(struct perf_event *event)
65{
66	struct hw_perf_event *hwc = &event->hw;
67	int shift = 64 - x86_pmu.cntval_bits;
68	u64 prev_raw_count, new_raw_count;
69	int idx = hwc->idx;
70	s64 delta;
71
72	if (idx == INTEL_PMC_IDX_FIXED_BTS)
73		return 0;
74
75	/*
76	 * Careful: an NMI might modify the previous event value.
77	 *
78	 * Our tactic to handle this is to first atomically read and
79	 * exchange a new raw count - then add that new-prev delta
80	 * count to the generic event atomically:
81	 */
82again:
83	prev_raw_count = local64_read(&hwc->prev_count);
84	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
85
86	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
87					new_raw_count) != prev_raw_count)
88		goto again;
89
90	/*
91	 * Now we have the new raw value and have updated the prev
92	 * timestamp already. We can now calculate the elapsed delta
93	 * (event-)time and add that to the generic event.
94	 *
95	 * Careful, not all hw sign-extends above the physical width
96	 * of the count.
97	 */
98	delta = (new_raw_count << shift) - (prev_raw_count << shift);
99	delta >>= shift;
100
101	local64_add(delta, &event->count);
102	local64_sub(delta, &hwc->period_left);
103
104	return new_raw_count;
105}
106
107/*
108 * Find and validate any extra registers to set up.
109 */
110static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
111{
112	struct hw_perf_event_extra *reg;
113	struct extra_reg *er;
114
115	reg = &event->hw.extra_reg;
116
117	if (!x86_pmu.extra_regs)
118		return 0;
119
120	for (er = x86_pmu.extra_regs; er->msr; er++) {
121		if (er->event != (config & er->config_mask))
122			continue;
123		if (event->attr.config1 & ~er->valid_mask)
124			return -EINVAL;
125		/* Check if the extra msrs can be safely accessed*/
126		if (!er->extra_msr_access)
127			return -ENXIO;
128
129		reg->idx = er->idx;
130		reg->config = event->attr.config1;
131		reg->reg = er->msr;
132		break;
133	}
134	return 0;
135}
136
137static atomic_t active_events;
138static DEFINE_MUTEX(pmc_reserve_mutex);
139
140#ifdef CONFIG_X86_LOCAL_APIC
141
142static bool reserve_pmc_hardware(void)
143{
144	int i;
145
146	for (i = 0; i < x86_pmu.num_counters; i++) {
147		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
148			goto perfctr_fail;
149	}
150
151	for (i = 0; i < x86_pmu.num_counters; i++) {
152		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
153			goto eventsel_fail;
154	}
155
156	return true;
157
158eventsel_fail:
159	for (i--; i >= 0; i--)
160		release_evntsel_nmi(x86_pmu_config_addr(i));
161
162	i = x86_pmu.num_counters;
163
164perfctr_fail:
165	for (i--; i >= 0; i--)
166		release_perfctr_nmi(x86_pmu_event_addr(i));
167
168	return false;
169}
170
171static void release_pmc_hardware(void)
172{
173	int i;
174
175	for (i = 0; i < x86_pmu.num_counters; i++) {
176		release_perfctr_nmi(x86_pmu_event_addr(i));
177		release_evntsel_nmi(x86_pmu_config_addr(i));
178	}
179}
180
181#else
182
183static bool reserve_pmc_hardware(void) { return true; }
184static void release_pmc_hardware(void) {}
185
186#endif
187
188static bool check_hw_exists(void)
189{
190	u64 val, val_fail, val_new= ~0;
191	int i, reg, reg_fail, ret = 0;
192	int bios_fail = 0;
193	int reg_safe = -1;
194
195	/*
196	 * Check to see if the BIOS enabled any of the counters, if so
197	 * complain and bail.
198	 */
199	for (i = 0; i < x86_pmu.num_counters; i++) {
200		reg = x86_pmu_config_addr(i);
201		ret = rdmsrl_safe(reg, &val);
202		if (ret)
203			goto msr_fail;
204		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
205			bios_fail = 1;
206			val_fail = val;
207			reg_fail = reg;
208		} else {
209			reg_safe = i;
210		}
211	}
212
213	if (x86_pmu.num_counters_fixed) {
214		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
215		ret = rdmsrl_safe(reg, &val);
216		if (ret)
217			goto msr_fail;
218		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
219			if (val & (0x03 << i*4)) {
220				bios_fail = 1;
221				val_fail = val;
222				reg_fail = reg;
223			}
224		}
225	}
226
227	/*
228	 * If all the counters are enabled, the below test will always
229	 * fail.  The tools will also become useless in this scenario.
230	 * Just fail and disable the hardware counters.
231	 */
232
233	if (reg_safe == -1) {
234		reg = reg_safe;
235		goto msr_fail;
236	}
237
238	/*
239	 * Read the current value, change it and read it back to see if it
240	 * matches, this is needed to detect certain hardware emulators
241	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
242	 */
243	reg = x86_pmu_event_addr(reg_safe);
244	if (rdmsrl_safe(reg, &val))
245		goto msr_fail;
246	val ^= 0xffffUL;
247	ret = wrmsrl_safe(reg, val);
248	ret |= rdmsrl_safe(reg, &val_new);
249	if (ret || val != val_new)
250		goto msr_fail;
251
252	/*
253	 * We still allow the PMU driver to operate:
254	 */
255	if (bios_fail) {
256		printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
257		printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
258	}
259
260	return true;
261
262msr_fail:
263	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
264	printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
265		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
266		reg, val_new);
267
268	return false;
269}
270
271static void hw_perf_event_destroy(struct perf_event *event)
272{
273	x86_release_hardware();
274}
275
276void hw_perf_lbr_event_destroy(struct perf_event *event)
277{
278	hw_perf_event_destroy(event);
279
280	/* undo the lbr/bts event accounting */
281	x86_del_exclusive(x86_lbr_exclusive_lbr);
282}
283
284static inline int x86_pmu_initialized(void)
285{
286	return x86_pmu.handle_irq != NULL;
287}
288
289static inline int
290set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
291{
292	struct perf_event_attr *attr = &event->attr;
293	unsigned int cache_type, cache_op, cache_result;
294	u64 config, val;
295
296	config = attr->config;
297
298	cache_type = (config >>  0) & 0xff;
299	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
300		return -EINVAL;
301
302	cache_op = (config >>  8) & 0xff;
303	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
304		return -EINVAL;
305
306	cache_result = (config >> 16) & 0xff;
307	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
308		return -EINVAL;
309
310	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
311
312	if (val == 0)
313		return -ENOENT;
314
315	if (val == -1)
316		return -EINVAL;
317
318	hwc->config |= val;
319	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
320	return x86_pmu_extra_regs(val, event);
321}
322
323int x86_reserve_hardware(void)
324{
325	int err = 0;
326
327	if (!atomic_inc_not_zero(&active_events)) {
328		mutex_lock(&pmc_reserve_mutex);
329		if (atomic_read(&active_events) == 0) {
330			if (!reserve_pmc_hardware())
331				err = -EBUSY;
332			else
333				reserve_ds_buffers();
334		}
335		if (!err)
336			atomic_inc(&active_events);
337		mutex_unlock(&pmc_reserve_mutex);
338	}
339
340	return err;
341}
342
343void x86_release_hardware(void)
344{
345	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
346		release_pmc_hardware();
347		release_ds_buffers();
348		mutex_unlock(&pmc_reserve_mutex);
349	}
350}
351
352/*
353 * Check if we can create event of a certain type (that no conflicting events
354 * are present).
355 */
356int x86_add_exclusive(unsigned int what)
357{
358	int ret = -EBUSY, i;
359
360	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
361		return 0;
362
363	mutex_lock(&pmc_reserve_mutex);
364	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
365		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
366			goto out;
367	}
368
369	atomic_inc(&x86_pmu.lbr_exclusive[what]);
370	ret = 0;
371
372out:
373	mutex_unlock(&pmc_reserve_mutex);
374	return ret;
375}
376
377void x86_del_exclusive(unsigned int what)
378{
379	atomic_dec(&x86_pmu.lbr_exclusive[what]);
380}
381
382int x86_setup_perfctr(struct perf_event *event)
383{
384	struct perf_event_attr *attr = &event->attr;
385	struct hw_perf_event *hwc = &event->hw;
386	u64 config;
387
388	if (!is_sampling_event(event)) {
389		hwc->sample_period = x86_pmu.max_period;
390		hwc->last_period = hwc->sample_period;
391		local64_set(&hwc->period_left, hwc->sample_period);
392	}
393
394	if (attr->type == PERF_TYPE_RAW)
395		return x86_pmu_extra_regs(event->attr.config, event);
396
397	if (attr->type == PERF_TYPE_HW_CACHE)
398		return set_ext_hw_attr(hwc, event);
399
400	if (attr->config >= x86_pmu.max_events)
401		return -EINVAL;
402
403	/*
404	 * The generic map:
405	 */
406	config = x86_pmu.event_map(attr->config);
407
408	if (config == 0)
409		return -ENOENT;
410
411	if (config == -1LL)
412		return -EINVAL;
413
414	/*
415	 * Branch tracing:
416	 */
417	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
418	    !attr->freq && hwc->sample_period == 1) {
419		/* BTS is not supported by this architecture. */
420		if (!x86_pmu.bts_active)
421			return -EOPNOTSUPP;
422
423		/* BTS is currently only allowed for user-mode. */
424		if (!attr->exclude_kernel)
425			return -EOPNOTSUPP;
426
427		/* disallow bts if conflicting events are present */
428		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
429			return -EBUSY;
430
431		event->destroy = hw_perf_lbr_event_destroy;
432	}
433
434	hwc->config |= config;
435
436	return 0;
437}
438
439/*
440 * check that branch_sample_type is compatible with
441 * settings needed for precise_ip > 1 which implies
442 * using the LBR to capture ALL taken branches at the
443 * priv levels of the measurement
444 */
445static inline int precise_br_compat(struct perf_event *event)
446{
447	u64 m = event->attr.branch_sample_type;
448	u64 b = 0;
449
450	/* must capture all branches */
451	if (!(m & PERF_SAMPLE_BRANCH_ANY))
452		return 0;
453
454	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
455
456	if (!event->attr.exclude_user)
457		b |= PERF_SAMPLE_BRANCH_USER;
458
459	if (!event->attr.exclude_kernel)
460		b |= PERF_SAMPLE_BRANCH_KERNEL;
461
462	/*
463	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
464	 */
465
466	return m == b;
467}
468
469int x86_pmu_hw_config(struct perf_event *event)
470{
471	if (event->attr.precise_ip) {
472		int precise = 0;
473
474		/* Support for constant skid */
475		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
476			precise++;
477
478			/* Support for IP fixup */
479			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
480				precise++;
481		}
482
483		if (event->attr.precise_ip > precise)
484			return -EOPNOTSUPP;
485	}
486	/*
487	 * check that PEBS LBR correction does not conflict with
488	 * whatever the user is asking with attr->branch_sample_type
489	 */
490	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
491		u64 *br_type = &event->attr.branch_sample_type;
492
493		if (has_branch_stack(event)) {
494			if (!precise_br_compat(event))
495				return -EOPNOTSUPP;
496
497			/* branch_sample_type is compatible */
498
499		} else {
500			/*
501			 * user did not specify  branch_sample_type
502			 *
503			 * For PEBS fixups, we capture all
504			 * the branches at the priv level of the
505			 * event.
506			 */
507			*br_type = PERF_SAMPLE_BRANCH_ANY;
508
509			if (!event->attr.exclude_user)
510				*br_type |= PERF_SAMPLE_BRANCH_USER;
511
512			if (!event->attr.exclude_kernel)
513				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
514		}
515	}
516
517	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
518		event->attach_state |= PERF_ATTACH_TASK_DATA;
519
520	/*
521	 * Generate PMC IRQs:
522	 * (keep 'enabled' bit clear for now)
523	 */
524	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
525
526	/*
527	 * Count user and OS events unless requested not to
528	 */
529	if (!event->attr.exclude_user)
530		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
531	if (!event->attr.exclude_kernel)
532		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
533
534	if (event->attr.type == PERF_TYPE_RAW)
535		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
536
537	if (event->attr.sample_period && x86_pmu.limit_period) {
538		if (x86_pmu.limit_period(event, event->attr.sample_period) >
539				event->attr.sample_period)
540			return -EINVAL;
541	}
542
543	return x86_setup_perfctr(event);
544}
545
546/*
547 * Setup the hardware configuration for a given attr_type
548 */
549static int __x86_pmu_event_init(struct perf_event *event)
550{
551	int err;
552
553	if (!x86_pmu_initialized())
554		return -ENODEV;
555
556	err = x86_reserve_hardware();
557	if (err)
558		return err;
559
560	event->destroy = hw_perf_event_destroy;
561
562	event->hw.idx = -1;
563	event->hw.last_cpu = -1;
564	event->hw.last_tag = ~0ULL;
565
566	/* mark unused */
567	event->hw.extra_reg.idx = EXTRA_REG_NONE;
568	event->hw.branch_reg.idx = EXTRA_REG_NONE;
569
570	return x86_pmu.hw_config(event);
571}
572
573void x86_pmu_disable_all(void)
574{
575	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
576	int idx;
577
578	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
579		u64 val;
580
581		if (!test_bit(idx, cpuc->active_mask))
582			continue;
583		rdmsrl(x86_pmu_config_addr(idx), val);
584		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
585			continue;
586		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
587		wrmsrl(x86_pmu_config_addr(idx), val);
588	}
589}
590
591static void x86_pmu_disable(struct pmu *pmu)
592{
593	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
594
595	if (!x86_pmu_initialized())
596		return;
597
598	if (!cpuc->enabled)
599		return;
600
601	cpuc->n_added = 0;
602	cpuc->enabled = 0;
603	barrier();
604
605	x86_pmu.disable_all();
606}
607
608void x86_pmu_enable_all(int added)
609{
610	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
611	int idx;
612
613	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
614		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
615
616		if (!test_bit(idx, cpuc->active_mask))
617			continue;
618
619		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
620	}
621}
622
623static struct pmu pmu;
624
625static inline int is_x86_event(struct perf_event *event)
626{
627	return event->pmu == &pmu;
628}
629
630/*
631 * Event scheduler state:
632 *
633 * Assign events iterating over all events and counters, beginning
634 * with events with least weights first. Keep the current iterator
635 * state in struct sched_state.
636 */
637struct sched_state {
638	int	weight;
639	int	event;		/* event index */
640	int	counter;	/* counter index */
641	int	unassigned;	/* number of events to be assigned left */
642	int	nr_gp;		/* number of GP counters used */
643	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
644};
645
646/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
647#define	SCHED_STATES_MAX	2
648
649struct perf_sched {
650	int			max_weight;
651	int			max_events;
652	int			max_gp;
653	int			saved_states;
654	struct event_constraint	**constraints;
655	struct sched_state	state;
656	struct sched_state	saved[SCHED_STATES_MAX];
657};
658
659/*
660 * Initialize interator that runs through all events and counters.
661 */
662static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
663			    int num, int wmin, int wmax, int gpmax)
664{
665	int idx;
666
667	memset(sched, 0, sizeof(*sched));
668	sched->max_events	= num;
669	sched->max_weight	= wmax;
670	sched->max_gp		= gpmax;
671	sched->constraints	= constraints;
672
673	for (idx = 0; idx < num; idx++) {
674		if (constraints[idx]->weight == wmin)
675			break;
676	}
677
678	sched->state.event	= idx;		/* start with min weight */
679	sched->state.weight	= wmin;
680	sched->state.unassigned	= num;
681}
682
683static void perf_sched_save_state(struct perf_sched *sched)
684{
685	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
686		return;
687
688	sched->saved[sched->saved_states] = sched->state;
689	sched->saved_states++;
690}
691
692static bool perf_sched_restore_state(struct perf_sched *sched)
693{
694	if (!sched->saved_states)
695		return false;
696
697	sched->saved_states--;
698	sched->state = sched->saved[sched->saved_states];
699
700	/* continue with next counter: */
701	clear_bit(sched->state.counter++, sched->state.used);
702
703	return true;
704}
705
706/*
707 * Select a counter for the current event to schedule. Return true on
708 * success.
709 */
710static bool __perf_sched_find_counter(struct perf_sched *sched)
711{
712	struct event_constraint *c;
713	int idx;
714
715	if (!sched->state.unassigned)
716		return false;
717
718	if (sched->state.event >= sched->max_events)
719		return false;
720
721	c = sched->constraints[sched->state.event];
722	/* Prefer fixed purpose counters */
723	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
724		idx = INTEL_PMC_IDX_FIXED;
725		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
726			if (!__test_and_set_bit(idx, sched->state.used))
727				goto done;
728		}
729	}
730
731	/* Grab the first unused counter starting with idx */
732	idx = sched->state.counter;
733	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
734		if (!__test_and_set_bit(idx, sched->state.used)) {
735			if (sched->state.nr_gp++ >= sched->max_gp)
736				return false;
737
738			goto done;
739		}
740	}
741
742	return false;
743
744done:
745	sched->state.counter = idx;
746
747	if (c->overlap)
748		perf_sched_save_state(sched);
749
750	return true;
751}
752
753static bool perf_sched_find_counter(struct perf_sched *sched)
754{
755	while (!__perf_sched_find_counter(sched)) {
756		if (!perf_sched_restore_state(sched))
757			return false;
758	}
759
760	return true;
761}
762
763/*
764 * Go through all unassigned events and find the next one to schedule.
765 * Take events with the least weight first. Return true on success.
766 */
767static bool perf_sched_next_event(struct perf_sched *sched)
768{
769	struct event_constraint *c;
770
771	if (!sched->state.unassigned || !--sched->state.unassigned)
772		return false;
773
774	do {
775		/* next event */
776		sched->state.event++;
777		if (sched->state.event >= sched->max_events) {
778			/* next weight */
779			sched->state.event = 0;
780			sched->state.weight++;
781			if (sched->state.weight > sched->max_weight)
782				return false;
783		}
784		c = sched->constraints[sched->state.event];
785	} while (c->weight != sched->state.weight);
786
787	sched->state.counter = 0;	/* start with first counter */
788
789	return true;
790}
791
792/*
793 * Assign a counter for each event.
794 */
795int perf_assign_events(struct event_constraint **constraints, int n,
796			int wmin, int wmax, int gpmax, int *assign)
797{
798	struct perf_sched sched;
799
800	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
801
802	do {
803		if (!perf_sched_find_counter(&sched))
804			break;	/* failed */
805		if (assign)
806			assign[sched.state.event] = sched.state.counter;
807	} while (perf_sched_next_event(&sched));
808
809	return sched.state.unassigned;
810}
811EXPORT_SYMBOL_GPL(perf_assign_events);
812
813int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
814{
815	struct event_constraint *c;
816	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
817	struct perf_event *e;
818	int i, wmin, wmax, unsched = 0;
819	struct hw_perf_event *hwc;
820
821	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
822
823	if (x86_pmu.start_scheduling)
824		x86_pmu.start_scheduling(cpuc);
825
826	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
827		cpuc->event_constraint[i] = NULL;
828		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
829		cpuc->event_constraint[i] = c;
830
831		wmin = min(wmin, c->weight);
832		wmax = max(wmax, c->weight);
833	}
834
835	/*
836	 * fastpath, try to reuse previous register
837	 */
838	for (i = 0; i < n; i++) {
839		hwc = &cpuc->event_list[i]->hw;
840		c = cpuc->event_constraint[i];
841
842		/* never assigned */
843		if (hwc->idx == -1)
844			break;
845
846		/* constraint still honored */
847		if (!test_bit(hwc->idx, c->idxmsk))
848			break;
849
850		/* not already used */
851		if (test_bit(hwc->idx, used_mask))
852			break;
853
854		__set_bit(hwc->idx, used_mask);
855		if (assign)
856			assign[i] = hwc->idx;
857	}
858
859	/* slow path */
860	if (i != n) {
861		int gpmax = x86_pmu.num_counters;
862
863		/*
864		 * Do not allow scheduling of more than half the available
865		 * generic counters.
866		 *
867		 * This helps avoid counter starvation of sibling thread by
868		 * ensuring at most half the counters cannot be in exclusive
869		 * mode. There is no designated counters for the limits. Any
870		 * N/2 counters can be used. This helps with events with
871		 * specific counter constraints.
872		 */
873		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
874		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
875			gpmax /= 2;
876
877		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
878					     wmax, gpmax, assign);
879	}
880
881	/*
882	 * In case of success (unsched = 0), mark events as committed,
883	 * so we do not put_constraint() in case new events are added
884	 * and fail to be scheduled
885	 *
886	 * We invoke the lower level commit callback to lock the resource
887	 *
888	 * We do not need to do all of this in case we are called to
889	 * validate an event group (assign == NULL)
890	 */
891	if (!unsched && assign) {
892		for (i = 0; i < n; i++) {
893			e = cpuc->event_list[i];
894			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
895			if (x86_pmu.commit_scheduling)
896				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
897		}
898	}
899
900	if (!assign || unsched) {
901
902		for (i = 0; i < n; i++) {
903			e = cpuc->event_list[i];
904			/*
905			 * do not put_constraint() on comitted events,
906			 * because they are good to go
907			 */
908			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
909				continue;
910
911			/*
912			 * release events that failed scheduling
913			 */
914			if (x86_pmu.put_event_constraints)
915				x86_pmu.put_event_constraints(cpuc, e);
916		}
917	}
918
919	if (x86_pmu.stop_scheduling)
920		x86_pmu.stop_scheduling(cpuc);
921
922	return unsched ? -EINVAL : 0;
923}
924
925/*
926 * dogrp: true if must collect siblings events (group)
927 * returns total number of events and error code
928 */
929static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
930{
931	struct perf_event *event;
932	int n, max_count;
933
934	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
935
936	/* current number of events already accepted */
937	n = cpuc->n_events;
938
939	if (is_x86_event(leader)) {
940		if (n >= max_count)
941			return -EINVAL;
942		cpuc->event_list[n] = leader;
943		n++;
944	}
945	if (!dogrp)
946		return n;
947
948	list_for_each_entry(event, &leader->sibling_list, group_entry) {
949		if (!is_x86_event(event) ||
950		    event->state <= PERF_EVENT_STATE_OFF)
951			continue;
952
953		if (n >= max_count)
954			return -EINVAL;
955
956		cpuc->event_list[n] = event;
957		n++;
958	}
959	return n;
960}
961
962static inline void x86_assign_hw_event(struct perf_event *event,
963				struct cpu_hw_events *cpuc, int i)
964{
965	struct hw_perf_event *hwc = &event->hw;
966
967	hwc->idx = cpuc->assign[i];
968	hwc->last_cpu = smp_processor_id();
969	hwc->last_tag = ++cpuc->tags[i];
970
971	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
972		hwc->config_base = 0;
973		hwc->event_base	= 0;
974	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
975		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
976		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
977		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
978	} else {
979		hwc->config_base = x86_pmu_config_addr(hwc->idx);
980		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
981		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
982	}
983}
984
985static inline int match_prev_assignment(struct hw_perf_event *hwc,
986					struct cpu_hw_events *cpuc,
987					int i)
988{
989	return hwc->idx == cpuc->assign[i] &&
990		hwc->last_cpu == smp_processor_id() &&
991		hwc->last_tag == cpuc->tags[i];
992}
993
994static void x86_pmu_start(struct perf_event *event, int flags);
995
996static void x86_pmu_enable(struct pmu *pmu)
997{
998	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
999	struct perf_event *event;
1000	struct hw_perf_event *hwc;
1001	int i, added = cpuc->n_added;
1002
1003	if (!x86_pmu_initialized())
1004		return;
1005
1006	if (cpuc->enabled)
1007		return;
1008
1009	if (cpuc->n_added) {
1010		int n_running = cpuc->n_events - cpuc->n_added;
1011		/*
1012		 * apply assignment obtained either from
1013		 * hw_perf_group_sched_in() or x86_pmu_enable()
1014		 *
1015		 * step1: save events moving to new counters
1016		 */
1017		for (i = 0; i < n_running; i++) {
1018			event = cpuc->event_list[i];
1019			hwc = &event->hw;
1020
1021			/*
1022			 * we can avoid reprogramming counter if:
1023			 * - assigned same counter as last time
1024			 * - running on same CPU as last time
1025			 * - no other event has used the counter since
1026			 */
1027			if (hwc->idx == -1 ||
1028			    match_prev_assignment(hwc, cpuc, i))
1029				continue;
1030
1031			/*
1032			 * Ensure we don't accidentally enable a stopped
1033			 * counter simply because we rescheduled.
1034			 */
1035			if (hwc->state & PERF_HES_STOPPED)
1036				hwc->state |= PERF_HES_ARCH;
1037
1038			x86_pmu_stop(event, PERF_EF_UPDATE);
1039		}
1040
1041		/*
1042		 * step2: reprogram moved events into new counters
1043		 */
1044		for (i = 0; i < cpuc->n_events; i++) {
1045			event = cpuc->event_list[i];
1046			hwc = &event->hw;
1047
1048			if (!match_prev_assignment(hwc, cpuc, i))
1049				x86_assign_hw_event(event, cpuc, i);
1050			else if (i < n_running)
1051				continue;
1052
1053			if (hwc->state & PERF_HES_ARCH)
1054				continue;
1055
1056			x86_pmu_start(event, PERF_EF_RELOAD);
1057		}
1058		cpuc->n_added = 0;
1059		perf_events_lapic_init();
1060	}
1061
1062	cpuc->enabled = 1;
1063	barrier();
1064
1065	x86_pmu.enable_all(added);
1066}
1067
1068static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1069
1070/*
1071 * Set the next IRQ period, based on the hwc->period_left value.
1072 * To be called with the event disabled in hw:
1073 */
1074int x86_perf_event_set_period(struct perf_event *event)
1075{
1076	struct hw_perf_event *hwc = &event->hw;
1077	s64 left = local64_read(&hwc->period_left);
1078	s64 period = hwc->sample_period;
1079	int ret = 0, idx = hwc->idx;
1080
1081	if (idx == INTEL_PMC_IDX_FIXED_BTS)
1082		return 0;
1083
1084	/*
1085	 * If we are way outside a reasonable range then just skip forward:
1086	 */
1087	if (unlikely(left <= -period)) {
1088		left = period;
1089		local64_set(&hwc->period_left, left);
1090		hwc->last_period = period;
1091		ret = 1;
1092	}
1093
1094	if (unlikely(left <= 0)) {
1095		left += period;
1096		local64_set(&hwc->period_left, left);
1097		hwc->last_period = period;
1098		ret = 1;
1099	}
1100	/*
1101	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1102	 */
1103	if (unlikely(left < 2))
1104		left = 2;
1105
1106	if (left > x86_pmu.max_period)
1107		left = x86_pmu.max_period;
1108
1109	if (x86_pmu.limit_period)
1110		left = x86_pmu.limit_period(event, left);
1111
1112	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1113
1114	/*
1115	 * The hw event starts counting from this event offset,
1116	 * mark it to be able to extra future deltas:
1117	 */
1118	local64_set(&hwc->prev_count, (u64)-left);
1119
1120	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1121
1122	/*
1123	 * Due to erratum on certan cpu we need
1124	 * a second write to be sure the register
1125	 * is updated properly
1126	 */
1127	if (x86_pmu.perfctr_second_write) {
1128		wrmsrl(hwc->event_base,
1129			(u64)(-left) & x86_pmu.cntval_mask);
1130	}
1131
1132	perf_event_update_userpage(event);
1133
1134	return ret;
1135}
1136
1137void x86_pmu_enable_event(struct perf_event *event)
1138{
1139	if (__this_cpu_read(cpu_hw_events.enabled))
1140		__x86_pmu_enable_event(&event->hw,
1141				       ARCH_PERFMON_EVENTSEL_ENABLE);
1142}
1143
1144/*
1145 * Add a single event to the PMU.
1146 *
1147 * The event is added to the group of enabled events
1148 * but only if it can be scehduled with existing events.
1149 */
1150static int x86_pmu_add(struct perf_event *event, int flags)
1151{
1152	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1153	struct hw_perf_event *hwc;
1154	int assign[X86_PMC_IDX_MAX];
1155	int n, n0, ret;
1156
1157	hwc = &event->hw;
1158
1159	n0 = cpuc->n_events;
1160	ret = n = collect_events(cpuc, event, false);
1161	if (ret < 0)
1162		goto out;
1163
1164	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1165	if (!(flags & PERF_EF_START))
1166		hwc->state |= PERF_HES_ARCH;
1167
1168	/*
1169	 * If group events scheduling transaction was started,
1170	 * skip the schedulability test here, it will be performed
1171	 * at commit time (->commit_txn) as a whole.
1172	 */
1173	if (cpuc->group_flag & PERF_EVENT_TXN)
1174		goto done_collect;
1175
1176	ret = x86_pmu.schedule_events(cpuc, n, assign);
1177	if (ret)
1178		goto out;
1179	/*
1180	 * copy new assignment, now we know it is possible
1181	 * will be used by hw_perf_enable()
1182	 */
1183	memcpy(cpuc->assign, assign, n*sizeof(int));
1184
1185done_collect:
1186	/*
1187	 * Commit the collect_events() state. See x86_pmu_del() and
1188	 * x86_pmu_*_txn().
1189	 */
1190	cpuc->n_events = n;
1191	cpuc->n_added += n - n0;
1192	cpuc->n_txn += n - n0;
1193
1194	ret = 0;
1195out:
1196	return ret;
1197}
1198
1199static void x86_pmu_start(struct perf_event *event, int flags)
1200{
1201	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1202	int idx = event->hw.idx;
1203
1204	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1205		return;
1206
1207	if (WARN_ON_ONCE(idx == -1))
1208		return;
1209
1210	if (flags & PERF_EF_RELOAD) {
1211		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1212		x86_perf_event_set_period(event);
1213	}
1214
1215	event->hw.state = 0;
1216
1217	cpuc->events[idx] = event;
1218	__set_bit(idx, cpuc->active_mask);
1219	__set_bit(idx, cpuc->running);
1220	x86_pmu.enable(event);
1221	perf_event_update_userpage(event);
1222}
1223
1224void perf_event_print_debug(void)
1225{
1226	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1227	u64 pebs, debugctl;
1228	struct cpu_hw_events *cpuc;
1229	unsigned long flags;
1230	int cpu, idx;
1231
1232	if (!x86_pmu.num_counters)
1233		return;
1234
1235	local_irq_save(flags);
1236
1237	cpu = smp_processor_id();
1238	cpuc = &per_cpu(cpu_hw_events, cpu);
1239
1240	if (x86_pmu.version >= 2) {
1241		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1242		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1243		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1244		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1245
1246		pr_info("\n");
1247		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1248		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1249		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1250		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1251		if (x86_pmu.pebs_constraints) {
1252			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1253			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1254		}
1255		if (x86_pmu.lbr_nr) {
1256			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1257			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
1258		}
1259	}
1260	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1261
1262	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1263		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1264		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1265
1266		prev_left = per_cpu(pmc_prev_left[idx], cpu);
1267
1268		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1269			cpu, idx, pmc_ctrl);
1270		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1271			cpu, idx, pmc_count);
1272		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1273			cpu, idx, prev_left);
1274	}
1275	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1276		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1277
1278		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1279			cpu, idx, pmc_count);
1280	}
1281	local_irq_restore(flags);
1282}
1283
1284void x86_pmu_stop(struct perf_event *event, int flags)
1285{
1286	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1287	struct hw_perf_event *hwc = &event->hw;
1288
1289	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1290		x86_pmu.disable(event);
1291		cpuc->events[hwc->idx] = NULL;
1292		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1293		hwc->state |= PERF_HES_STOPPED;
1294	}
1295
1296	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1297		/*
1298		 * Drain the remaining delta count out of a event
1299		 * that we are disabling:
1300		 */
1301		x86_perf_event_update(event);
1302		hwc->state |= PERF_HES_UPTODATE;
1303	}
1304}
1305
1306static void x86_pmu_del(struct perf_event *event, int flags)
1307{
1308	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1309	int i;
1310
1311	/*
1312	 * event is descheduled
1313	 */
1314	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
1315
1316	/*
1317	 * If we're called during a txn, we don't need to do anything.
1318	 * The events never got scheduled and ->cancel_txn will truncate
1319	 * the event_list.
1320	 *
1321	 * XXX assumes any ->del() called during a TXN will only be on
1322	 * an event added during that same TXN.
1323	 */
1324	if (cpuc->group_flag & PERF_EVENT_TXN)
1325		return;
1326
1327	/*
1328	 * Not a TXN, therefore cleanup properly.
1329	 */
1330	x86_pmu_stop(event, PERF_EF_UPDATE);
1331
1332	for (i = 0; i < cpuc->n_events; i++) {
1333		if (event == cpuc->event_list[i])
1334			break;
1335	}
1336
1337	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1338		return;
1339
1340	/* If we have a newly added event; make sure to decrease n_added. */
1341	if (i >= cpuc->n_events - cpuc->n_added)
1342		--cpuc->n_added;
1343
1344	if (x86_pmu.put_event_constraints)
1345		x86_pmu.put_event_constraints(cpuc, event);
1346
1347	/* Delete the array entry. */
1348	while (++i < cpuc->n_events) {
1349		cpuc->event_list[i-1] = cpuc->event_list[i];
1350		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1351	}
1352	--cpuc->n_events;
1353
1354	perf_event_update_userpage(event);
1355}
1356
1357int x86_pmu_handle_irq(struct pt_regs *regs)
1358{
1359	struct perf_sample_data data;
1360	struct cpu_hw_events *cpuc;
1361	struct perf_event *event;
1362	int idx, handled = 0;
1363	u64 val;
1364
1365	cpuc = this_cpu_ptr(&cpu_hw_events);
1366
1367	/*
1368	 * Some chipsets need to unmask the LVTPC in a particular spot
1369	 * inside the nmi handler.  As a result, the unmasking was pushed
1370	 * into all the nmi handlers.
1371	 *
1372	 * This generic handler doesn't seem to have any issues where the
1373	 * unmasking occurs so it was left at the top.
1374	 */
1375	apic_write(APIC_LVTPC, APIC_DM_NMI);
1376
1377	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1378		if (!test_bit(idx, cpuc->active_mask)) {
1379			/*
1380			 * Though we deactivated the counter some cpus
1381			 * might still deliver spurious interrupts still
1382			 * in flight. Catch them:
1383			 */
1384			if (__test_and_clear_bit(idx, cpuc->running))
1385				handled++;
1386			continue;
1387		}
1388
1389		event = cpuc->events[idx];
1390
1391		val = x86_perf_event_update(event);
1392		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1393			continue;
1394
1395		/*
1396		 * event overflow
1397		 */
1398		handled++;
1399		perf_sample_data_init(&data, 0, event->hw.last_period);
1400
1401		if (!x86_perf_event_set_period(event))
1402			continue;
1403
1404		if (perf_event_overflow(event, &data, regs))
1405			x86_pmu_stop(event, 0);
1406	}
1407
1408	if (handled)
1409		inc_irq_stat(apic_perf_irqs);
1410
1411	return handled;
1412}
1413
1414void perf_events_lapic_init(void)
1415{
1416	if (!x86_pmu.apic || !x86_pmu_initialized())
1417		return;
1418
1419	/*
1420	 * Always use NMI for PMU
1421	 */
1422	apic_write(APIC_LVTPC, APIC_DM_NMI);
1423}
1424
1425static int
1426perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1427{
1428	u64 start_clock;
1429	u64 finish_clock;
1430	int ret;
1431
1432	if (!atomic_read(&active_events))
1433		return NMI_DONE;
1434
1435	start_clock = sched_clock();
1436	ret = x86_pmu.handle_irq(regs);
1437	finish_clock = sched_clock();
1438
1439	perf_sample_event_took(finish_clock - start_clock);
1440
1441	return ret;
1442}
1443NOKPROBE_SYMBOL(perf_event_nmi_handler);
1444
1445struct event_constraint emptyconstraint;
1446struct event_constraint unconstrained;
1447
1448static int
1449x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1450{
1451	unsigned int cpu = (long)hcpu;
1452	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1453	int i, ret = NOTIFY_OK;
1454
1455	switch (action & ~CPU_TASKS_FROZEN) {
1456	case CPU_UP_PREPARE:
1457		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1458			cpuc->kfree_on_online[i] = NULL;
1459		if (x86_pmu.cpu_prepare)
1460			ret = x86_pmu.cpu_prepare(cpu);
1461		break;
1462
1463	case CPU_STARTING:
1464		if (x86_pmu.cpu_starting)
1465			x86_pmu.cpu_starting(cpu);
1466		break;
1467
1468	case CPU_ONLINE:
1469		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1470			kfree(cpuc->kfree_on_online[i]);
1471			cpuc->kfree_on_online[i] = NULL;
1472		}
1473		break;
1474
1475	case CPU_DYING:
1476		if (x86_pmu.cpu_dying)
1477			x86_pmu.cpu_dying(cpu);
1478		break;
1479
1480	case CPU_UP_CANCELED:
1481	case CPU_DEAD:
1482		if (x86_pmu.cpu_dead)
1483			x86_pmu.cpu_dead(cpu);
1484		break;
1485
1486	default:
1487		break;
1488	}
1489
1490	return ret;
1491}
1492
1493static void __init pmu_check_apic(void)
1494{
1495	if (cpu_has_apic)
1496		return;
1497
1498	x86_pmu.apic = 0;
1499	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1500	pr_info("no hardware sampling interrupt available.\n");
1501
1502	/*
1503	 * If we have a PMU initialized but no APIC
1504	 * interrupts, we cannot sample hardware
1505	 * events (user-space has to fall back and
1506	 * sample via a hrtimer based software event):
1507	 */
1508	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1509
1510}
1511
1512static struct attribute_group x86_pmu_format_group = {
1513	.name = "format",
1514	.attrs = NULL,
1515};
1516
1517/*
1518 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1519 * out of events_attr attributes.
1520 */
1521static void __init filter_events(struct attribute **attrs)
1522{
1523	struct device_attribute *d;
1524	struct perf_pmu_events_attr *pmu_attr;
1525	int i, j;
1526
1527	for (i = 0; attrs[i]; i++) {
1528		d = (struct device_attribute *)attrs[i];
1529		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
1530		/* str trumps id */
1531		if (pmu_attr->event_str)
1532			continue;
1533		if (x86_pmu.event_map(i))
1534			continue;
1535
1536		for (j = i; attrs[j]; j++)
1537			attrs[j] = attrs[j + 1];
1538
1539		/* Check the shifted attr. */
1540		i--;
1541	}
1542}
1543
1544/* Merge two pointer arrays */
1545static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
1546{
1547	struct attribute **new;
1548	int j, i;
1549
1550	for (j = 0; a[j]; j++)
1551		;
1552	for (i = 0; b[i]; i++)
1553		j++;
1554	j++;
1555
1556	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
1557	if (!new)
1558		return NULL;
1559
1560	j = 0;
1561	for (i = 0; a[i]; i++)
1562		new[j++] = a[i];
1563	for (i = 0; b[i]; i++)
1564		new[j++] = b[i];
1565	new[j] = NULL;
1566
1567	return new;
1568}
1569
1570ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
1571			  char *page)
1572{
1573	struct perf_pmu_events_attr *pmu_attr = \
1574		container_of(attr, struct perf_pmu_events_attr, attr);
1575	u64 config = x86_pmu.event_map(pmu_attr->id);
1576
1577	/* string trumps id */
1578	if (pmu_attr->event_str)
1579		return sprintf(page, "%s", pmu_attr->event_str);
1580
1581	return x86_pmu.events_sysfs_show(page, config);
1582}
1583
1584EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
1585EVENT_ATTR(instructions,		INSTRUCTIONS		);
1586EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
1587EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
1588EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
1589EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
1590EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
1591EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
1592EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
1593EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
1594
1595static struct attribute *empty_attrs;
1596
1597static struct attribute *events_attr[] = {
1598	EVENT_PTR(CPU_CYCLES),
1599	EVENT_PTR(INSTRUCTIONS),
1600	EVENT_PTR(CACHE_REFERENCES),
1601	EVENT_PTR(CACHE_MISSES),
1602	EVENT_PTR(BRANCH_INSTRUCTIONS),
1603	EVENT_PTR(BRANCH_MISSES),
1604	EVENT_PTR(BUS_CYCLES),
1605	EVENT_PTR(STALLED_CYCLES_FRONTEND),
1606	EVENT_PTR(STALLED_CYCLES_BACKEND),
1607	EVENT_PTR(REF_CPU_CYCLES),
1608	NULL,
1609};
1610
1611static struct attribute_group x86_pmu_events_group = {
1612	.name = "events",
1613	.attrs = events_attr,
1614};
1615
1616ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1617{
1618	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1619	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1620	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1621	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1622	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
1623	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
1624	ssize_t ret;
1625
1626	/*
1627	* We have whole page size to spend and just little data
1628	* to write, so we can safely use sprintf.
1629	*/
1630	ret = sprintf(page, "event=0x%02llx", event);
1631
1632	if (umask)
1633		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1634
1635	if (edge)
1636		ret += sprintf(page + ret, ",edge");
1637
1638	if (pc)
1639		ret += sprintf(page + ret, ",pc");
1640
1641	if (any)
1642		ret += sprintf(page + ret, ",any");
1643
1644	if (inv)
1645		ret += sprintf(page + ret, ",inv");
1646
1647	if (cmask)
1648		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1649
1650	ret += sprintf(page + ret, "\n");
1651
1652	return ret;
1653}
1654
1655static int __init init_hw_perf_events(void)
1656{
1657	struct x86_pmu_quirk *quirk;
1658	int err;
1659
1660	pr_info("Performance Events: ");
1661
1662	switch (boot_cpu_data.x86_vendor) {
1663	case X86_VENDOR_INTEL:
1664		err = intel_pmu_init();
1665		break;
1666	case X86_VENDOR_AMD:
1667		err = amd_pmu_init();
1668		break;
1669	default:
1670		err = -ENOTSUPP;
1671	}
1672	if (err != 0) {
1673		pr_cont("no PMU driver, software events only.\n");
1674		return 0;
1675	}
1676
1677	pmu_check_apic();
1678
1679	/* sanity check that the hardware exists or is emulated */
1680	if (!check_hw_exists())
1681		return 0;
1682
1683	pr_cont("%s PMU driver.\n", x86_pmu.name);
1684
1685	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1686
1687	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1688		quirk->func();
1689
1690	if (!x86_pmu.intel_ctrl)
1691		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1692
1693	perf_events_lapic_init();
1694	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1695
1696	unconstrained = (struct event_constraint)
1697		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1698				   0, x86_pmu.num_counters, 0, 0);
1699
1700	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1701
1702	if (x86_pmu.event_attrs)
1703		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
1704
1705	if (!x86_pmu.events_sysfs_show)
1706		x86_pmu_events_group.attrs = &empty_attrs;
1707	else
1708		filter_events(x86_pmu_events_group.attrs);
1709
1710	if (x86_pmu.cpu_events) {
1711		struct attribute **tmp;
1712
1713		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
1714		if (!WARN_ON(!tmp))
1715			x86_pmu_events_group.attrs = tmp;
1716	}
1717
1718	pr_info("... version:                %d\n",     x86_pmu.version);
1719	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1720	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1721	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1722	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1723	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1724	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1725
1726	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1727	perf_cpu_notifier(x86_pmu_notifier);
1728
1729	return 0;
1730}
1731early_initcall(init_hw_perf_events);
1732
1733static inline void x86_pmu_read(struct perf_event *event)
1734{
1735	x86_perf_event_update(event);
1736}
1737
1738/*
1739 * Start group events scheduling transaction
1740 * Set the flag to make pmu::enable() not perform the
1741 * schedulability test, it will be performed at commit time
1742 */
1743static void x86_pmu_start_txn(struct pmu *pmu)
1744{
1745	perf_pmu_disable(pmu);
1746	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1747	__this_cpu_write(cpu_hw_events.n_txn, 0);
1748}
1749
1750/*
1751 * Stop group events scheduling transaction
1752 * Clear the flag and pmu::enable() will perform the
1753 * schedulability test.
1754 */
1755static void x86_pmu_cancel_txn(struct pmu *pmu)
1756{
1757	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1758	/*
1759	 * Truncate collected array by the number of events added in this
1760	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1761	 */
1762	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1763	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1764	perf_pmu_enable(pmu);
1765}
1766
1767/*
1768 * Commit group events scheduling transaction
1769 * Perform the group schedulability test as a whole
1770 * Return 0 if success
1771 *
1772 * Does not cancel the transaction on failure; expects the caller to do this.
1773 */
1774static int x86_pmu_commit_txn(struct pmu *pmu)
1775{
1776	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1777	int assign[X86_PMC_IDX_MAX];
1778	int n, ret;
1779
1780	n = cpuc->n_events;
1781
1782	if (!x86_pmu_initialized())
1783		return -EAGAIN;
1784
1785	ret = x86_pmu.schedule_events(cpuc, n, assign);
1786	if (ret)
1787		return ret;
1788
1789	/*
1790	 * copy new assignment, now we know it is possible
1791	 * will be used by hw_perf_enable()
1792	 */
1793	memcpy(cpuc->assign, assign, n*sizeof(int));
1794
1795	cpuc->group_flag &= ~PERF_EVENT_TXN;
1796	perf_pmu_enable(pmu);
1797	return 0;
1798}
1799/*
1800 * a fake_cpuc is used to validate event groups. Due to
1801 * the extra reg logic, we need to also allocate a fake
1802 * per_core and per_cpu structure. Otherwise, group events
1803 * using extra reg may conflict without the kernel being
1804 * able to catch this when the last event gets added to
1805 * the group.
1806 */
1807static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1808{
1809	kfree(cpuc->shared_regs);
1810	kfree(cpuc);
1811}
1812
1813static struct cpu_hw_events *allocate_fake_cpuc(void)
1814{
1815	struct cpu_hw_events *cpuc;
1816	int cpu = raw_smp_processor_id();
1817
1818	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1819	if (!cpuc)
1820		return ERR_PTR(-ENOMEM);
1821
1822	/* only needed, if we have extra_regs */
1823	if (x86_pmu.extra_regs) {
1824		cpuc->shared_regs = allocate_shared_regs(cpu);
1825		if (!cpuc->shared_regs)
1826			goto error;
1827	}
1828	cpuc->is_fake = 1;
1829	return cpuc;
1830error:
1831	free_fake_cpuc(cpuc);
1832	return ERR_PTR(-ENOMEM);
1833}
1834
1835/*
1836 * validate that we can schedule this event
1837 */
1838static int validate_event(struct perf_event *event)
1839{
1840	struct cpu_hw_events *fake_cpuc;
1841	struct event_constraint *c;
1842	int ret = 0;
1843
1844	fake_cpuc = allocate_fake_cpuc();
1845	if (IS_ERR(fake_cpuc))
1846		return PTR_ERR(fake_cpuc);
1847
1848	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
1849
1850	if (!c || !c->weight)
1851		ret = -EINVAL;
1852
1853	if (x86_pmu.put_event_constraints)
1854		x86_pmu.put_event_constraints(fake_cpuc, event);
1855
1856	free_fake_cpuc(fake_cpuc);
1857
1858	return ret;
1859}
1860
1861/*
1862 * validate a single event group
1863 *
1864 * validation include:
1865 *	- check events are compatible which each other
1866 *	- events do not compete for the same counter
1867 *	- number of events <= number of counters
1868 *
1869 * validation ensures the group can be loaded onto the
1870 * PMU if it was the only group available.
1871 */
1872static int validate_group(struct perf_event *event)
1873{
1874	struct perf_event *leader = event->group_leader;
1875	struct cpu_hw_events *fake_cpuc;
1876	int ret = -EINVAL, n;
1877
1878	fake_cpuc = allocate_fake_cpuc();
1879	if (IS_ERR(fake_cpuc))
1880		return PTR_ERR(fake_cpuc);
1881	/*
1882	 * the event is not yet connected with its
1883	 * siblings therefore we must first collect
1884	 * existing siblings, then add the new event
1885	 * before we can simulate the scheduling
1886	 */
1887	n = collect_events(fake_cpuc, leader, true);
1888	if (n < 0)
1889		goto out;
1890
1891	fake_cpuc->n_events = n;
1892	n = collect_events(fake_cpuc, event, false);
1893	if (n < 0)
1894		goto out;
1895
1896	fake_cpuc->n_events = n;
1897
1898	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1899
1900out:
1901	free_fake_cpuc(fake_cpuc);
1902	return ret;
1903}
1904
1905static int x86_pmu_event_init(struct perf_event *event)
1906{
1907	struct pmu *tmp;
1908	int err;
1909
1910	switch (event->attr.type) {
1911	case PERF_TYPE_RAW:
1912	case PERF_TYPE_HARDWARE:
1913	case PERF_TYPE_HW_CACHE:
1914		break;
1915
1916	default:
1917		return -ENOENT;
1918	}
1919
1920	err = __x86_pmu_event_init(event);
1921	if (!err) {
1922		/*
1923		 * we temporarily connect event to its pmu
1924		 * such that validate_group() can classify
1925		 * it as an x86 event using is_x86_event()
1926		 */
1927		tmp = event->pmu;
1928		event->pmu = &pmu;
1929
1930		if (event->group_leader != event)
1931			err = validate_group(event);
1932		else
1933			err = validate_event(event);
1934
1935		event->pmu = tmp;
1936	}
1937	if (err) {
1938		if (event->destroy)
1939			event->destroy(event);
1940	}
1941
1942	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
1943		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
1944
1945	return err;
1946}
1947
1948static void refresh_pce(void *ignored)
1949{
1950	if (current->mm)
1951		load_mm_cr4(current->mm);
1952}
1953
1954static void x86_pmu_event_mapped(struct perf_event *event)
1955{
1956	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1957		return;
1958
1959	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
1960		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
1961}
1962
1963static void x86_pmu_event_unmapped(struct perf_event *event)
1964{
1965	if (!current->mm)
1966		return;
1967
1968	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1969		return;
1970
1971	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
1972		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
1973}
1974
1975static int x86_pmu_event_idx(struct perf_event *event)
1976{
1977	int idx = event->hw.idx;
1978
1979	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
1980		return 0;
1981
1982	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1983		idx -= INTEL_PMC_IDX_FIXED;
1984		idx |= 1 << 30;
1985	}
1986
1987	return idx + 1;
1988}
1989
1990static ssize_t get_attr_rdpmc(struct device *cdev,
1991			      struct device_attribute *attr,
1992			      char *buf)
1993{
1994	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1995}
1996
1997static ssize_t set_attr_rdpmc(struct device *cdev,
1998			      struct device_attribute *attr,
1999			      const char *buf, size_t count)
2000{
2001	unsigned long val;
2002	ssize_t ret;
2003
2004	ret = kstrtoul(buf, 0, &val);
2005	if (ret)
2006		return ret;
2007
2008	if (val > 2)
2009		return -EINVAL;
2010
2011	if (x86_pmu.attr_rdpmc_broken)
2012		return -ENOTSUPP;
2013
2014	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
2015		/*
2016		 * Changing into or out of always available, aka
2017		 * perf-event-bypassing mode.  This path is extremely slow,
2018		 * but only root can trigger it, so it's okay.
2019		 */
2020		if (val == 2)
2021			static_key_slow_inc(&rdpmc_always_available);
2022		else
2023			static_key_slow_dec(&rdpmc_always_available);
2024		on_each_cpu(refresh_pce, NULL, 1);
2025	}
2026
2027	x86_pmu.attr_rdpmc = val;
2028
2029	return count;
2030}
2031
2032static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2033
2034static struct attribute *x86_pmu_attrs[] = {
2035	&dev_attr_rdpmc.attr,
2036	NULL,
2037};
2038
2039static struct attribute_group x86_pmu_attr_group = {
2040	.attrs = x86_pmu_attrs,
2041};
2042
2043static const struct attribute_group *x86_pmu_attr_groups[] = {
2044	&x86_pmu_attr_group,
2045	&x86_pmu_format_group,
2046	&x86_pmu_events_group,
2047	NULL,
2048};
2049
2050static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
2051{
2052	if (x86_pmu.sched_task)
2053		x86_pmu.sched_task(ctx, sched_in);
2054}
2055
2056void perf_check_microcode(void)
2057{
2058	if (x86_pmu.check_microcode)
2059		x86_pmu.check_microcode();
2060}
2061EXPORT_SYMBOL_GPL(perf_check_microcode);
2062
2063static struct pmu pmu = {
2064	.pmu_enable		= x86_pmu_enable,
2065	.pmu_disable		= x86_pmu_disable,
2066
2067	.attr_groups		= x86_pmu_attr_groups,
2068
2069	.event_init		= x86_pmu_event_init,
2070
2071	.event_mapped		= x86_pmu_event_mapped,
2072	.event_unmapped		= x86_pmu_event_unmapped,
2073
2074	.add			= x86_pmu_add,
2075	.del			= x86_pmu_del,
2076	.start			= x86_pmu_start,
2077	.stop			= x86_pmu_stop,
2078	.read			= x86_pmu_read,
2079
2080	.start_txn		= x86_pmu_start_txn,
2081	.cancel_txn		= x86_pmu_cancel_txn,
2082	.commit_txn		= x86_pmu_commit_txn,
2083
2084	.event_idx		= x86_pmu_event_idx,
2085	.sched_task		= x86_pmu_sched_task,
2086	.task_ctx_size          = sizeof(struct x86_perf_task_context),
2087};
2088
2089void arch_perf_update_userpage(struct perf_event *event,
2090			       struct perf_event_mmap_page *userpg, u64 now)
2091{
2092	struct cyc2ns_data *data;
2093
2094	userpg->cap_user_time = 0;
2095	userpg->cap_user_time_zero = 0;
2096	userpg->cap_user_rdpmc =
2097		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
2098	userpg->pmc_width = x86_pmu.cntval_bits;
2099
2100	if (!sched_clock_stable())
2101		return;
2102
2103	data = cyc2ns_read_begin();
2104
2105	/*
2106	 * Internal timekeeping for enabled/running/stopped times
2107	 * is always in the local_clock domain.
2108	 */
2109	userpg->cap_user_time = 1;
2110	userpg->time_mult = data->cyc2ns_mul;
2111	userpg->time_shift = data->cyc2ns_shift;
2112	userpg->time_offset = data->cyc2ns_offset - now;
2113
2114	/*
2115	 * cap_user_time_zero doesn't make sense when we're using a different
2116	 * time base for the records.
2117	 */
2118	if (event->clock == &local_clock) {
2119		userpg->cap_user_time_zero = 1;
2120		userpg->time_zero = data->cyc2ns_offset;
2121	}
2122
2123	cyc2ns_read_end(data);
2124}
2125
2126/*
2127 * callchain support
2128 */
2129
2130static int backtrace_stack(void *data, char *name)
2131{
2132	return 0;
2133}
2134
2135static void backtrace_address(void *data, unsigned long addr, int reliable)
2136{
2137	struct perf_callchain_entry *entry = data;
2138
2139	perf_callchain_store(entry, addr);
2140}
2141
2142static const struct stacktrace_ops backtrace_ops = {
2143	.stack			= backtrace_stack,
2144	.address		= backtrace_address,
2145	.walk_stack		= print_context_stack_bp,
2146};
2147
2148void
2149perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
2150{
2151	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2152		/* TODO: We don't support guest os callchain now */
2153		return;
2154	}
2155
2156	perf_callchain_store(entry, regs->ip);
2157
2158	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
2159}
2160
2161static inline int
2162valid_user_frame(const void __user *fp, unsigned long size)
2163{
2164	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
2165}
2166
2167static unsigned long get_segment_base(unsigned int segment)
2168{
2169	struct desc_struct *desc;
2170	int idx = segment >> 3;
2171
2172	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2173		struct ldt_struct *ldt;
2174
2175		if (idx > LDT_ENTRIES)
2176			return 0;
2177
2178		/* IRQs are off, so this synchronizes with smp_store_release */
2179		ldt = lockless_dereference(current->active_mm->context.ldt);
2180		if (!ldt || idx > ldt->size)
2181			return 0;
2182
2183		desc = &ldt->entries[idx];
2184	} else {
2185		if (idx > GDT_ENTRIES)
2186			return 0;
2187
2188		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2189	}
2190
2191	return get_desc_base(desc);
2192}
2193
2194#ifdef CONFIG_COMPAT
2195
2196#include <asm/compat.h>
2197
2198static inline int
2199perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2200{
2201	/* 32-bit process in 64-bit kernel. */
2202	unsigned long ss_base, cs_base;
2203	struct stack_frame_ia32 frame;
2204	const void __user *fp;
2205
2206	if (!test_thread_flag(TIF_IA32))
2207		return 0;
2208
2209	cs_base = get_segment_base(regs->cs);
2210	ss_base = get_segment_base(regs->ss);
2211
2212	fp = compat_ptr(ss_base + regs->bp);
2213	while (entry->nr < PERF_MAX_STACK_DEPTH) {
2214		unsigned long bytes;
2215		frame.next_frame     = 0;
2216		frame.return_address = 0;
2217
2218		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2219		if (bytes != 0)
2220			break;
2221
2222		if (!valid_user_frame(fp, sizeof(frame)))
2223			break;
2224
2225		perf_callchain_store(entry, cs_base + frame.return_address);
2226		fp = compat_ptr(ss_base + frame.next_frame);
2227	}
2228	return 1;
2229}
2230#else
2231static inline int
2232perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2233{
2234    return 0;
2235}
2236#endif
2237
2238void
2239perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
2240{
2241	struct stack_frame frame;
2242	const void __user *fp;
2243
2244	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2245		/* TODO: We don't support guest os callchain now */
2246		return;
2247	}
2248
2249	/*
2250	 * We don't know what to do with VM86 stacks.. ignore them for now.
2251	 */
2252	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2253		return;
2254
2255	fp = (void __user *)regs->bp;
2256
2257	perf_callchain_store(entry, regs->ip);
2258
2259	if (!current->mm)
2260		return;
2261
2262	if (perf_callchain_user32(regs, entry))
2263		return;
2264
2265	while (entry->nr < PERF_MAX_STACK_DEPTH) {
2266		unsigned long bytes;
2267		frame.next_frame	     = NULL;
2268		frame.return_address = 0;
2269
2270		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2271		if (bytes != 0)
2272			break;
2273
2274		if (!valid_user_frame(fp, sizeof(frame)))
2275			break;
2276
2277		perf_callchain_store(entry, frame.return_address);
2278		fp = frame.next_frame;
2279	}
2280}
2281
2282/*
2283 * Deal with code segment offsets for the various execution modes:
2284 *
2285 *   VM86 - the good olde 16 bit days, where the linear address is
2286 *          20 bits and we use regs->ip + 0x10 * regs->cs.
2287 *
2288 *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
2289 *          to figure out what the 32bit base address is.
2290 *
2291 *    X32 - has TIF_X32 set, but is running in x86_64
2292 *
2293 * X86_64 - CS,DS,SS,ES are all zero based.
2294 */
2295static unsigned long code_segment_base(struct pt_regs *regs)
2296{
2297	/*
2298	 * For IA32 we look at the GDT/LDT segment base to convert the
2299	 * effective IP to a linear address.
2300	 */
2301
2302#ifdef CONFIG_X86_32
2303	/*
2304	 * If we are in VM86 mode, add the segment offset to convert to a
2305	 * linear address.
2306	 */
2307	if (regs->flags & X86_VM_MASK)
2308		return 0x10 * regs->cs;
2309
2310	if (user_mode(regs) && regs->cs != __USER_CS)
2311		return get_segment_base(regs->cs);
2312#else
2313	if (user_mode(regs) && !user_64bit_mode(regs) &&
2314	    regs->cs != __USER32_CS)
2315		return get_segment_base(regs->cs);
2316#endif
2317	return 0;
2318}
2319
2320unsigned long perf_instruction_pointer(struct pt_regs *regs)
2321{
2322	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
2323		return perf_guest_cbs->get_guest_ip();
2324
2325	return regs->ip + code_segment_base(regs);
2326}
2327
2328unsigned long perf_misc_flags(struct pt_regs *regs)
2329{
2330	int misc = 0;
2331
2332	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2333		if (perf_guest_cbs->is_user_mode())
2334			misc |= PERF_RECORD_MISC_GUEST_USER;
2335		else
2336			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
2337	} else {
2338		if (user_mode(regs))
2339			misc |= PERF_RECORD_MISC_USER;
2340		else
2341			misc |= PERF_RECORD_MISC_KERNEL;
2342	}
2343
2344	if (regs->flags & PERF_EFLAGS_EXACT)
2345		misc |= PERF_RECORD_MISC_EXACT_IP;
2346
2347	return misc;
2348}
2349
2350void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2351{
2352	cap->version		= x86_pmu.version;
2353	cap->num_counters_gp	= x86_pmu.num_counters;
2354	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
2355	cap->bit_width_gp	= x86_pmu.cntval_bits;
2356	cap->bit_width_fixed	= x86_pmu.cntval_bits;
2357	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
2358	cap->events_mask_len	= x86_pmu.events_mask_len;
2359}
2360EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
2361