1/* 2 * Performance events x86 architecture code 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2009 Jaswinder Singh Rajput 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14 15#include <linux/perf_event.h> 16#include <linux/capability.h> 17#include <linux/notifier.h> 18#include <linux/hardirq.h> 19#include <linux/kprobes.h> 20#include <linux/module.h> 21#include <linux/kdebug.h> 22#include <linux/sched.h> 23#include <linux/uaccess.h> 24#include <linux/slab.h> 25#include <linux/cpu.h> 26#include <linux/bitops.h> 27#include <linux/device.h> 28 29#include <asm/apic.h> 30#include <asm/stacktrace.h> 31#include <asm/nmi.h> 32#include <asm/smp.h> 33#include <asm/alternative.h> 34#include <asm/mmu_context.h> 35#include <asm/tlbflush.h> 36#include <asm/timer.h> 37#include <asm/desc.h> 38#include <asm/ldt.h> 39 40#include "perf_event.h" 41 42struct x86_pmu x86_pmu __read_mostly; 43 44DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 45 .enabled = 1, 46}; 47 48struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; 49 50u64 __read_mostly hw_cache_event_ids 51 [PERF_COUNT_HW_CACHE_MAX] 52 [PERF_COUNT_HW_CACHE_OP_MAX] 53 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 54u64 __read_mostly hw_cache_extra_regs 55 [PERF_COUNT_HW_CACHE_MAX] 56 [PERF_COUNT_HW_CACHE_OP_MAX] 57 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 58 59/* 60 * Propagate event elapsed time into the generic event. 61 * Can only be executed on the CPU where the event is active. 62 * Returns the delta events processed. 63 */ 64u64 x86_perf_event_update(struct perf_event *event) 65{ 66 struct hw_perf_event *hwc = &event->hw; 67 int shift = 64 - x86_pmu.cntval_bits; 68 u64 prev_raw_count, new_raw_count; 69 int idx = hwc->idx; 70 s64 delta; 71 72 if (idx == INTEL_PMC_IDX_FIXED_BTS) 73 return 0; 74 75 /* 76 * Careful: an NMI might modify the previous event value. 77 * 78 * Our tactic to handle this is to first atomically read and 79 * exchange a new raw count - then add that new-prev delta 80 * count to the generic event atomically: 81 */ 82again: 83 prev_raw_count = local64_read(&hwc->prev_count); 84 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 85 86 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 87 new_raw_count) != prev_raw_count) 88 goto again; 89 90 /* 91 * Now we have the new raw value and have updated the prev 92 * timestamp already. We can now calculate the elapsed delta 93 * (event-)time and add that to the generic event. 94 * 95 * Careful, not all hw sign-extends above the physical width 96 * of the count. 97 */ 98 delta = (new_raw_count << shift) - (prev_raw_count << shift); 99 delta >>= shift; 100 101 local64_add(delta, &event->count); 102 local64_sub(delta, &hwc->period_left); 103 104 return new_raw_count; 105} 106 107/* 108 * Find and validate any extra registers to set up. 109 */ 110static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 111{ 112 struct hw_perf_event_extra *reg; 113 struct extra_reg *er; 114 115 reg = &event->hw.extra_reg; 116 117 if (!x86_pmu.extra_regs) 118 return 0; 119 120 for (er = x86_pmu.extra_regs; er->msr; er++) { 121 if (er->event != (config & er->config_mask)) 122 continue; 123 if (event->attr.config1 & ~er->valid_mask) 124 return -EINVAL; 125 /* Check if the extra msrs can be safely accessed*/ 126 if (!er->extra_msr_access) 127 return -ENXIO; 128 129 reg->idx = er->idx; 130 reg->config = event->attr.config1; 131 reg->reg = er->msr; 132 break; 133 } 134 return 0; 135} 136 137static atomic_t active_events; 138static DEFINE_MUTEX(pmc_reserve_mutex); 139 140#ifdef CONFIG_X86_LOCAL_APIC 141 142static bool reserve_pmc_hardware(void) 143{ 144 int i; 145 146 for (i = 0; i < x86_pmu.num_counters; i++) { 147 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) 148 goto perfctr_fail; 149 } 150 151 for (i = 0; i < x86_pmu.num_counters; i++) { 152 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) 153 goto eventsel_fail; 154 } 155 156 return true; 157 158eventsel_fail: 159 for (i--; i >= 0; i--) 160 release_evntsel_nmi(x86_pmu_config_addr(i)); 161 162 i = x86_pmu.num_counters; 163 164perfctr_fail: 165 for (i--; i >= 0; i--) 166 release_perfctr_nmi(x86_pmu_event_addr(i)); 167 168 return false; 169} 170 171static void release_pmc_hardware(void) 172{ 173 int i; 174 175 for (i = 0; i < x86_pmu.num_counters; i++) { 176 release_perfctr_nmi(x86_pmu_event_addr(i)); 177 release_evntsel_nmi(x86_pmu_config_addr(i)); 178 } 179} 180 181#else 182 183static bool reserve_pmc_hardware(void) { return true; } 184static void release_pmc_hardware(void) {} 185 186#endif 187 188static bool check_hw_exists(void) 189{ 190 u64 val, val_fail, val_new= ~0; 191 int i, reg, reg_fail, ret = 0; 192 int bios_fail = 0; 193 int reg_safe = -1; 194 195 /* 196 * Check to see if the BIOS enabled any of the counters, if so 197 * complain and bail. 198 */ 199 for (i = 0; i < x86_pmu.num_counters; i++) { 200 reg = x86_pmu_config_addr(i); 201 ret = rdmsrl_safe(reg, &val); 202 if (ret) 203 goto msr_fail; 204 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { 205 bios_fail = 1; 206 val_fail = val; 207 reg_fail = reg; 208 } else { 209 reg_safe = i; 210 } 211 } 212 213 if (x86_pmu.num_counters_fixed) { 214 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 215 ret = rdmsrl_safe(reg, &val); 216 if (ret) 217 goto msr_fail; 218 for (i = 0; i < x86_pmu.num_counters_fixed; i++) { 219 if (val & (0x03 << i*4)) { 220 bios_fail = 1; 221 val_fail = val; 222 reg_fail = reg; 223 } 224 } 225 } 226 227 /* 228 * If all the counters are enabled, the below test will always 229 * fail. The tools will also become useless in this scenario. 230 * Just fail and disable the hardware counters. 231 */ 232 233 if (reg_safe == -1) { 234 reg = reg_safe; 235 goto msr_fail; 236 } 237 238 /* 239 * Read the current value, change it and read it back to see if it 240 * matches, this is needed to detect certain hardware emulators 241 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 242 */ 243 reg = x86_pmu_event_addr(reg_safe); 244 if (rdmsrl_safe(reg, &val)) 245 goto msr_fail; 246 val ^= 0xffffUL; 247 ret = wrmsrl_safe(reg, val); 248 ret |= rdmsrl_safe(reg, &val_new); 249 if (ret || val != val_new) 250 goto msr_fail; 251 252 /* 253 * We still allow the PMU driver to operate: 254 */ 255 if (bios_fail) { 256 printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); 257 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); 258 } 259 260 return true; 261 262msr_fail: 263 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 264 printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", 265 boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, 266 reg, val_new); 267 268 return false; 269} 270 271static void hw_perf_event_destroy(struct perf_event *event) 272{ 273 x86_release_hardware(); 274} 275 276void hw_perf_lbr_event_destroy(struct perf_event *event) 277{ 278 hw_perf_event_destroy(event); 279 280 /* undo the lbr/bts event accounting */ 281 x86_del_exclusive(x86_lbr_exclusive_lbr); 282} 283 284static inline int x86_pmu_initialized(void) 285{ 286 return x86_pmu.handle_irq != NULL; 287} 288 289static inline int 290set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) 291{ 292 struct perf_event_attr *attr = &event->attr; 293 unsigned int cache_type, cache_op, cache_result; 294 u64 config, val; 295 296 config = attr->config; 297 298 cache_type = (config >> 0) & 0xff; 299 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 300 return -EINVAL; 301 302 cache_op = (config >> 8) & 0xff; 303 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 304 return -EINVAL; 305 306 cache_result = (config >> 16) & 0xff; 307 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 308 return -EINVAL; 309 310 val = hw_cache_event_ids[cache_type][cache_op][cache_result]; 311 312 if (val == 0) 313 return -ENOENT; 314 315 if (val == -1) 316 return -EINVAL; 317 318 hwc->config |= val; 319 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; 320 return x86_pmu_extra_regs(val, event); 321} 322 323int x86_reserve_hardware(void) 324{ 325 int err = 0; 326 327 if (!atomic_inc_not_zero(&active_events)) { 328 mutex_lock(&pmc_reserve_mutex); 329 if (atomic_read(&active_events) == 0) { 330 if (!reserve_pmc_hardware()) 331 err = -EBUSY; 332 else 333 reserve_ds_buffers(); 334 } 335 if (!err) 336 atomic_inc(&active_events); 337 mutex_unlock(&pmc_reserve_mutex); 338 } 339 340 return err; 341} 342 343void x86_release_hardware(void) 344{ 345 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 346 release_pmc_hardware(); 347 release_ds_buffers(); 348 mutex_unlock(&pmc_reserve_mutex); 349 } 350} 351 352/* 353 * Check if we can create event of a certain type (that no conflicting events 354 * are present). 355 */ 356int x86_add_exclusive(unsigned int what) 357{ 358 int ret = -EBUSY, i; 359 360 if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) 361 return 0; 362 363 mutex_lock(&pmc_reserve_mutex); 364 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { 365 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) 366 goto out; 367 } 368 369 atomic_inc(&x86_pmu.lbr_exclusive[what]); 370 ret = 0; 371 372out: 373 mutex_unlock(&pmc_reserve_mutex); 374 return ret; 375} 376 377void x86_del_exclusive(unsigned int what) 378{ 379 atomic_dec(&x86_pmu.lbr_exclusive[what]); 380} 381 382int x86_setup_perfctr(struct perf_event *event) 383{ 384 struct perf_event_attr *attr = &event->attr; 385 struct hw_perf_event *hwc = &event->hw; 386 u64 config; 387 388 if (!is_sampling_event(event)) { 389 hwc->sample_period = x86_pmu.max_period; 390 hwc->last_period = hwc->sample_period; 391 local64_set(&hwc->period_left, hwc->sample_period); 392 } 393 394 if (attr->type == PERF_TYPE_RAW) 395 return x86_pmu_extra_regs(event->attr.config, event); 396 397 if (attr->type == PERF_TYPE_HW_CACHE) 398 return set_ext_hw_attr(hwc, event); 399 400 if (attr->config >= x86_pmu.max_events) 401 return -EINVAL; 402 403 /* 404 * The generic map: 405 */ 406 config = x86_pmu.event_map(attr->config); 407 408 if (config == 0) 409 return -ENOENT; 410 411 if (config == -1LL) 412 return -EINVAL; 413 414 /* 415 * Branch tracing: 416 */ 417 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && 418 !attr->freq && hwc->sample_period == 1) { 419 /* BTS is not supported by this architecture. */ 420 if (!x86_pmu.bts_active) 421 return -EOPNOTSUPP; 422 423 /* BTS is currently only allowed for user-mode. */ 424 if (!attr->exclude_kernel) 425 return -EOPNOTSUPP; 426 427 /* disallow bts if conflicting events are present */ 428 if (x86_add_exclusive(x86_lbr_exclusive_lbr)) 429 return -EBUSY; 430 431 event->destroy = hw_perf_lbr_event_destroy; 432 } 433 434 hwc->config |= config; 435 436 return 0; 437} 438 439/* 440 * check that branch_sample_type is compatible with 441 * settings needed for precise_ip > 1 which implies 442 * using the LBR to capture ALL taken branches at the 443 * priv levels of the measurement 444 */ 445static inline int precise_br_compat(struct perf_event *event) 446{ 447 u64 m = event->attr.branch_sample_type; 448 u64 b = 0; 449 450 /* must capture all branches */ 451 if (!(m & PERF_SAMPLE_BRANCH_ANY)) 452 return 0; 453 454 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; 455 456 if (!event->attr.exclude_user) 457 b |= PERF_SAMPLE_BRANCH_USER; 458 459 if (!event->attr.exclude_kernel) 460 b |= PERF_SAMPLE_BRANCH_KERNEL; 461 462 /* 463 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 464 */ 465 466 return m == b; 467} 468 469int x86_pmu_hw_config(struct perf_event *event) 470{ 471 if (event->attr.precise_ip) { 472 int precise = 0; 473 474 /* Support for constant skid */ 475 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 476 precise++; 477 478 /* Support for IP fixup */ 479 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 480 precise++; 481 } 482 483 if (event->attr.precise_ip > precise) 484 return -EOPNOTSUPP; 485 } 486 /* 487 * check that PEBS LBR correction does not conflict with 488 * whatever the user is asking with attr->branch_sample_type 489 */ 490 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { 491 u64 *br_type = &event->attr.branch_sample_type; 492 493 if (has_branch_stack(event)) { 494 if (!precise_br_compat(event)) 495 return -EOPNOTSUPP; 496 497 /* branch_sample_type is compatible */ 498 499 } else { 500 /* 501 * user did not specify branch_sample_type 502 * 503 * For PEBS fixups, we capture all 504 * the branches at the priv level of the 505 * event. 506 */ 507 *br_type = PERF_SAMPLE_BRANCH_ANY; 508 509 if (!event->attr.exclude_user) 510 *br_type |= PERF_SAMPLE_BRANCH_USER; 511 512 if (!event->attr.exclude_kernel) 513 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 514 } 515 } 516 517 if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) 518 event->attach_state |= PERF_ATTACH_TASK_DATA; 519 520 /* 521 * Generate PMC IRQs: 522 * (keep 'enabled' bit clear for now) 523 */ 524 event->hw.config = ARCH_PERFMON_EVENTSEL_INT; 525 526 /* 527 * Count user and OS events unless requested not to 528 */ 529 if (!event->attr.exclude_user) 530 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; 531 if (!event->attr.exclude_kernel) 532 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; 533 534 if (event->attr.type == PERF_TYPE_RAW) 535 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 536 537 if (event->attr.sample_period && x86_pmu.limit_period) { 538 if (x86_pmu.limit_period(event, event->attr.sample_period) > 539 event->attr.sample_period) 540 return -EINVAL; 541 } 542 543 return x86_setup_perfctr(event); 544} 545 546/* 547 * Setup the hardware configuration for a given attr_type 548 */ 549static int __x86_pmu_event_init(struct perf_event *event) 550{ 551 int err; 552 553 if (!x86_pmu_initialized()) 554 return -ENODEV; 555 556 err = x86_reserve_hardware(); 557 if (err) 558 return err; 559 560 event->destroy = hw_perf_event_destroy; 561 562 event->hw.idx = -1; 563 event->hw.last_cpu = -1; 564 event->hw.last_tag = ~0ULL; 565 566 /* mark unused */ 567 event->hw.extra_reg.idx = EXTRA_REG_NONE; 568 event->hw.branch_reg.idx = EXTRA_REG_NONE; 569 570 return x86_pmu.hw_config(event); 571} 572 573void x86_pmu_disable_all(void) 574{ 575 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 576 int idx; 577 578 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 579 u64 val; 580 581 if (!test_bit(idx, cpuc->active_mask)) 582 continue; 583 rdmsrl(x86_pmu_config_addr(idx), val); 584 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 585 continue; 586 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 587 wrmsrl(x86_pmu_config_addr(idx), val); 588 } 589} 590 591static void x86_pmu_disable(struct pmu *pmu) 592{ 593 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 594 595 if (!x86_pmu_initialized()) 596 return; 597 598 if (!cpuc->enabled) 599 return; 600 601 cpuc->n_added = 0; 602 cpuc->enabled = 0; 603 barrier(); 604 605 x86_pmu.disable_all(); 606} 607 608void x86_pmu_enable_all(int added) 609{ 610 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 611 int idx; 612 613 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 614 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 615 616 if (!test_bit(idx, cpuc->active_mask)) 617 continue; 618 619 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 620 } 621} 622 623static struct pmu pmu; 624 625static inline int is_x86_event(struct perf_event *event) 626{ 627 return event->pmu == &pmu; 628} 629 630/* 631 * Event scheduler state: 632 * 633 * Assign events iterating over all events and counters, beginning 634 * with events with least weights first. Keep the current iterator 635 * state in struct sched_state. 636 */ 637struct sched_state { 638 int weight; 639 int event; /* event index */ 640 int counter; /* counter index */ 641 int unassigned; /* number of events to be assigned left */ 642 int nr_gp; /* number of GP counters used */ 643 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 644}; 645 646/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ 647#define SCHED_STATES_MAX 2 648 649struct perf_sched { 650 int max_weight; 651 int max_events; 652 int max_gp; 653 int saved_states; 654 struct event_constraint **constraints; 655 struct sched_state state; 656 struct sched_state saved[SCHED_STATES_MAX]; 657}; 658 659/* 660 * Initialize interator that runs through all events and counters. 661 */ 662static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 663 int num, int wmin, int wmax, int gpmax) 664{ 665 int idx; 666 667 memset(sched, 0, sizeof(*sched)); 668 sched->max_events = num; 669 sched->max_weight = wmax; 670 sched->max_gp = gpmax; 671 sched->constraints = constraints; 672 673 for (idx = 0; idx < num; idx++) { 674 if (constraints[idx]->weight == wmin) 675 break; 676 } 677 678 sched->state.event = idx; /* start with min weight */ 679 sched->state.weight = wmin; 680 sched->state.unassigned = num; 681} 682 683static void perf_sched_save_state(struct perf_sched *sched) 684{ 685 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) 686 return; 687 688 sched->saved[sched->saved_states] = sched->state; 689 sched->saved_states++; 690} 691 692static bool perf_sched_restore_state(struct perf_sched *sched) 693{ 694 if (!sched->saved_states) 695 return false; 696 697 sched->saved_states--; 698 sched->state = sched->saved[sched->saved_states]; 699 700 /* continue with next counter: */ 701 clear_bit(sched->state.counter++, sched->state.used); 702 703 return true; 704} 705 706/* 707 * Select a counter for the current event to schedule. Return true on 708 * success. 709 */ 710static bool __perf_sched_find_counter(struct perf_sched *sched) 711{ 712 struct event_constraint *c; 713 int idx; 714 715 if (!sched->state.unassigned) 716 return false; 717 718 if (sched->state.event >= sched->max_events) 719 return false; 720 721 c = sched->constraints[sched->state.event]; 722 /* Prefer fixed purpose counters */ 723 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 724 idx = INTEL_PMC_IDX_FIXED; 725 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 726 if (!__test_and_set_bit(idx, sched->state.used)) 727 goto done; 728 } 729 } 730 731 /* Grab the first unused counter starting with idx */ 732 idx = sched->state.counter; 733 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 734 if (!__test_and_set_bit(idx, sched->state.used)) { 735 if (sched->state.nr_gp++ >= sched->max_gp) 736 return false; 737 738 goto done; 739 } 740 } 741 742 return false; 743 744done: 745 sched->state.counter = idx; 746 747 if (c->overlap) 748 perf_sched_save_state(sched); 749 750 return true; 751} 752 753static bool perf_sched_find_counter(struct perf_sched *sched) 754{ 755 while (!__perf_sched_find_counter(sched)) { 756 if (!perf_sched_restore_state(sched)) 757 return false; 758 } 759 760 return true; 761} 762 763/* 764 * Go through all unassigned events and find the next one to schedule. 765 * Take events with the least weight first. Return true on success. 766 */ 767static bool perf_sched_next_event(struct perf_sched *sched) 768{ 769 struct event_constraint *c; 770 771 if (!sched->state.unassigned || !--sched->state.unassigned) 772 return false; 773 774 do { 775 /* next event */ 776 sched->state.event++; 777 if (sched->state.event >= sched->max_events) { 778 /* next weight */ 779 sched->state.event = 0; 780 sched->state.weight++; 781 if (sched->state.weight > sched->max_weight) 782 return false; 783 } 784 c = sched->constraints[sched->state.event]; 785 } while (c->weight != sched->state.weight); 786 787 sched->state.counter = 0; /* start with first counter */ 788 789 return true; 790} 791 792/* 793 * Assign a counter for each event. 794 */ 795int perf_assign_events(struct event_constraint **constraints, int n, 796 int wmin, int wmax, int gpmax, int *assign) 797{ 798 struct perf_sched sched; 799 800 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 801 802 do { 803 if (!perf_sched_find_counter(&sched)) 804 break; /* failed */ 805 if (assign) 806 assign[sched.state.event] = sched.state.counter; 807 } while (perf_sched_next_event(&sched)); 808 809 return sched.state.unassigned; 810} 811EXPORT_SYMBOL_GPL(perf_assign_events); 812 813int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 814{ 815 struct event_constraint *c; 816 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 817 struct perf_event *e; 818 int i, wmin, wmax, unsched = 0; 819 struct hw_perf_event *hwc; 820 821 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 822 823 if (x86_pmu.start_scheduling) 824 x86_pmu.start_scheduling(cpuc); 825 826 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 827 cpuc->event_constraint[i] = NULL; 828 c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); 829 cpuc->event_constraint[i] = c; 830 831 wmin = min(wmin, c->weight); 832 wmax = max(wmax, c->weight); 833 } 834 835 /* 836 * fastpath, try to reuse previous register 837 */ 838 for (i = 0; i < n; i++) { 839 hwc = &cpuc->event_list[i]->hw; 840 c = cpuc->event_constraint[i]; 841 842 /* never assigned */ 843 if (hwc->idx == -1) 844 break; 845 846 /* constraint still honored */ 847 if (!test_bit(hwc->idx, c->idxmsk)) 848 break; 849 850 /* not already used */ 851 if (test_bit(hwc->idx, used_mask)) 852 break; 853 854 __set_bit(hwc->idx, used_mask); 855 if (assign) 856 assign[i] = hwc->idx; 857 } 858 859 /* slow path */ 860 if (i != n) { 861 int gpmax = x86_pmu.num_counters; 862 863 /* 864 * Do not allow scheduling of more than half the available 865 * generic counters. 866 * 867 * This helps avoid counter starvation of sibling thread by 868 * ensuring at most half the counters cannot be in exclusive 869 * mode. There is no designated counters for the limits. Any 870 * N/2 counters can be used. This helps with events with 871 * specific counter constraints. 872 */ 873 if (is_ht_workaround_enabled() && !cpuc->is_fake && 874 READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 875 gpmax /= 2; 876 877 unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 878 wmax, gpmax, assign); 879 } 880 881 /* 882 * In case of success (unsched = 0), mark events as committed, 883 * so we do not put_constraint() in case new events are added 884 * and fail to be scheduled 885 * 886 * We invoke the lower level commit callback to lock the resource 887 * 888 * We do not need to do all of this in case we are called to 889 * validate an event group (assign == NULL) 890 */ 891 if (!unsched && assign) { 892 for (i = 0; i < n; i++) { 893 e = cpuc->event_list[i]; 894 e->hw.flags |= PERF_X86_EVENT_COMMITTED; 895 if (x86_pmu.commit_scheduling) 896 x86_pmu.commit_scheduling(cpuc, i, assign[i]); 897 } 898 } 899 900 if (!assign || unsched) { 901 902 for (i = 0; i < n; i++) { 903 e = cpuc->event_list[i]; 904 /* 905 * do not put_constraint() on comitted events, 906 * because they are good to go 907 */ 908 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) 909 continue; 910 911 /* 912 * release events that failed scheduling 913 */ 914 if (x86_pmu.put_event_constraints) 915 x86_pmu.put_event_constraints(cpuc, e); 916 } 917 } 918 919 if (x86_pmu.stop_scheduling) 920 x86_pmu.stop_scheduling(cpuc); 921 922 return unsched ? -EINVAL : 0; 923} 924 925/* 926 * dogrp: true if must collect siblings events (group) 927 * returns total number of events and error code 928 */ 929static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) 930{ 931 struct perf_event *event; 932 int n, max_count; 933 934 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; 935 936 /* current number of events already accepted */ 937 n = cpuc->n_events; 938 939 if (is_x86_event(leader)) { 940 if (n >= max_count) 941 return -EINVAL; 942 cpuc->event_list[n] = leader; 943 n++; 944 } 945 if (!dogrp) 946 return n; 947 948 list_for_each_entry(event, &leader->sibling_list, group_entry) { 949 if (!is_x86_event(event) || 950 event->state <= PERF_EVENT_STATE_OFF) 951 continue; 952 953 if (n >= max_count) 954 return -EINVAL; 955 956 cpuc->event_list[n] = event; 957 n++; 958 } 959 return n; 960} 961 962static inline void x86_assign_hw_event(struct perf_event *event, 963 struct cpu_hw_events *cpuc, int i) 964{ 965 struct hw_perf_event *hwc = &event->hw; 966 967 hwc->idx = cpuc->assign[i]; 968 hwc->last_cpu = smp_processor_id(); 969 hwc->last_tag = ++cpuc->tags[i]; 970 971 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { 972 hwc->config_base = 0; 973 hwc->event_base = 0; 974 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { 975 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 976 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); 977 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; 978 } else { 979 hwc->config_base = x86_pmu_config_addr(hwc->idx); 980 hwc->event_base = x86_pmu_event_addr(hwc->idx); 981 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 982 } 983} 984 985static inline int match_prev_assignment(struct hw_perf_event *hwc, 986 struct cpu_hw_events *cpuc, 987 int i) 988{ 989 return hwc->idx == cpuc->assign[i] && 990 hwc->last_cpu == smp_processor_id() && 991 hwc->last_tag == cpuc->tags[i]; 992} 993 994static void x86_pmu_start(struct perf_event *event, int flags); 995 996static void x86_pmu_enable(struct pmu *pmu) 997{ 998 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 999 struct perf_event *event; 1000 struct hw_perf_event *hwc; 1001 int i, added = cpuc->n_added; 1002 1003 if (!x86_pmu_initialized()) 1004 return; 1005 1006 if (cpuc->enabled) 1007 return; 1008 1009 if (cpuc->n_added) { 1010 int n_running = cpuc->n_events - cpuc->n_added; 1011 /* 1012 * apply assignment obtained either from 1013 * hw_perf_group_sched_in() or x86_pmu_enable() 1014 * 1015 * step1: save events moving to new counters 1016 */ 1017 for (i = 0; i < n_running; i++) { 1018 event = cpuc->event_list[i]; 1019 hwc = &event->hw; 1020 1021 /* 1022 * we can avoid reprogramming counter if: 1023 * - assigned same counter as last time 1024 * - running on same CPU as last time 1025 * - no other event has used the counter since 1026 */ 1027 if (hwc->idx == -1 || 1028 match_prev_assignment(hwc, cpuc, i)) 1029 continue; 1030 1031 /* 1032 * Ensure we don't accidentally enable a stopped 1033 * counter simply because we rescheduled. 1034 */ 1035 if (hwc->state & PERF_HES_STOPPED) 1036 hwc->state |= PERF_HES_ARCH; 1037 1038 x86_pmu_stop(event, PERF_EF_UPDATE); 1039 } 1040 1041 /* 1042 * step2: reprogram moved events into new counters 1043 */ 1044 for (i = 0; i < cpuc->n_events; i++) { 1045 event = cpuc->event_list[i]; 1046 hwc = &event->hw; 1047 1048 if (!match_prev_assignment(hwc, cpuc, i)) 1049 x86_assign_hw_event(event, cpuc, i); 1050 else if (i < n_running) 1051 continue; 1052 1053 if (hwc->state & PERF_HES_ARCH) 1054 continue; 1055 1056 x86_pmu_start(event, PERF_EF_RELOAD); 1057 } 1058 cpuc->n_added = 0; 1059 perf_events_lapic_init(); 1060 } 1061 1062 cpuc->enabled = 1; 1063 barrier(); 1064 1065 x86_pmu.enable_all(added); 1066} 1067 1068static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1069 1070/* 1071 * Set the next IRQ period, based on the hwc->period_left value. 1072 * To be called with the event disabled in hw: 1073 */ 1074int x86_perf_event_set_period(struct perf_event *event) 1075{ 1076 struct hw_perf_event *hwc = &event->hw; 1077 s64 left = local64_read(&hwc->period_left); 1078 s64 period = hwc->sample_period; 1079 int ret = 0, idx = hwc->idx; 1080 1081 if (idx == INTEL_PMC_IDX_FIXED_BTS) 1082 return 0; 1083 1084 /* 1085 * If we are way outside a reasonable range then just skip forward: 1086 */ 1087 if (unlikely(left <= -period)) { 1088 left = period; 1089 local64_set(&hwc->period_left, left); 1090 hwc->last_period = period; 1091 ret = 1; 1092 } 1093 1094 if (unlikely(left <= 0)) { 1095 left += period; 1096 local64_set(&hwc->period_left, left); 1097 hwc->last_period = period; 1098 ret = 1; 1099 } 1100 /* 1101 * Quirk: certain CPUs dont like it if just 1 hw_event is left: 1102 */ 1103 if (unlikely(left < 2)) 1104 left = 2; 1105 1106 if (left > x86_pmu.max_period) 1107 left = x86_pmu.max_period; 1108 1109 if (x86_pmu.limit_period) 1110 left = x86_pmu.limit_period(event, left); 1111 1112 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1113 1114 /* 1115 * The hw event starts counting from this event offset, 1116 * mark it to be able to extra future deltas: 1117 */ 1118 local64_set(&hwc->prev_count, (u64)-left); 1119 1120 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1121 1122 /* 1123 * Due to erratum on certan cpu we need 1124 * a second write to be sure the register 1125 * is updated properly 1126 */ 1127 if (x86_pmu.perfctr_second_write) { 1128 wrmsrl(hwc->event_base, 1129 (u64)(-left) & x86_pmu.cntval_mask); 1130 } 1131 1132 perf_event_update_userpage(event); 1133 1134 return ret; 1135} 1136 1137void x86_pmu_enable_event(struct perf_event *event) 1138{ 1139 if (__this_cpu_read(cpu_hw_events.enabled)) 1140 __x86_pmu_enable_event(&event->hw, 1141 ARCH_PERFMON_EVENTSEL_ENABLE); 1142} 1143 1144/* 1145 * Add a single event to the PMU. 1146 * 1147 * The event is added to the group of enabled events 1148 * but only if it can be scehduled with existing events. 1149 */ 1150static int x86_pmu_add(struct perf_event *event, int flags) 1151{ 1152 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1153 struct hw_perf_event *hwc; 1154 int assign[X86_PMC_IDX_MAX]; 1155 int n, n0, ret; 1156 1157 hwc = &event->hw; 1158 1159 n0 = cpuc->n_events; 1160 ret = n = collect_events(cpuc, event, false); 1161 if (ret < 0) 1162 goto out; 1163 1164 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1165 if (!(flags & PERF_EF_START)) 1166 hwc->state |= PERF_HES_ARCH; 1167 1168 /* 1169 * If group events scheduling transaction was started, 1170 * skip the schedulability test here, it will be performed 1171 * at commit time (->commit_txn) as a whole. 1172 */ 1173 if (cpuc->group_flag & PERF_EVENT_TXN) 1174 goto done_collect; 1175 1176 ret = x86_pmu.schedule_events(cpuc, n, assign); 1177 if (ret) 1178 goto out; 1179 /* 1180 * copy new assignment, now we know it is possible 1181 * will be used by hw_perf_enable() 1182 */ 1183 memcpy(cpuc->assign, assign, n*sizeof(int)); 1184 1185done_collect: 1186 /* 1187 * Commit the collect_events() state. See x86_pmu_del() and 1188 * x86_pmu_*_txn(). 1189 */ 1190 cpuc->n_events = n; 1191 cpuc->n_added += n - n0; 1192 cpuc->n_txn += n - n0; 1193 1194 ret = 0; 1195out: 1196 return ret; 1197} 1198 1199static void x86_pmu_start(struct perf_event *event, int flags) 1200{ 1201 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1202 int idx = event->hw.idx; 1203 1204 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1205 return; 1206 1207 if (WARN_ON_ONCE(idx == -1)) 1208 return; 1209 1210 if (flags & PERF_EF_RELOAD) { 1211 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1212 x86_perf_event_set_period(event); 1213 } 1214 1215 event->hw.state = 0; 1216 1217 cpuc->events[idx] = event; 1218 __set_bit(idx, cpuc->active_mask); 1219 __set_bit(idx, cpuc->running); 1220 x86_pmu.enable(event); 1221 perf_event_update_userpage(event); 1222} 1223 1224void perf_event_print_debug(void) 1225{ 1226 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1227 u64 pebs, debugctl; 1228 struct cpu_hw_events *cpuc; 1229 unsigned long flags; 1230 int cpu, idx; 1231 1232 if (!x86_pmu.num_counters) 1233 return; 1234 1235 local_irq_save(flags); 1236 1237 cpu = smp_processor_id(); 1238 cpuc = &per_cpu(cpu_hw_events, cpu); 1239 1240 if (x86_pmu.version >= 2) { 1241 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1242 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1243 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1244 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1245 1246 pr_info("\n"); 1247 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1248 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1249 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1250 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1251 if (x86_pmu.pebs_constraints) { 1252 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); 1253 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1254 } 1255 if (x86_pmu.lbr_nr) { 1256 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 1257 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); 1258 } 1259 } 1260 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1261 1262 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1263 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); 1264 rdmsrl(x86_pmu_event_addr(idx), pmc_count); 1265 1266 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1267 1268 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1269 cpu, idx, pmc_ctrl); 1270 pr_info("CPU#%d: gen-PMC%d count: %016llx\n", 1271 cpu, idx, pmc_count); 1272 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1273 cpu, idx, prev_left); 1274 } 1275 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1276 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1277 1278 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1279 cpu, idx, pmc_count); 1280 } 1281 local_irq_restore(flags); 1282} 1283 1284void x86_pmu_stop(struct perf_event *event, int flags) 1285{ 1286 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1287 struct hw_perf_event *hwc = &event->hw; 1288 1289 if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { 1290 x86_pmu.disable(event); 1291 cpuc->events[hwc->idx] = NULL; 1292 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1293 hwc->state |= PERF_HES_STOPPED; 1294 } 1295 1296 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1297 /* 1298 * Drain the remaining delta count out of a event 1299 * that we are disabling: 1300 */ 1301 x86_perf_event_update(event); 1302 hwc->state |= PERF_HES_UPTODATE; 1303 } 1304} 1305 1306static void x86_pmu_del(struct perf_event *event, int flags) 1307{ 1308 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1309 int i; 1310 1311 /* 1312 * event is descheduled 1313 */ 1314 event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; 1315 1316 /* 1317 * If we're called during a txn, we don't need to do anything. 1318 * The events never got scheduled and ->cancel_txn will truncate 1319 * the event_list. 1320 * 1321 * XXX assumes any ->del() called during a TXN will only be on 1322 * an event added during that same TXN. 1323 */ 1324 if (cpuc->group_flag & PERF_EVENT_TXN) 1325 return; 1326 1327 /* 1328 * Not a TXN, therefore cleanup properly. 1329 */ 1330 x86_pmu_stop(event, PERF_EF_UPDATE); 1331 1332 for (i = 0; i < cpuc->n_events; i++) { 1333 if (event == cpuc->event_list[i]) 1334 break; 1335 } 1336 1337 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ 1338 return; 1339 1340 /* If we have a newly added event; make sure to decrease n_added. */ 1341 if (i >= cpuc->n_events - cpuc->n_added) 1342 --cpuc->n_added; 1343 1344 if (x86_pmu.put_event_constraints) 1345 x86_pmu.put_event_constraints(cpuc, event); 1346 1347 /* Delete the array entry. */ 1348 while (++i < cpuc->n_events) { 1349 cpuc->event_list[i-1] = cpuc->event_list[i]; 1350 cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1351 } 1352 --cpuc->n_events; 1353 1354 perf_event_update_userpage(event); 1355} 1356 1357int x86_pmu_handle_irq(struct pt_regs *regs) 1358{ 1359 struct perf_sample_data data; 1360 struct cpu_hw_events *cpuc; 1361 struct perf_event *event; 1362 int idx, handled = 0; 1363 u64 val; 1364 1365 cpuc = this_cpu_ptr(&cpu_hw_events); 1366 1367 /* 1368 * Some chipsets need to unmask the LVTPC in a particular spot 1369 * inside the nmi handler. As a result, the unmasking was pushed 1370 * into all the nmi handlers. 1371 * 1372 * This generic handler doesn't seem to have any issues where the 1373 * unmasking occurs so it was left at the top. 1374 */ 1375 apic_write(APIC_LVTPC, APIC_DM_NMI); 1376 1377 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1378 if (!test_bit(idx, cpuc->active_mask)) { 1379 /* 1380 * Though we deactivated the counter some cpus 1381 * might still deliver spurious interrupts still 1382 * in flight. Catch them: 1383 */ 1384 if (__test_and_clear_bit(idx, cpuc->running)) 1385 handled++; 1386 continue; 1387 } 1388 1389 event = cpuc->events[idx]; 1390 1391 val = x86_perf_event_update(event); 1392 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1393 continue; 1394 1395 /* 1396 * event overflow 1397 */ 1398 handled++; 1399 perf_sample_data_init(&data, 0, event->hw.last_period); 1400 1401 if (!x86_perf_event_set_period(event)) 1402 continue; 1403 1404 if (perf_event_overflow(event, &data, regs)) 1405 x86_pmu_stop(event, 0); 1406 } 1407 1408 if (handled) 1409 inc_irq_stat(apic_perf_irqs); 1410 1411 return handled; 1412} 1413 1414void perf_events_lapic_init(void) 1415{ 1416 if (!x86_pmu.apic || !x86_pmu_initialized()) 1417 return; 1418 1419 /* 1420 * Always use NMI for PMU 1421 */ 1422 apic_write(APIC_LVTPC, APIC_DM_NMI); 1423} 1424 1425static int 1426perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1427{ 1428 u64 start_clock; 1429 u64 finish_clock; 1430 int ret; 1431 1432 if (!atomic_read(&active_events)) 1433 return NMI_DONE; 1434 1435 start_clock = sched_clock(); 1436 ret = x86_pmu.handle_irq(regs); 1437 finish_clock = sched_clock(); 1438 1439 perf_sample_event_took(finish_clock - start_clock); 1440 1441 return ret; 1442} 1443NOKPROBE_SYMBOL(perf_event_nmi_handler); 1444 1445struct event_constraint emptyconstraint; 1446struct event_constraint unconstrained; 1447 1448static int 1449x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) 1450{ 1451 unsigned int cpu = (long)hcpu; 1452 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1453 int i, ret = NOTIFY_OK; 1454 1455 switch (action & ~CPU_TASKS_FROZEN) { 1456 case CPU_UP_PREPARE: 1457 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) 1458 cpuc->kfree_on_online[i] = NULL; 1459 if (x86_pmu.cpu_prepare) 1460 ret = x86_pmu.cpu_prepare(cpu); 1461 break; 1462 1463 case CPU_STARTING: 1464 if (x86_pmu.cpu_starting) 1465 x86_pmu.cpu_starting(cpu); 1466 break; 1467 1468 case CPU_ONLINE: 1469 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { 1470 kfree(cpuc->kfree_on_online[i]); 1471 cpuc->kfree_on_online[i] = NULL; 1472 } 1473 break; 1474 1475 case CPU_DYING: 1476 if (x86_pmu.cpu_dying) 1477 x86_pmu.cpu_dying(cpu); 1478 break; 1479 1480 case CPU_UP_CANCELED: 1481 case CPU_DEAD: 1482 if (x86_pmu.cpu_dead) 1483 x86_pmu.cpu_dead(cpu); 1484 break; 1485 1486 default: 1487 break; 1488 } 1489 1490 return ret; 1491} 1492 1493static void __init pmu_check_apic(void) 1494{ 1495 if (cpu_has_apic) 1496 return; 1497 1498 x86_pmu.apic = 0; 1499 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1500 pr_info("no hardware sampling interrupt available.\n"); 1501 1502 /* 1503 * If we have a PMU initialized but no APIC 1504 * interrupts, we cannot sample hardware 1505 * events (user-space has to fall back and 1506 * sample via a hrtimer based software event): 1507 */ 1508 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1509 1510} 1511 1512static struct attribute_group x86_pmu_format_group = { 1513 .name = "format", 1514 .attrs = NULL, 1515}; 1516 1517/* 1518 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1519 * out of events_attr attributes. 1520 */ 1521static void __init filter_events(struct attribute **attrs) 1522{ 1523 struct device_attribute *d; 1524 struct perf_pmu_events_attr *pmu_attr; 1525 int i, j; 1526 1527 for (i = 0; attrs[i]; i++) { 1528 d = (struct device_attribute *)attrs[i]; 1529 pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); 1530 /* str trumps id */ 1531 if (pmu_attr->event_str) 1532 continue; 1533 if (x86_pmu.event_map(i)) 1534 continue; 1535 1536 for (j = i; attrs[j]; j++) 1537 attrs[j] = attrs[j + 1]; 1538 1539 /* Check the shifted attr. */ 1540 i--; 1541 } 1542} 1543 1544/* Merge two pointer arrays */ 1545static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) 1546{ 1547 struct attribute **new; 1548 int j, i; 1549 1550 for (j = 0; a[j]; j++) 1551 ; 1552 for (i = 0; b[i]; i++) 1553 j++; 1554 j++; 1555 1556 new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); 1557 if (!new) 1558 return NULL; 1559 1560 j = 0; 1561 for (i = 0; a[i]; i++) 1562 new[j++] = a[i]; 1563 for (i = 0; b[i]; i++) 1564 new[j++] = b[i]; 1565 new[j] = NULL; 1566 1567 return new; 1568} 1569 1570ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, 1571 char *page) 1572{ 1573 struct perf_pmu_events_attr *pmu_attr = \ 1574 container_of(attr, struct perf_pmu_events_attr, attr); 1575 u64 config = x86_pmu.event_map(pmu_attr->id); 1576 1577 /* string trumps id */ 1578 if (pmu_attr->event_str) 1579 return sprintf(page, "%s", pmu_attr->event_str); 1580 1581 return x86_pmu.events_sysfs_show(page, config); 1582} 1583 1584EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1585EVENT_ATTR(instructions, INSTRUCTIONS ); 1586EVENT_ATTR(cache-references, CACHE_REFERENCES ); 1587EVENT_ATTR(cache-misses, CACHE_MISSES ); 1588EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); 1589EVENT_ATTR(branch-misses, BRANCH_MISSES ); 1590EVENT_ATTR(bus-cycles, BUS_CYCLES ); 1591EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); 1592EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); 1593EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); 1594 1595static struct attribute *empty_attrs; 1596 1597static struct attribute *events_attr[] = { 1598 EVENT_PTR(CPU_CYCLES), 1599 EVENT_PTR(INSTRUCTIONS), 1600 EVENT_PTR(CACHE_REFERENCES), 1601 EVENT_PTR(CACHE_MISSES), 1602 EVENT_PTR(BRANCH_INSTRUCTIONS), 1603 EVENT_PTR(BRANCH_MISSES), 1604 EVENT_PTR(BUS_CYCLES), 1605 EVENT_PTR(STALLED_CYCLES_FRONTEND), 1606 EVENT_PTR(STALLED_CYCLES_BACKEND), 1607 EVENT_PTR(REF_CPU_CYCLES), 1608 NULL, 1609}; 1610 1611static struct attribute_group x86_pmu_events_group = { 1612 .name = "events", 1613 .attrs = events_attr, 1614}; 1615 1616ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) 1617{ 1618 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 1619 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; 1620 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); 1621 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); 1622 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); 1623 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); 1624 ssize_t ret; 1625 1626 /* 1627 * We have whole page size to spend and just little data 1628 * to write, so we can safely use sprintf. 1629 */ 1630 ret = sprintf(page, "event=0x%02llx", event); 1631 1632 if (umask) 1633 ret += sprintf(page + ret, ",umask=0x%02llx", umask); 1634 1635 if (edge) 1636 ret += sprintf(page + ret, ",edge"); 1637 1638 if (pc) 1639 ret += sprintf(page + ret, ",pc"); 1640 1641 if (any) 1642 ret += sprintf(page + ret, ",any"); 1643 1644 if (inv) 1645 ret += sprintf(page + ret, ",inv"); 1646 1647 if (cmask) 1648 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); 1649 1650 ret += sprintf(page + ret, "\n"); 1651 1652 return ret; 1653} 1654 1655static int __init init_hw_perf_events(void) 1656{ 1657 struct x86_pmu_quirk *quirk; 1658 int err; 1659 1660 pr_info("Performance Events: "); 1661 1662 switch (boot_cpu_data.x86_vendor) { 1663 case X86_VENDOR_INTEL: 1664 err = intel_pmu_init(); 1665 break; 1666 case X86_VENDOR_AMD: 1667 err = amd_pmu_init(); 1668 break; 1669 default: 1670 err = -ENOTSUPP; 1671 } 1672 if (err != 0) { 1673 pr_cont("no PMU driver, software events only.\n"); 1674 return 0; 1675 } 1676 1677 pmu_check_apic(); 1678 1679 /* sanity check that the hardware exists or is emulated */ 1680 if (!check_hw_exists()) 1681 return 0; 1682 1683 pr_cont("%s PMU driver.\n", x86_pmu.name); 1684 1685 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1686 1687 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1688 quirk->func(); 1689 1690 if (!x86_pmu.intel_ctrl) 1691 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1692 1693 perf_events_lapic_init(); 1694 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1695 1696 unconstrained = (struct event_constraint) 1697 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1698 0, x86_pmu.num_counters, 0, 0); 1699 1700 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1701 1702 if (x86_pmu.event_attrs) 1703 x86_pmu_events_group.attrs = x86_pmu.event_attrs; 1704 1705 if (!x86_pmu.events_sysfs_show) 1706 x86_pmu_events_group.attrs = &empty_attrs; 1707 else 1708 filter_events(x86_pmu_events_group.attrs); 1709 1710 if (x86_pmu.cpu_events) { 1711 struct attribute **tmp; 1712 1713 tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); 1714 if (!WARN_ON(!tmp)) 1715 x86_pmu_events_group.attrs = tmp; 1716 } 1717 1718 pr_info("... version: %d\n", x86_pmu.version); 1719 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1720 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1721 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 1722 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1723 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1724 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1725 1726 perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); 1727 perf_cpu_notifier(x86_pmu_notifier); 1728 1729 return 0; 1730} 1731early_initcall(init_hw_perf_events); 1732 1733static inline void x86_pmu_read(struct perf_event *event) 1734{ 1735 x86_perf_event_update(event); 1736} 1737 1738/* 1739 * Start group events scheduling transaction 1740 * Set the flag to make pmu::enable() not perform the 1741 * schedulability test, it will be performed at commit time 1742 */ 1743static void x86_pmu_start_txn(struct pmu *pmu) 1744{ 1745 perf_pmu_disable(pmu); 1746 __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); 1747 __this_cpu_write(cpu_hw_events.n_txn, 0); 1748} 1749 1750/* 1751 * Stop group events scheduling transaction 1752 * Clear the flag and pmu::enable() will perform the 1753 * schedulability test. 1754 */ 1755static void x86_pmu_cancel_txn(struct pmu *pmu) 1756{ 1757 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); 1758 /* 1759 * Truncate collected array by the number of events added in this 1760 * transaction. See x86_pmu_add() and x86_pmu_*_txn(). 1761 */ 1762 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 1763 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 1764 perf_pmu_enable(pmu); 1765} 1766 1767/* 1768 * Commit group events scheduling transaction 1769 * Perform the group schedulability test as a whole 1770 * Return 0 if success 1771 * 1772 * Does not cancel the transaction on failure; expects the caller to do this. 1773 */ 1774static int x86_pmu_commit_txn(struct pmu *pmu) 1775{ 1776 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1777 int assign[X86_PMC_IDX_MAX]; 1778 int n, ret; 1779 1780 n = cpuc->n_events; 1781 1782 if (!x86_pmu_initialized()) 1783 return -EAGAIN; 1784 1785 ret = x86_pmu.schedule_events(cpuc, n, assign); 1786 if (ret) 1787 return ret; 1788 1789 /* 1790 * copy new assignment, now we know it is possible 1791 * will be used by hw_perf_enable() 1792 */ 1793 memcpy(cpuc->assign, assign, n*sizeof(int)); 1794 1795 cpuc->group_flag &= ~PERF_EVENT_TXN; 1796 perf_pmu_enable(pmu); 1797 return 0; 1798} 1799/* 1800 * a fake_cpuc is used to validate event groups. Due to 1801 * the extra reg logic, we need to also allocate a fake 1802 * per_core and per_cpu structure. Otherwise, group events 1803 * using extra reg may conflict without the kernel being 1804 * able to catch this when the last event gets added to 1805 * the group. 1806 */ 1807static void free_fake_cpuc(struct cpu_hw_events *cpuc) 1808{ 1809 kfree(cpuc->shared_regs); 1810 kfree(cpuc); 1811} 1812 1813static struct cpu_hw_events *allocate_fake_cpuc(void) 1814{ 1815 struct cpu_hw_events *cpuc; 1816 int cpu = raw_smp_processor_id(); 1817 1818 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); 1819 if (!cpuc) 1820 return ERR_PTR(-ENOMEM); 1821 1822 /* only needed, if we have extra_regs */ 1823 if (x86_pmu.extra_regs) { 1824 cpuc->shared_regs = allocate_shared_regs(cpu); 1825 if (!cpuc->shared_regs) 1826 goto error; 1827 } 1828 cpuc->is_fake = 1; 1829 return cpuc; 1830error: 1831 free_fake_cpuc(cpuc); 1832 return ERR_PTR(-ENOMEM); 1833} 1834 1835/* 1836 * validate that we can schedule this event 1837 */ 1838static int validate_event(struct perf_event *event) 1839{ 1840 struct cpu_hw_events *fake_cpuc; 1841 struct event_constraint *c; 1842 int ret = 0; 1843 1844 fake_cpuc = allocate_fake_cpuc(); 1845 if (IS_ERR(fake_cpuc)) 1846 return PTR_ERR(fake_cpuc); 1847 1848 c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); 1849 1850 if (!c || !c->weight) 1851 ret = -EINVAL; 1852 1853 if (x86_pmu.put_event_constraints) 1854 x86_pmu.put_event_constraints(fake_cpuc, event); 1855 1856 free_fake_cpuc(fake_cpuc); 1857 1858 return ret; 1859} 1860 1861/* 1862 * validate a single event group 1863 * 1864 * validation include: 1865 * - check events are compatible which each other 1866 * - events do not compete for the same counter 1867 * - number of events <= number of counters 1868 * 1869 * validation ensures the group can be loaded onto the 1870 * PMU if it was the only group available. 1871 */ 1872static int validate_group(struct perf_event *event) 1873{ 1874 struct perf_event *leader = event->group_leader; 1875 struct cpu_hw_events *fake_cpuc; 1876 int ret = -EINVAL, n; 1877 1878 fake_cpuc = allocate_fake_cpuc(); 1879 if (IS_ERR(fake_cpuc)) 1880 return PTR_ERR(fake_cpuc); 1881 /* 1882 * the event is not yet connected with its 1883 * siblings therefore we must first collect 1884 * existing siblings, then add the new event 1885 * before we can simulate the scheduling 1886 */ 1887 n = collect_events(fake_cpuc, leader, true); 1888 if (n < 0) 1889 goto out; 1890 1891 fake_cpuc->n_events = n; 1892 n = collect_events(fake_cpuc, event, false); 1893 if (n < 0) 1894 goto out; 1895 1896 fake_cpuc->n_events = n; 1897 1898 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 1899 1900out: 1901 free_fake_cpuc(fake_cpuc); 1902 return ret; 1903} 1904 1905static int x86_pmu_event_init(struct perf_event *event) 1906{ 1907 struct pmu *tmp; 1908 int err; 1909 1910 switch (event->attr.type) { 1911 case PERF_TYPE_RAW: 1912 case PERF_TYPE_HARDWARE: 1913 case PERF_TYPE_HW_CACHE: 1914 break; 1915 1916 default: 1917 return -ENOENT; 1918 } 1919 1920 err = __x86_pmu_event_init(event); 1921 if (!err) { 1922 /* 1923 * we temporarily connect event to its pmu 1924 * such that validate_group() can classify 1925 * it as an x86 event using is_x86_event() 1926 */ 1927 tmp = event->pmu; 1928 event->pmu = &pmu; 1929 1930 if (event->group_leader != event) 1931 err = validate_group(event); 1932 else 1933 err = validate_event(event); 1934 1935 event->pmu = tmp; 1936 } 1937 if (err) { 1938 if (event->destroy) 1939 event->destroy(event); 1940 } 1941 1942 if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) 1943 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; 1944 1945 return err; 1946} 1947 1948static void refresh_pce(void *ignored) 1949{ 1950 if (current->mm) 1951 load_mm_cr4(current->mm); 1952} 1953 1954static void x86_pmu_event_mapped(struct perf_event *event) 1955{ 1956 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 1957 return; 1958 1959 if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) 1960 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); 1961} 1962 1963static void x86_pmu_event_unmapped(struct perf_event *event) 1964{ 1965 if (!current->mm) 1966 return; 1967 1968 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 1969 return; 1970 1971 if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) 1972 on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); 1973} 1974 1975static int x86_pmu_event_idx(struct perf_event *event) 1976{ 1977 int idx = event->hw.idx; 1978 1979 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 1980 return 0; 1981 1982 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { 1983 idx -= INTEL_PMC_IDX_FIXED; 1984 idx |= 1 << 30; 1985 } 1986 1987 return idx + 1; 1988} 1989 1990static ssize_t get_attr_rdpmc(struct device *cdev, 1991 struct device_attribute *attr, 1992 char *buf) 1993{ 1994 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 1995} 1996 1997static ssize_t set_attr_rdpmc(struct device *cdev, 1998 struct device_attribute *attr, 1999 const char *buf, size_t count) 2000{ 2001 unsigned long val; 2002 ssize_t ret; 2003 2004 ret = kstrtoul(buf, 0, &val); 2005 if (ret) 2006 return ret; 2007 2008 if (val > 2) 2009 return -EINVAL; 2010 2011 if (x86_pmu.attr_rdpmc_broken) 2012 return -ENOTSUPP; 2013 2014 if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { 2015 /* 2016 * Changing into or out of always available, aka 2017 * perf-event-bypassing mode. This path is extremely slow, 2018 * but only root can trigger it, so it's okay. 2019 */ 2020 if (val == 2) 2021 static_key_slow_inc(&rdpmc_always_available); 2022 else 2023 static_key_slow_dec(&rdpmc_always_available); 2024 on_each_cpu(refresh_pce, NULL, 1); 2025 } 2026 2027 x86_pmu.attr_rdpmc = val; 2028 2029 return count; 2030} 2031 2032static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); 2033 2034static struct attribute *x86_pmu_attrs[] = { 2035 &dev_attr_rdpmc.attr, 2036 NULL, 2037}; 2038 2039static struct attribute_group x86_pmu_attr_group = { 2040 .attrs = x86_pmu_attrs, 2041}; 2042 2043static const struct attribute_group *x86_pmu_attr_groups[] = { 2044 &x86_pmu_attr_group, 2045 &x86_pmu_format_group, 2046 &x86_pmu_events_group, 2047 NULL, 2048}; 2049 2050static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) 2051{ 2052 if (x86_pmu.sched_task) 2053 x86_pmu.sched_task(ctx, sched_in); 2054} 2055 2056void perf_check_microcode(void) 2057{ 2058 if (x86_pmu.check_microcode) 2059 x86_pmu.check_microcode(); 2060} 2061EXPORT_SYMBOL_GPL(perf_check_microcode); 2062 2063static struct pmu pmu = { 2064 .pmu_enable = x86_pmu_enable, 2065 .pmu_disable = x86_pmu_disable, 2066 2067 .attr_groups = x86_pmu_attr_groups, 2068 2069 .event_init = x86_pmu_event_init, 2070 2071 .event_mapped = x86_pmu_event_mapped, 2072 .event_unmapped = x86_pmu_event_unmapped, 2073 2074 .add = x86_pmu_add, 2075 .del = x86_pmu_del, 2076 .start = x86_pmu_start, 2077 .stop = x86_pmu_stop, 2078 .read = x86_pmu_read, 2079 2080 .start_txn = x86_pmu_start_txn, 2081 .cancel_txn = x86_pmu_cancel_txn, 2082 .commit_txn = x86_pmu_commit_txn, 2083 2084 .event_idx = x86_pmu_event_idx, 2085 .sched_task = x86_pmu_sched_task, 2086 .task_ctx_size = sizeof(struct x86_perf_task_context), 2087}; 2088 2089void arch_perf_update_userpage(struct perf_event *event, 2090 struct perf_event_mmap_page *userpg, u64 now) 2091{ 2092 struct cyc2ns_data *data; 2093 2094 userpg->cap_user_time = 0; 2095 userpg->cap_user_time_zero = 0; 2096 userpg->cap_user_rdpmc = 2097 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); 2098 userpg->pmc_width = x86_pmu.cntval_bits; 2099 2100 if (!sched_clock_stable()) 2101 return; 2102 2103 data = cyc2ns_read_begin(); 2104 2105 /* 2106 * Internal timekeeping for enabled/running/stopped times 2107 * is always in the local_clock domain. 2108 */ 2109 userpg->cap_user_time = 1; 2110 userpg->time_mult = data->cyc2ns_mul; 2111 userpg->time_shift = data->cyc2ns_shift; 2112 userpg->time_offset = data->cyc2ns_offset - now; 2113 2114 /* 2115 * cap_user_time_zero doesn't make sense when we're using a different 2116 * time base for the records. 2117 */ 2118 if (event->clock == &local_clock) { 2119 userpg->cap_user_time_zero = 1; 2120 userpg->time_zero = data->cyc2ns_offset; 2121 } 2122 2123 cyc2ns_read_end(data); 2124} 2125 2126/* 2127 * callchain support 2128 */ 2129 2130static int backtrace_stack(void *data, char *name) 2131{ 2132 return 0; 2133} 2134 2135static void backtrace_address(void *data, unsigned long addr, int reliable) 2136{ 2137 struct perf_callchain_entry *entry = data; 2138 2139 perf_callchain_store(entry, addr); 2140} 2141 2142static const struct stacktrace_ops backtrace_ops = { 2143 .stack = backtrace_stack, 2144 .address = backtrace_address, 2145 .walk_stack = print_context_stack_bp, 2146}; 2147 2148void 2149perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) 2150{ 2151 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2152 /* TODO: We don't support guest os callchain now */ 2153 return; 2154 } 2155 2156 perf_callchain_store(entry, regs->ip); 2157 2158 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 2159} 2160 2161static inline int 2162valid_user_frame(const void __user *fp, unsigned long size) 2163{ 2164 return (__range_not_ok(fp, size, TASK_SIZE) == 0); 2165} 2166 2167static unsigned long get_segment_base(unsigned int segment) 2168{ 2169 struct desc_struct *desc; 2170 int idx = segment >> 3; 2171 2172 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2173 struct ldt_struct *ldt; 2174 2175 if (idx > LDT_ENTRIES) 2176 return 0; 2177 2178 /* IRQs are off, so this synchronizes with smp_store_release */ 2179 ldt = lockless_dereference(current->active_mm->context.ldt); 2180 if (!ldt || idx > ldt->size) 2181 return 0; 2182 2183 desc = &ldt->entries[idx]; 2184 } else { 2185 if (idx > GDT_ENTRIES) 2186 return 0; 2187 2188 desc = raw_cpu_ptr(gdt_page.gdt) + idx; 2189 } 2190 2191 return get_desc_base(desc); 2192} 2193 2194#ifdef CONFIG_COMPAT 2195 2196#include <asm/compat.h> 2197 2198static inline int 2199perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 2200{ 2201 /* 32-bit process in 64-bit kernel. */ 2202 unsigned long ss_base, cs_base; 2203 struct stack_frame_ia32 frame; 2204 const void __user *fp; 2205 2206 if (!test_thread_flag(TIF_IA32)) 2207 return 0; 2208 2209 cs_base = get_segment_base(regs->cs); 2210 ss_base = get_segment_base(regs->ss); 2211 2212 fp = compat_ptr(ss_base + regs->bp); 2213 while (entry->nr < PERF_MAX_STACK_DEPTH) { 2214 unsigned long bytes; 2215 frame.next_frame = 0; 2216 frame.return_address = 0; 2217 2218 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); 2219 if (bytes != 0) 2220 break; 2221 2222 if (!valid_user_frame(fp, sizeof(frame))) 2223 break; 2224 2225 perf_callchain_store(entry, cs_base + frame.return_address); 2226 fp = compat_ptr(ss_base + frame.next_frame); 2227 } 2228 return 1; 2229} 2230#else 2231static inline int 2232perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 2233{ 2234 return 0; 2235} 2236#endif 2237 2238void 2239perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) 2240{ 2241 struct stack_frame frame; 2242 const void __user *fp; 2243 2244 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2245 /* TODO: We don't support guest os callchain now */ 2246 return; 2247 } 2248 2249 /* 2250 * We don't know what to do with VM86 stacks.. ignore them for now. 2251 */ 2252 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) 2253 return; 2254 2255 fp = (void __user *)regs->bp; 2256 2257 perf_callchain_store(entry, regs->ip); 2258 2259 if (!current->mm) 2260 return; 2261 2262 if (perf_callchain_user32(regs, entry)) 2263 return; 2264 2265 while (entry->nr < PERF_MAX_STACK_DEPTH) { 2266 unsigned long bytes; 2267 frame.next_frame = NULL; 2268 frame.return_address = 0; 2269 2270 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); 2271 if (bytes != 0) 2272 break; 2273 2274 if (!valid_user_frame(fp, sizeof(frame))) 2275 break; 2276 2277 perf_callchain_store(entry, frame.return_address); 2278 fp = frame.next_frame; 2279 } 2280} 2281 2282/* 2283 * Deal with code segment offsets for the various execution modes: 2284 * 2285 * VM86 - the good olde 16 bit days, where the linear address is 2286 * 20 bits and we use regs->ip + 0x10 * regs->cs. 2287 * 2288 * IA32 - Where we need to look at GDT/LDT segment descriptor tables 2289 * to figure out what the 32bit base address is. 2290 * 2291 * X32 - has TIF_X32 set, but is running in x86_64 2292 * 2293 * X86_64 - CS,DS,SS,ES are all zero based. 2294 */ 2295static unsigned long code_segment_base(struct pt_regs *regs) 2296{ 2297 /* 2298 * For IA32 we look at the GDT/LDT segment base to convert the 2299 * effective IP to a linear address. 2300 */ 2301 2302#ifdef CONFIG_X86_32 2303 /* 2304 * If we are in VM86 mode, add the segment offset to convert to a 2305 * linear address. 2306 */ 2307 if (regs->flags & X86_VM_MASK) 2308 return 0x10 * regs->cs; 2309 2310 if (user_mode(regs) && regs->cs != __USER_CS) 2311 return get_segment_base(regs->cs); 2312#else 2313 if (user_mode(regs) && !user_64bit_mode(regs) && 2314 regs->cs != __USER32_CS) 2315 return get_segment_base(regs->cs); 2316#endif 2317 return 0; 2318} 2319 2320unsigned long perf_instruction_pointer(struct pt_regs *regs) 2321{ 2322 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) 2323 return perf_guest_cbs->get_guest_ip(); 2324 2325 return regs->ip + code_segment_base(regs); 2326} 2327 2328unsigned long perf_misc_flags(struct pt_regs *regs) 2329{ 2330 int misc = 0; 2331 2332 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2333 if (perf_guest_cbs->is_user_mode()) 2334 misc |= PERF_RECORD_MISC_GUEST_USER; 2335 else 2336 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 2337 } else { 2338 if (user_mode(regs)) 2339 misc |= PERF_RECORD_MISC_USER; 2340 else 2341 misc |= PERF_RECORD_MISC_KERNEL; 2342 } 2343 2344 if (regs->flags & PERF_EFLAGS_EXACT) 2345 misc |= PERF_RECORD_MISC_EXACT_IP; 2346 2347 return misc; 2348} 2349 2350void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) 2351{ 2352 cap->version = x86_pmu.version; 2353 cap->num_counters_gp = x86_pmu.num_counters; 2354 cap->num_counters_fixed = x86_pmu.num_counters_fixed; 2355 cap->bit_width_gp = x86_pmu.cntval_bits; 2356 cap->bit_width_fixed = x86_pmu.cntval_bits; 2357 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 2358 cap->events_mask_len = x86_pmu.events_mask_len; 2359} 2360EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); 2361