1/* 2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters 3 * Copyright (C) 2013 Google, Inc., Stephane Eranian 4 * 5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b 6 * section 14.7.1 (September 2013) 7 * 8 * RAPL provides more controls than just reporting energy consumption 9 * however here we only expose the 3 energy consumption free running 10 * counters (pp0, pkg, dram). 11 * 12 * Each of those counters increments in a power unit defined by the 13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules 14 * but it can vary. 15 * 16 * Counter to rapl events mappings: 17 * 18 * pp0 counter: consumption of all physical cores (power plane 0) 19 * event: rapl_energy_cores 20 * perf code: 0x1 21 * 22 * pkg counter: consumption of the whole processor package 23 * event: rapl_energy_pkg 24 * perf code: 0x2 25 * 26 * dram counter: consumption of the dram domain (servers only) 27 * event: rapl_energy_dram 28 * perf code: 0x3 29 * 30 * dram counter: consumption of the builtin-gpu domain (client only) 31 * event: rapl_energy_gpu 32 * perf code: 0x4 33 * 34 * We manage those counters as free running (read-only). They may be 35 * use simultaneously by other tools, such as turbostat. 36 * 37 * The events only support system-wide mode counting. There is no 38 * sampling support because it does not make sense and is not 39 * supported by the RAPL hardware. 40 * 41 * Because we want to avoid floating-point operations in the kernel, 42 * the events are all reported in fixed point arithmetic (32.32). 43 * Tools must adjust the counts to convert them to Watts using 44 * the duration of the measurement. Tools may use a function such as 45 * ldexp(raw_count, -32); 46 */ 47#include <linux/module.h> 48#include <linux/slab.h> 49#include <linux/perf_event.h> 50#include <asm/cpu_device_id.h> 51#include "perf_event.h" 52 53/* 54 * RAPL energy status counters 55 */ 56#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ 57#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ 58#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ 59#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ 60#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ 61#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ 62#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ 63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ 64 65#define NR_RAPL_DOMAINS 0x4 66static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 67 "pp0-core", 68 "package", 69 "dram", 70 "pp1-gpu", 71}; 72 73/* Clients have PP0, PKG */ 74#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ 75 1<<RAPL_IDX_PKG_NRG_STAT|\ 76 1<<RAPL_IDX_PP1_NRG_STAT) 77 78/* Servers have PP0, PKG, RAM */ 79#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ 80 1<<RAPL_IDX_PKG_NRG_STAT|\ 81 1<<RAPL_IDX_RAM_NRG_STAT) 82 83/* Servers have PP0, PKG, RAM, PP1 */ 84#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ 85 1<<RAPL_IDX_PKG_NRG_STAT|\ 86 1<<RAPL_IDX_RAM_NRG_STAT|\ 87 1<<RAPL_IDX_PP1_NRG_STAT) 88 89/* 90 * event code: LSB 8 bits, passed in attr->config 91 * any other bit is reserved 92 */ 93#define RAPL_EVENT_MASK 0xFFULL 94 95#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ 96static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ 97 struct kobj_attribute *attr, \ 98 char *page) \ 99{ \ 100 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ 101 return sprintf(page, _format "\n"); \ 102} \ 103static struct kobj_attribute format_attr_##_var = \ 104 __ATTR(_name, 0444, __rapl_##_var##_show, NULL) 105 106#define RAPL_EVENT_DESC(_name, _config) \ 107{ \ 108 .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ 109 .config = _config, \ 110} 111 112#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ 113 114#define RAPL_EVENT_ATTR_STR(_name, v, str) \ 115static struct perf_pmu_events_attr event_attr_##v = { \ 116 .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \ 117 .id = 0, \ 118 .event_str = str, \ 119}; 120 121struct rapl_pmu { 122 spinlock_t lock; 123 int n_active; /* number of active events */ 124 struct list_head active_list; 125 struct pmu *pmu; /* pointer to rapl_pmu_class */ 126 ktime_t timer_interval; /* in ktime_t unit */ 127 struct hrtimer hrtimer; 128}; 129 130static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ 131static struct pmu rapl_pmu_class; 132static cpumask_t rapl_cpu_mask; 133static int rapl_cntr_mask; 134 135static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); 136static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); 137 138static struct x86_pmu_quirk *rapl_quirks; 139static inline u64 rapl_read_counter(struct perf_event *event) 140{ 141 u64 raw; 142 rdmsrl(event->hw.event_base, raw); 143 return raw; 144} 145 146#define rapl_add_quirk(func_) \ 147do { \ 148 static struct x86_pmu_quirk __quirk __initdata = { \ 149 .func = func_, \ 150 }; \ 151 __quirk.next = rapl_quirks; \ 152 rapl_quirks = &__quirk; \ 153} while (0) 154 155static inline u64 rapl_scale(u64 v, int cfg) 156{ 157 if (cfg > NR_RAPL_DOMAINS) { 158 pr_warn("invalid domain %d, failed to scale data\n", cfg); 159 return v; 160 } 161 /* 162 * scale delta to smallest unit (1/2^32) 163 * users must then scale back: count * 1/(1e9*2^32) to get Joules 164 * or use ldexp(count, -32). 165 * Watts = Joules/Time delta 166 */ 167 return v << (32 - rapl_hw_unit[cfg - 1]); 168} 169 170static u64 rapl_event_update(struct perf_event *event) 171{ 172 struct hw_perf_event *hwc = &event->hw; 173 u64 prev_raw_count, new_raw_count; 174 s64 delta, sdelta; 175 int shift = RAPL_CNTR_WIDTH; 176 177again: 178 prev_raw_count = local64_read(&hwc->prev_count); 179 rdmsrl(event->hw.event_base, new_raw_count); 180 181 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 182 new_raw_count) != prev_raw_count) { 183 cpu_relax(); 184 goto again; 185 } 186 187 /* 188 * Now we have the new raw value and have updated the prev 189 * timestamp already. We can now calculate the elapsed delta 190 * (event-)time and add that to the generic event. 191 * 192 * Careful, not all hw sign-extends above the physical width 193 * of the count. 194 */ 195 delta = (new_raw_count << shift) - (prev_raw_count << shift); 196 delta >>= shift; 197 198 sdelta = rapl_scale(delta, event->hw.config); 199 200 local64_add(sdelta, &event->count); 201 202 return new_raw_count; 203} 204 205static void rapl_start_hrtimer(struct rapl_pmu *pmu) 206{ 207 __hrtimer_start_range_ns(&pmu->hrtimer, 208 pmu->timer_interval, 0, 209 HRTIMER_MODE_REL_PINNED, 0); 210} 211 212static void rapl_stop_hrtimer(struct rapl_pmu *pmu) 213{ 214 hrtimer_cancel(&pmu->hrtimer); 215} 216 217static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) 218{ 219 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 220 struct perf_event *event; 221 unsigned long flags; 222 223 if (!pmu->n_active) 224 return HRTIMER_NORESTART; 225 226 spin_lock_irqsave(&pmu->lock, flags); 227 228 list_for_each_entry(event, &pmu->active_list, active_entry) { 229 rapl_event_update(event); 230 } 231 232 spin_unlock_irqrestore(&pmu->lock, flags); 233 234 hrtimer_forward_now(hrtimer, pmu->timer_interval); 235 236 return HRTIMER_RESTART; 237} 238 239static void rapl_hrtimer_init(struct rapl_pmu *pmu) 240{ 241 struct hrtimer *hr = &pmu->hrtimer; 242 243 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 244 hr->function = rapl_hrtimer_handle; 245} 246 247static void __rapl_pmu_event_start(struct rapl_pmu *pmu, 248 struct perf_event *event) 249{ 250 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 251 return; 252 253 event->hw.state = 0; 254 255 list_add_tail(&event->active_entry, &pmu->active_list); 256 257 local64_set(&event->hw.prev_count, rapl_read_counter(event)); 258 259 pmu->n_active++; 260 if (pmu->n_active == 1) 261 rapl_start_hrtimer(pmu); 262} 263 264static void rapl_pmu_event_start(struct perf_event *event, int mode) 265{ 266 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 267 unsigned long flags; 268 269 spin_lock_irqsave(&pmu->lock, flags); 270 __rapl_pmu_event_start(pmu, event); 271 spin_unlock_irqrestore(&pmu->lock, flags); 272} 273 274static void rapl_pmu_event_stop(struct perf_event *event, int mode) 275{ 276 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 277 struct hw_perf_event *hwc = &event->hw; 278 unsigned long flags; 279 280 spin_lock_irqsave(&pmu->lock, flags); 281 282 /* mark event as deactivated and stopped */ 283 if (!(hwc->state & PERF_HES_STOPPED)) { 284 WARN_ON_ONCE(pmu->n_active <= 0); 285 pmu->n_active--; 286 if (pmu->n_active == 0) 287 rapl_stop_hrtimer(pmu); 288 289 list_del(&event->active_entry); 290 291 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 292 hwc->state |= PERF_HES_STOPPED; 293 } 294 295 /* check if update of sw counter is necessary */ 296 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 297 /* 298 * Drain the remaining delta count out of a event 299 * that we are disabling: 300 */ 301 rapl_event_update(event); 302 hwc->state |= PERF_HES_UPTODATE; 303 } 304 305 spin_unlock_irqrestore(&pmu->lock, flags); 306} 307 308static int rapl_pmu_event_add(struct perf_event *event, int mode) 309{ 310 struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); 311 struct hw_perf_event *hwc = &event->hw; 312 unsigned long flags; 313 314 spin_lock_irqsave(&pmu->lock, flags); 315 316 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 317 318 if (mode & PERF_EF_START) 319 __rapl_pmu_event_start(pmu, event); 320 321 spin_unlock_irqrestore(&pmu->lock, flags); 322 323 return 0; 324} 325 326static void rapl_pmu_event_del(struct perf_event *event, int flags) 327{ 328 rapl_pmu_event_stop(event, PERF_EF_UPDATE); 329} 330 331static int rapl_pmu_event_init(struct perf_event *event) 332{ 333 u64 cfg = event->attr.config & RAPL_EVENT_MASK; 334 int bit, msr, ret = 0; 335 336 /* only look at RAPL events */ 337 if (event->attr.type != rapl_pmu_class.type) 338 return -ENOENT; 339 340 /* check only supported bits are set */ 341 if (event->attr.config & ~RAPL_EVENT_MASK) 342 return -EINVAL; 343 344 /* 345 * check event is known (determines counter) 346 */ 347 switch (cfg) { 348 case INTEL_RAPL_PP0: 349 bit = RAPL_IDX_PP0_NRG_STAT; 350 msr = MSR_PP0_ENERGY_STATUS; 351 break; 352 case INTEL_RAPL_PKG: 353 bit = RAPL_IDX_PKG_NRG_STAT; 354 msr = MSR_PKG_ENERGY_STATUS; 355 break; 356 case INTEL_RAPL_RAM: 357 bit = RAPL_IDX_RAM_NRG_STAT; 358 msr = MSR_DRAM_ENERGY_STATUS; 359 break; 360 case INTEL_RAPL_PP1: 361 bit = RAPL_IDX_PP1_NRG_STAT; 362 msr = MSR_PP1_ENERGY_STATUS; 363 break; 364 default: 365 return -EINVAL; 366 } 367 /* check event supported */ 368 if (!(rapl_cntr_mask & (1 << bit))) 369 return -EINVAL; 370 371 /* unsupported modes and filters */ 372 if (event->attr.exclude_user || 373 event->attr.exclude_kernel || 374 event->attr.exclude_hv || 375 event->attr.exclude_idle || 376 event->attr.exclude_host || 377 event->attr.exclude_guest || 378 event->attr.sample_period) /* no sampling */ 379 return -EINVAL; 380 381 /* must be done before validate_group */ 382 event->hw.event_base = msr; 383 event->hw.config = cfg; 384 event->hw.idx = bit; 385 386 return ret; 387} 388 389static void rapl_pmu_event_read(struct perf_event *event) 390{ 391 rapl_event_update(event); 392} 393 394static ssize_t rapl_get_attr_cpumask(struct device *dev, 395 struct device_attribute *attr, char *buf) 396{ 397 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); 398} 399 400static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); 401 402static struct attribute *rapl_pmu_attrs[] = { 403 &dev_attr_cpumask.attr, 404 NULL, 405}; 406 407static struct attribute_group rapl_pmu_attr_group = { 408 .attrs = rapl_pmu_attrs, 409}; 410 411static ssize_t rapl_sysfs_show(struct device *dev, 412 struct device_attribute *attr, 413 char *page) 414{ 415 struct perf_pmu_events_attr *pmu_attr = \ 416 container_of(attr, struct perf_pmu_events_attr, attr); 417 418 if (pmu_attr->event_str) 419 return sprintf(page, "%s", pmu_attr->event_str); 420 421 return 0; 422} 423 424RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); 425RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); 426RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); 427RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); 428 429RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); 430RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); 431RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); 432RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); 433 434/* 435 * we compute in 0.23 nJ increments regardless of MSR 436 */ 437RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); 438RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); 439RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); 440RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); 441 442static struct attribute *rapl_events_srv_attr[] = { 443 EVENT_PTR(rapl_cores), 444 EVENT_PTR(rapl_pkg), 445 EVENT_PTR(rapl_ram), 446 447 EVENT_PTR(rapl_cores_unit), 448 EVENT_PTR(rapl_pkg_unit), 449 EVENT_PTR(rapl_ram_unit), 450 451 EVENT_PTR(rapl_cores_scale), 452 EVENT_PTR(rapl_pkg_scale), 453 EVENT_PTR(rapl_ram_scale), 454 NULL, 455}; 456 457static struct attribute *rapl_events_cln_attr[] = { 458 EVENT_PTR(rapl_cores), 459 EVENT_PTR(rapl_pkg), 460 EVENT_PTR(rapl_gpu), 461 462 EVENT_PTR(rapl_cores_unit), 463 EVENT_PTR(rapl_pkg_unit), 464 EVENT_PTR(rapl_gpu_unit), 465 466 EVENT_PTR(rapl_cores_scale), 467 EVENT_PTR(rapl_pkg_scale), 468 EVENT_PTR(rapl_gpu_scale), 469 NULL, 470}; 471 472static struct attribute *rapl_events_hsw_attr[] = { 473 EVENT_PTR(rapl_cores), 474 EVENT_PTR(rapl_pkg), 475 EVENT_PTR(rapl_gpu), 476 EVENT_PTR(rapl_ram), 477 478 EVENT_PTR(rapl_cores_unit), 479 EVENT_PTR(rapl_pkg_unit), 480 EVENT_PTR(rapl_gpu_unit), 481 EVENT_PTR(rapl_ram_unit), 482 483 EVENT_PTR(rapl_cores_scale), 484 EVENT_PTR(rapl_pkg_scale), 485 EVENT_PTR(rapl_gpu_scale), 486 EVENT_PTR(rapl_ram_scale), 487 NULL, 488}; 489 490static struct attribute_group rapl_pmu_events_group = { 491 .name = "events", 492 .attrs = NULL, /* patched at runtime */ 493}; 494 495DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); 496static struct attribute *rapl_formats_attr[] = { 497 &format_attr_event.attr, 498 NULL, 499}; 500 501static struct attribute_group rapl_pmu_format_group = { 502 .name = "format", 503 .attrs = rapl_formats_attr, 504}; 505 506const struct attribute_group *rapl_attr_groups[] = { 507 &rapl_pmu_attr_group, 508 &rapl_pmu_format_group, 509 &rapl_pmu_events_group, 510 NULL, 511}; 512 513static struct pmu rapl_pmu_class = { 514 .attr_groups = rapl_attr_groups, 515 .task_ctx_nr = perf_invalid_context, /* system-wide only */ 516 .event_init = rapl_pmu_event_init, 517 .add = rapl_pmu_event_add, /* must have */ 518 .del = rapl_pmu_event_del, /* must have */ 519 .start = rapl_pmu_event_start, 520 .stop = rapl_pmu_event_stop, 521 .read = rapl_pmu_event_read, 522}; 523 524static void rapl_cpu_exit(int cpu) 525{ 526 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 527 int i, phys_id = topology_physical_package_id(cpu); 528 int target = -1; 529 530 /* find a new cpu on same package */ 531 for_each_online_cpu(i) { 532 if (i == cpu) 533 continue; 534 if (phys_id == topology_physical_package_id(i)) { 535 target = i; 536 break; 537 } 538 } 539 /* 540 * clear cpu from cpumask 541 * if was set in cpumask and still some cpu on package, 542 * then move to new cpu 543 */ 544 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) 545 cpumask_set_cpu(target, &rapl_cpu_mask); 546 547 WARN_ON(cpumask_empty(&rapl_cpu_mask)); 548 /* 549 * migrate events and context to new cpu 550 */ 551 if (target >= 0) 552 perf_pmu_migrate_context(pmu->pmu, cpu, target); 553 554 /* cancel overflow polling timer for CPU */ 555 rapl_stop_hrtimer(pmu); 556} 557 558static void rapl_cpu_init(int cpu) 559{ 560 int i, phys_id = topology_physical_package_id(cpu); 561 562 /* check if phys_is is already covered */ 563 for_each_cpu(i, &rapl_cpu_mask) { 564 if (phys_id == topology_physical_package_id(i)) 565 return; 566 } 567 /* was not found, so add it */ 568 cpumask_set_cpu(cpu, &rapl_cpu_mask); 569} 570 571static __init void rapl_hsw_server_quirk(void) 572{ 573 /* 574 * DRAM domain on HSW server has fixed energy unit which can be 575 * different than the unit from power unit MSR. 576 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 577 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " 578 */ 579 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; 580} 581 582static int rapl_cpu_prepare(int cpu) 583{ 584 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 585 int phys_id = topology_physical_package_id(cpu); 586 u64 ms; 587 588 if (pmu) 589 return 0; 590 591 if (phys_id < 0) 592 return -1; 593 594 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); 595 if (!pmu) 596 return -1; 597 spin_lock_init(&pmu->lock); 598 599 INIT_LIST_HEAD(&pmu->active_list); 600 601 pmu->pmu = &rapl_pmu_class; 602 603 /* 604 * use reference of 200W for scaling the timeout 605 * to avoid missing counter overflows. 606 * 200W = 200 Joules/sec 607 * divide interval by 2 to avoid lockstep (2 * 100) 608 * if hw unit is 32, then we use 2 ms 1/200/2 609 */ 610 if (rapl_hw_unit[0] < 32) 611 ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); 612 else 613 ms = 2; 614 615 pmu->timer_interval = ms_to_ktime(ms); 616 617 rapl_hrtimer_init(pmu); 618 619 /* set RAPL pmu for this cpu for now */ 620 per_cpu(rapl_pmu, cpu) = pmu; 621 per_cpu(rapl_pmu_to_free, cpu) = NULL; 622 623 return 0; 624} 625 626static void rapl_cpu_kfree(int cpu) 627{ 628 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); 629 630 kfree(pmu); 631 632 per_cpu(rapl_pmu_to_free, cpu) = NULL; 633} 634 635static int rapl_cpu_dying(int cpu) 636{ 637 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); 638 639 if (!pmu) 640 return 0; 641 642 per_cpu(rapl_pmu, cpu) = NULL; 643 644 per_cpu(rapl_pmu_to_free, cpu) = pmu; 645 646 return 0; 647} 648 649static int rapl_cpu_notifier(struct notifier_block *self, 650 unsigned long action, void *hcpu) 651{ 652 unsigned int cpu = (long)hcpu; 653 654 switch (action & ~CPU_TASKS_FROZEN) { 655 case CPU_UP_PREPARE: 656 rapl_cpu_prepare(cpu); 657 break; 658 case CPU_STARTING: 659 rapl_cpu_init(cpu); 660 break; 661 case CPU_UP_CANCELED: 662 case CPU_DYING: 663 rapl_cpu_dying(cpu); 664 break; 665 case CPU_ONLINE: 666 case CPU_DEAD: 667 rapl_cpu_kfree(cpu); 668 break; 669 case CPU_DOWN_PREPARE: 670 rapl_cpu_exit(cpu); 671 break; 672 default: 673 break; 674 } 675 676 return NOTIFY_OK; 677} 678 679static int rapl_check_hw_unit(void) 680{ 681 u64 msr_rapl_power_unit_bits; 682 int i; 683 684 /* protect rdmsrl() to handle virtualization */ 685 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) 686 return -1; 687 for (i = 0; i < NR_RAPL_DOMAINS; i++) 688 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; 689 690 return 0; 691} 692 693static const struct x86_cpu_id rapl_cpu_match[] = { 694 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, 695 [1] = {}, 696}; 697 698static int __init rapl_pmu_init(void) 699{ 700 struct rapl_pmu *pmu; 701 int cpu, ret; 702 struct x86_pmu_quirk *quirk; 703 int i; 704 705 /* 706 * check for Intel processor family 6 707 */ 708 if (!x86_match_cpu(rapl_cpu_match)) 709 return 0; 710 711 /* check supported CPU */ 712 switch (boot_cpu_data.x86_model) { 713 case 42: /* Sandy Bridge */ 714 case 58: /* Ivy Bridge */ 715 rapl_cntr_mask = RAPL_IDX_CLN; 716 rapl_pmu_events_group.attrs = rapl_events_cln_attr; 717 break; 718 case 63: /* Haswell-Server */ 719 rapl_add_quirk(rapl_hsw_server_quirk); 720 rapl_cntr_mask = RAPL_IDX_SRV; 721 rapl_pmu_events_group.attrs = rapl_events_srv_attr; 722 break; 723 case 60: /* Haswell */ 724 case 69: /* Haswell-Celeron */ 725 case 61: /* Broadwell */ 726 rapl_cntr_mask = RAPL_IDX_HSW; 727 rapl_pmu_events_group.attrs = rapl_events_hsw_attr; 728 break; 729 case 45: /* Sandy Bridge-EP */ 730 case 62: /* IvyTown */ 731 rapl_cntr_mask = RAPL_IDX_SRV; 732 rapl_pmu_events_group.attrs = rapl_events_srv_attr; 733 break; 734 735 default: 736 /* unsupported */ 737 return 0; 738 } 739 ret = rapl_check_hw_unit(); 740 if (ret) 741 return ret; 742 743 /* run cpu model quirks */ 744 for (quirk = rapl_quirks; quirk; quirk = quirk->next) 745 quirk->func(); 746 cpu_notifier_register_begin(); 747 748 for_each_online_cpu(cpu) { 749 ret = rapl_cpu_prepare(cpu); 750 if (ret) 751 goto out; 752 rapl_cpu_init(cpu); 753 } 754 755 __perf_cpu_notifier(rapl_cpu_notifier); 756 757 ret = perf_pmu_register(&rapl_pmu_class, "power", -1); 758 if (WARN_ON(ret)) { 759 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); 760 cpu_notifier_register_done(); 761 return -1; 762 } 763 764 pmu = __this_cpu_read(rapl_pmu); 765 766 pr_info("RAPL PMU detected," 767 " API unit is 2^-32 Joules," 768 " %d fixed counters" 769 " %llu ms ovfl timer\n", 770 hweight32(rapl_cntr_mask), 771 ktime_to_ms(pmu->timer_interval)); 772 for (i = 0; i < NR_RAPL_DOMAINS; i++) { 773 if (rapl_cntr_mask & (1 << i)) { 774 pr_info("hw unit of domain %s 2^-%d Joules\n", 775 rapl_domain_names[i], rapl_hw_unit[i]); 776 } 777 } 778out: 779 cpu_notifier_register_done(); 780 781 return 0; 782} 783device_initcall(rapl_pmu_init); 784