1/* 2 * Intel Cache Quality-of-Service Monitoring (CQM) support. 3 * 4 * Based very, very heavily on work by Peter Zijlstra. 5 */ 6 7#include <linux/perf_event.h> 8#include <linux/slab.h> 9#include <asm/cpu_device_id.h> 10#include "perf_event.h" 11 12#define MSR_IA32_PQR_ASSOC 0x0c8f 13#define MSR_IA32_QM_CTR 0x0c8e 14#define MSR_IA32_QM_EVTSEL 0x0c8d 15 16static unsigned int cqm_max_rmid = -1; 17static unsigned int cqm_l3_scale; /* supposedly cacheline size */ 18 19struct intel_cqm_state { 20 raw_spinlock_t lock; 21 int rmid; 22 int cnt; 23}; 24 25static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); 26 27/* 28 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. 29 * Also protects event->hw.cqm_rmid 30 * 31 * Hold either for stability, both for modification of ->hw.cqm_rmid. 32 */ 33static DEFINE_MUTEX(cache_mutex); 34static DEFINE_RAW_SPINLOCK(cache_lock); 35 36/* 37 * Groups of events that have the same target(s), one RMID per group. 38 */ 39static LIST_HEAD(cache_groups); 40 41/* 42 * Mask of CPUs for reading CQM values. We only need one per-socket. 43 */ 44static cpumask_t cqm_cpumask; 45 46#define RMID_VAL_ERROR (1ULL << 63) 47#define RMID_VAL_UNAVAIL (1ULL << 62) 48 49#define QOS_L3_OCCUP_EVENT_ID (1 << 0) 50 51#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID 52 53/* 54 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). 55 * 56 * This rmid is always free and is guaranteed to have an associated 57 * near-zero occupancy value, i.e. no cachelines are tagged with this 58 * RMID, once __intel_cqm_rmid_rotate() returns. 59 */ 60static unsigned int intel_cqm_rotation_rmid; 61 62#define INVALID_RMID (-1) 63 64/* 65 * Is @rmid valid for programming the hardware? 66 * 67 * rmid 0 is reserved by the hardware for all non-monitored tasks, which 68 * means that we should never come across an rmid with that value. 69 * Likewise, an rmid value of -1 is used to indicate "no rmid currently 70 * assigned" and is used as part of the rotation code. 71 */ 72static inline bool __rmid_valid(unsigned int rmid) 73{ 74 if (!rmid || rmid == INVALID_RMID) 75 return false; 76 77 return true; 78} 79 80static u64 __rmid_read(unsigned int rmid) 81{ 82 u64 val; 83 84 /* 85 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, 86 * it just says that to increase confusion. 87 */ 88 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); 89 rdmsrl(MSR_IA32_QM_CTR, val); 90 91 /* 92 * Aside from the ERROR and UNAVAIL bits, assume this thing returns 93 * the number of cachelines tagged with @rmid. 94 */ 95 return val; 96} 97 98enum rmid_recycle_state { 99 RMID_YOUNG = 0, 100 RMID_AVAILABLE, 101 RMID_DIRTY, 102}; 103 104struct cqm_rmid_entry { 105 unsigned int rmid; 106 enum rmid_recycle_state state; 107 struct list_head list; 108 unsigned long queue_time; 109}; 110 111/* 112 * cqm_rmid_free_lru - A least recently used list of RMIDs. 113 * 114 * Oldest entry at the head, newest (most recently used) entry at the 115 * tail. This list is never traversed, it's only used to keep track of 116 * the lru order. That is, we only pick entries of the head or insert 117 * them on the tail. 118 * 119 * All entries on the list are 'free', and their RMIDs are not currently 120 * in use. To mark an RMID as in use, remove its entry from the lru 121 * list. 122 * 123 * 124 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. 125 * 126 * This list is contains RMIDs that no one is currently using but that 127 * may have a non-zero occupancy value associated with them. The 128 * rotation worker moves RMIDs from the limbo list to the free list once 129 * the occupancy value drops below __intel_cqm_threshold. 130 * 131 * Both lists are protected by cache_mutex. 132 */ 133static LIST_HEAD(cqm_rmid_free_lru); 134static LIST_HEAD(cqm_rmid_limbo_lru); 135 136/* 137 * We use a simple array of pointers so that we can lookup a struct 138 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() 139 * and __put_rmid() from having to worry about dealing with struct 140 * cqm_rmid_entry - they just deal with rmids, i.e. integers. 141 * 142 * Once this array is initialized it is read-only. No locks are required 143 * to access it. 144 * 145 * All entries for all RMIDs can be looked up in the this array at all 146 * times. 147 */ 148static struct cqm_rmid_entry **cqm_rmid_ptrs; 149 150static inline struct cqm_rmid_entry *__rmid_entry(int rmid) 151{ 152 struct cqm_rmid_entry *entry; 153 154 entry = cqm_rmid_ptrs[rmid]; 155 WARN_ON(entry->rmid != rmid); 156 157 return entry; 158} 159 160/* 161 * Returns < 0 on fail. 162 * 163 * We expect to be called with cache_mutex held. 164 */ 165static int __get_rmid(void) 166{ 167 struct cqm_rmid_entry *entry; 168 169 lockdep_assert_held(&cache_mutex); 170 171 if (list_empty(&cqm_rmid_free_lru)) 172 return INVALID_RMID; 173 174 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); 175 list_del(&entry->list); 176 177 return entry->rmid; 178} 179 180static void __put_rmid(unsigned int rmid) 181{ 182 struct cqm_rmid_entry *entry; 183 184 lockdep_assert_held(&cache_mutex); 185 186 WARN_ON(!__rmid_valid(rmid)); 187 entry = __rmid_entry(rmid); 188 189 entry->queue_time = jiffies; 190 entry->state = RMID_YOUNG; 191 192 list_add_tail(&entry->list, &cqm_rmid_limbo_lru); 193} 194 195static int intel_cqm_setup_rmid_cache(void) 196{ 197 struct cqm_rmid_entry *entry; 198 unsigned int nr_rmids; 199 int r = 0; 200 201 nr_rmids = cqm_max_rmid + 1; 202 cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * 203 nr_rmids, GFP_KERNEL); 204 if (!cqm_rmid_ptrs) 205 return -ENOMEM; 206 207 for (; r <= cqm_max_rmid; r++) { 208 struct cqm_rmid_entry *entry; 209 210 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 211 if (!entry) 212 goto fail; 213 214 INIT_LIST_HEAD(&entry->list); 215 entry->rmid = r; 216 cqm_rmid_ptrs[r] = entry; 217 218 list_add_tail(&entry->list, &cqm_rmid_free_lru); 219 } 220 221 /* 222 * RMID 0 is special and is always allocated. It's used for all 223 * tasks that are not monitored. 224 */ 225 entry = __rmid_entry(0); 226 list_del(&entry->list); 227 228 mutex_lock(&cache_mutex); 229 intel_cqm_rotation_rmid = __get_rmid(); 230 mutex_unlock(&cache_mutex); 231 232 return 0; 233fail: 234 while (r--) 235 kfree(cqm_rmid_ptrs[r]); 236 237 kfree(cqm_rmid_ptrs); 238 return -ENOMEM; 239} 240 241/* 242 * Determine if @a and @b measure the same set of tasks. 243 * 244 * If @a and @b measure the same set of tasks then we want to share a 245 * single RMID. 246 */ 247static bool __match_event(struct perf_event *a, struct perf_event *b) 248{ 249 /* Per-cpu and task events don't mix */ 250 if ((a->attach_state & PERF_ATTACH_TASK) != 251 (b->attach_state & PERF_ATTACH_TASK)) 252 return false; 253 254#ifdef CONFIG_CGROUP_PERF 255 if (a->cgrp != b->cgrp) 256 return false; 257#endif 258 259 /* If not task event, we're machine wide */ 260 if (!(b->attach_state & PERF_ATTACH_TASK)) 261 return true; 262 263 /* 264 * Events that target same task are placed into the same cache group. 265 */ 266 if (a->hw.target == b->hw.target) 267 return true; 268 269 /* 270 * Are we an inherited event? 271 */ 272 if (b->parent == a) 273 return true; 274 275 return false; 276} 277 278#ifdef CONFIG_CGROUP_PERF 279static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) 280{ 281 if (event->attach_state & PERF_ATTACH_TASK) 282 return perf_cgroup_from_task(event->hw.target); 283 284 return event->cgrp; 285} 286#endif 287 288/* 289 * Determine if @a's tasks intersect with @b's tasks 290 * 291 * There are combinations of events that we explicitly prohibit, 292 * 293 * PROHIBITS 294 * system-wide -> cgroup and task 295 * cgroup -> system-wide 296 * -> task in cgroup 297 * task -> system-wide 298 * -> task in cgroup 299 * 300 * Call this function before allocating an RMID. 301 */ 302static bool __conflict_event(struct perf_event *a, struct perf_event *b) 303{ 304#ifdef CONFIG_CGROUP_PERF 305 /* 306 * We can have any number of cgroups but only one system-wide 307 * event at a time. 308 */ 309 if (a->cgrp && b->cgrp) { 310 struct perf_cgroup *ac = a->cgrp; 311 struct perf_cgroup *bc = b->cgrp; 312 313 /* 314 * This condition should have been caught in 315 * __match_event() and we should be sharing an RMID. 316 */ 317 WARN_ON_ONCE(ac == bc); 318 319 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || 320 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) 321 return true; 322 323 return false; 324 } 325 326 if (a->cgrp || b->cgrp) { 327 struct perf_cgroup *ac, *bc; 328 329 /* 330 * cgroup and system-wide events are mutually exclusive 331 */ 332 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || 333 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) 334 return true; 335 336 /* 337 * Ensure neither event is part of the other's cgroup 338 */ 339 ac = event_to_cgroup(a); 340 bc = event_to_cgroup(b); 341 if (ac == bc) 342 return true; 343 344 /* 345 * Must have cgroup and non-intersecting task events. 346 */ 347 if (!ac || !bc) 348 return false; 349 350 /* 351 * We have cgroup and task events, and the task belongs 352 * to a cgroup. Check for for overlap. 353 */ 354 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || 355 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) 356 return true; 357 358 return false; 359 } 360#endif 361 /* 362 * If one of them is not a task, same story as above with cgroups. 363 */ 364 if (!(a->attach_state & PERF_ATTACH_TASK) || 365 !(b->attach_state & PERF_ATTACH_TASK)) 366 return true; 367 368 /* 369 * Must be non-overlapping. 370 */ 371 return false; 372} 373 374struct rmid_read { 375 unsigned int rmid; 376 atomic64_t value; 377}; 378 379static void __intel_cqm_event_count(void *info); 380 381/* 382 * Exchange the RMID of a group of events. 383 */ 384static unsigned int 385intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) 386{ 387 struct perf_event *event; 388 unsigned int old_rmid = group->hw.cqm_rmid; 389 struct list_head *head = &group->hw.cqm_group_entry; 390 391 lockdep_assert_held(&cache_mutex); 392 393 /* 394 * If our RMID is being deallocated, perform a read now. 395 */ 396 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { 397 struct rmid_read rr = { 398 .value = ATOMIC64_INIT(0), 399 .rmid = old_rmid, 400 }; 401 402 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, 403 &rr, 1); 404 local64_set(&group->count, atomic64_read(&rr.value)); 405 } 406 407 raw_spin_lock_irq(&cache_lock); 408 409 group->hw.cqm_rmid = rmid; 410 list_for_each_entry(event, head, hw.cqm_group_entry) 411 event->hw.cqm_rmid = rmid; 412 413 raw_spin_unlock_irq(&cache_lock); 414 415 return old_rmid; 416} 417 418/* 419 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because 420 * cachelines are still tagged with RMIDs in limbo, we progressively 421 * increment the threshold until we find an RMID in limbo with <= 422 * __intel_cqm_threshold lines tagged. This is designed to mitigate the 423 * problem where cachelines tagged with an RMID are not steadily being 424 * evicted. 425 * 426 * On successful rotations we decrease the threshold back towards zero. 427 * 428 * __intel_cqm_max_threshold provides an upper bound on the threshold, 429 * and is measured in bytes because it's exposed to userland. 430 */ 431static unsigned int __intel_cqm_threshold; 432static unsigned int __intel_cqm_max_threshold; 433 434/* 435 * Test whether an RMID has a zero occupancy value on this cpu. 436 */ 437static void intel_cqm_stable(void *arg) 438{ 439 struct cqm_rmid_entry *entry; 440 441 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { 442 if (entry->state != RMID_AVAILABLE) 443 break; 444 445 if (__rmid_read(entry->rmid) > __intel_cqm_threshold) 446 entry->state = RMID_DIRTY; 447 } 448} 449 450/* 451 * If we have group events waiting for an RMID that don't conflict with 452 * events already running, assign @rmid. 453 */ 454static bool intel_cqm_sched_in_event(unsigned int rmid) 455{ 456 struct perf_event *leader, *event; 457 458 lockdep_assert_held(&cache_mutex); 459 460 leader = list_first_entry(&cache_groups, struct perf_event, 461 hw.cqm_groups_entry); 462 event = leader; 463 464 list_for_each_entry_continue(event, &cache_groups, 465 hw.cqm_groups_entry) { 466 if (__rmid_valid(event->hw.cqm_rmid)) 467 continue; 468 469 if (__conflict_event(event, leader)) 470 continue; 471 472 intel_cqm_xchg_rmid(event, rmid); 473 return true; 474 } 475 476 return false; 477} 478 479/* 480 * Initially use this constant for both the limbo queue time and the 481 * rotation timer interval, pmu::hrtimer_interval_ms. 482 * 483 * They don't need to be the same, but the two are related since if you 484 * rotate faster than you recycle RMIDs, you may run out of available 485 * RMIDs. 486 */ 487#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ 488 489static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; 490 491/* 492 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list 493 * @nr_available: number of freeable RMIDs on the limbo list 494 * 495 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no 496 * cachelines are tagged with those RMIDs. After this we can reuse them 497 * and know that the current set of active RMIDs is stable. 498 * 499 * Return %true or %false depending on whether stabilization needs to be 500 * reattempted. 501 * 502 * If we return %true then @nr_available is updated to indicate the 503 * number of RMIDs on the limbo list that have been queued for the 504 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values 505 * are above __intel_cqm_threshold. 506 */ 507static bool intel_cqm_rmid_stabilize(unsigned int *available) 508{ 509 struct cqm_rmid_entry *entry, *tmp; 510 511 lockdep_assert_held(&cache_mutex); 512 513 *available = 0; 514 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { 515 unsigned long min_queue_time; 516 unsigned long now = jiffies; 517 518 /* 519 * We hold RMIDs placed into limbo for a minimum queue 520 * time. Before the minimum queue time has elapsed we do 521 * not recycle RMIDs. 522 * 523 * The reasoning is that until a sufficient time has 524 * passed since we stopped using an RMID, any RMID 525 * placed onto the limbo list will likely still have 526 * data tagged in the cache, which means we'll probably 527 * fail to recycle it anyway. 528 * 529 * We can save ourselves an expensive IPI by skipping 530 * any RMIDs that have not been queued for the minimum 531 * time. 532 */ 533 min_queue_time = entry->queue_time + 534 msecs_to_jiffies(__rmid_queue_time_ms); 535 536 if (time_after(min_queue_time, now)) 537 break; 538 539 entry->state = RMID_AVAILABLE; 540 (*available)++; 541 } 542 543 /* 544 * Fast return if none of the RMIDs on the limbo list have been 545 * sitting on the queue for the minimum queue time. 546 */ 547 if (!*available) 548 return false; 549 550 /* 551 * Test whether an RMID is free for each package. 552 */ 553 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); 554 555 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { 556 /* 557 * Exhausted all RMIDs that have waited min queue time. 558 */ 559 if (entry->state == RMID_YOUNG) 560 break; 561 562 if (entry->state == RMID_DIRTY) 563 continue; 564 565 list_del(&entry->list); /* remove from limbo */ 566 567 /* 568 * The rotation RMID gets priority if it's 569 * currently invalid. In which case, skip adding 570 * the RMID to the the free lru. 571 */ 572 if (!__rmid_valid(intel_cqm_rotation_rmid)) { 573 intel_cqm_rotation_rmid = entry->rmid; 574 continue; 575 } 576 577 /* 578 * If we have groups waiting for RMIDs, hand 579 * them one now provided they don't conflict. 580 */ 581 if (intel_cqm_sched_in_event(entry->rmid)) 582 continue; 583 584 /* 585 * Otherwise place it onto the free list. 586 */ 587 list_add_tail(&entry->list, &cqm_rmid_free_lru); 588 } 589 590 591 return __rmid_valid(intel_cqm_rotation_rmid); 592} 593 594/* 595 * Pick a victim group and move it to the tail of the group list. 596 * @next: The first group without an RMID 597 */ 598static void __intel_cqm_pick_and_rotate(struct perf_event *next) 599{ 600 struct perf_event *rotor; 601 unsigned int rmid; 602 603 lockdep_assert_held(&cache_mutex); 604 605 rotor = list_first_entry(&cache_groups, struct perf_event, 606 hw.cqm_groups_entry); 607 608 /* 609 * The group at the front of the list should always have a valid 610 * RMID. If it doesn't then no groups have RMIDs assigned and we 611 * don't need to rotate the list. 612 */ 613 if (next == rotor) 614 return; 615 616 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); 617 __put_rmid(rmid); 618 619 list_rotate_left(&cache_groups); 620} 621 622/* 623 * Deallocate the RMIDs from any events that conflict with @event, and 624 * place them on the back of the group list. 625 */ 626static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) 627{ 628 struct perf_event *group, *g; 629 unsigned int rmid; 630 631 lockdep_assert_held(&cache_mutex); 632 633 list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { 634 if (group == event) 635 continue; 636 637 rmid = group->hw.cqm_rmid; 638 639 /* 640 * Skip events that don't have a valid RMID. 641 */ 642 if (!__rmid_valid(rmid)) 643 continue; 644 645 /* 646 * No conflict? No problem! Leave the event alone. 647 */ 648 if (!__conflict_event(group, event)) 649 continue; 650 651 intel_cqm_xchg_rmid(group, INVALID_RMID); 652 __put_rmid(rmid); 653 } 654} 655 656/* 657 * Attempt to rotate the groups and assign new RMIDs. 658 * 659 * We rotate for two reasons, 660 * 1. To handle the scheduling of conflicting events 661 * 2. To recycle RMIDs 662 * 663 * Rotating RMIDs is complicated because the hardware doesn't give us 664 * any clues. 665 * 666 * There's problems with the hardware interface; when you change the 667 * task:RMID map cachelines retain their 'old' tags, giving a skewed 668 * picture. In order to work around this, we must always keep one free 669 * RMID - intel_cqm_rotation_rmid. 670 * 671 * Rotation works by taking away an RMID from a group (the old RMID), 672 * and assigning the free RMID to another group (the new RMID). We must 673 * then wait for the old RMID to not be used (no cachelines tagged). 674 * This ensure that all cachelines are tagged with 'active' RMIDs. At 675 * this point we can start reading values for the new RMID and treat the 676 * old RMID as the free RMID for the next rotation. 677 * 678 * Return %true or %false depending on whether we did any rotating. 679 */ 680static bool __intel_cqm_rmid_rotate(void) 681{ 682 struct perf_event *group, *start = NULL; 683 unsigned int threshold_limit; 684 unsigned int nr_needed = 0; 685 unsigned int nr_available; 686 bool rotated = false; 687 688 mutex_lock(&cache_mutex); 689 690again: 691 /* 692 * Fast path through this function if there are no groups and no 693 * RMIDs that need cleaning. 694 */ 695 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) 696 goto out; 697 698 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { 699 if (!__rmid_valid(group->hw.cqm_rmid)) { 700 if (!start) 701 start = group; 702 nr_needed++; 703 } 704 } 705 706 /* 707 * We have some event groups, but they all have RMIDs assigned 708 * and no RMIDs need cleaning. 709 */ 710 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) 711 goto out; 712 713 if (!nr_needed) 714 goto stabilize; 715 716 /* 717 * We have more event groups without RMIDs than available RMIDs, 718 * or we have event groups that conflict with the ones currently 719 * scheduled. 720 * 721 * We force deallocate the rmid of the group at the head of 722 * cache_groups. The first event group without an RMID then gets 723 * assigned intel_cqm_rotation_rmid. This ensures we always make 724 * forward progress. 725 * 726 * Rotate the cache_groups list so the previous head is now the 727 * tail. 728 */ 729 __intel_cqm_pick_and_rotate(start); 730 731 /* 732 * If the rotation is going to succeed, reduce the threshold so 733 * that we don't needlessly reuse dirty RMIDs. 734 */ 735 if (__rmid_valid(intel_cqm_rotation_rmid)) { 736 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); 737 intel_cqm_rotation_rmid = __get_rmid(); 738 739 intel_cqm_sched_out_conflicting_events(start); 740 741 if (__intel_cqm_threshold) 742 __intel_cqm_threshold--; 743 } 744 745 rotated = true; 746 747stabilize: 748 /* 749 * We now need to stablize the RMID we freed above (if any) to 750 * ensure that the next time we rotate we have an RMID with zero 751 * occupancy value. 752 * 753 * Alternatively, if we didn't need to perform any rotation, 754 * we'll have a bunch of RMIDs in limbo that need stabilizing. 755 */ 756 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; 757 758 while (intel_cqm_rmid_stabilize(&nr_available) && 759 __intel_cqm_threshold < threshold_limit) { 760 unsigned int steal_limit; 761 762 /* 763 * Don't spin if nobody is actively waiting for an RMID, 764 * the rotation worker will be kicked as soon as an 765 * event needs an RMID anyway. 766 */ 767 if (!nr_needed) 768 break; 769 770 /* Allow max 25% of RMIDs to be in limbo. */ 771 steal_limit = (cqm_max_rmid + 1) / 4; 772 773 /* 774 * We failed to stabilize any RMIDs so our rotation 775 * logic is now stuck. In order to make forward progress 776 * we have a few options: 777 * 778 * 1. rotate ("steal") another RMID 779 * 2. increase the threshold 780 * 3. do nothing 781 * 782 * We do both of 1. and 2. until we hit the steal limit. 783 * 784 * The steal limit prevents all RMIDs ending up on the 785 * limbo list. This can happen if every RMID has a 786 * non-zero occupancy above threshold_limit, and the 787 * occupancy values aren't dropping fast enough. 788 * 789 * Note that there is prioritisation at work here - we'd 790 * rather increase the number of RMIDs on the limbo list 791 * than increase the threshold, because increasing the 792 * threshold skews the event data (because we reuse 793 * dirty RMIDs) - threshold bumps are a last resort. 794 */ 795 if (nr_available < steal_limit) 796 goto again; 797 798 __intel_cqm_threshold++; 799 } 800 801out: 802 mutex_unlock(&cache_mutex); 803 return rotated; 804} 805 806static void intel_cqm_rmid_rotate(struct work_struct *work); 807 808static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); 809 810static struct pmu intel_cqm_pmu; 811 812static void intel_cqm_rmid_rotate(struct work_struct *work) 813{ 814 unsigned long delay; 815 816 __intel_cqm_rmid_rotate(); 817 818 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); 819 schedule_delayed_work(&intel_cqm_rmid_work, delay); 820} 821 822/* 823 * Find a group and setup RMID. 824 * 825 * If we're part of a group, we use the group's RMID. 826 */ 827static void intel_cqm_setup_event(struct perf_event *event, 828 struct perf_event **group) 829{ 830 struct perf_event *iter; 831 unsigned int rmid; 832 bool conflict = false; 833 834 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { 835 rmid = iter->hw.cqm_rmid; 836 837 if (__match_event(iter, event)) { 838 /* All tasks in a group share an RMID */ 839 event->hw.cqm_rmid = rmid; 840 *group = iter; 841 return; 842 } 843 844 /* 845 * We only care about conflicts for events that are 846 * actually scheduled in (and hence have a valid RMID). 847 */ 848 if (__conflict_event(iter, event) && __rmid_valid(rmid)) 849 conflict = true; 850 } 851 852 if (conflict) 853 rmid = INVALID_RMID; 854 else 855 rmid = __get_rmid(); 856 857 event->hw.cqm_rmid = rmid; 858} 859 860static void intel_cqm_event_read(struct perf_event *event) 861{ 862 unsigned long flags; 863 unsigned int rmid; 864 u64 val; 865 866 /* 867 * Task events are handled by intel_cqm_event_count(). 868 */ 869 if (event->cpu == -1) 870 return; 871 872 raw_spin_lock_irqsave(&cache_lock, flags); 873 rmid = event->hw.cqm_rmid; 874 875 if (!__rmid_valid(rmid)) 876 goto out; 877 878 val = __rmid_read(rmid); 879 880 /* 881 * Ignore this reading on error states and do not update the value. 882 */ 883 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) 884 goto out; 885 886 local64_set(&event->count, val); 887out: 888 raw_spin_unlock_irqrestore(&cache_lock, flags); 889} 890 891static void __intel_cqm_event_count(void *info) 892{ 893 struct rmid_read *rr = info; 894 u64 val; 895 896 val = __rmid_read(rr->rmid); 897 898 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) 899 return; 900 901 atomic64_add(val, &rr->value); 902} 903 904static inline bool cqm_group_leader(struct perf_event *event) 905{ 906 return !list_empty(&event->hw.cqm_groups_entry); 907} 908 909static u64 intel_cqm_event_count(struct perf_event *event) 910{ 911 unsigned long flags; 912 struct rmid_read rr = { 913 .value = ATOMIC64_INIT(0), 914 }; 915 916 /* 917 * We only need to worry about task events. System-wide events 918 * are handled like usual, i.e. entirely with 919 * intel_cqm_event_read(). 920 */ 921 if (event->cpu != -1) 922 return __perf_event_count(event); 923 924 /* 925 * Only the group leader gets to report values. This stops us 926 * reporting duplicate values to userspace, and gives us a clear 927 * rule for which task gets to report the values. 928 * 929 * Note that it is impossible to attribute these values to 930 * specific packages - we forfeit that ability when we create 931 * task events. 932 */ 933 if (!cqm_group_leader(event)) 934 return 0; 935 936 /* 937 * Getting up-to-date values requires an SMP IPI which is not 938 * possible if we're being called in interrupt context. Return 939 * the cached values instead. 940 */ 941 if (unlikely(in_interrupt())) 942 goto out; 943 944 /* 945 * Notice that we don't perform the reading of an RMID 946 * atomically, because we can't hold a spin lock across the 947 * IPIs. 948 * 949 * Speculatively perform the read, since @event might be 950 * assigned a different (possibly invalid) RMID while we're 951 * busying performing the IPI calls. It's therefore necessary to 952 * check @event's RMID afterwards, and if it has changed, 953 * discard the result of the read. 954 */ 955 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); 956 957 if (!__rmid_valid(rr.rmid)) 958 goto out; 959 960 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); 961 962 raw_spin_lock_irqsave(&cache_lock, flags); 963 if (event->hw.cqm_rmid == rr.rmid) 964 local64_set(&event->count, atomic64_read(&rr.value)); 965 raw_spin_unlock_irqrestore(&cache_lock, flags); 966out: 967 return __perf_event_count(event); 968} 969 970static void intel_cqm_event_start(struct perf_event *event, int mode) 971{ 972 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); 973 unsigned int rmid = event->hw.cqm_rmid; 974 unsigned long flags; 975 976 if (!(event->hw.cqm_state & PERF_HES_STOPPED)) 977 return; 978 979 event->hw.cqm_state &= ~PERF_HES_STOPPED; 980 981 raw_spin_lock_irqsave(&state->lock, flags); 982 983 if (state->cnt++) 984 WARN_ON_ONCE(state->rmid != rmid); 985 else 986 WARN_ON_ONCE(state->rmid); 987 988 state->rmid = rmid; 989 wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); 990 991 raw_spin_unlock_irqrestore(&state->lock, flags); 992} 993 994static void intel_cqm_event_stop(struct perf_event *event, int mode) 995{ 996 struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); 997 unsigned long flags; 998 999 if (event->hw.cqm_state & PERF_HES_STOPPED) 1000 return; 1001 1002 event->hw.cqm_state |= PERF_HES_STOPPED; 1003 1004 raw_spin_lock_irqsave(&state->lock, flags); 1005 intel_cqm_event_read(event); 1006 1007 if (!--state->cnt) { 1008 state->rmid = 0; 1009 wrmsrl(MSR_IA32_PQR_ASSOC, 0); 1010 } else { 1011 WARN_ON_ONCE(!state->rmid); 1012 } 1013 1014 raw_spin_unlock_irqrestore(&state->lock, flags); 1015} 1016 1017static int intel_cqm_event_add(struct perf_event *event, int mode) 1018{ 1019 unsigned long flags; 1020 unsigned int rmid; 1021 1022 raw_spin_lock_irqsave(&cache_lock, flags); 1023 1024 event->hw.cqm_state = PERF_HES_STOPPED; 1025 rmid = event->hw.cqm_rmid; 1026 1027 if (__rmid_valid(rmid) && (mode & PERF_EF_START)) 1028 intel_cqm_event_start(event, mode); 1029 1030 raw_spin_unlock_irqrestore(&cache_lock, flags); 1031 1032 return 0; 1033} 1034 1035static void intel_cqm_event_del(struct perf_event *event, int mode) 1036{ 1037 intel_cqm_event_stop(event, mode); 1038} 1039 1040static void intel_cqm_event_destroy(struct perf_event *event) 1041{ 1042 struct perf_event *group_other = NULL; 1043 1044 mutex_lock(&cache_mutex); 1045 1046 /* 1047 * If there's another event in this group... 1048 */ 1049 if (!list_empty(&event->hw.cqm_group_entry)) { 1050 group_other = list_first_entry(&event->hw.cqm_group_entry, 1051 struct perf_event, 1052 hw.cqm_group_entry); 1053 list_del(&event->hw.cqm_group_entry); 1054 } 1055 1056 /* 1057 * And we're the group leader.. 1058 */ 1059 if (cqm_group_leader(event)) { 1060 /* 1061 * If there was a group_other, make that leader, otherwise 1062 * destroy the group and return the RMID. 1063 */ 1064 if (group_other) { 1065 list_replace(&event->hw.cqm_groups_entry, 1066 &group_other->hw.cqm_groups_entry); 1067 } else { 1068 unsigned int rmid = event->hw.cqm_rmid; 1069 1070 if (__rmid_valid(rmid)) 1071 __put_rmid(rmid); 1072 list_del(&event->hw.cqm_groups_entry); 1073 } 1074 } 1075 1076 mutex_unlock(&cache_mutex); 1077} 1078 1079static int intel_cqm_event_init(struct perf_event *event) 1080{ 1081 struct perf_event *group = NULL; 1082 bool rotate = false; 1083 1084 if (event->attr.type != intel_cqm_pmu.type) 1085 return -ENOENT; 1086 1087 if (event->attr.config & ~QOS_EVENT_MASK) 1088 return -EINVAL; 1089 1090 /* unsupported modes and filters */ 1091 if (event->attr.exclude_user || 1092 event->attr.exclude_kernel || 1093 event->attr.exclude_hv || 1094 event->attr.exclude_idle || 1095 event->attr.exclude_host || 1096 event->attr.exclude_guest || 1097 event->attr.sample_period) /* no sampling */ 1098 return -EINVAL; 1099 1100 INIT_LIST_HEAD(&event->hw.cqm_group_entry); 1101 INIT_LIST_HEAD(&event->hw.cqm_groups_entry); 1102 1103 event->destroy = intel_cqm_event_destroy; 1104 1105 mutex_lock(&cache_mutex); 1106 1107 /* Will also set rmid */ 1108 intel_cqm_setup_event(event, &group); 1109 1110 if (group) { 1111 list_add_tail(&event->hw.cqm_group_entry, 1112 &group->hw.cqm_group_entry); 1113 } else { 1114 list_add_tail(&event->hw.cqm_groups_entry, 1115 &cache_groups); 1116 1117 /* 1118 * All RMIDs are either in use or have recently been 1119 * used. Kick the rotation worker to clean/free some. 1120 * 1121 * We only do this for the group leader, rather than for 1122 * every event in a group to save on needless work. 1123 */ 1124 if (!__rmid_valid(event->hw.cqm_rmid)) 1125 rotate = true; 1126 } 1127 1128 mutex_unlock(&cache_mutex); 1129 1130 if (rotate) 1131 schedule_delayed_work(&intel_cqm_rmid_work, 0); 1132 1133 return 0; 1134} 1135 1136EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); 1137EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); 1138EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); 1139EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); 1140EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); 1141 1142static struct attribute *intel_cqm_events_attr[] = { 1143 EVENT_PTR(intel_cqm_llc), 1144 EVENT_PTR(intel_cqm_llc_pkg), 1145 EVENT_PTR(intel_cqm_llc_unit), 1146 EVENT_PTR(intel_cqm_llc_scale), 1147 EVENT_PTR(intel_cqm_llc_snapshot), 1148 NULL, 1149}; 1150 1151static struct attribute_group intel_cqm_events_group = { 1152 .name = "events", 1153 .attrs = intel_cqm_events_attr, 1154}; 1155 1156PMU_FORMAT_ATTR(event, "config:0-7"); 1157static struct attribute *intel_cqm_formats_attr[] = { 1158 &format_attr_event.attr, 1159 NULL, 1160}; 1161 1162static struct attribute_group intel_cqm_format_group = { 1163 .name = "format", 1164 .attrs = intel_cqm_formats_attr, 1165}; 1166 1167static ssize_t 1168max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, 1169 char *page) 1170{ 1171 ssize_t rv; 1172 1173 mutex_lock(&cache_mutex); 1174 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); 1175 mutex_unlock(&cache_mutex); 1176 1177 return rv; 1178} 1179 1180static ssize_t 1181max_recycle_threshold_store(struct device *dev, 1182 struct device_attribute *attr, 1183 const char *buf, size_t count) 1184{ 1185 unsigned int bytes, cachelines; 1186 int ret; 1187 1188 ret = kstrtouint(buf, 0, &bytes); 1189 if (ret) 1190 return ret; 1191 1192 mutex_lock(&cache_mutex); 1193 1194 __intel_cqm_max_threshold = bytes; 1195 cachelines = bytes / cqm_l3_scale; 1196 1197 /* 1198 * The new maximum takes effect immediately. 1199 */ 1200 if (__intel_cqm_threshold > cachelines) 1201 __intel_cqm_threshold = cachelines; 1202 1203 mutex_unlock(&cache_mutex); 1204 1205 return count; 1206} 1207 1208static DEVICE_ATTR_RW(max_recycle_threshold); 1209 1210static struct attribute *intel_cqm_attrs[] = { 1211 &dev_attr_max_recycle_threshold.attr, 1212 NULL, 1213}; 1214 1215static const struct attribute_group intel_cqm_group = { 1216 .attrs = intel_cqm_attrs, 1217}; 1218 1219static const struct attribute_group *intel_cqm_attr_groups[] = { 1220 &intel_cqm_events_group, 1221 &intel_cqm_format_group, 1222 &intel_cqm_group, 1223 NULL, 1224}; 1225 1226static struct pmu intel_cqm_pmu = { 1227 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, 1228 .attr_groups = intel_cqm_attr_groups, 1229 .task_ctx_nr = perf_sw_context, 1230 .event_init = intel_cqm_event_init, 1231 .add = intel_cqm_event_add, 1232 .del = intel_cqm_event_del, 1233 .start = intel_cqm_event_start, 1234 .stop = intel_cqm_event_stop, 1235 .read = intel_cqm_event_read, 1236 .count = intel_cqm_event_count, 1237}; 1238 1239static inline void cqm_pick_event_reader(int cpu) 1240{ 1241 int phys_id = topology_physical_package_id(cpu); 1242 int i; 1243 1244 for_each_cpu(i, &cqm_cpumask) { 1245 if (phys_id == topology_physical_package_id(i)) 1246 return; /* already got reader for this socket */ 1247 } 1248 1249 cpumask_set_cpu(cpu, &cqm_cpumask); 1250} 1251 1252static void intel_cqm_cpu_prepare(unsigned int cpu) 1253{ 1254 struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); 1255 struct cpuinfo_x86 *c = &cpu_data(cpu); 1256 1257 raw_spin_lock_init(&state->lock); 1258 state->rmid = 0; 1259 state->cnt = 0; 1260 1261 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); 1262 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); 1263} 1264 1265static void intel_cqm_cpu_exit(unsigned int cpu) 1266{ 1267 int phys_id = topology_physical_package_id(cpu); 1268 int i; 1269 1270 /* 1271 * Is @cpu a designated cqm reader? 1272 */ 1273 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) 1274 return; 1275 1276 for_each_online_cpu(i) { 1277 if (i == cpu) 1278 continue; 1279 1280 if (phys_id == topology_physical_package_id(i)) { 1281 cpumask_set_cpu(i, &cqm_cpumask); 1282 break; 1283 } 1284 } 1285} 1286 1287static int intel_cqm_cpu_notifier(struct notifier_block *nb, 1288 unsigned long action, void *hcpu) 1289{ 1290 unsigned int cpu = (unsigned long)hcpu; 1291 1292 switch (action & ~CPU_TASKS_FROZEN) { 1293 case CPU_UP_PREPARE: 1294 intel_cqm_cpu_prepare(cpu); 1295 break; 1296 case CPU_DOWN_PREPARE: 1297 intel_cqm_cpu_exit(cpu); 1298 break; 1299 case CPU_STARTING: 1300 cqm_pick_event_reader(cpu); 1301 break; 1302 } 1303 1304 return NOTIFY_OK; 1305} 1306 1307static const struct x86_cpu_id intel_cqm_match[] = { 1308 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, 1309 {} 1310}; 1311 1312static int __init intel_cqm_init(void) 1313{ 1314 char *str, scale[20]; 1315 int i, cpu, ret; 1316 1317 if (!x86_match_cpu(intel_cqm_match)) 1318 return -ENODEV; 1319 1320 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; 1321 1322 /* 1323 * It's possible that not all resources support the same number 1324 * of RMIDs. Instead of making scheduling much more complicated 1325 * (where we have to match a task's RMID to a cpu that supports 1326 * that many RMIDs) just find the minimum RMIDs supported across 1327 * all cpus. 1328 * 1329 * Also, check that the scales match on all cpus. 1330 */ 1331 cpu_notifier_register_begin(); 1332 1333 for_each_online_cpu(cpu) { 1334 struct cpuinfo_x86 *c = &cpu_data(cpu); 1335 1336 if (c->x86_cache_max_rmid < cqm_max_rmid) 1337 cqm_max_rmid = c->x86_cache_max_rmid; 1338 1339 if (c->x86_cache_occ_scale != cqm_l3_scale) { 1340 pr_err("Multiple LLC scale values, disabling\n"); 1341 ret = -EINVAL; 1342 goto out; 1343 } 1344 } 1345 1346 /* 1347 * A reasonable upper limit on the max threshold is the number 1348 * of lines tagged per RMID if all RMIDs have the same number of 1349 * lines tagged in the LLC. 1350 * 1351 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1352 */ 1353 __intel_cqm_max_threshold = 1354 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); 1355 1356 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); 1357 str = kstrdup(scale, GFP_KERNEL); 1358 if (!str) { 1359 ret = -ENOMEM; 1360 goto out; 1361 } 1362 1363 event_attr_intel_cqm_llc_scale.event_str = str; 1364 1365 ret = intel_cqm_setup_rmid_cache(); 1366 if (ret) 1367 goto out; 1368 1369 for_each_online_cpu(i) { 1370 intel_cqm_cpu_prepare(i); 1371 cqm_pick_event_reader(i); 1372 } 1373 1374 __perf_cpu_notifier(intel_cqm_cpu_notifier); 1375 1376 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); 1377 if (ret) 1378 pr_err("Intel CQM perf registration failed: %d\n", ret); 1379 else 1380 pr_info("Intel CQM monitoring enabled\n"); 1381 1382out: 1383 cpu_notifier_register_done(); 1384 1385 return ret; 1386} 1387device_initcall(intel_cqm_init); 1388