1/* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan@linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang@intel.com> 36 * Wu Fengguang <fengguang.wu@intel.com> 37 * Mike Galbraith <efault@gmx.de> 38 * Paul Mackerras <paulus@samba.org> 39 * Jaswinder Singh Rajput <jaswinder@kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44#include "perf.h" 45#include "builtin.h" 46#include "util/cgroup.h" 47#include "util/util.h" 48#include "util/parse-options.h" 49#include "util/parse-events.h" 50#include "util/pmu.h" 51#include "util/event.h" 52#include "util/evlist.h" 53#include "util/evsel.h" 54#include "util/debug.h" 55#include "util/color.h" 56#include "util/stat.h" 57#include "util/header.h" 58#include "util/cpumap.h" 59#include "util/thread.h" 60#include "util/thread_map.h" 61 62#include <stdlib.h> 63#include <sys/prctl.h> 64#include <locale.h> 65 66#define DEFAULT_SEPARATOR " " 67#define CNTR_NOT_SUPPORTED "<not supported>" 68#define CNTR_NOT_COUNTED "<not counted>" 69 70static void print_stat(int argc, const char **argv); 71static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 72static void print_counter(struct perf_evsel *counter, char *prefix); 73static void print_aggr(char *prefix); 74 75/* Default events used for perf stat -T */ 76static const char * const transaction_attrs[] = { 77 "task-clock", 78 "{" 79 "instructions," 80 "cycles," 81 "cpu/cycles-t/," 82 "cpu/tx-start/," 83 "cpu/el-start/," 84 "cpu/cycles-ct/" 85 "}" 86}; 87 88/* More limited version when the CPU does not have all events. */ 89static const char * const transaction_limited_attrs[] = { 90 "task-clock", 91 "{" 92 "instructions," 93 "cycles," 94 "cpu/cycles-t/," 95 "cpu/tx-start/" 96 "}" 97}; 98 99/* must match transaction_attrs and the beginning limited_attrs */ 100enum { 101 T_TASK_CLOCK, 102 T_INSTRUCTIONS, 103 T_CYCLES, 104 T_CYCLES_IN_TX, 105 T_TRANSACTION_START, 106 T_ELISION_START, 107 T_CYCLES_IN_TX_CP, 108}; 109 110static struct perf_evlist *evsel_list; 111 112static struct target target = { 113 .uid = UINT_MAX, 114}; 115 116enum aggr_mode { 117 AGGR_NONE, 118 AGGR_GLOBAL, 119 AGGR_SOCKET, 120 AGGR_CORE, 121}; 122 123static int run_count = 1; 124static bool no_inherit = false; 125static bool scale = true; 126static enum aggr_mode aggr_mode = AGGR_GLOBAL; 127static volatile pid_t child_pid = -1; 128static bool null_run = false; 129static int detailed_run = 0; 130static bool transaction_run; 131static bool big_num = true; 132static int big_num_opt = -1; 133static const char *csv_sep = NULL; 134static bool csv_output = false; 135static bool group = false; 136static FILE *output = NULL; 137static const char *pre_cmd = NULL; 138static const char *post_cmd = NULL; 139static bool sync_run = false; 140static unsigned int interval = 0; 141static unsigned int initial_delay = 0; 142static unsigned int unit_width = 4; /* strlen("unit") */ 143static bool forever = false; 144static struct timespec ref_time; 145static struct cpu_map *aggr_map; 146static int (*aggr_get_id)(struct cpu_map *m, int cpu); 147 148static volatile int done = 0; 149 150struct perf_stat { 151 struct stats res_stats[3]; 152}; 153 154static inline void diff_timespec(struct timespec *r, struct timespec *a, 155 struct timespec *b) 156{ 157 r->tv_sec = a->tv_sec - b->tv_sec; 158 if (a->tv_nsec < b->tv_nsec) { 159 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 160 r->tv_sec--; 161 } else { 162 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 163 } 164} 165 166static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 167{ 168 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 169} 170 171static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 172{ 173 return perf_evsel__cpus(evsel)->nr; 174} 175 176static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 177{ 178 int i; 179 struct perf_stat *ps = evsel->priv; 180 181 for (i = 0; i < 3; i++) 182 init_stats(&ps->res_stats[i]); 183} 184 185static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 186{ 187 evsel->priv = zalloc(sizeof(struct perf_stat)); 188 if (evsel->priv == NULL) 189 return -ENOMEM; 190 perf_evsel__reset_stat_priv(evsel); 191 return 0; 192} 193 194static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 195{ 196 zfree(&evsel->priv); 197} 198 199static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 200{ 201 void *addr; 202 size_t sz; 203 204 sz = sizeof(*evsel->counts) + 205 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 206 207 addr = zalloc(sz); 208 if (!addr) 209 return -ENOMEM; 210 211 evsel->prev_raw_counts = addr; 212 213 return 0; 214} 215 216static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 217{ 218 zfree(&evsel->prev_raw_counts); 219} 220 221static void perf_evlist__free_stats(struct perf_evlist *evlist) 222{ 223 struct perf_evsel *evsel; 224 225 evlist__for_each(evlist, evsel) { 226 perf_evsel__free_stat_priv(evsel); 227 perf_evsel__free_counts(evsel); 228 perf_evsel__free_prev_raw_counts(evsel); 229 } 230} 231 232static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 233{ 234 struct perf_evsel *evsel; 235 236 evlist__for_each(evlist, evsel) { 237 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 238 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 239 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 240 goto out_free; 241 } 242 243 return 0; 244 245out_free: 246 perf_evlist__free_stats(evlist); 247 return -1; 248} 249 250static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 251static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 252static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 253static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 254static struct stats runtime_branches_stats[MAX_NR_CPUS]; 255static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 256static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 257static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 258static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 259static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 260static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 261static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS]; 262static struct stats walltime_nsecs_stats; 263static struct stats runtime_transaction_stats[MAX_NR_CPUS]; 264static struct stats runtime_elision_stats[MAX_NR_CPUS]; 265 266static void perf_stat__reset_stats(struct perf_evlist *evlist) 267{ 268 struct perf_evsel *evsel; 269 270 evlist__for_each(evlist, evsel) { 271 perf_evsel__reset_stat_priv(evsel); 272 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 273 } 274 275 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 276 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 277 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 278 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 279 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 280 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 281 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 282 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 283 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 284 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 285 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 286 memset(runtime_cycles_in_tx_stats, 0, 287 sizeof(runtime_cycles_in_tx_stats)); 288 memset(runtime_transaction_stats, 0, 289 sizeof(runtime_transaction_stats)); 290 memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats)); 291 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 292} 293 294static int create_perf_stat_counter(struct perf_evsel *evsel) 295{ 296 struct perf_event_attr *attr = &evsel->attr; 297 298 if (scale) 299 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 300 PERF_FORMAT_TOTAL_TIME_RUNNING; 301 302 attr->inherit = !no_inherit; 303 304 if (target__has_cpu(&target)) 305 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 306 307 if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) { 308 attr->disabled = 1; 309 if (!initial_delay) 310 attr->enable_on_exec = 1; 311 } 312 313 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 314} 315 316/* 317 * Does the counter have nsecs as a unit? 318 */ 319static inline int nsec_counter(struct perf_evsel *evsel) 320{ 321 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 322 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 323 return 1; 324 325 return 0; 326} 327 328static struct perf_evsel *nth_evsel(int n) 329{ 330 static struct perf_evsel **array; 331 static int array_len; 332 struct perf_evsel *ev; 333 int j; 334 335 /* Assumes this only called when evsel_list does not change anymore. */ 336 if (!array) { 337 evlist__for_each(evsel_list, ev) 338 array_len++; 339 array = malloc(array_len * sizeof(void *)); 340 if (!array) 341 exit(ENOMEM); 342 j = 0; 343 evlist__for_each(evsel_list, ev) 344 array[j++] = ev; 345 } 346 if (n < array_len) 347 return array[n]; 348 return NULL; 349} 350 351/* 352 * Update various tracking values we maintain to print 353 * more semantic information such as miss/hit ratios, 354 * instruction rates, etc: 355 */ 356static void update_shadow_stats(struct perf_evsel *counter, u64 *count, 357 int cpu) 358{ 359 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 360 update_stats(&runtime_nsecs_stats[cpu], count[0]); 361 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 362 update_stats(&runtime_cycles_stats[cpu], count[0]); 363 else if (transaction_run && 364 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX))) 365 update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]); 366 else if (transaction_run && 367 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START))) 368 update_stats(&runtime_transaction_stats[cpu], count[0]); 369 else if (transaction_run && 370 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START))) 371 update_stats(&runtime_elision_stats[cpu], count[0]); 372 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 373 update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]); 374 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 375 update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]); 376 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 377 update_stats(&runtime_branches_stats[cpu], count[0]); 378 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 379 update_stats(&runtime_cacherefs_stats[cpu], count[0]); 380 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 381 update_stats(&runtime_l1_dcache_stats[cpu], count[0]); 382 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 383 update_stats(&runtime_l1_icache_stats[cpu], count[0]); 384 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 385 update_stats(&runtime_ll_cache_stats[cpu], count[0]); 386 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 387 update_stats(&runtime_dtlb_cache_stats[cpu], count[0]); 388 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 389 update_stats(&runtime_itlb_cache_stats[cpu], count[0]); 390} 391 392static void zero_per_pkg(struct perf_evsel *counter) 393{ 394 if (counter->per_pkg_mask) 395 memset(counter->per_pkg_mask, 0, MAX_NR_CPUS); 396} 397 398static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) 399{ 400 unsigned long *mask = counter->per_pkg_mask; 401 struct cpu_map *cpus = perf_evsel__cpus(counter); 402 int s; 403 404 *skip = false; 405 406 if (!counter->per_pkg) 407 return 0; 408 409 if (cpu_map__empty(cpus)) 410 return 0; 411 412 if (!mask) { 413 mask = zalloc(MAX_NR_CPUS); 414 if (!mask) 415 return -ENOMEM; 416 417 counter->per_pkg_mask = mask; 418 } 419 420 s = cpu_map__get_socket(cpus, cpu); 421 if (s < 0) 422 return -1; 423 424 *skip = test_and_set_bit(s, mask) == 1; 425 return 0; 426} 427 428static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused, 429 struct perf_counts_values *count) 430{ 431 struct perf_counts_values *aggr = &evsel->counts->aggr; 432 static struct perf_counts_values zero; 433 bool skip = false; 434 435 if (check_per_pkg(evsel, cpu, &skip)) { 436 pr_err("failed to read per-pkg counter\n"); 437 return -1; 438 } 439 440 if (skip) 441 count = &zero; 442 443 switch (aggr_mode) { 444 case AGGR_CORE: 445 case AGGR_SOCKET: 446 case AGGR_NONE: 447 if (!evsel->snapshot) 448 perf_evsel__compute_deltas(evsel, cpu, count); 449 perf_counts_values__scale(count, scale, NULL); 450 evsel->counts->cpu[cpu] = *count; 451 if (aggr_mode == AGGR_NONE) 452 update_shadow_stats(evsel, count->values, cpu); 453 break; 454 case AGGR_GLOBAL: 455 aggr->val += count->val; 456 if (scale) { 457 aggr->ena += count->ena; 458 aggr->run += count->run; 459 } 460 default: 461 break; 462 } 463 464 return 0; 465} 466 467static int read_counter(struct perf_evsel *counter); 468 469/* 470 * Read out the results of a single counter: 471 * aggregate counts across CPUs in system-wide mode 472 */ 473static int read_counter_aggr(struct perf_evsel *counter) 474{ 475 struct perf_counts_values *aggr = &counter->counts->aggr; 476 struct perf_stat *ps = counter->priv; 477 u64 *count = counter->counts->aggr.values; 478 int i; 479 480 aggr->val = aggr->ena = aggr->run = 0; 481 482 if (read_counter(counter)) 483 return -1; 484 485 if (!counter->snapshot) 486 perf_evsel__compute_deltas(counter, -1, aggr); 487 perf_counts_values__scale(aggr, scale, &counter->counts->scaled); 488 489 for (i = 0; i < 3; i++) 490 update_stats(&ps->res_stats[i], count[i]); 491 492 if (verbose) { 493 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 494 perf_evsel__name(counter), count[0], count[1], count[2]); 495 } 496 497 /* 498 * Save the full runtime - to allow normalization during printout: 499 */ 500 update_shadow_stats(counter, count, 0); 501 502 return 0; 503} 504 505/* 506 * Read out the results of a single counter: 507 * do not aggregate counts across CPUs in system-wide mode 508 */ 509static int read_counter(struct perf_evsel *counter) 510{ 511 int nthreads = thread_map__nr(evsel_list->threads); 512 int ncpus = perf_evsel__nr_cpus(counter); 513 int cpu, thread; 514 515 if (!counter->supported) 516 return -ENOENT; 517 518 if (counter->system_wide) 519 nthreads = 1; 520 521 if (counter->per_pkg) 522 zero_per_pkg(counter); 523 524 for (thread = 0; thread < nthreads; thread++) { 525 for (cpu = 0; cpu < ncpus; cpu++) { 526 if (perf_evsel__read_cb(counter, cpu, thread, read_cb)) 527 return -1; 528 } 529 } 530 531 return 0; 532} 533 534static void print_interval(void) 535{ 536 static int num_print_interval; 537 struct perf_evsel *counter; 538 struct perf_stat *ps; 539 struct timespec ts, rs; 540 char prefix[64]; 541 542 if (aggr_mode == AGGR_GLOBAL) { 543 evlist__for_each(evsel_list, counter) { 544 ps = counter->priv; 545 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 546 read_counter_aggr(counter); 547 } 548 } else { 549 evlist__for_each(evsel_list, counter) { 550 ps = counter->priv; 551 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 552 read_counter(counter); 553 } 554 } 555 556 clock_gettime(CLOCK_MONOTONIC, &ts); 557 diff_timespec(&rs, &ts, &ref_time); 558 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 559 560 if (num_print_interval == 0 && !csv_output) { 561 switch (aggr_mode) { 562 case AGGR_SOCKET: 563 fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); 564 break; 565 case AGGR_CORE: 566 fprintf(output, "# time core cpus counts %*s events\n", unit_width, "unit"); 567 break; 568 case AGGR_NONE: 569 fprintf(output, "# time CPU counts %*s events\n", unit_width, "unit"); 570 break; 571 case AGGR_GLOBAL: 572 default: 573 fprintf(output, "# time counts %*s events\n", unit_width, "unit"); 574 } 575 } 576 577 if (++num_print_interval == 25) 578 num_print_interval = 0; 579 580 switch (aggr_mode) { 581 case AGGR_CORE: 582 case AGGR_SOCKET: 583 print_aggr(prefix); 584 break; 585 case AGGR_NONE: 586 evlist__for_each(evsel_list, counter) 587 print_counter(counter, prefix); 588 break; 589 case AGGR_GLOBAL: 590 default: 591 evlist__for_each(evsel_list, counter) 592 print_counter_aggr(counter, prefix); 593 } 594 595 fflush(output); 596} 597 598static void handle_initial_delay(void) 599{ 600 struct perf_evsel *counter; 601 602 if (initial_delay) { 603 const int ncpus = cpu_map__nr(evsel_list->cpus), 604 nthreads = thread_map__nr(evsel_list->threads); 605 606 usleep(initial_delay * 1000); 607 evlist__for_each(evsel_list, counter) 608 perf_evsel__enable(counter, ncpus, nthreads); 609 } 610} 611 612static volatile int workload_exec_errno; 613 614/* 615 * perf_evlist__prepare_workload will send a SIGUSR1 616 * if the fork fails, since we asked by setting its 617 * want_signal to true. 618 */ 619static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info, 620 void *ucontext __maybe_unused) 621{ 622 workload_exec_errno = info->si_value.sival_int; 623} 624 625static int __run_perf_stat(int argc, const char **argv) 626{ 627 char msg[512]; 628 unsigned long long t0, t1; 629 struct perf_evsel *counter; 630 struct timespec ts; 631 size_t l; 632 int status = 0; 633 const bool forks = (argc > 0); 634 635 if (interval) { 636 ts.tv_sec = interval / 1000; 637 ts.tv_nsec = (interval % 1000) * 1000000; 638 } else { 639 ts.tv_sec = 1; 640 ts.tv_nsec = 0; 641 } 642 643 if (forks) { 644 if (perf_evlist__prepare_workload(evsel_list, &target, argv, false, 645 workload_exec_failed_signal) < 0) { 646 perror("failed to prepare workload"); 647 return -1; 648 } 649 child_pid = evsel_list->workload.pid; 650 } 651 652 if (group) 653 perf_evlist__set_leader(evsel_list); 654 655 evlist__for_each(evsel_list, counter) { 656 if (create_perf_stat_counter(counter) < 0) { 657 /* 658 * PPC returns ENXIO for HW counters until 2.6.37 659 * (behavior changed with commit b0a873e). 660 */ 661 if (errno == EINVAL || errno == ENOSYS || 662 errno == ENOENT || errno == EOPNOTSUPP || 663 errno == ENXIO) { 664 if (verbose) 665 ui__warning("%s event is not supported by the kernel.\n", 666 perf_evsel__name(counter)); 667 counter->supported = false; 668 continue; 669 } 670 671 perf_evsel__open_strerror(counter, &target, 672 errno, msg, sizeof(msg)); 673 ui__error("%s\n", msg); 674 675 if (child_pid != -1) 676 kill(child_pid, SIGTERM); 677 678 return -1; 679 } 680 counter->supported = true; 681 682 l = strlen(counter->unit); 683 if (l > unit_width) 684 unit_width = l; 685 } 686 687 if (perf_evlist__apply_filters(evsel_list, &counter)) { 688 error("failed to set filter \"%s\" on event %s with %d (%s)\n", 689 counter->filter, perf_evsel__name(counter), errno, 690 strerror_r(errno, msg, sizeof(msg))); 691 return -1; 692 } 693 694 /* 695 * Enable counters and exec the command: 696 */ 697 t0 = rdclock(); 698 clock_gettime(CLOCK_MONOTONIC, &ref_time); 699 700 if (forks) { 701 perf_evlist__start_workload(evsel_list); 702 handle_initial_delay(); 703 704 if (interval) { 705 while (!waitpid(child_pid, &status, WNOHANG)) { 706 nanosleep(&ts, NULL); 707 print_interval(); 708 } 709 } 710 wait(&status); 711 712 if (workload_exec_errno) { 713 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg)); 714 pr_err("Workload failed: %s\n", emsg); 715 return -1; 716 } 717 718 if (WIFSIGNALED(status)) 719 psignal(WTERMSIG(status), argv[0]); 720 } else { 721 handle_initial_delay(); 722 while (!done) { 723 nanosleep(&ts, NULL); 724 if (interval) 725 print_interval(); 726 } 727 } 728 729 t1 = rdclock(); 730 731 update_stats(&walltime_nsecs_stats, t1 - t0); 732 733 if (aggr_mode == AGGR_GLOBAL) { 734 evlist__for_each(evsel_list, counter) { 735 read_counter_aggr(counter); 736 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 737 thread_map__nr(evsel_list->threads)); 738 } 739 } else { 740 evlist__for_each(evsel_list, counter) { 741 read_counter(counter); 742 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 743 } 744 } 745 746 return WEXITSTATUS(status); 747} 748 749static int run_perf_stat(int argc, const char **argv) 750{ 751 int ret; 752 753 if (pre_cmd) { 754 ret = system(pre_cmd); 755 if (ret) 756 return ret; 757 } 758 759 if (sync_run) 760 sync(); 761 762 ret = __run_perf_stat(argc, argv); 763 if (ret) 764 return ret; 765 766 if (post_cmd) { 767 ret = system(post_cmd); 768 if (ret) 769 return ret; 770 } 771 772 return ret; 773} 774 775static void print_running(u64 run, u64 ena) 776{ 777 if (csv_output) { 778 fprintf(output, "%s%" PRIu64 "%s%.2f", 779 csv_sep, 780 run, 781 csv_sep, 782 ena ? 100.0 * run / ena : 100.0); 783 } else if (run != ena) { 784 fprintf(output, " (%.2f%%)", 100.0 * run / ena); 785 } 786} 787 788static void print_noise_pct(double total, double avg) 789{ 790 double pct = rel_stddev_stats(total, avg); 791 792 if (csv_output) 793 fprintf(output, "%s%.2f%%", csv_sep, pct); 794 else if (pct) 795 fprintf(output, " ( +-%6.2f%% )", pct); 796} 797 798static void print_noise(struct perf_evsel *evsel, double avg) 799{ 800 struct perf_stat *ps; 801 802 if (run_count == 1) 803 return; 804 805 ps = evsel->priv; 806 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 807} 808 809static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 810{ 811 switch (aggr_mode) { 812 case AGGR_CORE: 813 fprintf(output, "S%d-C%*d%s%*d%s", 814 cpu_map__id_to_socket(id), 815 csv_output ? 0 : -8, 816 cpu_map__id_to_cpu(id), 817 csv_sep, 818 csv_output ? 0 : 4, 819 nr, 820 csv_sep); 821 break; 822 case AGGR_SOCKET: 823 fprintf(output, "S%*d%s%*d%s", 824 csv_output ? 0 : -5, 825 id, 826 csv_sep, 827 csv_output ? 0 : 4, 828 nr, 829 csv_sep); 830 break; 831 case AGGR_NONE: 832 fprintf(output, "CPU%*d%s", 833 csv_output ? 0 : -4, 834 perf_evsel__cpus(evsel)->map[id], csv_sep); 835 break; 836 case AGGR_GLOBAL: 837 default: 838 break; 839 } 840} 841 842static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) 843{ 844 double msecs = avg / 1e6; 845 const char *fmt_v, *fmt_n; 846 char name[25]; 847 848 fmt_v = csv_output ? "%.6f%s" : "%18.6f%s"; 849 fmt_n = csv_output ? "%s" : "%-25s"; 850 851 aggr_printout(evsel, id, nr); 852 853 scnprintf(name, sizeof(name), "%s%s", 854 perf_evsel__name(evsel), csv_output ? "" : " (msec)"); 855 856 fprintf(output, fmt_v, msecs, csv_sep); 857 858 if (csv_output) 859 fprintf(output, "%s%s", evsel->unit, csv_sep); 860 else 861 fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep); 862 863 fprintf(output, fmt_n, name); 864 865 if (evsel->cgrp) 866 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 867 868 if (csv_output || interval) 869 return; 870 871 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 872 fprintf(output, " # %8.3f CPUs utilized ", 873 avg / avg_stats(&walltime_nsecs_stats)); 874 else 875 fprintf(output, " "); 876} 877 878/* used for get_ratio_color() */ 879enum grc_type { 880 GRC_STALLED_CYCLES_FE, 881 GRC_STALLED_CYCLES_BE, 882 GRC_CACHE_MISSES, 883 GRC_MAX_NR 884}; 885 886static const char *get_ratio_color(enum grc_type type, double ratio) 887{ 888 static const double grc_table[GRC_MAX_NR][3] = { 889 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 890 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 891 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 892 }; 893 const char *color = PERF_COLOR_NORMAL; 894 895 if (ratio > grc_table[type][0]) 896 color = PERF_COLOR_RED; 897 else if (ratio > grc_table[type][1]) 898 color = PERF_COLOR_MAGENTA; 899 else if (ratio > grc_table[type][2]) 900 color = PERF_COLOR_YELLOW; 901 902 return color; 903} 904 905static void print_stalled_cycles_frontend(int cpu, 906 struct perf_evsel *evsel 907 __maybe_unused, double avg) 908{ 909 double total, ratio = 0.0; 910 const char *color; 911 912 total = avg_stats(&runtime_cycles_stats[cpu]); 913 914 if (total) 915 ratio = avg / total * 100.0; 916 917 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 918 919 fprintf(output, " # "); 920 color_fprintf(output, color, "%6.2f%%", ratio); 921 fprintf(output, " frontend cycles idle "); 922} 923 924static void print_stalled_cycles_backend(int cpu, 925 struct perf_evsel *evsel 926 __maybe_unused, double avg) 927{ 928 double total, ratio = 0.0; 929 const char *color; 930 931 total = avg_stats(&runtime_cycles_stats[cpu]); 932 933 if (total) 934 ratio = avg / total * 100.0; 935 936 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 937 938 fprintf(output, " # "); 939 color_fprintf(output, color, "%6.2f%%", ratio); 940 fprintf(output, " backend cycles idle "); 941} 942 943static void print_branch_misses(int cpu, 944 struct perf_evsel *evsel __maybe_unused, 945 double avg) 946{ 947 double total, ratio = 0.0; 948 const char *color; 949 950 total = avg_stats(&runtime_branches_stats[cpu]); 951 952 if (total) 953 ratio = avg / total * 100.0; 954 955 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 956 957 fprintf(output, " # "); 958 color_fprintf(output, color, "%6.2f%%", ratio); 959 fprintf(output, " of all branches "); 960} 961 962static void print_l1_dcache_misses(int cpu, 963 struct perf_evsel *evsel __maybe_unused, 964 double avg) 965{ 966 double total, ratio = 0.0; 967 const char *color; 968 969 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 970 971 if (total) 972 ratio = avg / total * 100.0; 973 974 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 975 976 fprintf(output, " # "); 977 color_fprintf(output, color, "%6.2f%%", ratio); 978 fprintf(output, " of all L1-dcache hits "); 979} 980 981static void print_l1_icache_misses(int cpu, 982 struct perf_evsel *evsel __maybe_unused, 983 double avg) 984{ 985 double total, ratio = 0.0; 986 const char *color; 987 988 total = avg_stats(&runtime_l1_icache_stats[cpu]); 989 990 if (total) 991 ratio = avg / total * 100.0; 992 993 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 994 995 fprintf(output, " # "); 996 color_fprintf(output, color, "%6.2f%%", ratio); 997 fprintf(output, " of all L1-icache hits "); 998} 999 1000static void print_dtlb_cache_misses(int cpu, 1001 struct perf_evsel *evsel __maybe_unused, 1002 double avg) 1003{ 1004 double total, ratio = 0.0; 1005 const char *color; 1006 1007 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 1008 1009 if (total) 1010 ratio = avg / total * 100.0; 1011 1012 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1013 1014 fprintf(output, " # "); 1015 color_fprintf(output, color, "%6.2f%%", ratio); 1016 fprintf(output, " of all dTLB cache hits "); 1017} 1018 1019static void print_itlb_cache_misses(int cpu, 1020 struct perf_evsel *evsel __maybe_unused, 1021 double avg) 1022{ 1023 double total, ratio = 0.0; 1024 const char *color; 1025 1026 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 1027 1028 if (total) 1029 ratio = avg / total * 100.0; 1030 1031 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1032 1033 fprintf(output, " # "); 1034 color_fprintf(output, color, "%6.2f%%", ratio); 1035 fprintf(output, " of all iTLB cache hits "); 1036} 1037 1038static void print_ll_cache_misses(int cpu, 1039 struct perf_evsel *evsel __maybe_unused, 1040 double avg) 1041{ 1042 double total, ratio = 0.0; 1043 const char *color; 1044 1045 total = avg_stats(&runtime_ll_cache_stats[cpu]); 1046 1047 if (total) 1048 ratio = avg / total * 100.0; 1049 1050 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 1051 1052 fprintf(output, " # "); 1053 color_fprintf(output, color, "%6.2f%%", ratio); 1054 fprintf(output, " of all LL-cache hits "); 1055} 1056 1057static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) 1058{ 1059 double total, ratio = 0.0, total2; 1060 double sc = evsel->scale; 1061 const char *fmt; 1062 int cpu = cpu_map__id_to_cpu(id); 1063 1064 if (csv_output) { 1065 fmt = sc != 1.0 ? "%.2f%s" : "%.0f%s"; 1066 } else { 1067 if (big_num) 1068 fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s"; 1069 else 1070 fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s"; 1071 } 1072 1073 aggr_printout(evsel, id, nr); 1074 1075 if (aggr_mode == AGGR_GLOBAL) 1076 cpu = 0; 1077 1078 fprintf(output, fmt, avg, csv_sep); 1079 1080 if (evsel->unit) 1081 fprintf(output, "%-*s%s", 1082 csv_output ? 0 : unit_width, 1083 evsel->unit, csv_sep); 1084 1085 fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel)); 1086 1087 if (evsel->cgrp) 1088 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 1089 1090 if (csv_output || interval) 1091 return; 1092 1093 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 1094 total = avg_stats(&runtime_cycles_stats[cpu]); 1095 if (total) { 1096 ratio = avg / total; 1097 fprintf(output, " # %5.2f insns per cycle ", ratio); 1098 } else { 1099 fprintf(output, " "); 1100 } 1101 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 1102 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 1103 1104 if (total && avg) { 1105 ratio = total / avg; 1106 fprintf(output, "\n"); 1107 if (aggr_mode == AGGR_NONE) 1108 fprintf(output, " "); 1109 fprintf(output, " # %5.2f stalled cycles per insn", ratio); 1110 } 1111 1112 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 1113 runtime_branches_stats[cpu].n != 0) { 1114 print_branch_misses(cpu, evsel, avg); 1115 } else if ( 1116 evsel->attr.type == PERF_TYPE_HW_CACHE && 1117 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 1118 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1119 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1120 runtime_l1_dcache_stats[cpu].n != 0) { 1121 print_l1_dcache_misses(cpu, evsel, avg); 1122 } else if ( 1123 evsel->attr.type == PERF_TYPE_HW_CACHE && 1124 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 1125 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1126 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1127 runtime_l1_icache_stats[cpu].n != 0) { 1128 print_l1_icache_misses(cpu, evsel, avg); 1129 } else if ( 1130 evsel->attr.type == PERF_TYPE_HW_CACHE && 1131 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 1132 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1133 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1134 runtime_dtlb_cache_stats[cpu].n != 0) { 1135 print_dtlb_cache_misses(cpu, evsel, avg); 1136 } else if ( 1137 evsel->attr.type == PERF_TYPE_HW_CACHE && 1138 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 1139 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1140 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1141 runtime_itlb_cache_stats[cpu].n != 0) { 1142 print_itlb_cache_misses(cpu, evsel, avg); 1143 } else if ( 1144 evsel->attr.type == PERF_TYPE_HW_CACHE && 1145 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 1146 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 1147 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 1148 runtime_ll_cache_stats[cpu].n != 0) { 1149 print_ll_cache_misses(cpu, evsel, avg); 1150 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 1151 runtime_cacherefs_stats[cpu].n != 0) { 1152 total = avg_stats(&runtime_cacherefs_stats[cpu]); 1153 1154 if (total) 1155 ratio = avg * 100 / total; 1156 1157 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 1158 1159 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 1160 print_stalled_cycles_frontend(cpu, evsel, avg); 1161 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 1162 print_stalled_cycles_backend(cpu, evsel, avg); 1163 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 1164 total = avg_stats(&runtime_nsecs_stats[cpu]); 1165 1166 if (total) { 1167 ratio = avg / total; 1168 fprintf(output, " # %8.3f GHz ", ratio); 1169 } else { 1170 fprintf(output, " "); 1171 } 1172 } else if (transaction_run && 1173 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) { 1174 total = avg_stats(&runtime_cycles_stats[cpu]); 1175 if (total) 1176 fprintf(output, 1177 " # %5.2f%% transactional cycles ", 1178 100.0 * (avg / total)); 1179 } else if (transaction_run && 1180 perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) { 1181 total = avg_stats(&runtime_cycles_stats[cpu]); 1182 total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1183 if (total2 < avg) 1184 total2 = avg; 1185 if (total) 1186 fprintf(output, 1187 " # %5.2f%% aborted cycles ", 1188 100.0 * ((total2-avg) / total)); 1189 } else if (transaction_run && 1190 perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) && 1191 avg > 0 && 1192 runtime_cycles_in_tx_stats[cpu].n != 0) { 1193 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1194 1195 if (total) 1196 ratio = total / avg; 1197 1198 fprintf(output, " # %8.0f cycles / transaction ", ratio); 1199 } else if (transaction_run && 1200 perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) && 1201 avg > 0 && 1202 runtime_cycles_in_tx_stats[cpu].n != 0) { 1203 total = avg_stats(&runtime_cycles_in_tx_stats[cpu]); 1204 1205 if (total) 1206 ratio = total / avg; 1207 1208 fprintf(output, " # %8.0f cycles / elision ", ratio); 1209 } else if (runtime_nsecs_stats[cpu].n != 0) { 1210 char unit = 'M'; 1211 1212 total = avg_stats(&runtime_nsecs_stats[cpu]); 1213 1214 if (total) 1215 ratio = 1000.0 * avg / total; 1216 if (ratio < 0.001) { 1217 ratio *= 1000; 1218 unit = 'K'; 1219 } 1220 1221 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 1222 } else { 1223 fprintf(output, " "); 1224 } 1225} 1226 1227static void print_aggr(char *prefix) 1228{ 1229 struct perf_evsel *counter; 1230 int cpu, s, s2, id, nr; 1231 double uval; 1232 u64 ena, run, val; 1233 1234 if (!(aggr_map || aggr_get_id)) 1235 return; 1236 1237 for (s = 0; s < aggr_map->nr; s++) { 1238 id = aggr_map->map[s]; 1239 evlist__for_each(evsel_list, counter) { 1240 val = ena = run = 0; 1241 nr = 0; 1242 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1243 s2 = aggr_get_id(perf_evsel__cpus(counter), cpu); 1244 if (s2 != id) 1245 continue; 1246 val += counter->counts->cpu[cpu].val; 1247 ena += counter->counts->cpu[cpu].ena; 1248 run += counter->counts->cpu[cpu].run; 1249 nr++; 1250 } 1251 if (prefix) 1252 fprintf(output, "%s", prefix); 1253 1254 if (run == 0 || ena == 0) { 1255 aggr_printout(counter, id, nr); 1256 1257 fprintf(output, "%*s%s", 1258 csv_output ? 0 : 18, 1259 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1260 csv_sep); 1261 1262 fprintf(output, "%-*s%s", 1263 csv_output ? 0 : unit_width, 1264 counter->unit, csv_sep); 1265 1266 fprintf(output, "%*s", 1267 csv_output ? 0 : -25, 1268 perf_evsel__name(counter)); 1269 1270 if (counter->cgrp) 1271 fprintf(output, "%s%s", 1272 csv_sep, counter->cgrp->name); 1273 1274 print_running(run, ena); 1275 fputc('\n', output); 1276 continue; 1277 } 1278 uval = val * counter->scale; 1279 1280 if (nsec_counter(counter)) 1281 nsec_printout(id, nr, counter, uval); 1282 else 1283 abs_printout(id, nr, counter, uval); 1284 1285 if (!csv_output) 1286 print_noise(counter, 1.0); 1287 1288 print_running(run, ena); 1289 fputc('\n', output); 1290 } 1291 } 1292} 1293 1294/* 1295 * Print out the results of a single counter: 1296 * aggregated counts in system-wide mode 1297 */ 1298static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1299{ 1300 struct perf_stat *ps = counter->priv; 1301 double avg = avg_stats(&ps->res_stats[0]); 1302 int scaled = counter->counts->scaled; 1303 double uval; 1304 double avg_enabled, avg_running; 1305 1306 avg_enabled = avg_stats(&ps->res_stats[1]); 1307 avg_running = avg_stats(&ps->res_stats[2]); 1308 1309 if (prefix) 1310 fprintf(output, "%s", prefix); 1311 1312 if (scaled == -1 || !counter->supported) { 1313 fprintf(output, "%*s%s", 1314 csv_output ? 0 : 18, 1315 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1316 csv_sep); 1317 fprintf(output, "%-*s%s", 1318 csv_output ? 0 : unit_width, 1319 counter->unit, csv_sep); 1320 fprintf(output, "%*s", 1321 csv_output ? 0 : -25, 1322 perf_evsel__name(counter)); 1323 1324 if (counter->cgrp) 1325 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1326 1327 print_running(avg_running, avg_enabled); 1328 fputc('\n', output); 1329 return; 1330 } 1331 1332 uval = avg * counter->scale; 1333 1334 if (nsec_counter(counter)) 1335 nsec_printout(-1, 0, counter, uval); 1336 else 1337 abs_printout(-1, 0, counter, uval); 1338 1339 print_noise(counter, avg); 1340 1341 print_running(avg_running, avg_enabled); 1342 fprintf(output, "\n"); 1343} 1344 1345/* 1346 * Print out the results of a single counter: 1347 * does not use aggregated count in system-wide 1348 */ 1349static void print_counter(struct perf_evsel *counter, char *prefix) 1350{ 1351 u64 ena, run, val; 1352 double uval; 1353 int cpu; 1354 1355 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1356 val = counter->counts->cpu[cpu].val; 1357 ena = counter->counts->cpu[cpu].ena; 1358 run = counter->counts->cpu[cpu].run; 1359 1360 if (prefix) 1361 fprintf(output, "%s", prefix); 1362 1363 if (run == 0 || ena == 0) { 1364 fprintf(output, "CPU%*d%s%*s%s", 1365 csv_output ? 0 : -4, 1366 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1367 csv_output ? 0 : 18, 1368 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1369 csv_sep); 1370 1371 fprintf(output, "%-*s%s", 1372 csv_output ? 0 : unit_width, 1373 counter->unit, csv_sep); 1374 1375 fprintf(output, "%*s", 1376 csv_output ? 0 : -25, 1377 perf_evsel__name(counter)); 1378 1379 if (counter->cgrp) 1380 fprintf(output, "%s%s", 1381 csv_sep, counter->cgrp->name); 1382 1383 print_running(run, ena); 1384 fputc('\n', output); 1385 continue; 1386 } 1387 1388 uval = val * counter->scale; 1389 1390 if (nsec_counter(counter)) 1391 nsec_printout(cpu, 0, counter, uval); 1392 else 1393 abs_printout(cpu, 0, counter, uval); 1394 1395 if (!csv_output) 1396 print_noise(counter, 1.0); 1397 print_running(run, ena); 1398 1399 fputc('\n', output); 1400 } 1401} 1402 1403static void print_stat(int argc, const char **argv) 1404{ 1405 struct perf_evsel *counter; 1406 int i; 1407 1408 fflush(stdout); 1409 1410 if (!csv_output) { 1411 fprintf(output, "\n"); 1412 fprintf(output, " Performance counter stats for "); 1413 if (target.system_wide) 1414 fprintf(output, "\'system wide"); 1415 else if (target.cpu_list) 1416 fprintf(output, "\'CPU(s) %s", target.cpu_list); 1417 else if (!target__has_task(&target)) { 1418 fprintf(output, "\'%s", argv[0]); 1419 for (i = 1; i < argc; i++) 1420 fprintf(output, " %s", argv[i]); 1421 } else if (target.pid) 1422 fprintf(output, "process id \'%s", target.pid); 1423 else 1424 fprintf(output, "thread id \'%s", target.tid); 1425 1426 fprintf(output, "\'"); 1427 if (run_count > 1) 1428 fprintf(output, " (%d runs)", run_count); 1429 fprintf(output, ":\n\n"); 1430 } 1431 1432 switch (aggr_mode) { 1433 case AGGR_CORE: 1434 case AGGR_SOCKET: 1435 print_aggr(NULL); 1436 break; 1437 case AGGR_GLOBAL: 1438 evlist__for_each(evsel_list, counter) 1439 print_counter_aggr(counter, NULL); 1440 break; 1441 case AGGR_NONE: 1442 evlist__for_each(evsel_list, counter) 1443 print_counter(counter, NULL); 1444 break; 1445 default: 1446 break; 1447 } 1448 1449 if (!csv_output) { 1450 if (!null_run) 1451 fprintf(output, "\n"); 1452 fprintf(output, " %17.9f seconds time elapsed", 1453 avg_stats(&walltime_nsecs_stats)/1e9); 1454 if (run_count > 1) { 1455 fprintf(output, " "); 1456 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1457 avg_stats(&walltime_nsecs_stats)); 1458 } 1459 fprintf(output, "\n\n"); 1460 } 1461} 1462 1463static volatile int signr = -1; 1464 1465static void skip_signal(int signo) 1466{ 1467 if ((child_pid == -1) || interval) 1468 done = 1; 1469 1470 signr = signo; 1471 /* 1472 * render child_pid harmless 1473 * won't send SIGTERM to a random 1474 * process in case of race condition 1475 * and fast PID recycling 1476 */ 1477 child_pid = -1; 1478} 1479 1480static void sig_atexit(void) 1481{ 1482 sigset_t set, oset; 1483 1484 /* 1485 * avoid race condition with SIGCHLD handler 1486 * in skip_signal() which is modifying child_pid 1487 * goal is to avoid send SIGTERM to a random 1488 * process 1489 */ 1490 sigemptyset(&set); 1491 sigaddset(&set, SIGCHLD); 1492 sigprocmask(SIG_BLOCK, &set, &oset); 1493 1494 if (child_pid != -1) 1495 kill(child_pid, SIGTERM); 1496 1497 sigprocmask(SIG_SETMASK, &oset, NULL); 1498 1499 if (signr == -1) 1500 return; 1501 1502 signal(signr, SIG_DFL); 1503 kill(getpid(), signr); 1504} 1505 1506static int stat__set_big_num(const struct option *opt __maybe_unused, 1507 const char *s __maybe_unused, int unset) 1508{ 1509 big_num_opt = unset ? 0 : 1; 1510 return 0; 1511} 1512 1513static int perf_stat_init_aggr_mode(void) 1514{ 1515 switch (aggr_mode) { 1516 case AGGR_SOCKET: 1517 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1518 perror("cannot build socket map"); 1519 return -1; 1520 } 1521 aggr_get_id = cpu_map__get_socket; 1522 break; 1523 case AGGR_CORE: 1524 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1525 perror("cannot build core map"); 1526 return -1; 1527 } 1528 aggr_get_id = cpu_map__get_core; 1529 break; 1530 case AGGR_NONE: 1531 case AGGR_GLOBAL: 1532 default: 1533 break; 1534 } 1535 return 0; 1536} 1537 1538static int setup_events(const char * const *attrs, unsigned len) 1539{ 1540 unsigned i; 1541 1542 for (i = 0; i < len; i++) { 1543 if (parse_events(evsel_list, attrs[i])) 1544 return -1; 1545 } 1546 return 0; 1547} 1548 1549/* 1550 * Add default attributes, if there were no attributes specified or 1551 * if -d/--detailed, -d -d or -d -d -d is used: 1552 */ 1553static int add_default_attributes(void) 1554{ 1555 struct perf_event_attr default_attrs[] = { 1556 1557 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1558 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1559 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1560 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1561 1562 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1563 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1564 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1565 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1566 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1567 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1568 1569}; 1570 1571/* 1572 * Detailed stats (-d), covering the L1 and last level data caches: 1573 */ 1574 struct perf_event_attr detailed_attrs[] = { 1575 1576 { .type = PERF_TYPE_HW_CACHE, 1577 .config = 1578 PERF_COUNT_HW_CACHE_L1D << 0 | 1579 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1580 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1581 1582 { .type = PERF_TYPE_HW_CACHE, 1583 .config = 1584 PERF_COUNT_HW_CACHE_L1D << 0 | 1585 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1586 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1587 1588 { .type = PERF_TYPE_HW_CACHE, 1589 .config = 1590 PERF_COUNT_HW_CACHE_LL << 0 | 1591 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1592 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1593 1594 { .type = PERF_TYPE_HW_CACHE, 1595 .config = 1596 PERF_COUNT_HW_CACHE_LL << 0 | 1597 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1598 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1599}; 1600 1601/* 1602 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1603 */ 1604 struct perf_event_attr very_detailed_attrs[] = { 1605 1606 { .type = PERF_TYPE_HW_CACHE, 1607 .config = 1608 PERF_COUNT_HW_CACHE_L1I << 0 | 1609 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1610 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1611 1612 { .type = PERF_TYPE_HW_CACHE, 1613 .config = 1614 PERF_COUNT_HW_CACHE_L1I << 0 | 1615 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1616 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1617 1618 { .type = PERF_TYPE_HW_CACHE, 1619 .config = 1620 PERF_COUNT_HW_CACHE_DTLB << 0 | 1621 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1622 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1623 1624 { .type = PERF_TYPE_HW_CACHE, 1625 .config = 1626 PERF_COUNT_HW_CACHE_DTLB << 0 | 1627 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1628 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1629 1630 { .type = PERF_TYPE_HW_CACHE, 1631 .config = 1632 PERF_COUNT_HW_CACHE_ITLB << 0 | 1633 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1634 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1635 1636 { .type = PERF_TYPE_HW_CACHE, 1637 .config = 1638 PERF_COUNT_HW_CACHE_ITLB << 0 | 1639 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1640 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1641 1642}; 1643 1644/* 1645 * Very, very detailed stats (-d -d -d), adding prefetch events: 1646 */ 1647 struct perf_event_attr very_very_detailed_attrs[] = { 1648 1649 { .type = PERF_TYPE_HW_CACHE, 1650 .config = 1651 PERF_COUNT_HW_CACHE_L1D << 0 | 1652 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1653 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1654 1655 { .type = PERF_TYPE_HW_CACHE, 1656 .config = 1657 PERF_COUNT_HW_CACHE_L1D << 0 | 1658 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1659 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1660}; 1661 1662 /* Set attrs if no event is selected and !null_run: */ 1663 if (null_run) 1664 return 0; 1665 1666 if (transaction_run) { 1667 int err; 1668 if (pmu_have_event("cpu", "cycles-ct") && 1669 pmu_have_event("cpu", "el-start")) 1670 err = setup_events(transaction_attrs, 1671 ARRAY_SIZE(transaction_attrs)); 1672 else 1673 err = setup_events(transaction_limited_attrs, 1674 ARRAY_SIZE(transaction_limited_attrs)); 1675 if (err < 0) { 1676 fprintf(stderr, "Cannot set up transaction events\n"); 1677 return -1; 1678 } 1679 return 0; 1680 } 1681 1682 if (!evsel_list->nr_entries) { 1683 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1684 return -1; 1685 } 1686 1687 /* Detailed events get appended to the event list: */ 1688 1689 if (detailed_run < 1) 1690 return 0; 1691 1692 /* Append detailed run extra attributes: */ 1693 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1694 return -1; 1695 1696 if (detailed_run < 2) 1697 return 0; 1698 1699 /* Append very detailed run extra attributes: */ 1700 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1701 return -1; 1702 1703 if (detailed_run < 3) 1704 return 0; 1705 1706 /* Append very, very detailed run extra attributes: */ 1707 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1708} 1709 1710int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1711{ 1712 bool append_file = false; 1713 int output_fd = 0; 1714 const char *output_name = NULL; 1715 const struct option options[] = { 1716 OPT_BOOLEAN('T', "transaction", &transaction_run, 1717 "hardware transaction statistics"), 1718 OPT_CALLBACK('e', "event", &evsel_list, "event", 1719 "event selector. use 'perf list' to list available events", 1720 parse_events_option), 1721 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1722 "event filter", parse_filter), 1723 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1724 "child tasks do not inherit counters"), 1725 OPT_STRING('p', "pid", &target.pid, "pid", 1726 "stat events on existing process id"), 1727 OPT_STRING('t', "tid", &target.tid, "tid", 1728 "stat events on existing thread id"), 1729 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1730 "system-wide collection from all CPUs"), 1731 OPT_BOOLEAN('g', "group", &group, 1732 "put the counters into a counter group"), 1733 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1734 OPT_INCR('v', "verbose", &verbose, 1735 "be more verbose (show counter open errors, etc)"), 1736 OPT_INTEGER('r', "repeat", &run_count, 1737 "repeat command and print average + stddev (max: 100, forever: 0)"), 1738 OPT_BOOLEAN('n', "null", &null_run, 1739 "null run - dont start any counters"), 1740 OPT_INCR('d', "detailed", &detailed_run, 1741 "detailed run - start a lot of events"), 1742 OPT_BOOLEAN('S', "sync", &sync_run, 1743 "call sync() before starting a run"), 1744 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1745 "print large numbers with thousands\' separators", 1746 stat__set_big_num), 1747 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1748 "list of cpus to monitor in system-wide"), 1749 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1750 "disable CPU count aggregation", AGGR_NONE), 1751 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1752 "print counts with custom separator"), 1753 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1754 "monitor event in cgroup name only", parse_cgroups), 1755 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1756 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1757 OPT_INTEGER(0, "log-fd", &output_fd, 1758 "log output to fd, instead of stderr"), 1759 OPT_STRING(0, "pre", &pre_cmd, "command", 1760 "command to run prior to the measured command"), 1761 OPT_STRING(0, "post", &post_cmd, "command", 1762 "command to run after to the measured command"), 1763 OPT_UINTEGER('I', "interval-print", &interval, 1764 "print counts at regular interval in ms (>= 100)"), 1765 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1766 "aggregate counts per processor socket", AGGR_SOCKET), 1767 OPT_SET_UINT(0, "per-core", &aggr_mode, 1768 "aggregate counts per physical processor core", AGGR_CORE), 1769 OPT_UINTEGER('D', "delay", &initial_delay, 1770 "ms to wait before starting measurement after program start"), 1771 OPT_END() 1772 }; 1773 const char * const stat_usage[] = { 1774 "perf stat [<options>] [<command>]", 1775 NULL 1776 }; 1777 int status = -EINVAL, run_idx; 1778 const char *mode; 1779 1780 setlocale(LC_ALL, ""); 1781 1782 evsel_list = perf_evlist__new(); 1783 if (evsel_list == NULL) 1784 return -ENOMEM; 1785 1786 argc = parse_options(argc, argv, options, stat_usage, 1787 PARSE_OPT_STOP_AT_NON_OPTION); 1788 1789 output = stderr; 1790 if (output_name && strcmp(output_name, "-")) 1791 output = NULL; 1792 1793 if (output_name && output_fd) { 1794 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1795 parse_options_usage(stat_usage, options, "o", 1); 1796 parse_options_usage(NULL, options, "log-fd", 0); 1797 goto out; 1798 } 1799 1800 if (output_fd < 0) { 1801 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1802 parse_options_usage(stat_usage, options, "log-fd", 0); 1803 goto out; 1804 } 1805 1806 if (!output) { 1807 struct timespec tm; 1808 mode = append_file ? "a" : "w"; 1809 1810 output = fopen(output_name, mode); 1811 if (!output) { 1812 perror("failed to create output file"); 1813 return -1; 1814 } 1815 clock_gettime(CLOCK_REALTIME, &tm); 1816 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1817 } else if (output_fd > 0) { 1818 mode = append_file ? "a" : "w"; 1819 output = fdopen(output_fd, mode); 1820 if (!output) { 1821 perror("Failed opening logfd"); 1822 return -errno; 1823 } 1824 } 1825 1826 if (csv_sep) { 1827 csv_output = true; 1828 if (!strcmp(csv_sep, "\\t")) 1829 csv_sep = "\t"; 1830 } else 1831 csv_sep = DEFAULT_SEPARATOR; 1832 1833 /* 1834 * let the spreadsheet do the pretty-printing 1835 */ 1836 if (csv_output) { 1837 /* User explicitly passed -B? */ 1838 if (big_num_opt == 1) { 1839 fprintf(stderr, "-B option not supported with -x\n"); 1840 parse_options_usage(stat_usage, options, "B", 1); 1841 parse_options_usage(NULL, options, "x", 1); 1842 goto out; 1843 } else /* Nope, so disable big number formatting */ 1844 big_num = false; 1845 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1846 big_num = false; 1847 1848 if (!argc && target__none(&target)) 1849 usage_with_options(stat_usage, options); 1850 1851 if (run_count < 0) { 1852 pr_err("Run count must be a positive number\n"); 1853 parse_options_usage(stat_usage, options, "r", 1); 1854 goto out; 1855 } else if (run_count == 0) { 1856 forever = true; 1857 run_count = 1; 1858 } 1859 1860 /* no_aggr, cgroup are for system-wide only */ 1861 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) && 1862 !target__has_cpu(&target)) { 1863 fprintf(stderr, "both cgroup and no-aggregation " 1864 "modes only available in system-wide mode\n"); 1865 1866 parse_options_usage(stat_usage, options, "G", 1); 1867 parse_options_usage(NULL, options, "A", 1); 1868 parse_options_usage(NULL, options, "a", 1); 1869 goto out; 1870 } 1871 1872 if (add_default_attributes()) 1873 goto out; 1874 1875 target__validate(&target); 1876 1877 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1878 if (target__has_task(&target)) { 1879 pr_err("Problems finding threads of monitor\n"); 1880 parse_options_usage(stat_usage, options, "p", 1); 1881 parse_options_usage(NULL, options, "t", 1); 1882 } else if (target__has_cpu(&target)) { 1883 perror("failed to parse CPUs map"); 1884 parse_options_usage(stat_usage, options, "C", 1); 1885 parse_options_usage(NULL, options, "a", 1); 1886 } 1887 goto out; 1888 } 1889 if (interval && interval < 100) { 1890 pr_err("print interval must be >= 100ms\n"); 1891 parse_options_usage(stat_usage, options, "I", 1); 1892 goto out; 1893 } 1894 1895 if (perf_evlist__alloc_stats(evsel_list, interval)) 1896 goto out; 1897 1898 if (perf_stat_init_aggr_mode()) 1899 goto out; 1900 1901 /* 1902 * We dont want to block the signals - that would cause 1903 * child tasks to inherit that and Ctrl-C would not work. 1904 * What we want is for Ctrl-C to work in the exec()-ed 1905 * task, but being ignored by perf stat itself: 1906 */ 1907 atexit(sig_atexit); 1908 if (!forever) 1909 signal(SIGINT, skip_signal); 1910 signal(SIGCHLD, skip_signal); 1911 signal(SIGALRM, skip_signal); 1912 signal(SIGABRT, skip_signal); 1913 1914 status = 0; 1915 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1916 if (run_count != 1 && verbose) 1917 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1918 run_idx + 1); 1919 1920 status = run_perf_stat(argc, argv); 1921 if (forever && status != -1) { 1922 print_stat(argc, argv); 1923 perf_stat__reset_stats(evsel_list); 1924 } 1925 } 1926 1927 if (!forever && status != -1 && !interval) 1928 print_stat(argc, argv); 1929 1930 perf_evlist__free_stats(evsel_list); 1931out: 1932 perf_evlist__delete(evsel_list); 1933 return status; 1934} 1935