1/*
2 * Hypervisor supplied "24x7" performance counter support
3 *
4 * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
5 * Copyright 2014 IBM Corporation.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#define pr_fmt(fmt) "hv-24x7: " fmt
14
15#include <linux/perf_event.h>
16#include <linux/rbtree.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h>
20
21#include <asm/firmware.h>
22#include <asm/hvcall.h>
23#include <asm/io.h>
24#include <linux/byteorder/generic.h>
25
26#include "hv-24x7.h"
27#include "hv-24x7-catalog.h"
28#include "hv-common.h"
29
30static const char *event_domain_suffix(unsigned domain)
31{
32	switch (domain) {
33#define DOMAIN(n, v, x, c)		\
34	case HV_PERF_DOMAIN_##n:	\
35		return "__" #n;
36#include "hv-24x7-domains.h"
37#undef DOMAIN
38	default:
39		WARN(1, "unknown domain %d\n", domain);
40		return "__UNKNOWN_DOMAIN_SUFFIX";
41	}
42}
43
44static bool domain_is_valid(unsigned domain)
45{
46	switch (domain) {
47#define DOMAIN(n, v, x, c)		\
48	case HV_PERF_DOMAIN_##n:	\
49		/* fall through */
50#include "hv-24x7-domains.h"
51#undef DOMAIN
52		return true;
53	default:
54		return false;
55	}
56}
57
58static bool is_physical_domain(unsigned domain)
59{
60	switch (domain) {
61#define DOMAIN(n, v, x, c)		\
62	case HV_PERF_DOMAIN_##n:	\
63		return c;
64#include "hv-24x7-domains.h"
65#undef DOMAIN
66	default:
67		return false;
68	}
69}
70
71static bool catalog_entry_domain_is_valid(unsigned domain)
72{
73	return is_physical_domain(domain);
74}
75
76/*
77 * TODO: Merging events:
78 * - Think of the hcall as an interface to a 4d array of counters:
79 *   - x = domains
80 *   - y = indexes in the domain (core, chip, vcpu, node, etc)
81 *   - z = offset into the counter space
82 *   - w = lpars (guest vms, "logical partitions")
83 * - A single request is: x,y,y_last,z,z_last,w,w_last
84 *   - this means we can retrieve a rectangle of counters in y,z for a single x.
85 *
86 * - Things to consider (ignoring w):
87 *   - input  cost_per_request = 16
88 *   - output cost_per_result(ys,zs)  = 8 + 8 * ys + ys * zs
89 *   - limited number of requests per hcall (must fit into 4K bytes)
90 *     - 4k = 16 [buffer header] - 16 [request size] * request_count
91 *     - 255 requests per hcall
92 *   - sometimes it will be more efficient to read extra data and discard
93 */
94
95/*
96 * Example usage:
97 *  perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
98 */
99
100/* u3 0-6, one of HV_24X7_PERF_DOMAIN */
101EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
102/* u16 */
103EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
104EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
105/* u32, see "data_offset" */
106EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
107/* u16 */
108EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
109
110EVENT_DEFINE_RANGE(reserved1, config,   4, 15);
111EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
112EVENT_DEFINE_RANGE(reserved3, config2,  0, 63);
113
114static struct attribute *format_attrs[] = {
115	&format_attr_domain.attr,
116	&format_attr_offset.attr,
117	&format_attr_core.attr,
118	&format_attr_vcpu.attr,
119	&format_attr_lpar.attr,
120	NULL,
121};
122
123static struct attribute_group format_group = {
124	.name = "format",
125	.attrs = format_attrs,
126};
127
128static struct attribute_group event_group = {
129	.name = "events",
130	/* .attrs is set in init */
131};
132
133static struct attribute_group event_desc_group = {
134	.name = "event_descs",
135	/* .attrs is set in init */
136};
137
138static struct attribute_group event_long_desc_group = {
139	.name = "event_long_descs",
140	/* .attrs is set in init */
141};
142
143static struct kmem_cache *hv_page_cache;
144
145/*
146 * request_buffer and result_buffer are not required to be 4k aligned,
147 * but are not allowed to cross any 4k boundary. Aligning them to 4k is
148 * the simplest way to ensure that.
149 */
150#define H24x7_DATA_BUFFER_SIZE	4096
151DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
152DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
153
154static char *event_name(struct hv_24x7_event_data *ev, int *len)
155{
156	*len = be16_to_cpu(ev->event_name_len) - 2;
157	return (char *)ev->remainder;
158}
159
160static char *event_desc(struct hv_24x7_event_data *ev, int *len)
161{
162	unsigned nl = be16_to_cpu(ev->event_name_len);
163	__be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
164
165	*len = be16_to_cpu(*desc_len) - 2;
166	return (char *)ev->remainder + nl;
167}
168
169static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
170{
171	unsigned nl = be16_to_cpu(ev->event_name_len);
172	__be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
173	unsigned desc_len = be16_to_cpu(*desc_len_);
174	__be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
175
176	*len = be16_to_cpu(*long_desc_len) - 2;
177	return (char *)ev->remainder + nl + desc_len;
178}
179
180static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
181					  void *end)
182{
183	void *start = ev;
184
185	return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
186}
187
188/*
189 * Things we don't check:
190 *  - padding for desc, name, and long/detailed desc is required to be '\0'
191 *    bytes.
192 *
193 *  Return NULL if we pass end,
194 *  Otherwise return the address of the byte just following the event.
195 */
196static void *event_end(struct hv_24x7_event_data *ev, void *end)
197{
198	void *start = ev;
199	__be16 *dl_, *ldl_;
200	unsigned dl, ldl;
201	unsigned nl = be16_to_cpu(ev->event_name_len);
202
203	if (nl < 2) {
204		pr_debug("%s: name length too short: %d", __func__, nl);
205		return NULL;
206	}
207
208	if (start + nl > end) {
209		pr_debug("%s: start=%p + nl=%u > end=%p",
210				__func__, start, nl, end);
211		return NULL;
212	}
213
214	dl_ = (__be16 *)(ev->remainder + nl - 2);
215	if (!IS_ALIGNED((uintptr_t)dl_, 2))
216		pr_warn("desc len not aligned %p", dl_);
217	dl = be16_to_cpu(*dl_);
218	if (dl < 2) {
219		pr_debug("%s: desc len too short: %d", __func__, dl);
220		return NULL;
221	}
222
223	if (start + nl + dl > end) {
224		pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
225				__func__, start, nl, dl, start + nl + dl, end);
226		return NULL;
227	}
228
229	ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
230	if (!IS_ALIGNED((uintptr_t)ldl_, 2))
231		pr_warn("long desc len not aligned %p", ldl_);
232	ldl = be16_to_cpu(*ldl_);
233	if (ldl < 2) {
234		pr_debug("%s: long desc len too short (ldl=%u)",
235				__func__, ldl);
236		return NULL;
237	}
238
239	if (start + nl + dl + ldl > end) {
240		pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
241				__func__, start, nl, dl, ldl, end);
242		return NULL;
243	}
244
245	return start + nl + dl + ldl;
246}
247
248static unsigned long h_get_24x7_catalog_page_(unsigned long phys_4096,
249					      unsigned long version,
250					      unsigned long index)
251{
252	pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
253			phys_4096, version, index);
254
255	WARN_ON(!IS_ALIGNED(phys_4096, 4096));
256
257	return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
258			phys_4096, version, index);
259}
260
261static unsigned long h_get_24x7_catalog_page(char page[],
262					     u64 version, u32 index)
263{
264	return h_get_24x7_catalog_page_(virt_to_phys(page),
265					version, index);
266}
267
268static unsigned core_domains[] = {
269	HV_PERF_DOMAIN_PHYS_CORE,
270	HV_PERF_DOMAIN_VCPU_HOME_CORE,
271	HV_PERF_DOMAIN_VCPU_HOME_CHIP,
272	HV_PERF_DOMAIN_VCPU_HOME_NODE,
273	HV_PERF_DOMAIN_VCPU_REMOTE_NODE,
274};
275/* chip event data always yeilds a single event, core yeilds multiple */
276#define MAX_EVENTS_PER_EVENT_DATA ARRAY_SIZE(core_domains)
277
278static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
279{
280	const char *sindex;
281	const char *lpar;
282
283	if (is_physical_domain(domain)) {
284		lpar = "0x0";
285		sindex = "core";
286	} else {
287		lpar = "?";
288		sindex = "vcpu";
289	}
290
291	return kasprintf(GFP_KERNEL,
292			"domain=0x%x,offset=0x%x,%s=?,lpar=%s",
293			domain,
294			be16_to_cpu(event->event_counter_offs) +
295				be16_to_cpu(event->event_group_record_offs),
296			sindex,
297			lpar);
298}
299
300/* Avoid trusting fw to NUL terminate strings */
301static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
302{
303	return kasprintf(gfp, "%.*s", max_len, maybe_str);
304}
305
306static ssize_t device_show_string(struct device *dev,
307		struct device_attribute *attr, char *buf)
308{
309	struct dev_ext_attribute *d;
310
311	d = container_of(attr, struct dev_ext_attribute, attr);
312
313	return sprintf(buf, "%s\n", (char *)d->var);
314}
315
316static struct attribute *device_str_attr_create_(char *name, char *str)
317{
318	struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
319
320	if (!attr)
321		return NULL;
322
323	attr->var = str;
324	attr->attr.attr.name = name;
325	attr->attr.attr.mode = 0444;
326	attr->attr.show = device_show_string;
327
328	return &attr->attr.attr;
329}
330
331static struct attribute *device_str_attr_create(char *name, int name_max,
332						int name_nonce,
333						char *str, size_t str_max)
334{
335	char *n;
336	char *s = memdup_to_str(str, str_max, GFP_KERNEL);
337	struct attribute *a;
338
339	if (!s)
340		return NULL;
341
342	if (!name_nonce)
343		n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
344	else
345		n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
346					name_nonce);
347	if (!n)
348		goto out_s;
349
350	a = device_str_attr_create_(n, s);
351	if (!a)
352		goto out_n;
353
354	return a;
355out_n:
356	kfree(n);
357out_s:
358	kfree(s);
359	return NULL;
360}
361
362static void device_str_attr_destroy(struct attribute *attr)
363{
364	struct dev_ext_attribute *d;
365
366	d = container_of(attr, struct dev_ext_attribute, attr.attr);
367	kfree(d->var);
368	kfree(d->attr.attr.name);
369	kfree(d);
370}
371
372static struct attribute *event_to_attr(unsigned ix,
373				       struct hv_24x7_event_data *event,
374				       unsigned domain,
375				       int nonce)
376{
377	int event_name_len;
378	char *ev_name, *a_ev_name, *val;
379	const char *ev_suffix;
380	struct attribute *attr;
381
382	if (!domain_is_valid(domain)) {
383		pr_warn("catalog event %u has invalid domain %u\n",
384				ix, domain);
385		return NULL;
386	}
387
388	val = event_fmt(event, domain);
389	if (!val)
390		return NULL;
391
392	ev_suffix = event_domain_suffix(domain);
393	ev_name = event_name(event, &event_name_len);
394	if (!nonce)
395		a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s",
396				(int)event_name_len, ev_name, ev_suffix);
397	else
398		a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s__%d",
399				(int)event_name_len, ev_name, ev_suffix, nonce);
400
401	if (!a_ev_name)
402		goto out_val;
403
404	attr = device_str_attr_create_(a_ev_name, val);
405	if (!attr)
406		goto out_name;
407
408	return attr;
409out_name:
410	kfree(a_ev_name);
411out_val:
412	kfree(val);
413	return NULL;
414}
415
416static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
417				int nonce)
418{
419	int nl, dl;
420	char *name = event_name(event, &nl);
421	char *desc = event_desc(event, &dl);
422
423	/* If there isn't a description, don't create the sysfs file */
424	if (!dl)
425		return NULL;
426
427	return device_str_attr_create(name, nl, nonce, desc, dl);
428}
429
430static struct attribute *
431event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
432{
433	int nl, dl;
434	char *name = event_name(event, &nl);
435	char *desc = event_long_desc(event, &dl);
436
437	/* If there isn't a description, don't create the sysfs file */
438	if (!dl)
439		return NULL;
440
441	return device_str_attr_create(name, nl, nonce, desc, dl);
442}
443
444static ssize_t event_data_to_attrs(unsigned ix, struct attribute **attrs,
445		struct hv_24x7_event_data *event, int nonce)
446{
447	unsigned i;
448
449	switch (event->domain) {
450	case HV_PERF_DOMAIN_PHYS_CHIP:
451		*attrs = event_to_attr(ix, event, event->domain, nonce);
452		return 1;
453	case HV_PERF_DOMAIN_PHYS_CORE:
454		for (i = 0; i < ARRAY_SIZE(core_domains); i++) {
455			attrs[i] = event_to_attr(ix, event, core_domains[i],
456						nonce);
457			if (!attrs[i]) {
458				pr_warn("catalog event %u: individual attr %u "
459					"creation failure\n", ix, i);
460				for (; i; i--)
461					device_str_attr_destroy(attrs[i - 1]);
462				return -1;
463			}
464		}
465		return i;
466	default:
467		pr_warn("catalog event %u: domain %u is not allowed in the "
468				"catalog\n", ix, event->domain);
469		return -1;
470	}
471}
472
473static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
474{
475	switch (event->domain) {
476	case HV_PERF_DOMAIN_PHYS_CHIP:
477		return 1;
478	case HV_PERF_DOMAIN_PHYS_CORE:
479		return ARRAY_SIZE(core_domains);
480	default:
481		return 0;
482	}
483}
484
485static unsigned long vmalloc_to_phys(void *v)
486{
487	struct page *p = vmalloc_to_page(v);
488
489	BUG_ON(!p);
490	return page_to_phys(p) + offset_in_page(v);
491}
492
493/* */
494struct event_uniq {
495	struct rb_node node;
496	const char *name;
497	int nl;
498	unsigned ct;
499	unsigned domain;
500};
501
502static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
503{
504	if (s1 < s2)
505		return 1;
506	if (s2 > s1)
507		return -1;
508
509	return memcmp(d1, d2, s1);
510}
511
512static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
513					size_t s2, unsigned d2)
514{
515	int r = memord(v1, s1, v2, s2);
516
517	if (r)
518		return r;
519	if (d1 > d2)
520		return 1;
521	if (d2 > d1)
522		return -1;
523	return 0;
524}
525
526static int event_uniq_add(struct rb_root *root, const char *name, int nl,
527				unsigned domain)
528{
529	struct rb_node **new = &(root->rb_node), *parent = NULL;
530	struct event_uniq *data;
531
532	/* Figure out where to put new node */
533	while (*new) {
534		struct event_uniq *it;
535		int result;
536
537		it = container_of(*new, struct event_uniq, node);
538		result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
539					it->domain);
540
541		parent = *new;
542		if (result < 0)
543			new = &((*new)->rb_left);
544		else if (result > 0)
545			new = &((*new)->rb_right);
546		else {
547			it->ct++;
548			pr_info("found a duplicate event %.*s, ct=%u\n", nl,
549						name, it->ct);
550			return it->ct;
551		}
552	}
553
554	data = kmalloc(sizeof(*data), GFP_KERNEL);
555	if (!data)
556		return -ENOMEM;
557
558	*data = (struct event_uniq) {
559		.name = name,
560		.nl = nl,
561		.ct = 0,
562		.domain = domain,
563	};
564
565	/* Add new node and rebalance tree. */
566	rb_link_node(&data->node, parent, new);
567	rb_insert_color(&data->node, root);
568
569	/* data->ct */
570	return 0;
571}
572
573static void event_uniq_destroy(struct rb_root *root)
574{
575	/*
576	 * the strings we point to are in the giant block of memory filled by
577	 * the catalog, and are freed separately.
578	 */
579	struct event_uniq *pos, *n;
580
581	rbtree_postorder_for_each_entry_safe(pos, n, root, node)
582		kfree(pos);
583}
584
585
586/*
587 * ensure the event structure's sizes are self consistent and don't cause us to
588 * read outside of the event
589 *
590 * On success, return the event length in bytes.
591 * Otherwise, return -1 (and print as appropriate).
592 */
593static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
594					  size_t event_idx,
595					  size_t event_data_bytes,
596					  size_t event_entry_count,
597					  size_t offset, void *end)
598{
599	ssize_t ev_len;
600	void *ev_end, *calc_ev_end;
601
602	if (offset >= event_data_bytes)
603		return -1;
604
605	if (event_idx >= event_entry_count) {
606		pr_devel("catalog event data has %zu bytes of padding after last event\n",
607				event_data_bytes - offset);
608		return -1;
609	}
610
611	if (!event_fixed_portion_is_within(event, end)) {
612		pr_warn("event %zu fixed portion is not within range\n",
613				event_idx);
614		return -1;
615	}
616
617	ev_len = be16_to_cpu(event->length);
618
619	if (ev_len % 16)
620		pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
621				event_idx, ev_len, event);
622
623	ev_end = (__u8 *)event + ev_len;
624	if (ev_end > end) {
625		pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
626				event_idx, ev_len, ev_end, end,
627				offset);
628		return -1;
629	}
630
631	calc_ev_end = event_end(event, end);
632	if (!calc_ev_end) {
633		pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
634			event_idx, event_data_bytes, event, end,
635			offset);
636		return -1;
637	}
638
639	if (calc_ev_end > ev_end) {
640		pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
641			event_idx, event, ev_end, offset, calc_ev_end);
642		return -1;
643	}
644
645	return ev_len;
646}
647
648#define MAX_4K (SIZE_MAX / 4096)
649
650static int create_events_from_catalog(struct attribute ***events_,
651		struct attribute ***event_descs_,
652		struct attribute ***event_long_descs_)
653{
654	unsigned long hret;
655	size_t catalog_len, catalog_page_len, event_entry_count,
656	       event_data_len, event_data_offs,
657	       event_data_bytes, junk_events, event_idx, event_attr_ct, i,
658	       attr_max, event_idx_last, desc_ct, long_desc_ct;
659	ssize_t ct, ev_len;
660	uint32_t catalog_version_num;
661	struct attribute **events, **event_descs, **event_long_descs;
662	struct hv_24x7_catalog_page_0 *page_0 =
663		kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
664	void *page = page_0;
665	void *event_data, *end;
666	struct hv_24x7_event_data *event;
667	struct rb_root ev_uniq = RB_ROOT;
668	int ret = 0;
669
670	if (!page) {
671		ret = -ENOMEM;
672		goto e_out;
673	}
674
675	hret = h_get_24x7_catalog_page(page, 0, 0);
676	if (hret) {
677		ret = -EIO;
678		goto e_free;
679	}
680
681	catalog_version_num = be64_to_cpu(page_0->version);
682	catalog_page_len = be32_to_cpu(page_0->length);
683
684	if (MAX_4K < catalog_page_len) {
685		pr_err("invalid page count: %zu\n", catalog_page_len);
686		ret = -EIO;
687		goto e_free;
688	}
689
690	catalog_len = catalog_page_len * 4096;
691
692	event_entry_count = be16_to_cpu(page_0->event_entry_count);
693	event_data_offs   = be16_to_cpu(page_0->event_data_offs);
694	event_data_len    = be16_to_cpu(page_0->event_data_len);
695
696	pr_devel("cv %zu cl %zu eec %zu edo %zu edl %zu\n",
697			(size_t)catalog_version_num, catalog_len,
698			event_entry_count, event_data_offs, event_data_len);
699
700	if ((MAX_4K < event_data_len)
701			|| (MAX_4K < event_data_offs)
702			|| (MAX_4K - event_data_offs < event_data_len)) {
703		pr_err("invalid event data offs %zu and/or len %zu\n",
704				event_data_offs, event_data_len);
705		ret = -EIO;
706		goto e_free;
707	}
708
709	if ((event_data_offs + event_data_len) > catalog_page_len) {
710		pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
711				event_data_offs,
712				event_data_offs + event_data_len,
713				catalog_page_len);
714		ret = -EIO;
715		goto e_free;
716	}
717
718	if (SIZE_MAX / MAX_EVENTS_PER_EVENT_DATA - 1 < event_entry_count) {
719		pr_err("event_entry_count %zu is invalid\n",
720				event_entry_count);
721		ret = -EIO;
722		goto e_free;
723	}
724
725	event_data_bytes = event_data_len * 4096;
726
727	/*
728	 * event data can span several pages, events can cross between these
729	 * pages. Use vmalloc to make this easier.
730	 */
731	event_data = vmalloc(event_data_bytes);
732	if (!event_data) {
733		pr_err("could not allocate event data\n");
734		ret = -ENOMEM;
735		goto e_free;
736	}
737
738	end = event_data + event_data_bytes;
739
740	/*
741	 * using vmalloc_to_phys() like this only works if PAGE_SIZE is
742	 * divisible by 4096
743	 */
744	BUILD_BUG_ON(PAGE_SIZE % 4096);
745
746	for (i = 0; i < event_data_len; i++) {
747		hret = h_get_24x7_catalog_page_(
748				vmalloc_to_phys(event_data + i * 4096),
749				catalog_version_num,
750				i + event_data_offs);
751		if (hret) {
752			pr_err("failed to get event data in page %zu\n",
753					i + event_data_offs);
754			ret = -EIO;
755			goto e_event_data;
756		}
757	}
758
759	/*
760	 * scan the catalog to determine the number of attributes we need, and
761	 * verify it at the same time.
762	 */
763	for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
764	     ;
765	     event_idx++, event = (void *)event + ev_len) {
766		size_t offset = (void *)event - (void *)event_data;
767		char *name;
768		int nl;
769
770		ev_len = catalog_event_len_validate(event, event_idx,
771						    event_data_bytes,
772						    event_entry_count,
773						    offset, end);
774		if (ev_len < 0)
775			break;
776
777		name = event_name(event, &nl);
778
779		if (event->event_group_record_len == 0) {
780			pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
781					event_idx, nl, name);
782			junk_events++;
783			continue;
784		}
785
786		if (!catalog_entry_domain_is_valid(event->domain)) {
787			pr_info("event %zu (%.*s) has invalid domain %d\n",
788					event_idx, nl, name, event->domain);
789			junk_events++;
790			continue;
791		}
792
793		attr_max += event_to_attr_ct(event);
794	}
795
796	event_idx_last = event_idx;
797	if (event_idx_last != event_entry_count)
798		pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
799				event_idx_last, event_entry_count, junk_events);
800
801	events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
802	if (!events) {
803		ret = -ENOMEM;
804		goto e_event_data;
805	}
806
807	event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
808				GFP_KERNEL);
809	if (!event_descs) {
810		ret = -ENOMEM;
811		goto e_event_attrs;
812	}
813
814	event_long_descs = kmalloc_array(event_idx + 1,
815			sizeof(*event_long_descs), GFP_KERNEL);
816	if (!event_long_descs) {
817		ret = -ENOMEM;
818		goto e_event_descs;
819	}
820
821	/* Iterate over the catalog filling in the attribute vector */
822	for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
823				event = event_data, event_idx = 0;
824			event_idx < event_idx_last;
825			event_idx++, ev_len = be16_to_cpu(event->length),
826				event = (void *)event + ev_len) {
827		char *name;
828		int nl;
829		int nonce;
830		/*
831		 * these are the only "bad" events that are intermixed and that
832		 * we can ignore without issue. make sure to skip them here
833		 */
834		if (event->event_group_record_len == 0)
835			continue;
836		if (!catalog_entry_domain_is_valid(event->domain))
837			continue;
838
839		name  = event_name(event, &nl);
840		nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
841		ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
842					    event, nonce);
843		if (ct <= 0) {
844			pr_warn("event %zu (%.*s) creation failure, skipping\n",
845				event_idx, nl, name);
846			junk_events++;
847		} else {
848			event_attr_ct += ct;
849			event_descs[desc_ct] = event_to_desc_attr(event, nonce);
850			if (event_descs[desc_ct])
851				desc_ct++;
852			event_long_descs[long_desc_ct] =
853					event_to_long_desc_attr(event, nonce);
854			if (event_long_descs[long_desc_ct])
855				long_desc_ct++;
856		}
857	}
858
859	pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
860			event_idx, event_attr_ct, junk_events, desc_ct);
861
862	events[event_attr_ct] = NULL;
863	event_descs[desc_ct] = NULL;
864	event_long_descs[long_desc_ct] = NULL;
865
866	event_uniq_destroy(&ev_uniq);
867	vfree(event_data);
868	kmem_cache_free(hv_page_cache, page);
869
870	*events_ = events;
871	*event_descs_ = event_descs;
872	*event_long_descs_ = event_long_descs;
873	return 0;
874
875e_event_descs:
876	kfree(event_descs);
877e_event_attrs:
878	kfree(events);
879e_event_data:
880	vfree(event_data);
881e_free:
882	kmem_cache_free(hv_page_cache, page);
883e_out:
884	*events_ = NULL;
885	*event_descs_ = NULL;
886	*event_long_descs_ = NULL;
887	return ret;
888}
889
890static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
891			    struct bin_attribute *bin_attr, char *buf,
892			    loff_t offset, size_t count)
893{
894	unsigned long hret;
895	ssize_t ret = 0;
896	size_t catalog_len = 0, catalog_page_len = 0;
897	loff_t page_offset = 0;
898	loff_t offset_in_page;
899	size_t copy_len;
900	uint64_t catalog_version_num = 0;
901	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
902	struct hv_24x7_catalog_page_0 *page_0 = page;
903
904	if (!page)
905		return -ENOMEM;
906
907	hret = h_get_24x7_catalog_page(page, 0, 0);
908	if (hret) {
909		ret = -EIO;
910		goto e_free;
911	}
912
913	catalog_version_num = be64_to_cpu(page_0->version);
914	catalog_page_len = be32_to_cpu(page_0->length);
915	catalog_len = catalog_page_len * 4096;
916
917	page_offset = offset / 4096;
918	offset_in_page = offset % 4096;
919
920	if (page_offset >= catalog_page_len)
921		goto e_free;
922
923	if (page_offset != 0) {
924		hret = h_get_24x7_catalog_page(page, catalog_version_num,
925					       page_offset);
926		if (hret) {
927			ret = -EIO;
928			goto e_free;
929		}
930	}
931
932	copy_len = 4096 - offset_in_page;
933	if (copy_len > count)
934		copy_len = count;
935
936	memcpy(buf, page+offset_in_page, copy_len);
937	ret = copy_len;
938
939e_free:
940	if (hret)
941		pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
942		       " rc=%ld\n",
943		       catalog_version_num, page_offset, hret);
944	kmem_cache_free(hv_page_cache, page);
945
946	pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
947			"catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
948			count, catalog_len, catalog_page_len, ret);
949
950	return ret;
951}
952
953#define PAGE_0_ATTR(_name, _fmt, _expr)				\
954static ssize_t _name##_show(struct device *dev,			\
955			    struct device_attribute *dev_attr,	\
956			    char *buf)				\
957{								\
958	unsigned long hret;					\
959	ssize_t ret = 0;					\
960	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);	\
961	struct hv_24x7_catalog_page_0 *page_0 = page;		\
962	if (!page)						\
963		return -ENOMEM;					\
964	hret = h_get_24x7_catalog_page(page, 0, 0);		\
965	if (hret) {						\
966		ret = -EIO;					\
967		goto e_free;					\
968	}							\
969	ret = sprintf(buf, _fmt, _expr);			\
970e_free:								\
971	kmem_cache_free(hv_page_cache, page);			\
972	return ret;						\
973}								\
974static DEVICE_ATTR_RO(_name)
975
976PAGE_0_ATTR(catalog_version, "%lld\n",
977		(unsigned long long)be64_to_cpu(page_0->version));
978PAGE_0_ATTR(catalog_len, "%lld\n",
979		(unsigned long long)be32_to_cpu(page_0->length) * 4096);
980static BIN_ATTR_RO(catalog, 0/* real length varies */);
981
982static struct bin_attribute *if_bin_attrs[] = {
983	&bin_attr_catalog,
984	NULL,
985};
986
987static struct attribute *if_attrs[] = {
988	&dev_attr_catalog_len.attr,
989	&dev_attr_catalog_version.attr,
990	NULL,
991};
992
993static struct attribute_group if_group = {
994	.name = "interface",
995	.bin_attrs = if_bin_attrs,
996	.attrs = if_attrs,
997};
998
999static const struct attribute_group *attr_groups[] = {
1000	&format_group,
1001	&event_group,
1002	&event_desc_group,
1003	&event_long_desc_group,
1004	&if_group,
1005	NULL,
1006};
1007
1008static void log_24x7_hcall(struct hv_24x7_request_buffer *request_buffer,
1009			struct hv_24x7_data_result_buffer *result_buffer,
1010			unsigned long ret)
1011{
1012	struct hv_24x7_request *req;
1013
1014	req = &request_buffer->requests[0];
1015	pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => "
1016			"ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
1017			req->performance_domain, req->data_offset,
1018			req->starting_ix, req->starting_lpar_ix, ret, ret,
1019			result_buffer->detailed_rc,
1020			result_buffer->failing_request_ix);
1021}
1022
1023/*
1024 * Start the process for a new H_GET_24x7_DATA hcall.
1025 */
1026static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1027			struct hv_24x7_data_result_buffer *result_buffer)
1028{
1029
1030	memset(request_buffer, 0, 4096);
1031	memset(result_buffer, 0, 4096);
1032
1033	request_buffer->interface_version = HV_24X7_IF_VERSION_CURRENT;
1034	/* memset above set request_buffer->num_requests to 0 */
1035}
1036
1037/*
1038 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
1039 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
1040 */
1041static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1042			struct hv_24x7_data_result_buffer *result_buffer)
1043{
1044	unsigned long ret;
1045
1046	/*
1047	 * NOTE: Due to variable number of array elements in request and
1048	 *	 result buffer(s), sizeof() is not reliable. Use the actual
1049	 *	 allocated buffer size, H24x7_DATA_BUFFER_SIZE.
1050	 */
1051	ret = plpar_hcall_norets(H_GET_24X7_DATA,
1052			virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
1053			virt_to_phys(result_buffer),  H24x7_DATA_BUFFER_SIZE);
1054
1055	if (ret)
1056		log_24x7_hcall(request_buffer, result_buffer, ret);
1057
1058	return ret;
1059}
1060
1061/*
1062 * Add the given @event to the next slot in the 24x7 request_buffer.
1063 *
1064 * Note that H_GET_24X7_DATA hcall allows reading several counters'
1065 * values in a single HCALL. We expect the caller to add events to the
1066 * request buffer one by one, make the HCALL and process the results.
1067 */
1068static int add_event_to_24x7_request(struct perf_event *event,
1069				struct hv_24x7_request_buffer *request_buffer)
1070{
1071	u16 idx;
1072	int i;
1073	struct hv_24x7_request *req;
1074
1075	if (request_buffer->num_requests > 254) {
1076		pr_devel("Too many requests for 24x7 HCALL %d\n",
1077				request_buffer->num_requests);
1078		return -EINVAL;
1079	}
1080
1081	if (is_physical_domain(event_get_domain(event)))
1082		idx = event_get_core(event);
1083	else
1084		idx = event_get_vcpu(event);
1085
1086	i = request_buffer->num_requests++;
1087	req = &request_buffer->requests[i];
1088
1089	req->performance_domain = event_get_domain(event);
1090	req->data_size = cpu_to_be16(8);
1091	req->data_offset = cpu_to_be32(event_get_offset(event));
1092	req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event)),
1093	req->max_num_lpars = cpu_to_be16(1);
1094	req->starting_ix = cpu_to_be16(idx);
1095	req->max_ix = cpu_to_be16(1);
1096
1097	return 0;
1098}
1099
1100static unsigned long single_24x7_request(struct perf_event *event, u64 *count)
1101{
1102	unsigned long ret;
1103	struct hv_24x7_request_buffer *request_buffer;
1104	struct hv_24x7_data_result_buffer *result_buffer;
1105	struct hv_24x7_result *resb;
1106
1107	BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
1108	BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
1109
1110	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1111	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1112
1113	init_24x7_request(request_buffer, result_buffer);
1114
1115	ret = add_event_to_24x7_request(event, request_buffer);
1116	if (ret)
1117		goto out;
1118
1119	ret = make_24x7_request(request_buffer, result_buffer);
1120	if (ret) {
1121		log_24x7_hcall(request_buffer, result_buffer, ret);
1122		goto out;
1123	}
1124
1125	/* process result from hcall */
1126	resb = &result_buffer->results[0];
1127	*count = be64_to_cpu(resb->elements[0].element_data[0]);
1128
1129out:
1130	put_cpu_var(hv_24x7_reqb);
1131	put_cpu_var(hv_24x7_resb);
1132	return ret;
1133}
1134
1135
1136static int h_24x7_event_init(struct perf_event *event)
1137{
1138	struct hv_perf_caps caps;
1139	unsigned domain;
1140	unsigned long hret;
1141	u64 ct;
1142
1143	/* Not our event */
1144	if (event->attr.type != event->pmu->type)
1145		return -ENOENT;
1146
1147	/* Unused areas must be 0 */
1148	if (event_get_reserved1(event) ||
1149	    event_get_reserved2(event) ||
1150	    event_get_reserved3(event)) {
1151		pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
1152				event->attr.config,
1153				event_get_reserved1(event),
1154				event->attr.config1,
1155				event_get_reserved2(event),
1156				event->attr.config2,
1157				event_get_reserved3(event));
1158		return -EINVAL;
1159	}
1160
1161	/* unsupported modes and filters */
1162	if (event->attr.exclude_user   ||
1163	    event->attr.exclude_kernel ||
1164	    event->attr.exclude_hv     ||
1165	    event->attr.exclude_idle   ||
1166	    event->attr.exclude_host   ||
1167	    event->attr.exclude_guest)
1168		return -EINVAL;
1169
1170	/* no branch sampling */
1171	if (has_branch_stack(event))
1172		return -EOPNOTSUPP;
1173
1174	/* offset must be 8 byte aligned */
1175	if (event_get_offset(event) % 8) {
1176		pr_devel("bad alignment\n");
1177		return -EINVAL;
1178	}
1179
1180	/* Domains above 6 are invalid */
1181	domain = event_get_domain(event);
1182	if (domain > 6) {
1183		pr_devel("invalid domain %d\n", domain);
1184		return -EINVAL;
1185	}
1186
1187	hret = hv_perf_caps_get(&caps);
1188	if (hret) {
1189		pr_devel("could not get capabilities: rc=%ld\n", hret);
1190		return -EIO;
1191	}
1192
1193	/* Physical domains & other lpars require extra capabilities */
1194	if (!caps.collect_privileged && (is_physical_domain(domain) ||
1195		(event_get_lpar(event) != event_get_lpar_max()))) {
1196		pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
1197				is_physical_domain(domain),
1198				event_get_lpar(event));
1199		return -EACCES;
1200	}
1201
1202	/* see if the event complains */
1203	if (single_24x7_request(event, &ct)) {
1204		pr_devel("test hcall failed\n");
1205		return -EIO;
1206	}
1207
1208	return 0;
1209}
1210
1211static u64 h_24x7_get_value(struct perf_event *event)
1212{
1213	unsigned long ret;
1214	u64 ct;
1215	ret = single_24x7_request(event, &ct);
1216	if (ret)
1217		/* We checked this in event init, shouldn't fail here... */
1218		return 0;
1219
1220	return ct;
1221}
1222
1223static void update_event_count(struct perf_event *event, u64 now)
1224{
1225	s64 prev;
1226
1227	prev = local64_xchg(&event->hw.prev_count, now);
1228	local64_add(now - prev, &event->count);
1229}
1230
1231static void h_24x7_event_read(struct perf_event *event)
1232{
1233	u64 now;
1234
1235	now = h_24x7_get_value(event);
1236	update_event_count(event, now);
1237}
1238
1239static void h_24x7_event_start(struct perf_event *event, int flags)
1240{
1241	if (flags & PERF_EF_RELOAD)
1242		local64_set(&event->hw.prev_count, h_24x7_get_value(event));
1243}
1244
1245static void h_24x7_event_stop(struct perf_event *event, int flags)
1246{
1247	h_24x7_event_read(event);
1248}
1249
1250static int h_24x7_event_add(struct perf_event *event, int flags)
1251{
1252	if (flags & PERF_EF_START)
1253		h_24x7_event_start(event, flags);
1254
1255	return 0;
1256}
1257
1258static struct pmu h_24x7_pmu = {
1259	.task_ctx_nr = perf_invalid_context,
1260
1261	.name = "hv_24x7",
1262	.attr_groups = attr_groups,
1263	.event_init  = h_24x7_event_init,
1264	.add         = h_24x7_event_add,
1265	.del         = h_24x7_event_stop,
1266	.start       = h_24x7_event_start,
1267	.stop        = h_24x7_event_stop,
1268	.read        = h_24x7_event_read,
1269};
1270
1271static int hv_24x7_init(void)
1272{
1273	int r;
1274	unsigned long hret;
1275	struct hv_perf_caps caps;
1276
1277	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
1278		pr_debug("not a virtualized system, not enabling\n");
1279		return -ENODEV;
1280	}
1281
1282	hret = hv_perf_caps_get(&caps);
1283	if (hret) {
1284		pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
1285				hret);
1286		return -ENODEV;
1287	}
1288
1289	hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
1290	if (!hv_page_cache)
1291		return -ENOMEM;
1292
1293	/* sampling not supported */
1294	h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1295
1296	r = create_events_from_catalog(&event_group.attrs,
1297				   &event_desc_group.attrs,
1298				   &event_long_desc_group.attrs);
1299
1300	if (r)
1301		return r;
1302
1303	r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
1304	if (r)
1305		return r;
1306
1307	return 0;
1308}
1309
1310device_initcall(hv_24x7_init);
1311