hv-24x7.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
hv-24x7.c (45308B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Hypervisor supplied "24x7" performance counter support
      4 *
      5 * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
      6 * Copyright 2014 IBM Corporation.
      7 */
      8
      9#define pr_fmt(fmt) "hv-24x7: " fmt
     10
     11#include <linux/perf_event.h>
     12#include <linux/rbtree.h>
     13#include <linux/module.h>
     14#include <linux/slab.h>
     15#include <linux/vmalloc.h>
     16
     17#include <asm/cputhreads.h>
     18#include <asm/firmware.h>
     19#include <asm/hvcall.h>
     20#include <asm/io.h>
     21#include <linux/byteorder/generic.h>
     22
     23#include <asm/rtas.h>
     24#include "hv-24x7.h"
     25#include "hv-24x7-catalog.h"
     26#include "hv-common.h"
     27
     28/* Version of the 24x7 hypervisor API that we should use in this machine. */
     29static int interface_version;
     30
     31/* Whether we have to aggregate result data for some domains. */
     32static bool aggregate_result_elements;
     33
     34static cpumask_t hv_24x7_cpumask;
     35
     36static bool domain_is_valid(unsigned int domain)
     37{
     38	switch (domain) {
     39#define DOMAIN(n, v, x, c)		\
     40	case HV_PERF_DOMAIN_##n:	\
     41		/* fall through */
     42#include "hv-24x7-domains.h"
     43#undef DOMAIN
     44		return true;
     45	default:
     46		return false;
     47	}
     48}
     49
     50static bool is_physical_domain(unsigned int domain)
     51{
     52	switch (domain) {
     53#define DOMAIN(n, v, x, c)		\
     54	case HV_PERF_DOMAIN_##n:	\
     55		return c;
     56#include "hv-24x7-domains.h"
     57#undef DOMAIN
     58	default:
     59		return false;
     60	}
     61}
     62
     63/*
     64 * The Processor Module Information system parameter allows transferring
     65 * of certain processor module information from the platform to the OS.
     66 * Refer PAPR+ document to get parameter token value as '43'.
     67 */
     68
     69#define PROCESSOR_MODULE_INFO   43
     70
     71static u32 phys_sockets;	/* Physical sockets */
     72static u32 phys_chipspersocket;	/* Physical chips per socket*/
     73static u32 phys_coresperchip; /* Physical cores per chip */
     74
     75/*
     76 * read_24x7_sys_info()
     77 * Retrieve the number of sockets and chips per socket and cores per
     78 * chip details through the get-system-parameter rtas call.
     79 */
     80void read_24x7_sys_info(void)
     81{
     82	int call_status, len, ntypes;
     83
     84	spin_lock(&rtas_data_buf_lock);
     85
     86	/*
     87	 * Making system parameter: chips and sockets and cores per chip
     88	 * default to 1.
     89	 */
     90	phys_sockets = 1;
     91	phys_chipspersocket = 1;
     92	phys_coresperchip = 1;
     93
     94	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
     95				NULL,
     96				PROCESSOR_MODULE_INFO,
     97				__pa(rtas_data_buf),
     98				RTAS_DATA_BUF_SIZE);
     99
    100	if (call_status != 0) {
    101		pr_err("Error calling get-system-parameter %d\n",
    102		       call_status);
    103	} else {
    104		len = be16_to_cpup((__be16 *)&rtas_data_buf[0]);
    105		if (len < 8)
    106			goto out;
    107
    108		ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]);
    109
    110		if (!ntypes)
    111			goto out;
    112
    113		phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]);
    114		phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]);
    115		phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]);
    116	}
    117
    118out:
    119	spin_unlock(&rtas_data_buf_lock);
    120}
    121
    122/* Domains for which more than one result element are returned for each event. */
    123static bool domain_needs_aggregation(unsigned int domain)
    124{
    125	return aggregate_result_elements &&
    126			(domain == HV_PERF_DOMAIN_PHYS_CORE ||
    127			 (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
    128			  domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
    129}
    130
    131static const char *domain_name(unsigned int domain)
    132{
    133	if (!domain_is_valid(domain))
    134		return NULL;
    135
    136	switch (domain) {
    137	case HV_PERF_DOMAIN_PHYS_CHIP:		return "Physical Chip";
    138	case HV_PERF_DOMAIN_PHYS_CORE:		return "Physical Core";
    139	case HV_PERF_DOMAIN_VCPU_HOME_CORE:	return "VCPU Home Core";
    140	case HV_PERF_DOMAIN_VCPU_HOME_CHIP:	return "VCPU Home Chip";
    141	case HV_PERF_DOMAIN_VCPU_HOME_NODE:	return "VCPU Home Node";
    142	case HV_PERF_DOMAIN_VCPU_REMOTE_NODE:	return "VCPU Remote Node";
    143	}
    144
    145	WARN_ON_ONCE(domain);
    146	return NULL;
    147}
    148
    149static bool catalog_entry_domain_is_valid(unsigned int domain)
    150{
    151	/* POWER8 doesn't support virtual domains. */
    152	if (interface_version == 1)
    153		return is_physical_domain(domain);
    154	else
    155		return domain_is_valid(domain);
    156}
    157
    158/*
    159 * TODO: Merging events:
    160 * - Think of the hcall as an interface to a 4d array of counters:
    161 *   - x = domains
    162 *   - y = indexes in the domain (core, chip, vcpu, node, etc)
    163 *   - z = offset into the counter space
    164 *   - w = lpars (guest vms, "logical partitions")
    165 * - A single request is: x,y,y_last,z,z_last,w,w_last
    166 *   - this means we can retrieve a rectangle of counters in y,z for a single x.
    167 *
    168 * - Things to consider (ignoring w):
    169 *   - input  cost_per_request = 16
    170 *   - output cost_per_result(ys,zs)  = 8 + 8 * ys + ys * zs
    171 *   - limited number of requests per hcall (must fit into 4K bytes)
    172 *     - 4k = 16 [buffer header] - 16 [request size] * request_count
    173 *     - 255 requests per hcall
    174 *   - sometimes it will be more efficient to read extra data and discard
    175 */
    176
    177/*
    178 * Example usage:
    179 *  perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
    180 */
    181
    182/* u3 0-6, one of HV_24X7_PERF_DOMAIN */
    183EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
    184/* u16 */
    185EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
    186EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
    187EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
    188/* u32, see "data_offset" */
    189EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
    190/* u16 */
    191EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
    192
    193EVENT_DEFINE_RANGE(reserved1, config,   4, 15);
    194EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
    195EVENT_DEFINE_RANGE(reserved3, config2,  0, 63);
    196
    197static struct attribute *format_attrs[] = {
    198	&format_attr_domain.attr,
    199	&format_attr_offset.attr,
    200	&format_attr_core.attr,
    201	&format_attr_chip.attr,
    202	&format_attr_vcpu.attr,
    203	&format_attr_lpar.attr,
    204	NULL,
    205};
    206
    207static const struct attribute_group format_group = {
    208	.name = "format",
    209	.attrs = format_attrs,
    210};
    211
    212static struct attribute_group event_group = {
    213	.name = "events",
    214	/* .attrs is set in init */
    215};
    216
    217static struct attribute_group event_desc_group = {
    218	.name = "event_descs",
    219	/* .attrs is set in init */
    220};
    221
    222static struct attribute_group event_long_desc_group = {
    223	.name = "event_long_descs",
    224	/* .attrs is set in init */
    225};
    226
    227static struct kmem_cache *hv_page_cache;
    228
    229static DEFINE_PER_CPU(int, hv_24x7_txn_flags);
    230static DEFINE_PER_CPU(int, hv_24x7_txn_err);
    231
    232struct hv_24x7_hw {
    233	struct perf_event *events[255];
    234};
    235
    236static DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
    237
    238/*
    239 * request_buffer and result_buffer are not required to be 4k aligned,
    240 * but are not allowed to cross any 4k boundary. Aligning them to 4k is
    241 * the simplest way to ensure that.
    242 */
    243#define H24x7_DATA_BUFFER_SIZE	4096
    244static DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
    245static DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
    246
    247static unsigned int max_num_requests(int interface_version)
    248{
    249	return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
    250		/ H24x7_REQUEST_SIZE(interface_version);
    251}
    252
    253static char *event_name(struct hv_24x7_event_data *ev, int *len)
    254{
    255	*len = be16_to_cpu(ev->event_name_len) - 2;
    256	return (char *)ev->remainder;
    257}
    258
    259static char *event_desc(struct hv_24x7_event_data *ev, int *len)
    260{
    261	unsigned int nl = be16_to_cpu(ev->event_name_len);
    262	__be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
    263
    264	*len = be16_to_cpu(*desc_len) - 2;
    265	return (char *)ev->remainder + nl;
    266}
    267
    268static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
    269{
    270	unsigned int nl = be16_to_cpu(ev->event_name_len);
    271	__be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
    272	unsigned int desc_len = be16_to_cpu(*desc_len_);
    273	__be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
    274
    275	*len = be16_to_cpu(*long_desc_len) - 2;
    276	return (char *)ev->remainder + nl + desc_len;
    277}
    278
    279static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
    280					  void *end)
    281{
    282	void *start = ev;
    283
    284	return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
    285}
    286
    287/*
    288 * Things we don't check:
    289 *  - padding for desc, name, and long/detailed desc is required to be '\0'
    290 *    bytes.
    291 *
    292 *  Return NULL if we pass end,
    293 *  Otherwise return the address of the byte just following the event.
    294 */
    295static void *event_end(struct hv_24x7_event_data *ev, void *end)
    296{
    297	void *start = ev;
    298	__be16 *dl_, *ldl_;
    299	unsigned int dl, ldl;
    300	unsigned int nl = be16_to_cpu(ev->event_name_len);
    301
    302	if (nl < 2) {
    303		pr_debug("%s: name length too short: %d", __func__, nl);
    304		return NULL;
    305	}
    306
    307	if (start + nl > end) {
    308		pr_debug("%s: start=%p + nl=%u > end=%p",
    309				__func__, start, nl, end);
    310		return NULL;
    311	}
    312
    313	dl_ = (__be16 *)(ev->remainder + nl - 2);
    314	if (!IS_ALIGNED((uintptr_t)dl_, 2))
    315		pr_warn("desc len not aligned %p", dl_);
    316	dl = be16_to_cpu(*dl_);
    317	if (dl < 2) {
    318		pr_debug("%s: desc len too short: %d", __func__, dl);
    319		return NULL;
    320	}
    321
    322	if (start + nl + dl > end) {
    323		pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
    324				__func__, start, nl, dl, start + nl + dl, end);
    325		return NULL;
    326	}
    327
    328	ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
    329	if (!IS_ALIGNED((uintptr_t)ldl_, 2))
    330		pr_warn("long desc len not aligned %p", ldl_);
    331	ldl = be16_to_cpu(*ldl_);
    332	if (ldl < 2) {
    333		pr_debug("%s: long desc len too short (ldl=%u)",
    334				__func__, ldl);
    335		return NULL;
    336	}
    337
    338	if (start + nl + dl + ldl > end) {
    339		pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
    340				__func__, start, nl, dl, ldl, end);
    341		return NULL;
    342	}
    343
    344	return start + nl + dl + ldl;
    345}
    346
    347static long h_get_24x7_catalog_page_(unsigned long phys_4096,
    348				     unsigned long version, unsigned long index)
    349{
    350	pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
    351			phys_4096, version, index);
    352
    353	WARN_ON(!IS_ALIGNED(phys_4096, 4096));
    354
    355	return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
    356			phys_4096, version, index);
    357}
    358
    359static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
    360{
    361	return h_get_24x7_catalog_page_(virt_to_phys(page),
    362					version, index);
    363}
    364
    365/*
    366 * Each event we find in the catalog, will have a sysfs entry. Format the
    367 * data for this sysfs entry based on the event's domain.
    368 *
    369 * Events belonging to the Chip domain can only be monitored in that domain.
    370 * i.e the domain for these events is a fixed/knwon value.
    371 *
    372 * Events belonging to the Core domain can be monitored either in the physical
    373 * core or in one of the virtual CPU domains. So the domain value for these
    374 * events must be specified by the user (i.e is a required parameter). Format
    375 * the Core events with 'domain=?' so the perf-tool can error check required
    376 * parameters.
    377 *
    378 * NOTE: For the Core domain events, rather than making domain a required
    379 *	 parameter we could default it to PHYS_CORE and allowe users to
    380 *	 override the domain to one of the VCPU domains.
    381 *
    382 *	 However, this can make the interface a little inconsistent.
    383 *
    384 *	 If we set domain=2 (PHYS_CHIP) and allow user to override this field
    385 *	 the user may be tempted to also modify the "offset=x" field in which
    386 *	 can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
    387 *	 HPM_INST (offset=0x20) events. With:
    388 *
    389 *		perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
    390 *
    391 *	we end up monitoring HPM_INST, while the command line has HPM_PCYC.
    392 *
    393 *	By not assigning a default value to the domain for the Core events,
    394 *	we can have simple guidelines:
    395 *
    396 *		- Specifying values for parameters with "=?" is required.
    397 *
    398 *		- Specifying (i.e overriding) values for other parameters
    399 *		  is undefined.
    400 */
    401static char *event_fmt(struct hv_24x7_event_data *event, unsigned int domain)
    402{
    403	const char *sindex;
    404	const char *lpar;
    405	const char *domain_str;
    406	char buf[8];
    407
    408	switch (domain) {
    409	case HV_PERF_DOMAIN_PHYS_CHIP:
    410		snprintf(buf, sizeof(buf), "%d", domain);
    411		domain_str = buf;
    412		lpar = "0x0";
    413		sindex = "chip";
    414		break;
    415	case HV_PERF_DOMAIN_PHYS_CORE:
    416		domain_str = "?";
    417		lpar = "0x0";
    418		sindex = "core";
    419		break;
    420	default:
    421		domain_str = "?";
    422		lpar = "?";
    423		sindex = "vcpu";
    424	}
    425
    426	return kasprintf(GFP_KERNEL,
    427			"domain=%s,offset=0x%x,%s=?,lpar=%s",
    428			domain_str,
    429			be16_to_cpu(event->event_counter_offs) +
    430				be16_to_cpu(event->event_group_record_offs),
    431			sindex,
    432			lpar);
    433}
    434
    435/* Avoid trusting fw to NUL terminate strings */
    436static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
    437{
    438	return kasprintf(gfp, "%.*s", max_len, maybe_str);
    439}
    440
    441static ssize_t device_show_string(struct device *dev,
    442		struct device_attribute *attr, char *buf)
    443{
    444	struct dev_ext_attribute *d;
    445
    446	d = container_of(attr, struct dev_ext_attribute, attr);
    447
    448	return sprintf(buf, "%s\n", (char *)d->var);
    449}
    450
    451static ssize_t cpumask_show(struct device *dev,
    452			    struct device_attribute *attr, char *buf)
    453{
    454	return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
    455}
    456
    457static ssize_t sockets_show(struct device *dev,
    458			    struct device_attribute *attr, char *buf)
    459{
    460	return sprintf(buf, "%d\n", phys_sockets);
    461}
    462
    463static ssize_t chipspersocket_show(struct device *dev,
    464				   struct device_attribute *attr, char *buf)
    465{
    466	return sprintf(buf, "%d\n", phys_chipspersocket);
    467}
    468
    469static ssize_t coresperchip_show(struct device *dev,
    470				 struct device_attribute *attr, char *buf)
    471{
    472	return sprintf(buf, "%d\n", phys_coresperchip);
    473}
    474
    475static struct attribute *device_str_attr_create_(char *name, char *str)
    476{
    477	struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
    478
    479	if (!attr)
    480		return NULL;
    481
    482	sysfs_attr_init(&attr->attr.attr);
    483
    484	attr->var = str;
    485	attr->attr.attr.name = name;
    486	attr->attr.attr.mode = 0444;
    487	attr->attr.show = device_show_string;
    488
    489	return &attr->attr.attr;
    490}
    491
    492/*
    493 * Allocate and initialize strings representing event attributes.
    494 *
    495 * NOTE: The strings allocated here are never destroyed and continue to
    496 *	 exist till shutdown. This is to allow us to create as many events
    497 *	 from the catalog as possible, even if we encounter errors with some.
    498 *	 In case of changes to error paths in future, these may need to be
    499 *	 freed by the caller.
    500 */
    501static struct attribute *device_str_attr_create(char *name, int name_max,
    502						int name_nonce,
    503						char *str, size_t str_max)
    504{
    505	char *n;
    506	char *s = memdup_to_str(str, str_max, GFP_KERNEL);
    507	struct attribute *a;
    508
    509	if (!s)
    510		return NULL;
    511
    512	if (!name_nonce)
    513		n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
    514	else
    515		n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
    516					name_nonce);
    517	if (!n)
    518		goto out_s;
    519
    520	a = device_str_attr_create_(n, s);
    521	if (!a)
    522		goto out_n;
    523
    524	return a;
    525out_n:
    526	kfree(n);
    527out_s:
    528	kfree(s);
    529	return NULL;
    530}
    531
    532static struct attribute *event_to_attr(unsigned int ix,
    533				       struct hv_24x7_event_data *event,
    534				       unsigned int domain,
    535				       int nonce)
    536{
    537	int event_name_len;
    538	char *ev_name, *a_ev_name, *val;
    539	struct attribute *attr;
    540
    541	if (!domain_is_valid(domain)) {
    542		pr_warn("catalog event %u has invalid domain %u\n",
    543				ix, domain);
    544		return NULL;
    545	}
    546
    547	val = event_fmt(event, domain);
    548	if (!val)
    549		return NULL;
    550
    551	ev_name = event_name(event, &event_name_len);
    552	if (!nonce)
    553		a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
    554				(int)event_name_len, ev_name);
    555	else
    556		a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
    557				(int)event_name_len, ev_name, nonce);
    558
    559	if (!a_ev_name)
    560		goto out_val;
    561
    562	attr = device_str_attr_create_(a_ev_name, val);
    563	if (!attr)
    564		goto out_name;
    565
    566	return attr;
    567out_name:
    568	kfree(a_ev_name);
    569out_val:
    570	kfree(val);
    571	return NULL;
    572}
    573
    574static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
    575					    int nonce)
    576{
    577	int nl, dl;
    578	char *name = event_name(event, &nl);
    579	char *desc = event_desc(event, &dl);
    580
    581	/* If there isn't a description, don't create the sysfs file */
    582	if (!dl)
    583		return NULL;
    584
    585	return device_str_attr_create(name, nl, nonce, desc, dl);
    586}
    587
    588static struct attribute *
    589event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
    590{
    591	int nl, dl;
    592	char *name = event_name(event, &nl);
    593	char *desc = event_long_desc(event, &dl);
    594
    595	/* If there isn't a description, don't create the sysfs file */
    596	if (!dl)
    597		return NULL;
    598
    599	return device_str_attr_create(name, nl, nonce, desc, dl);
    600}
    601
    602static int event_data_to_attrs(unsigned int ix, struct attribute **attrs,
    603			       struct hv_24x7_event_data *event, int nonce)
    604{
    605	*attrs = event_to_attr(ix, event, event->domain, nonce);
    606	if (!*attrs)
    607		return -1;
    608
    609	return 0;
    610}
    611
    612/* */
    613struct event_uniq {
    614	struct rb_node node;
    615	const char *name;
    616	int nl;
    617	unsigned int ct;
    618	unsigned int domain;
    619};
    620
    621static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
    622{
    623	if (s1 < s2)
    624		return 1;
    625	if (s1 > s2)
    626		return -1;
    627
    628	return memcmp(d1, d2, s1);
    629}
    630
    631static int ev_uniq_ord(const void *v1, size_t s1, unsigned int d1,
    632		       const void *v2, size_t s2, unsigned int d2)
    633{
    634	int r = memord(v1, s1, v2, s2);
    635
    636	if (r)
    637		return r;
    638	if (d1 > d2)
    639		return 1;
    640	if (d2 > d1)
    641		return -1;
    642	return 0;
    643}
    644
    645static int event_uniq_add(struct rb_root *root, const char *name, int nl,
    646			  unsigned int domain)
    647{
    648	struct rb_node **new = &(root->rb_node), *parent = NULL;
    649	struct event_uniq *data;
    650
    651	/* Figure out where to put new node */
    652	while (*new) {
    653		struct event_uniq *it;
    654		int result;
    655
    656		it = rb_entry(*new, struct event_uniq, node);
    657		result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
    658					it->domain);
    659
    660		parent = *new;
    661		if (result < 0)
    662			new = &((*new)->rb_left);
    663		else if (result > 0)
    664			new = &((*new)->rb_right);
    665		else {
    666			it->ct++;
    667			pr_info("found a duplicate event %.*s, ct=%u\n", nl,
    668						name, it->ct);
    669			return it->ct;
    670		}
    671	}
    672
    673	data = kmalloc(sizeof(*data), GFP_KERNEL);
    674	if (!data)
    675		return -ENOMEM;
    676
    677	*data = (struct event_uniq) {
    678		.name = name,
    679		.nl = nl,
    680		.ct = 0,
    681		.domain = domain,
    682	};
    683
    684	/* Add new node and rebalance tree. */
    685	rb_link_node(&data->node, parent, new);
    686	rb_insert_color(&data->node, root);
    687
    688	/* data->ct */
    689	return 0;
    690}
    691
    692static void event_uniq_destroy(struct rb_root *root)
    693{
    694	/*
    695	 * the strings we point to are in the giant block of memory filled by
    696	 * the catalog, and are freed separately.
    697	 */
    698	struct event_uniq *pos, *n;
    699
    700	rbtree_postorder_for_each_entry_safe(pos, n, root, node)
    701		kfree(pos);
    702}
    703
    704
    705/*
    706 * ensure the event structure's sizes are self consistent and don't cause us to
    707 * read outside of the event
    708 *
    709 * On success, return the event length in bytes.
    710 * Otherwise, return -1 (and print as appropriate).
    711 */
    712static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
    713					  size_t event_idx,
    714					  size_t event_data_bytes,
    715					  size_t event_entry_count,
    716					  size_t offset, void *end)
    717{
    718	ssize_t ev_len;
    719	void *ev_end, *calc_ev_end;
    720
    721	if (offset >= event_data_bytes)
    722		return -1;
    723
    724	if (event_idx >= event_entry_count) {
    725		pr_devel("catalog event data has %zu bytes of padding after last event\n",
    726				event_data_bytes - offset);
    727		return -1;
    728	}
    729
    730	if (!event_fixed_portion_is_within(event, end)) {
    731		pr_warn("event %zu fixed portion is not within range\n",
    732				event_idx);
    733		return -1;
    734	}
    735
    736	ev_len = be16_to_cpu(event->length);
    737
    738	if (ev_len % 16)
    739		pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
    740				event_idx, ev_len, event);
    741
    742	ev_end = (__u8 *)event + ev_len;
    743	if (ev_end > end) {
    744		pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
    745				event_idx, ev_len, ev_end, end,
    746				offset);
    747		return -1;
    748	}
    749
    750	calc_ev_end = event_end(event, end);
    751	if (!calc_ev_end) {
    752		pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
    753			event_idx, event_data_bytes, event, end,
    754			offset);
    755		return -1;
    756	}
    757
    758	if (calc_ev_end > ev_end) {
    759		pr_warn("event %zu exceeds its own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
    760			event_idx, event, ev_end, offset, calc_ev_end);
    761		return -1;
    762	}
    763
    764	return ev_len;
    765}
    766
    767/*
    768 * Return true incase of invalid or dummy events with names like RESERVED*
    769 */
    770static bool ignore_event(const char *name)
    771{
    772	return strncmp(name, "RESERVED", 8) == 0;
    773}
    774
    775#define MAX_4K (SIZE_MAX / 4096)
    776
    777static int create_events_from_catalog(struct attribute ***events_,
    778				      struct attribute ***event_descs_,
    779				      struct attribute ***event_long_descs_)
    780{
    781	long hret;
    782	size_t catalog_len, catalog_page_len, event_entry_count,
    783	       event_data_len, event_data_offs,
    784	       event_data_bytes, junk_events, event_idx, event_attr_ct, i,
    785	       attr_max, event_idx_last, desc_ct, long_desc_ct;
    786	ssize_t ct, ev_len;
    787	uint64_t catalog_version_num;
    788	struct attribute **events, **event_descs, **event_long_descs;
    789	struct hv_24x7_catalog_page_0 *page_0 =
    790		kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
    791	void *page = page_0;
    792	void *event_data, *end;
    793	struct hv_24x7_event_data *event;
    794	struct rb_root ev_uniq = RB_ROOT;
    795	int ret = 0;
    796
    797	if (!page) {
    798		ret = -ENOMEM;
    799		goto e_out;
    800	}
    801
    802	hret = h_get_24x7_catalog_page(page, 0, 0);
    803	if (hret) {
    804		ret = -EIO;
    805		goto e_free;
    806	}
    807
    808	catalog_version_num = be64_to_cpu(page_0->version);
    809	catalog_page_len = be32_to_cpu(page_0->length);
    810
    811	if (MAX_4K < catalog_page_len) {
    812		pr_err("invalid page count: %zu\n", catalog_page_len);
    813		ret = -EIO;
    814		goto e_free;
    815	}
    816
    817	catalog_len = catalog_page_len * 4096;
    818
    819	event_entry_count = be16_to_cpu(page_0->event_entry_count);
    820	event_data_offs   = be16_to_cpu(page_0->event_data_offs);
    821	event_data_len    = be16_to_cpu(page_0->event_data_len);
    822
    823	pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
    824			catalog_version_num, catalog_len,
    825			event_entry_count, event_data_offs, event_data_len);
    826
    827	if ((MAX_4K < event_data_len)
    828			|| (MAX_4K < event_data_offs)
    829			|| (MAX_4K - event_data_offs < event_data_len)) {
    830		pr_err("invalid event data offs %zu and/or len %zu\n",
    831				event_data_offs, event_data_len);
    832		ret = -EIO;
    833		goto e_free;
    834	}
    835
    836	if ((event_data_offs + event_data_len) > catalog_page_len) {
    837		pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
    838				event_data_offs,
    839				event_data_offs + event_data_len,
    840				catalog_page_len);
    841		ret = -EIO;
    842		goto e_free;
    843	}
    844
    845	if (SIZE_MAX - 1 < event_entry_count) {
    846		pr_err("event_entry_count %zu is invalid\n", event_entry_count);
    847		ret = -EIO;
    848		goto e_free;
    849	}
    850
    851	event_data_bytes = event_data_len * 4096;
    852
    853	/*
    854	 * event data can span several pages, events can cross between these
    855	 * pages. Use vmalloc to make this easier.
    856	 */
    857	event_data = vmalloc(event_data_bytes);
    858	if (!event_data) {
    859		pr_err("could not allocate event data\n");
    860		ret = -ENOMEM;
    861		goto e_free;
    862	}
    863
    864	end = event_data + event_data_bytes;
    865
    866	/*
    867	 * using vmalloc_to_phys() like this only works if PAGE_SIZE is
    868	 * divisible by 4096
    869	 */
    870	BUILD_BUG_ON(PAGE_SIZE % 4096);
    871
    872	for (i = 0; i < event_data_len; i++) {
    873		hret = h_get_24x7_catalog_page_(
    874				vmalloc_to_phys(event_data + i * 4096),
    875				catalog_version_num,
    876				i + event_data_offs);
    877		if (hret) {
    878			pr_err("Failed to get event data in page %zu: rc=%ld\n",
    879			       i + event_data_offs, hret);
    880			ret = -EIO;
    881			goto e_event_data;
    882		}
    883	}
    884
    885	/*
    886	 * scan the catalog to determine the number of attributes we need, and
    887	 * verify it at the same time.
    888	 */
    889	for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
    890	     ;
    891	     event_idx++, event = (void *)event + ev_len) {
    892		size_t offset = (void *)event - (void *)event_data;
    893		char *name;
    894		int nl;
    895
    896		ev_len = catalog_event_len_validate(event, event_idx,
    897						    event_data_bytes,
    898						    event_entry_count,
    899						    offset, end);
    900		if (ev_len < 0)
    901			break;
    902
    903		name = event_name(event, &nl);
    904
    905		if (ignore_event(name)) {
    906			junk_events++;
    907			continue;
    908		}
    909		if (event->event_group_record_len == 0) {
    910			pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
    911					event_idx, nl, name);
    912			junk_events++;
    913			continue;
    914		}
    915
    916		if (!catalog_entry_domain_is_valid(event->domain)) {
    917			pr_info("event %zu (%.*s) has invalid domain %d\n",
    918					event_idx, nl, name, event->domain);
    919			junk_events++;
    920			continue;
    921		}
    922
    923		attr_max++;
    924	}
    925
    926	event_idx_last = event_idx;
    927	if (event_idx_last != event_entry_count)
    928		pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
    929				event_idx_last, event_entry_count, junk_events);
    930
    931	events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
    932	if (!events) {
    933		ret = -ENOMEM;
    934		goto e_event_data;
    935	}
    936
    937	event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
    938				GFP_KERNEL);
    939	if (!event_descs) {
    940		ret = -ENOMEM;
    941		goto e_event_attrs;
    942	}
    943
    944	event_long_descs = kmalloc_array(event_idx + 1,
    945			sizeof(*event_long_descs), GFP_KERNEL);
    946	if (!event_long_descs) {
    947		ret = -ENOMEM;
    948		goto e_event_descs;
    949	}
    950
    951	/* Iterate over the catalog filling in the attribute vector */
    952	for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
    953				event = event_data, event_idx = 0;
    954			event_idx < event_idx_last;
    955			event_idx++, ev_len = be16_to_cpu(event->length),
    956				event = (void *)event + ev_len) {
    957		char *name;
    958		int nl;
    959		int nonce;
    960		/*
    961		 * these are the only "bad" events that are intermixed and that
    962		 * we can ignore without issue. make sure to skip them here
    963		 */
    964		if (event->event_group_record_len == 0)
    965			continue;
    966		if (!catalog_entry_domain_is_valid(event->domain))
    967			continue;
    968
    969		name  = event_name(event, &nl);
    970		if (ignore_event(name))
    971			continue;
    972
    973		nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
    974		ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
    975					    event, nonce);
    976		if (ct < 0) {
    977			pr_warn("event %zu (%.*s) creation failure, skipping\n",
    978				event_idx, nl, name);
    979			junk_events++;
    980		} else {
    981			event_attr_ct++;
    982			event_descs[desc_ct] = event_to_desc_attr(event, nonce);
    983			if (event_descs[desc_ct])
    984				desc_ct++;
    985			event_long_descs[long_desc_ct] =
    986					event_to_long_desc_attr(event, nonce);
    987			if (event_long_descs[long_desc_ct])
    988				long_desc_ct++;
    989		}
    990	}
    991
    992	pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
    993			event_idx, event_attr_ct, junk_events, desc_ct);
    994
    995	events[event_attr_ct] = NULL;
    996	event_descs[desc_ct] = NULL;
    997	event_long_descs[long_desc_ct] = NULL;
    998
    999	event_uniq_destroy(&ev_uniq);
   1000	vfree(event_data);
   1001	kmem_cache_free(hv_page_cache, page);
   1002
   1003	*events_ = events;
   1004	*event_descs_ = event_descs;
   1005	*event_long_descs_ = event_long_descs;
   1006	return 0;
   1007
   1008e_event_descs:
   1009	kfree(event_descs);
   1010e_event_attrs:
   1011	kfree(events);
   1012e_event_data:
   1013	vfree(event_data);
   1014e_free:
   1015	kmem_cache_free(hv_page_cache, page);
   1016e_out:
   1017	*events_ = NULL;
   1018	*event_descs_ = NULL;
   1019	*event_long_descs_ = NULL;
   1020	return ret;
   1021}
   1022
   1023static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
   1024			    struct bin_attribute *bin_attr, char *buf,
   1025			    loff_t offset, size_t count)
   1026{
   1027	long hret;
   1028	ssize_t ret = 0;
   1029	size_t catalog_len = 0, catalog_page_len = 0;
   1030	loff_t page_offset = 0;
   1031	loff_t offset_in_page;
   1032	size_t copy_len;
   1033	uint64_t catalog_version_num = 0;
   1034	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
   1035	struct hv_24x7_catalog_page_0 *page_0 = page;
   1036
   1037	if (!page)
   1038		return -ENOMEM;
   1039
   1040	hret = h_get_24x7_catalog_page(page, 0, 0);
   1041	if (hret) {
   1042		ret = -EIO;
   1043		goto e_free;
   1044	}
   1045
   1046	catalog_version_num = be64_to_cpu(page_0->version);
   1047	catalog_page_len = be32_to_cpu(page_0->length);
   1048	catalog_len = catalog_page_len * 4096;
   1049
   1050	page_offset = offset / 4096;
   1051	offset_in_page = offset % 4096;
   1052
   1053	if (page_offset >= catalog_page_len)
   1054		goto e_free;
   1055
   1056	if (page_offset != 0) {
   1057		hret = h_get_24x7_catalog_page(page, catalog_version_num,
   1058					       page_offset);
   1059		if (hret) {
   1060			ret = -EIO;
   1061			goto e_free;
   1062		}
   1063	}
   1064
   1065	copy_len = 4096 - offset_in_page;
   1066	if (copy_len > count)
   1067		copy_len = count;
   1068
   1069	memcpy(buf, page+offset_in_page, copy_len);
   1070	ret = copy_len;
   1071
   1072e_free:
   1073	if (hret)
   1074		pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
   1075		       " rc=%ld\n",
   1076		       catalog_version_num, page_offset, hret);
   1077	kmem_cache_free(hv_page_cache, page);
   1078
   1079	pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
   1080			"catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
   1081			count, catalog_len, catalog_page_len, ret);
   1082
   1083	return ret;
   1084}
   1085
   1086static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
   1087			    char *page)
   1088{
   1089	int d, n, count = 0;
   1090	const char *str;
   1091
   1092	for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
   1093		str = domain_name(d);
   1094		if (!str)
   1095			continue;
   1096
   1097		n = sprintf(page, "%d: %s\n", d, str);
   1098		if (n < 0)
   1099			break;
   1100
   1101		count += n;
   1102		page += n;
   1103	}
   1104	return count;
   1105}
   1106
   1107#define PAGE_0_ATTR(_name, _fmt, _expr)				\
   1108static ssize_t _name##_show(struct device *dev,			\
   1109			    struct device_attribute *dev_attr,	\
   1110			    char *buf)				\
   1111{								\
   1112	long hret;						\
   1113	ssize_t ret = 0;					\
   1114	void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);	\
   1115	struct hv_24x7_catalog_page_0 *page_0 = page;		\
   1116	if (!page)						\
   1117		return -ENOMEM;					\
   1118	hret = h_get_24x7_catalog_page(page, 0, 0);		\
   1119	if (hret) {						\
   1120		ret = -EIO;					\
   1121		goto e_free;					\
   1122	}							\
   1123	ret = sprintf(buf, _fmt, _expr);			\
   1124e_free:								\
   1125	kmem_cache_free(hv_page_cache, page);			\
   1126	return ret;						\
   1127}								\
   1128static DEVICE_ATTR_RO(_name)
   1129
   1130PAGE_0_ATTR(catalog_version, "%lld\n",
   1131		(unsigned long long)be64_to_cpu(page_0->version));
   1132PAGE_0_ATTR(catalog_len, "%lld\n",
   1133		(unsigned long long)be32_to_cpu(page_0->length) * 4096);
   1134static BIN_ATTR_RO(catalog, 0/* real length varies */);
   1135static DEVICE_ATTR_RO(domains);
   1136static DEVICE_ATTR_RO(sockets);
   1137static DEVICE_ATTR_RO(chipspersocket);
   1138static DEVICE_ATTR_RO(coresperchip);
   1139static DEVICE_ATTR_RO(cpumask);
   1140
   1141static struct bin_attribute *if_bin_attrs[] = {
   1142	&bin_attr_catalog,
   1143	NULL,
   1144};
   1145
   1146static struct attribute *cpumask_attrs[] = {
   1147	&dev_attr_cpumask.attr,
   1148	NULL,
   1149};
   1150
   1151static const struct attribute_group cpumask_attr_group = {
   1152	.attrs = cpumask_attrs,
   1153};
   1154
   1155static struct attribute *if_attrs[] = {
   1156	&dev_attr_catalog_len.attr,
   1157	&dev_attr_catalog_version.attr,
   1158	&dev_attr_domains.attr,
   1159	&dev_attr_sockets.attr,
   1160	&dev_attr_chipspersocket.attr,
   1161	&dev_attr_coresperchip.attr,
   1162	NULL,
   1163};
   1164
   1165static const struct attribute_group if_group = {
   1166	.name = "interface",
   1167	.bin_attrs = if_bin_attrs,
   1168	.attrs = if_attrs,
   1169};
   1170
   1171static const struct attribute_group *attr_groups[] = {
   1172	&format_group,
   1173	&event_group,
   1174	&event_desc_group,
   1175	&event_long_desc_group,
   1176	&if_group,
   1177	&cpumask_attr_group,
   1178	NULL,
   1179};
   1180
   1181/*
   1182 * Start the process for a new H_GET_24x7_DATA hcall.
   1183 */
   1184static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
   1185			      struct hv_24x7_data_result_buffer *result_buffer)
   1186{
   1187
   1188	memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
   1189	memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
   1190
   1191	request_buffer->interface_version = interface_version;
   1192	/* memset above set request_buffer->num_requests to 0 */
   1193}
   1194
   1195/*
   1196 * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
   1197 * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
   1198 */
   1199static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
   1200			     struct hv_24x7_data_result_buffer *result_buffer)
   1201{
   1202	long ret;
   1203
   1204	/*
   1205	 * NOTE: Due to variable number of array elements in request and
   1206	 *	 result buffer(s), sizeof() is not reliable. Use the actual
   1207	 *	 allocated buffer size, H24x7_DATA_BUFFER_SIZE.
   1208	 */
   1209	ret = plpar_hcall_norets(H_GET_24X7_DATA,
   1210			virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
   1211			virt_to_phys(result_buffer),  H24x7_DATA_BUFFER_SIZE);
   1212
   1213	if (ret) {
   1214		struct hv_24x7_request *req;
   1215
   1216		req = request_buffer->requests;
   1217		pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
   1218				      req->performance_domain, req->data_offset,
   1219				      req->starting_ix, req->starting_lpar_ix,
   1220				      ret, ret, result_buffer->detailed_rc,
   1221				      result_buffer->failing_request_ix);
   1222		return -EIO;
   1223	}
   1224
   1225	return 0;
   1226}
   1227
   1228/*
   1229 * Add the given @event to the next slot in the 24x7 request_buffer.
   1230 *
   1231 * Note that H_GET_24X7_DATA hcall allows reading several counters'
   1232 * values in a single HCALL. We expect the caller to add events to the
   1233 * request buffer one by one, make the HCALL and process the results.
   1234 */
   1235static int add_event_to_24x7_request(struct perf_event *event,
   1236				struct hv_24x7_request_buffer *request_buffer)
   1237{
   1238	u16 idx;
   1239	int i;
   1240	size_t req_size;
   1241	struct hv_24x7_request *req;
   1242
   1243	if (request_buffer->num_requests >=
   1244	    max_num_requests(request_buffer->interface_version)) {
   1245		pr_devel("Too many requests for 24x7 HCALL %d\n",
   1246				request_buffer->num_requests);
   1247		return -EINVAL;
   1248	}
   1249
   1250	switch (event_get_domain(event)) {
   1251	case HV_PERF_DOMAIN_PHYS_CHIP:
   1252		idx = event_get_chip(event);
   1253		break;
   1254	case HV_PERF_DOMAIN_PHYS_CORE:
   1255		idx = event_get_core(event);
   1256		break;
   1257	default:
   1258		idx = event_get_vcpu(event);
   1259	}
   1260
   1261	req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
   1262
   1263	i = request_buffer->num_requests++;
   1264	req = (void *) request_buffer->requests + i * req_size;
   1265
   1266	req->performance_domain = event_get_domain(event);
   1267	req->data_size = cpu_to_be16(8);
   1268	req->data_offset = cpu_to_be32(event_get_offset(event));
   1269	req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
   1270	req->max_num_lpars = cpu_to_be16(1);
   1271	req->starting_ix = cpu_to_be16(idx);
   1272	req->max_ix = cpu_to_be16(1);
   1273
   1274	if (request_buffer->interface_version > 1) {
   1275		if (domain_needs_aggregation(req->performance_domain))
   1276			req->max_num_thread_groups = -1;
   1277		else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
   1278			req->starting_thread_group_ix = idx % 2;
   1279			req->max_num_thread_groups = 1;
   1280		}
   1281	}
   1282
   1283	return 0;
   1284}
   1285
   1286/**
   1287 * get_count_from_result - get event count from all result elements in result
   1288 *
   1289 * If the event corresponding to this result needs aggregation of the result
   1290 * element values, then this function does that.
   1291 *
   1292 * @event:	Event associated with @res.
   1293 * @resb:	Result buffer containing @res.
   1294 * @res:	Result to work on.
   1295 * @countp:	Output variable containing the event count.
   1296 * @next:	Optional output variable pointing to the next result in @resb.
   1297 */
   1298static int get_count_from_result(struct perf_event *event,
   1299				 struct hv_24x7_data_result_buffer *resb,
   1300				 struct hv_24x7_result *res, u64 *countp,
   1301				 struct hv_24x7_result **next)
   1302{
   1303	u16 num_elements = be16_to_cpu(res->num_elements_returned);
   1304	u16 data_size = be16_to_cpu(res->result_element_data_size);
   1305	unsigned int data_offset;
   1306	void *element_data;
   1307	int i;
   1308	u64 count;
   1309
   1310	/*
   1311	 * We can bail out early if the result is empty.
   1312	 */
   1313	if (!num_elements) {
   1314		pr_debug("Result of request %hhu is empty, nothing to do\n",
   1315			 res->result_ix);
   1316
   1317		if (next)
   1318			*next = (struct hv_24x7_result *) res->elements;
   1319
   1320		return -ENODATA;
   1321	}
   1322
   1323	/*
   1324	 * Since we always specify 1 as the maximum for the smallest resource
   1325	 * we're requesting, there should to be only one element per result.
   1326	 * Except when an event needs aggregation, in which case there are more.
   1327	 */
   1328	if (num_elements != 1 &&
   1329	    !domain_needs_aggregation(event_get_domain(event))) {
   1330		pr_err("Error: result of request %hhu has %hu elements\n",
   1331		       res->result_ix, num_elements);
   1332
   1333		return -EIO;
   1334	}
   1335
   1336	if (data_size != sizeof(u64)) {
   1337		pr_debug("Error: result of request %hhu has data of %hu bytes\n",
   1338			 res->result_ix, data_size);
   1339
   1340		return -ENOTSUPP;
   1341	}
   1342
   1343	if (resb->interface_version == 1)
   1344		data_offset = offsetof(struct hv_24x7_result_element_v1,
   1345				       element_data);
   1346	else
   1347		data_offset = offsetof(struct hv_24x7_result_element_v2,
   1348				       element_data);
   1349
   1350	/* Go through the result elements in the result. */
   1351	for (i = count = 0, element_data = res->elements + data_offset;
   1352	     i < num_elements;
   1353	     i++, element_data += data_size + data_offset)
   1354		count += be64_to_cpu(*((u64 *) element_data));
   1355
   1356	*countp = count;
   1357
   1358	/* The next result is after the last result element. */
   1359	if (next)
   1360		*next = element_data - data_offset;
   1361
   1362	return 0;
   1363}
   1364
   1365static int single_24x7_request(struct perf_event *event, u64 *count)
   1366{
   1367	int ret;
   1368	struct hv_24x7_request_buffer *request_buffer;
   1369	struct hv_24x7_data_result_buffer *result_buffer;
   1370
   1371	BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
   1372	BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
   1373
   1374	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
   1375	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
   1376
   1377	init_24x7_request(request_buffer, result_buffer);
   1378
   1379	ret = add_event_to_24x7_request(event, request_buffer);
   1380	if (ret)
   1381		goto out;
   1382
   1383	ret = make_24x7_request(request_buffer, result_buffer);
   1384	if (ret)
   1385		goto out;
   1386
   1387	/* process result from hcall */
   1388	ret = get_count_from_result(event, result_buffer,
   1389				    result_buffer->results, count, NULL);
   1390
   1391out:
   1392	put_cpu_var(hv_24x7_reqb);
   1393	put_cpu_var(hv_24x7_resb);
   1394	return ret;
   1395}
   1396
   1397
   1398static int h_24x7_event_init(struct perf_event *event)
   1399{
   1400	struct hv_perf_caps caps;
   1401	unsigned int domain;
   1402	unsigned long hret;
   1403	u64 ct;
   1404
   1405	/* Not our event */
   1406	if (event->attr.type != event->pmu->type)
   1407		return -ENOENT;
   1408
   1409	/* Unused areas must be 0 */
   1410	if (event_get_reserved1(event) ||
   1411	    event_get_reserved2(event) ||
   1412	    event_get_reserved3(event)) {
   1413		pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
   1414				event->attr.config,
   1415				event_get_reserved1(event),
   1416				event->attr.config1,
   1417				event_get_reserved2(event),
   1418				event->attr.config2,
   1419				event_get_reserved3(event));
   1420		return -EINVAL;
   1421	}
   1422
   1423	/* no branch sampling */
   1424	if (has_branch_stack(event))
   1425		return -EOPNOTSUPP;
   1426
   1427	/* offset must be 8 byte aligned */
   1428	if (event_get_offset(event) % 8) {
   1429		pr_devel("bad alignment\n");
   1430		return -EINVAL;
   1431	}
   1432
   1433	domain = event_get_domain(event);
   1434	if (domain >= HV_PERF_DOMAIN_MAX) {
   1435		pr_devel("invalid domain %d\n", domain);
   1436		return -EINVAL;
   1437	}
   1438
   1439	hret = hv_perf_caps_get(&caps);
   1440	if (hret) {
   1441		pr_devel("could not get capabilities: rc=%ld\n", hret);
   1442		return -EIO;
   1443	}
   1444
   1445	/* Physical domains & other lpars require extra capabilities */
   1446	if (!caps.collect_privileged && (is_physical_domain(domain) ||
   1447		(event_get_lpar(event) != event_get_lpar_max()))) {
   1448		pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
   1449				is_physical_domain(domain),
   1450				event_get_lpar(event));
   1451		return -EACCES;
   1452	}
   1453
   1454	/* Get the initial value of the counter for this event */
   1455	if (single_24x7_request(event, &ct)) {
   1456		pr_devel("test hcall failed\n");
   1457		return -EIO;
   1458	}
   1459	(void)local64_xchg(&event->hw.prev_count, ct);
   1460
   1461	return 0;
   1462}
   1463
   1464static u64 h_24x7_get_value(struct perf_event *event)
   1465{
   1466	u64 ct;
   1467
   1468	if (single_24x7_request(event, &ct))
   1469		/* We checked this in event init, shouldn't fail here... */
   1470		return 0;
   1471
   1472	return ct;
   1473}
   1474
   1475static void update_event_count(struct perf_event *event, u64 now)
   1476{
   1477	s64 prev;
   1478
   1479	prev = local64_xchg(&event->hw.prev_count, now);
   1480	local64_add(now - prev, &event->count);
   1481}
   1482
   1483static void h_24x7_event_read(struct perf_event *event)
   1484{
   1485	u64 now;
   1486	struct hv_24x7_request_buffer *request_buffer;
   1487	struct hv_24x7_hw *h24x7hw;
   1488	int txn_flags;
   1489
   1490	txn_flags = __this_cpu_read(hv_24x7_txn_flags);
   1491
   1492	/*
   1493	 * If in a READ transaction, add this counter to the list of
   1494	 * counters to read during the next HCALL (i.e commit_txn()).
   1495	 * If not in a READ transaction, go ahead and make the HCALL
   1496	 * to read this counter by itself.
   1497	 */
   1498
   1499	if (txn_flags & PERF_PMU_TXN_READ) {
   1500		int i;
   1501		int ret;
   1502
   1503		if (__this_cpu_read(hv_24x7_txn_err))
   1504			return;
   1505
   1506		request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
   1507
   1508		ret = add_event_to_24x7_request(event, request_buffer);
   1509		if (ret) {
   1510			__this_cpu_write(hv_24x7_txn_err, ret);
   1511		} else {
   1512			/*
   1513			 * Associate the event with the HCALL request index,
   1514			 * so ->commit_txn() can quickly find/update count.
   1515			 */
   1516			i = request_buffer->num_requests - 1;
   1517
   1518			h24x7hw = &get_cpu_var(hv_24x7_hw);
   1519			h24x7hw->events[i] = event;
   1520			put_cpu_var(h24x7hw);
   1521		}
   1522
   1523		put_cpu_var(hv_24x7_reqb);
   1524	} else {
   1525		now = h_24x7_get_value(event);
   1526		update_event_count(event, now);
   1527	}
   1528}
   1529
   1530static void h_24x7_event_start(struct perf_event *event, int flags)
   1531{
   1532	if (flags & PERF_EF_RELOAD)
   1533		local64_set(&event->hw.prev_count, h_24x7_get_value(event));
   1534}
   1535
   1536static void h_24x7_event_stop(struct perf_event *event, int flags)
   1537{
   1538	h_24x7_event_read(event);
   1539}
   1540
   1541static int h_24x7_event_add(struct perf_event *event, int flags)
   1542{
   1543	if (flags & PERF_EF_START)
   1544		h_24x7_event_start(event, flags);
   1545
   1546	return 0;
   1547}
   1548
   1549/*
   1550 * 24x7 counters only support READ transactions. They are
   1551 * always counting and dont need/support ADD transactions.
   1552 * Cache the flags, but otherwise ignore transactions that
   1553 * are not PERF_PMU_TXN_READ.
   1554 */
   1555static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
   1556{
   1557	struct hv_24x7_request_buffer *request_buffer;
   1558	struct hv_24x7_data_result_buffer *result_buffer;
   1559
   1560	/* We should not be called if we are already in a txn */
   1561	WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
   1562
   1563	__this_cpu_write(hv_24x7_txn_flags, flags);
   1564	if (flags & ~PERF_PMU_TXN_READ)
   1565		return;
   1566
   1567	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
   1568	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
   1569
   1570	init_24x7_request(request_buffer, result_buffer);
   1571
   1572	put_cpu_var(hv_24x7_resb);
   1573	put_cpu_var(hv_24x7_reqb);
   1574}
   1575
   1576/*
   1577 * Clean up transaction state.
   1578 *
   1579 * NOTE: Ignore state of request and result buffers for now.
   1580 *	 We will initialize them during the next read/txn.
   1581 */
   1582static void reset_txn(void)
   1583{
   1584	__this_cpu_write(hv_24x7_txn_flags, 0);
   1585	__this_cpu_write(hv_24x7_txn_err, 0);
   1586}
   1587
   1588/*
   1589 * 24x7 counters only support READ transactions. They are always counting
   1590 * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
   1591 * ignore transactions that are not of type PERF_PMU_TXN_READ.
   1592 *
   1593 * For READ transactions, submit all pending 24x7 requests (i.e requests
   1594 * that were queued by h_24x7_event_read()), to the hypervisor and update
   1595 * the event counts.
   1596 */
   1597static int h_24x7_event_commit_txn(struct pmu *pmu)
   1598{
   1599	struct hv_24x7_request_buffer *request_buffer;
   1600	struct hv_24x7_data_result_buffer *result_buffer;
   1601	struct hv_24x7_result *res, *next_res;
   1602	u64 count;
   1603	int i, ret, txn_flags;
   1604	struct hv_24x7_hw *h24x7hw;
   1605
   1606	txn_flags = __this_cpu_read(hv_24x7_txn_flags);
   1607	WARN_ON_ONCE(!txn_flags);
   1608
   1609	ret = 0;
   1610	if (txn_flags & ~PERF_PMU_TXN_READ)
   1611		goto out;
   1612
   1613	ret = __this_cpu_read(hv_24x7_txn_err);
   1614	if (ret)
   1615		goto out;
   1616
   1617	request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
   1618	result_buffer = (void *)get_cpu_var(hv_24x7_resb);
   1619
   1620	ret = make_24x7_request(request_buffer, result_buffer);
   1621	if (ret)
   1622		goto put_reqb;
   1623
   1624	h24x7hw = &get_cpu_var(hv_24x7_hw);
   1625
   1626	/* Go through results in the result buffer to update event counts. */
   1627	for (i = 0, res = result_buffer->results;
   1628	     i < result_buffer->num_results; i++, res = next_res) {
   1629		struct perf_event *event = h24x7hw->events[res->result_ix];
   1630
   1631		ret = get_count_from_result(event, result_buffer, res, &count,
   1632					    &next_res);
   1633		if (ret)
   1634			break;
   1635
   1636		update_event_count(event, count);
   1637	}
   1638
   1639	put_cpu_var(hv_24x7_hw);
   1640
   1641put_reqb:
   1642	put_cpu_var(hv_24x7_resb);
   1643	put_cpu_var(hv_24x7_reqb);
   1644out:
   1645	reset_txn();
   1646	return ret;
   1647}
   1648
   1649/*
   1650 * 24x7 counters only support READ transactions. They are always counting
   1651 * and dont need/support ADD transactions. However, regardless of type
   1652 * of transaction, all we need to do is cleanup, so we don't have to check
   1653 * the type of transaction.
   1654 */
   1655static void h_24x7_event_cancel_txn(struct pmu *pmu)
   1656{
   1657	WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
   1658	reset_txn();
   1659}
   1660
   1661static struct pmu h_24x7_pmu = {
   1662	.task_ctx_nr = perf_invalid_context,
   1663
   1664	.name = "hv_24x7",
   1665	.attr_groups = attr_groups,
   1666	.event_init  = h_24x7_event_init,
   1667	.add         = h_24x7_event_add,
   1668	.del         = h_24x7_event_stop,
   1669	.start       = h_24x7_event_start,
   1670	.stop        = h_24x7_event_stop,
   1671	.read        = h_24x7_event_read,
   1672	.start_txn   = h_24x7_event_start_txn,
   1673	.commit_txn  = h_24x7_event_commit_txn,
   1674	.cancel_txn  = h_24x7_event_cancel_txn,
   1675	.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
   1676};
   1677
   1678static int ppc_hv_24x7_cpu_online(unsigned int cpu)
   1679{
   1680	if (cpumask_empty(&hv_24x7_cpumask))
   1681		cpumask_set_cpu(cpu, &hv_24x7_cpumask);
   1682
   1683	return 0;
   1684}
   1685
   1686static int ppc_hv_24x7_cpu_offline(unsigned int cpu)
   1687{
   1688	int target;
   1689
   1690	/* Check if exiting cpu is used for collecting 24x7 events */
   1691	if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask))
   1692		return 0;
   1693
   1694	/* Find a new cpu to collect 24x7 events */
   1695	target = cpumask_last(cpu_active_mask);
   1696
   1697	if (target < 0 || target >= nr_cpu_ids) {
   1698		pr_err("hv_24x7: CPU hotplug init failed\n");
   1699		return -1;
   1700	}
   1701
   1702	/* Migrate 24x7 events to the new target */
   1703	cpumask_set_cpu(target, &hv_24x7_cpumask);
   1704	perf_pmu_migrate_context(&h_24x7_pmu, cpu, target);
   1705
   1706	return 0;
   1707}
   1708
   1709static int hv_24x7_cpu_hotplug_init(void)
   1710{
   1711	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
   1712			  "perf/powerpc/hv_24x7:online",
   1713			  ppc_hv_24x7_cpu_online,
   1714			  ppc_hv_24x7_cpu_offline);
   1715}
   1716
   1717static int hv_24x7_init(void)
   1718{
   1719	int r;
   1720	unsigned long hret;
   1721	struct hv_perf_caps caps;
   1722
   1723	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
   1724		pr_debug("not a virtualized system, not enabling\n");
   1725		return -ENODEV;
   1726	} else if (!cur_cpu_spec->oprofile_cpu_type)
   1727		return -ENODEV;
   1728
   1729	/* POWER8 only supports v1, while POWER9 only supports v2. */
   1730	if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
   1731		interface_version = 1;
   1732	else {
   1733		interface_version = 2;
   1734
   1735		/* SMT8 in POWER9 needs to aggregate result elements. */
   1736		if (threads_per_core == 8)
   1737			aggregate_result_elements = true;
   1738	}
   1739
   1740	hret = hv_perf_caps_get(&caps);
   1741	if (hret) {
   1742		pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
   1743				hret);
   1744		return -ENODEV;
   1745	}
   1746
   1747	hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
   1748	if (!hv_page_cache)
   1749		return -ENOMEM;
   1750
   1751	/* sampling not supported */
   1752	h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
   1753
   1754	r = create_events_from_catalog(&event_group.attrs,
   1755				   &event_desc_group.attrs,
   1756				   &event_long_desc_group.attrs);
   1757
   1758	if (r)
   1759		return r;
   1760
   1761	/* init cpuhotplug */
   1762	r = hv_24x7_cpu_hotplug_init();
   1763	if (r)
   1764		return r;
   1765
   1766	r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
   1767	if (r)
   1768		return r;
   1769
   1770	read_24x7_sys_info();
   1771
   1772	return 0;
   1773}
   1774
   1775device_initcall(hv_24x7_init);