cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

imc-pmu.c (47630B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * In-Memory Collection (IMC) Performance Monitor counter support.
      4 *
      5 * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
      6 *           (C) 2017 Anju T Sudhakar, IBM Corporation.
      7 *           (C) 2017 Hemant K Shaw, IBM Corporation.
      8 */
      9#include <linux/of.h>
     10#include <linux/perf_event.h>
     11#include <linux/slab.h>
     12#include <asm/opal.h>
     13#include <asm/imc-pmu.h>
     14#include <asm/cputhreads.h>
     15#include <asm/smp.h>
     16#include <linux/string.h>
     17
     18/* Nest IMC data structures and variables */
     19
     20/*
     21 * Used to avoid races in counting the nest-pmu units during hotplug
     22 * register and unregister
     23 */
     24static DEFINE_MUTEX(nest_init_lock);
     25static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc);
     26static struct imc_pmu **per_nest_pmu_arr;
     27static cpumask_t nest_imc_cpumask;
     28static struct imc_pmu_ref *nest_imc_refc;
     29static int nest_pmus;
     30
     31/* Core IMC data structures and variables */
     32
     33static cpumask_t core_imc_cpumask;
     34static struct imc_pmu_ref *core_imc_refc;
     35static struct imc_pmu *core_imc_pmu;
     36
     37/* Thread IMC data structures and variables */
     38
     39static DEFINE_PER_CPU(u64 *, thread_imc_mem);
     40static struct imc_pmu *thread_imc_pmu;
     41static int thread_imc_mem_size;
     42
     43/* Trace IMC data structures */
     44static DEFINE_PER_CPU(u64 *, trace_imc_mem);
     45static struct imc_pmu_ref *trace_imc_refc;
     46static int trace_imc_mem_size;
     47
     48/*
     49 * Global data structure used to avoid races between thread,
     50 * core and trace-imc
     51 */
     52static struct imc_pmu_ref imc_global_refc = {
     53	.lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
     54	.id = 0,
     55	.refc = 0,
     56};
     57
     58static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
     59{
     60	return container_of(event->pmu, struct imc_pmu, pmu);
     61}
     62
     63PMU_FORMAT_ATTR(event, "config:0-61");
     64PMU_FORMAT_ATTR(offset, "config:0-31");
     65PMU_FORMAT_ATTR(rvalue, "config:32");
     66PMU_FORMAT_ATTR(mode, "config:33-40");
     67static struct attribute *imc_format_attrs[] = {
     68	&format_attr_event.attr,
     69	&format_attr_offset.attr,
     70	&format_attr_rvalue.attr,
     71	&format_attr_mode.attr,
     72	NULL,
     73};
     74
     75static const struct attribute_group imc_format_group = {
     76	.name = "format",
     77	.attrs = imc_format_attrs,
     78};
     79
     80/* Format attribute for imc trace-mode */
     81PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
     82PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
     83PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
     84PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
     85static struct attribute *trace_imc_format_attrs[] = {
     86	&format_attr_event.attr,
     87	&format_attr_cpmc_reserved.attr,
     88	&format_attr_cpmc_event.attr,
     89	&format_attr_cpmc_samplesel.attr,
     90	&format_attr_cpmc_load.attr,
     91	NULL,
     92};
     93
     94static const struct attribute_group trace_imc_format_group = {
     95.name = "format",
     96.attrs = trace_imc_format_attrs,
     97};
     98
     99/* Get the cpumask printed to a buffer "buf" */
    100static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
    101					struct device_attribute *attr,
    102					char *buf)
    103{
    104	struct pmu *pmu = dev_get_drvdata(dev);
    105	struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu);
    106	cpumask_t *active_mask;
    107
    108	switch(imc_pmu->domain){
    109	case IMC_DOMAIN_NEST:
    110		active_mask = &nest_imc_cpumask;
    111		break;
    112	case IMC_DOMAIN_CORE:
    113		active_mask = &core_imc_cpumask;
    114		break;
    115	default:
    116		return 0;
    117	}
    118
    119	return cpumap_print_to_pagebuf(true, buf, active_mask);
    120}
    121
    122static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
    123
    124static struct attribute *imc_pmu_cpumask_attrs[] = {
    125	&dev_attr_cpumask.attr,
    126	NULL,
    127};
    128
    129static const struct attribute_group imc_pmu_cpumask_attr_group = {
    130	.attrs = imc_pmu_cpumask_attrs,
    131};
    132
    133/* device_str_attr_create : Populate event "name" and string "str" in attribute */
    134static struct attribute *device_str_attr_create(const char *name, const char *str)
    135{
    136	struct perf_pmu_events_attr *attr;
    137
    138	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
    139	if (!attr)
    140		return NULL;
    141	sysfs_attr_init(&attr->attr.attr);
    142
    143	attr->event_str = str;
    144	attr->attr.attr.name = name;
    145	attr->attr.attr.mode = 0444;
    146	attr->attr.show = perf_event_sysfs_show;
    147
    148	return &attr->attr.attr;
    149}
    150
    151static int imc_parse_event(struct device_node *np, const char *scale,
    152				  const char *unit, const char *prefix,
    153				  u32 base, struct imc_events *event)
    154{
    155	const char *s;
    156	u32 reg;
    157
    158	if (of_property_read_u32(np, "reg", &reg))
    159		goto error;
    160	/* Add the base_reg value to the "reg" */
    161	event->value = base + reg;
    162
    163	if (of_property_read_string(np, "event-name", &s))
    164		goto error;
    165
    166	event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s);
    167	if (!event->name)
    168		goto error;
    169
    170	if (of_property_read_string(np, "scale", &s))
    171		s = scale;
    172
    173	if (s) {
    174		event->scale = kstrdup(s, GFP_KERNEL);
    175		if (!event->scale)
    176			goto error;
    177	}
    178
    179	if (of_property_read_string(np, "unit", &s))
    180		s = unit;
    181
    182	if (s) {
    183		event->unit = kstrdup(s, GFP_KERNEL);
    184		if (!event->unit)
    185			goto error;
    186	}
    187
    188	return 0;
    189error:
    190	kfree(event->unit);
    191	kfree(event->scale);
    192	kfree(event->name);
    193	return -EINVAL;
    194}
    195
    196/*
    197 * imc_free_events: Function to cleanup the events list, having
    198 * 		    "nr_entries".
    199 */
    200static void imc_free_events(struct imc_events *events, int nr_entries)
    201{
    202	int i;
    203
    204	/* Nothing to clean, return */
    205	if (!events)
    206		return;
    207	for (i = 0; i < nr_entries; i++) {
    208		kfree(events[i].unit);
    209		kfree(events[i].scale);
    210		kfree(events[i].name);
    211	}
    212
    213	kfree(events);
    214}
    215
    216/*
    217 * update_events_in_group: Update the "events" information in an attr_group
    218 *                         and assign the attr_group to the pmu "pmu".
    219 */
    220static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu)
    221{
    222	struct attribute_group *attr_group;
    223	struct attribute **attrs, *dev_str;
    224	struct device_node *np, *pmu_events;
    225	u32 handle, base_reg;
    226	int i = 0, j = 0, ct, ret;
    227	const char *prefix, *g_scale, *g_unit;
    228	const char *ev_val_str, *ev_scale_str, *ev_unit_str;
    229
    230	if (!of_property_read_u32(node, "events", &handle))
    231		pmu_events = of_find_node_by_phandle(handle);
    232	else
    233		return 0;
    234
    235	/* Did not find any node with a given phandle */
    236	if (!pmu_events)
    237		return 0;
    238
    239	/* Get a count of number of child nodes */
    240	ct = of_get_child_count(pmu_events);
    241
    242	/* Get the event prefix */
    243	if (of_property_read_string(node, "events-prefix", &prefix))
    244		return 0;
    245
    246	/* Get a global unit and scale data if available */
    247	if (of_property_read_string(node, "scale", &g_scale))
    248		g_scale = NULL;
    249
    250	if (of_property_read_string(node, "unit", &g_unit))
    251		g_unit = NULL;
    252
    253	/* "reg" property gives out the base offset of the counters data */
    254	of_property_read_u32(node, "reg", &base_reg);
    255
    256	/* Allocate memory for the events */
    257	pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL);
    258	if (!pmu->events)
    259		return -ENOMEM;
    260
    261	ct = 0;
    262	/* Parse the events and update the struct */
    263	for_each_child_of_node(pmu_events, np) {
    264		ret = imc_parse_event(np, g_scale, g_unit, prefix, base_reg, &pmu->events[ct]);
    265		if (!ret)
    266			ct++;
    267	}
    268
    269	/* Allocate memory for attribute group */
    270	attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
    271	if (!attr_group) {
    272		imc_free_events(pmu->events, ct);
    273		return -ENOMEM;
    274	}
    275
    276	/*
    277	 * Allocate memory for attributes.
    278	 * Since we have count of events for this pmu, we also allocate
    279	 * memory for the scale and unit attribute for now.
    280	 * "ct" has the total event structs added from the events-parent node.
    281	 * So allocate three times the "ct" (this includes event, event_scale and
    282	 * event_unit).
    283	 */
    284	attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL);
    285	if (!attrs) {
    286		kfree(attr_group);
    287		imc_free_events(pmu->events, ct);
    288		return -ENOMEM;
    289	}
    290
    291	attr_group->name = "events";
    292	attr_group->attrs = attrs;
    293	do {
    294		ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i].value);
    295		dev_str = device_str_attr_create(pmu->events[i].name, ev_val_str);
    296		if (!dev_str)
    297			continue;
    298
    299		attrs[j++] = dev_str;
    300		if (pmu->events[i].scale) {
    301			ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale", pmu->events[i].name);
    302			dev_str = device_str_attr_create(ev_scale_str, pmu->events[i].scale);
    303			if (!dev_str)
    304				continue;
    305
    306			attrs[j++] = dev_str;
    307		}
    308
    309		if (pmu->events[i].unit) {
    310			ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit", pmu->events[i].name);
    311			dev_str = device_str_attr_create(ev_unit_str, pmu->events[i].unit);
    312			if (!dev_str)
    313				continue;
    314
    315			attrs[j++] = dev_str;
    316		}
    317	} while (++i < ct);
    318
    319	/* Save the event attribute */
    320	pmu->attr_groups[IMC_EVENT_ATTR] = attr_group;
    321
    322	return 0;
    323}
    324
    325/* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */
    326static struct imc_pmu_ref *get_nest_pmu_ref(int cpu)
    327{
    328	return per_cpu(local_nest_imc_refc, cpu);
    329}
    330
    331static void nest_change_cpu_context(int old_cpu, int new_cpu)
    332{
    333	struct imc_pmu **pn = per_nest_pmu_arr;
    334
    335	if (old_cpu < 0 || new_cpu < 0)
    336		return;
    337
    338	while (*pn) {
    339		perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu);
    340		pn++;
    341	}
    342}
    343
    344static int ppc_nest_imc_cpu_offline(unsigned int cpu)
    345{
    346	int nid, target = -1;
    347	const struct cpumask *l_cpumask;
    348	struct imc_pmu_ref *ref;
    349
    350	/*
    351	 * Check in the designated list for this cpu. Dont bother
    352	 * if not one of them.
    353	 */
    354	if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
    355		return 0;
    356
    357	/*
    358	 * Check whether nest_imc is registered. We could end up here if the
    359	 * cpuhotplug callback registration fails. i.e, callback invokes the
    360	 * offline path for all successfully registered nodes. At this stage,
    361	 * nest_imc pmu will not be registered and we should return here.
    362	 *
    363	 * We return with a zero since this is not an offline failure. And
    364	 * cpuhp_setup_state() returns the actual failure reason to the caller,
    365	 * which in turn will call the cleanup routine.
    366	 */
    367	if (!nest_pmus)
    368		return 0;
    369
    370	/*
    371	 * Now that this cpu is one of the designated,
    372	 * find a next cpu a) which is online and b) in same chip.
    373	 */
    374	nid = cpu_to_node(cpu);
    375	l_cpumask = cpumask_of_node(nid);
    376	target = cpumask_last(l_cpumask);
    377
    378	/*
    379	 * If this(target) is the last cpu in the cpumask for this chip,
    380	 * check for any possible online cpu in the chip.
    381	 */
    382	if (unlikely(target == cpu))
    383		target = cpumask_any_but(l_cpumask, cpu);
    384
    385	/*
    386	 * Update the cpumask with the target cpu and
    387	 * migrate the context if needed
    388	 */
    389	if (target >= 0 && target < nr_cpu_ids) {
    390		cpumask_set_cpu(target, &nest_imc_cpumask);
    391		nest_change_cpu_context(cpu, target);
    392	} else {
    393		opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
    394				       get_hard_smp_processor_id(cpu));
    395		/*
    396		 * If this is the last cpu in this chip then, skip the reference
    397		 * count mutex lock and make the reference count on this chip zero.
    398		 */
    399		ref = get_nest_pmu_ref(cpu);
    400		if (!ref)
    401			return -EINVAL;
    402
    403		ref->refc = 0;
    404	}
    405	return 0;
    406}
    407
    408static int ppc_nest_imc_cpu_online(unsigned int cpu)
    409{
    410	const struct cpumask *l_cpumask;
    411	static struct cpumask tmp_mask;
    412	int res;
    413
    414	/* Get the cpumask of this node */
    415	l_cpumask = cpumask_of_node(cpu_to_node(cpu));
    416
    417	/*
    418	 * If this is not the first online CPU on this node, then
    419	 * just return.
    420	 */
    421	if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask))
    422		return 0;
    423
    424	/*
    425	 * If this is the first online cpu on this node
    426	 * disable the nest counters by making an OPAL call.
    427	 */
    428	res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
    429				     get_hard_smp_processor_id(cpu));
    430	if (res)
    431		return res;
    432
    433	/* Make this CPU the designated target for counter collection */
    434	cpumask_set_cpu(cpu, &nest_imc_cpumask);
    435	return 0;
    436}
    437
    438static int nest_pmu_cpumask_init(void)
    439{
    440	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
    441				 "perf/powerpc/imc:online",
    442				 ppc_nest_imc_cpu_online,
    443				 ppc_nest_imc_cpu_offline);
    444}
    445
    446static void nest_imc_counters_release(struct perf_event *event)
    447{
    448	int rc, node_id;
    449	struct imc_pmu_ref *ref;
    450
    451	if (event->cpu < 0)
    452		return;
    453
    454	node_id = cpu_to_node(event->cpu);
    455
    456	/*
    457	 * See if we need to disable the nest PMU.
    458	 * If no events are currently in use, then we have to take a
    459	 * mutex to ensure that we don't race with another task doing
    460	 * enable or disable the nest counters.
    461	 */
    462	ref = get_nest_pmu_ref(event->cpu);
    463	if (!ref)
    464		return;
    465
    466	/* Take the mutex lock for this node and then decrement the reference count */
    467	mutex_lock(&ref->lock);
    468	if (ref->refc == 0) {
    469		/*
    470		 * The scenario where this is true is, when perf session is
    471		 * started, followed by offlining of all cpus in a given node.
    472		 *
    473		 * In the cpuhotplug offline path, ppc_nest_imc_cpu_offline()
    474		 * function set the ref->count to zero, if the cpu which is
    475		 * about to offline is the last cpu in a given node and make
    476		 * an OPAL call to disable the engine in that node.
    477		 *
    478		 */
    479		mutex_unlock(&ref->lock);
    480		return;
    481	}
    482	ref->refc--;
    483	if (ref->refc == 0) {
    484		rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
    485					    get_hard_smp_processor_id(event->cpu));
    486		if (rc) {
    487			mutex_unlock(&ref->lock);
    488			pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id);
    489			return;
    490		}
    491	} else if (ref->refc < 0) {
    492		WARN(1, "nest-imc: Invalid event reference count\n");
    493		ref->refc = 0;
    494	}
    495	mutex_unlock(&ref->lock);
    496}
    497
    498static int nest_imc_event_init(struct perf_event *event)
    499{
    500	int chip_id, rc, node_id;
    501	u32 l_config, config = event->attr.config;
    502	struct imc_mem_info *pcni;
    503	struct imc_pmu *pmu;
    504	struct imc_pmu_ref *ref;
    505	bool flag = false;
    506
    507	if (event->attr.type != event->pmu->type)
    508		return -ENOENT;
    509
    510	/* Sampling not supported */
    511	if (event->hw.sample_period)
    512		return -EINVAL;
    513
    514	if (event->cpu < 0)
    515		return -EINVAL;
    516
    517	pmu = imc_event_to_pmu(event);
    518
    519	/* Sanity check for config (event offset) */
    520	if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)
    521		return -EINVAL;
    522
    523	/*
    524	 * Nest HW counter memory resides in a per-chip reserve-memory (HOMER).
    525	 * Get the base memory address for this cpu.
    526	 */
    527	chip_id = cpu_to_chip_id(event->cpu);
    528
    529	/* Return, if chip_id is not valid */
    530	if (chip_id < 0)
    531		return -ENODEV;
    532
    533	pcni = pmu->mem_info;
    534	do {
    535		if (pcni->id == chip_id) {
    536			flag = true;
    537			break;
    538		}
    539		pcni++;
    540	} while (pcni->vbase != 0);
    541
    542	if (!flag)
    543		return -ENODEV;
    544
    545	/*
    546	 * Add the event offset to the base address.
    547	 */
    548	l_config = config & IMC_EVENT_OFFSET_MASK;
    549	event->hw.event_base = (u64)pcni->vbase + l_config;
    550	node_id = cpu_to_node(event->cpu);
    551
    552	/*
    553	 * Get the imc_pmu_ref struct for this node.
    554	 * Take the mutex lock and then increment the count of nest pmu events
    555	 * inited.
    556	 */
    557	ref = get_nest_pmu_ref(event->cpu);
    558	if (!ref)
    559		return -EINVAL;
    560
    561	mutex_lock(&ref->lock);
    562	if (ref->refc == 0) {
    563		rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST,
    564					     get_hard_smp_processor_id(event->cpu));
    565		if (rc) {
    566			mutex_unlock(&ref->lock);
    567			pr_err("nest-imc: Unable to start the counters for node %d\n",
    568									node_id);
    569			return rc;
    570		}
    571	}
    572	++ref->refc;
    573	mutex_unlock(&ref->lock);
    574
    575	event->destroy = nest_imc_counters_release;
    576	return 0;
    577}
    578
    579/*
    580 * core_imc_mem_init : Initializes memory for the current core.
    581 *
    582 * Uses alloc_pages_node() and uses the returned address as an argument to
    583 * an opal call to configure the pdbar. The address sent as an argument is
    584 * converted to physical address before the opal call is made. This is the
    585 * base address at which the core imc counters are populated.
    586 */
    587static int core_imc_mem_init(int cpu, int size)
    588{
    589	int nid, rc = 0, core_id = (cpu / threads_per_core);
    590	struct imc_mem_info *mem_info;
    591	struct page *page;
    592
    593	/*
    594	 * alloc_pages_node() will allocate memory for core in the
    595	 * local node only.
    596	 */
    597	nid = cpu_to_node(cpu);
    598	mem_info = &core_imc_pmu->mem_info[core_id];
    599	mem_info->id = core_id;
    600
    601	/* We need only vbase for core counters */
    602	page = alloc_pages_node(nid,
    603				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
    604				__GFP_NOWARN, get_order(size));
    605	if (!page)
    606		return -ENOMEM;
    607	mem_info->vbase = page_address(page);
    608
    609	/* Init the mutex */
    610	core_imc_refc[core_id].id = core_id;
    611	mutex_init(&core_imc_refc[core_id].lock);
    612
    613	rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE,
    614				__pa((void *)mem_info->vbase),
    615				get_hard_smp_processor_id(cpu));
    616	if (rc) {
    617		free_pages((u64)mem_info->vbase, get_order(size));
    618		mem_info->vbase = NULL;
    619	}
    620
    621	return rc;
    622}
    623
    624static bool is_core_imc_mem_inited(int cpu)
    625{
    626	struct imc_mem_info *mem_info;
    627	int core_id = (cpu / threads_per_core);
    628
    629	mem_info = &core_imc_pmu->mem_info[core_id];
    630	if (!mem_info->vbase)
    631		return false;
    632
    633	return true;
    634}
    635
    636static int ppc_core_imc_cpu_online(unsigned int cpu)
    637{
    638	const struct cpumask *l_cpumask;
    639	static struct cpumask tmp_mask;
    640	int ret = 0;
    641
    642	/* Get the cpumask for this core */
    643	l_cpumask = cpu_sibling_mask(cpu);
    644
    645	/* If a cpu for this core is already set, then, don't do anything */
    646	if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask))
    647		return 0;
    648
    649	if (!is_core_imc_mem_inited(cpu)) {
    650		ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size);
    651		if (ret) {
    652			pr_info("core_imc memory allocation for cpu %d failed\n", cpu);
    653			return ret;
    654		}
    655	}
    656
    657	/* set the cpu in the mask */
    658	cpumask_set_cpu(cpu, &core_imc_cpumask);
    659	return 0;
    660}
    661
    662static int ppc_core_imc_cpu_offline(unsigned int cpu)
    663{
    664	unsigned int core_id;
    665	int ncpu;
    666	struct imc_pmu_ref *ref;
    667
    668	/*
    669	 * clear this cpu out of the mask, if not present in the mask,
    670	 * don't bother doing anything.
    671	 */
    672	if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask))
    673		return 0;
    674
    675	/*
    676	 * Check whether core_imc is registered. We could end up here
    677	 * if the cpuhotplug callback registration fails. i.e, callback
    678	 * invokes the offline path for all successfully registered cpus.
    679	 * At this stage, core_imc pmu will not be registered and we
    680	 * should return here.
    681	 *
    682	 * We return with a zero since this is not an offline failure.
    683	 * And cpuhp_setup_state() returns the actual failure reason
    684	 * to the caller, which inturn will call the cleanup routine.
    685	 */
    686	if (!core_imc_pmu->pmu.event_init)
    687		return 0;
    688
    689	/* Find any online cpu in that core except the current "cpu" */
    690	ncpu = cpumask_last(cpu_sibling_mask(cpu));
    691
    692	if (unlikely(ncpu == cpu))
    693		ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
    694
    695	if (ncpu >= 0 && ncpu < nr_cpu_ids) {
    696		cpumask_set_cpu(ncpu, &core_imc_cpumask);
    697		perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu);
    698	} else {
    699		/*
    700		 * If this is the last cpu in this core then, skip taking refernce
    701		 * count mutex lock for this core and directly zero "refc" for
    702		 * this core.
    703		 */
    704		opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
    705				       get_hard_smp_processor_id(cpu));
    706		core_id = cpu / threads_per_core;
    707		ref = &core_imc_refc[core_id];
    708		if (!ref)
    709			return -EINVAL;
    710
    711		ref->refc = 0;
    712		/*
    713		 * Reduce the global reference count, if this is the
    714		 * last cpu in this core and core-imc event running
    715		 * in this cpu.
    716		 */
    717		mutex_lock(&imc_global_refc.lock);
    718		if (imc_global_refc.id == IMC_DOMAIN_CORE)
    719			imc_global_refc.refc--;
    720
    721		mutex_unlock(&imc_global_refc.lock);
    722	}
    723	return 0;
    724}
    725
    726static int core_imc_pmu_cpumask_init(void)
    727{
    728	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
    729				 "perf/powerpc/imc_core:online",
    730				 ppc_core_imc_cpu_online,
    731				 ppc_core_imc_cpu_offline);
    732}
    733
    734static void reset_global_refc(struct perf_event *event)
    735{
    736		mutex_lock(&imc_global_refc.lock);
    737		imc_global_refc.refc--;
    738
    739		/*
    740		 * If no other thread is running any
    741		 * event for this domain(thread/core/trace),
    742		 * set the global id to zero.
    743		 */
    744		if (imc_global_refc.refc <= 0) {
    745			imc_global_refc.refc = 0;
    746			imc_global_refc.id = 0;
    747		}
    748		mutex_unlock(&imc_global_refc.lock);
    749}
    750
    751static void core_imc_counters_release(struct perf_event *event)
    752{
    753	int rc, core_id;
    754	struct imc_pmu_ref *ref;
    755
    756	if (event->cpu < 0)
    757		return;
    758	/*
    759	 * See if we need to disable the IMC PMU.
    760	 * If no events are currently in use, then we have to take a
    761	 * mutex to ensure that we don't race with another task doing
    762	 * enable or disable the core counters.
    763	 */
    764	core_id = event->cpu / threads_per_core;
    765
    766	/* Take the mutex lock and decrement the refernce count for this core */
    767	ref = &core_imc_refc[core_id];
    768	if (!ref)
    769		return;
    770
    771	mutex_lock(&ref->lock);
    772	if (ref->refc == 0) {
    773		/*
    774		 * The scenario where this is true is, when perf session is
    775		 * started, followed by offlining of all cpus in a given core.
    776		 *
    777		 * In the cpuhotplug offline path, ppc_core_imc_cpu_offline()
    778		 * function set the ref->count to zero, if the cpu which is
    779		 * about to offline is the last cpu in a given core and make
    780		 * an OPAL call to disable the engine in that core.
    781		 *
    782		 */
    783		mutex_unlock(&ref->lock);
    784		return;
    785	}
    786	ref->refc--;
    787	if (ref->refc == 0) {
    788		rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
    789					    get_hard_smp_processor_id(event->cpu));
    790		if (rc) {
    791			mutex_unlock(&ref->lock);
    792			pr_err("IMC: Unable to stop the counters for core %d\n", core_id);
    793			return;
    794		}
    795	} else if (ref->refc < 0) {
    796		WARN(1, "core-imc: Invalid event reference count\n");
    797		ref->refc = 0;
    798	}
    799	mutex_unlock(&ref->lock);
    800
    801	reset_global_refc(event);
    802}
    803
    804static int core_imc_event_init(struct perf_event *event)
    805{
    806	int core_id, rc;
    807	u64 config = event->attr.config;
    808	struct imc_mem_info *pcmi;
    809	struct imc_pmu *pmu;
    810	struct imc_pmu_ref *ref;
    811
    812	if (event->attr.type != event->pmu->type)
    813		return -ENOENT;
    814
    815	/* Sampling not supported */
    816	if (event->hw.sample_period)
    817		return -EINVAL;
    818
    819	if (event->cpu < 0)
    820		return -EINVAL;
    821
    822	event->hw.idx = -1;
    823	pmu = imc_event_to_pmu(event);
    824
    825	/* Sanity check for config (event offset) */
    826	if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
    827		return -EINVAL;
    828
    829	if (!is_core_imc_mem_inited(event->cpu))
    830		return -ENODEV;
    831
    832	core_id = event->cpu / threads_per_core;
    833	pcmi = &core_imc_pmu->mem_info[core_id];
    834	if ((!pcmi->vbase))
    835		return -ENODEV;
    836
    837	/* Get the core_imc mutex for this core */
    838	ref = &core_imc_refc[core_id];
    839	if (!ref)
    840		return -EINVAL;
    841
    842	/*
    843	 * Core pmu units are enabled only when it is used.
    844	 * See if this is triggered for the first time.
    845	 * If yes, take the mutex lock and enable the core counters.
    846	 * If not, just increment the count in core_imc_refc struct.
    847	 */
    848	mutex_lock(&ref->lock);
    849	if (ref->refc == 0) {
    850		rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
    851					     get_hard_smp_processor_id(event->cpu));
    852		if (rc) {
    853			mutex_unlock(&ref->lock);
    854			pr_err("core-imc: Unable to start the counters for core %d\n",
    855									core_id);
    856			return rc;
    857		}
    858	}
    859	++ref->refc;
    860	mutex_unlock(&ref->lock);
    861
    862	/*
    863	 * Since the system can run either in accumulation or trace-mode
    864	 * of IMC at a time, core-imc events are allowed only if no other
    865	 * trace/thread imc events are enabled/monitored.
    866	 *
    867	 * Take the global lock, and check the refc.id
    868	 * to know whether any other trace/thread imc
    869	 * events are running.
    870	 */
    871	mutex_lock(&imc_global_refc.lock);
    872	if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
    873		/*
    874		 * No other trace/thread imc events are running in
    875		 * the system, so set the refc.id to core-imc.
    876		 */
    877		imc_global_refc.id = IMC_DOMAIN_CORE;
    878		imc_global_refc.refc++;
    879	} else {
    880		mutex_unlock(&imc_global_refc.lock);
    881		return -EBUSY;
    882	}
    883	mutex_unlock(&imc_global_refc.lock);
    884
    885	event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
    886	event->destroy = core_imc_counters_release;
    887	return 0;
    888}
    889
    890/*
    891 * Allocates a page of memory for each of the online cpus, and load
    892 * LDBAR with 0.
    893 * The physical base address of the page allocated for a cpu will be
    894 * written to the LDBAR for that cpu, when the thread-imc event
    895 * is added.
    896 *
    897 * LDBAR Register Layout:
    898 *
    899 *  0          4         8         12        16        20        24        28
    900 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
    901 *   | |       [   ]    [                   Counter Address [8:50]
    902 *   | * Mode    |
    903 *   |           * PB Scope
    904 *   * Enable/Disable
    905 *
    906 *  32        36        40        44        48        52        56        60
    907 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
    908 *           Counter Address [8:50]              ]
    909 *
    910 */
    911static int thread_imc_mem_alloc(int cpu_id, int size)
    912{
    913	u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
    914	int nid = cpu_to_node(cpu_id);
    915
    916	if (!local_mem) {
    917		struct page *page;
    918		/*
    919		 * This case could happen only once at start, since we dont
    920		 * free the memory in cpu offline path.
    921		 */
    922		page = alloc_pages_node(nid,
    923				  GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
    924				  __GFP_NOWARN, get_order(size));
    925		if (!page)
    926			return -ENOMEM;
    927		local_mem = page_address(page);
    928
    929		per_cpu(thread_imc_mem, cpu_id) = local_mem;
    930	}
    931
    932	mtspr(SPRN_LDBAR, 0);
    933	return 0;
    934}
    935
    936static int ppc_thread_imc_cpu_online(unsigned int cpu)
    937{
    938	return thread_imc_mem_alloc(cpu, thread_imc_mem_size);
    939}
    940
    941static int ppc_thread_imc_cpu_offline(unsigned int cpu)
    942{
    943	/*
    944	 * Set the bit 0 of LDBAR to zero.
    945	 *
    946	 * If bit 0 of LDBAR is unset, it will stop posting
    947	 * the counter data to memory.
    948	 * For thread-imc, bit 0 of LDBAR will be set to 1 in the
    949	 * event_add function. So reset this bit here, to stop the updates
    950	 * to memory in the cpu_offline path.
    951	 */
    952	mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
    953
    954	/* Reduce the refc if thread-imc event running on this cpu */
    955	mutex_lock(&imc_global_refc.lock);
    956	if (imc_global_refc.id == IMC_DOMAIN_THREAD)
    957		imc_global_refc.refc--;
    958	mutex_unlock(&imc_global_refc.lock);
    959
    960	return 0;
    961}
    962
    963static int thread_imc_cpu_init(void)
    964{
    965	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
    966			  "perf/powerpc/imc_thread:online",
    967			  ppc_thread_imc_cpu_online,
    968			  ppc_thread_imc_cpu_offline);
    969}
    970
    971static int thread_imc_event_init(struct perf_event *event)
    972{
    973	u32 config = event->attr.config;
    974	struct task_struct *target;
    975	struct imc_pmu *pmu;
    976
    977	if (event->attr.type != event->pmu->type)
    978		return -ENOENT;
    979
    980	if (!perfmon_capable())
    981		return -EACCES;
    982
    983	/* Sampling not supported */
    984	if (event->hw.sample_period)
    985		return -EINVAL;
    986
    987	event->hw.idx = -1;
    988	pmu = imc_event_to_pmu(event);
    989
    990	/* Sanity check for config offset */
    991	if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
    992		return -EINVAL;
    993
    994	target = event->hw.target;
    995	if (!target)
    996		return -EINVAL;
    997
    998	mutex_lock(&imc_global_refc.lock);
    999	/*
   1000	 * Check if any other trace/core imc events are running in the
   1001	 * system, if not set the global id to thread-imc.
   1002	 */
   1003	if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
   1004		imc_global_refc.id = IMC_DOMAIN_THREAD;
   1005		imc_global_refc.refc++;
   1006	} else {
   1007		mutex_unlock(&imc_global_refc.lock);
   1008		return -EBUSY;
   1009	}
   1010	mutex_unlock(&imc_global_refc.lock);
   1011
   1012	event->pmu->task_ctx_nr = perf_sw_context;
   1013	event->destroy = reset_global_refc;
   1014	return 0;
   1015}
   1016
   1017static bool is_thread_imc_pmu(struct perf_event *event)
   1018{
   1019	if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc")))
   1020		return true;
   1021
   1022	return false;
   1023}
   1024
   1025static u64 * get_event_base_addr(struct perf_event *event)
   1026{
   1027	u64 addr;
   1028
   1029	if (is_thread_imc_pmu(event)) {
   1030		addr = (u64)per_cpu(thread_imc_mem, smp_processor_id());
   1031		return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK));
   1032	}
   1033
   1034	return (u64 *)event->hw.event_base;
   1035}
   1036
   1037static void thread_imc_pmu_start_txn(struct pmu *pmu,
   1038				     unsigned int txn_flags)
   1039{
   1040	if (txn_flags & ~PERF_PMU_TXN_ADD)
   1041		return;
   1042	perf_pmu_disable(pmu);
   1043}
   1044
   1045static void thread_imc_pmu_cancel_txn(struct pmu *pmu)
   1046{
   1047	perf_pmu_enable(pmu);
   1048}
   1049
   1050static int thread_imc_pmu_commit_txn(struct pmu *pmu)
   1051{
   1052	perf_pmu_enable(pmu);
   1053	return 0;
   1054}
   1055
   1056static u64 imc_read_counter(struct perf_event *event)
   1057{
   1058	u64 *addr, data;
   1059
   1060	/*
   1061	 * In-Memory Collection (IMC) counters are free flowing counters.
   1062	 * So we take a snapshot of the counter value on enable and save it
   1063	 * to calculate the delta at later stage to present the event counter
   1064	 * value.
   1065	 */
   1066	addr = get_event_base_addr(event);
   1067	data = be64_to_cpu(READ_ONCE(*addr));
   1068	local64_set(&event->hw.prev_count, data);
   1069
   1070	return data;
   1071}
   1072
   1073static void imc_event_update(struct perf_event *event)
   1074{
   1075	u64 counter_prev, counter_new, final_count;
   1076
   1077	counter_prev = local64_read(&event->hw.prev_count);
   1078	counter_new = imc_read_counter(event);
   1079	final_count = counter_new - counter_prev;
   1080
   1081	/* Update the delta to the event count */
   1082	local64_add(final_count, &event->count);
   1083}
   1084
   1085static void imc_event_start(struct perf_event *event, int flags)
   1086{
   1087	/*
   1088	 * In Memory Counters are free flowing counters. HW or the microcode
   1089	 * keeps adding to the counter offset in memory. To get event
   1090	 * counter value, we snapshot the value here and we calculate
   1091	 * delta at later point.
   1092	 */
   1093	imc_read_counter(event);
   1094}
   1095
   1096static void imc_event_stop(struct perf_event *event, int flags)
   1097{
   1098	/*
   1099	 * Take a snapshot and calculate the delta and update
   1100	 * the event counter values.
   1101	 */
   1102	imc_event_update(event);
   1103}
   1104
   1105static int imc_event_add(struct perf_event *event, int flags)
   1106{
   1107	if (flags & PERF_EF_START)
   1108		imc_event_start(event, flags);
   1109
   1110	return 0;
   1111}
   1112
   1113static int thread_imc_event_add(struct perf_event *event, int flags)
   1114{
   1115	int core_id;
   1116	struct imc_pmu_ref *ref;
   1117	u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
   1118
   1119	if (flags & PERF_EF_START)
   1120		imc_event_start(event, flags);
   1121
   1122	if (!is_core_imc_mem_inited(smp_processor_id()))
   1123		return -EINVAL;
   1124
   1125	core_id = smp_processor_id() / threads_per_core;
   1126	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
   1127	mtspr(SPRN_LDBAR, ldbar_value);
   1128
   1129	/*
   1130	 * imc pmus are enabled only when it is used.
   1131	 * See if this is triggered for the first time.
   1132	 * If yes, take the mutex lock and enable the counters.
   1133	 * If not, just increment the count in ref count struct.
   1134	 */
   1135	ref = &core_imc_refc[core_id];
   1136	if (!ref)
   1137		return -EINVAL;
   1138
   1139	mutex_lock(&ref->lock);
   1140	if (ref->refc == 0) {
   1141		if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
   1142		    get_hard_smp_processor_id(smp_processor_id()))) {
   1143			mutex_unlock(&ref->lock);
   1144			pr_err("thread-imc: Unable to start the counter\
   1145				for core %d\n", core_id);
   1146			return -EINVAL;
   1147		}
   1148	}
   1149	++ref->refc;
   1150	mutex_unlock(&ref->lock);
   1151	return 0;
   1152}
   1153
   1154static void thread_imc_event_del(struct perf_event *event, int flags)
   1155{
   1156
   1157	int core_id;
   1158	struct imc_pmu_ref *ref;
   1159
   1160	core_id = smp_processor_id() / threads_per_core;
   1161	ref = &core_imc_refc[core_id];
   1162	if (!ref) {
   1163		pr_debug("imc: Failed to get event reference count\n");
   1164		return;
   1165	}
   1166
   1167	mutex_lock(&ref->lock);
   1168	ref->refc--;
   1169	if (ref->refc == 0) {
   1170		if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
   1171		    get_hard_smp_processor_id(smp_processor_id()))) {
   1172			mutex_unlock(&ref->lock);
   1173			pr_err("thread-imc: Unable to stop the counters\
   1174				for core %d\n", core_id);
   1175			return;
   1176		}
   1177	} else if (ref->refc < 0) {
   1178		ref->refc = 0;
   1179	}
   1180	mutex_unlock(&ref->lock);
   1181
   1182	/* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
   1183	mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
   1184
   1185	/*
   1186	 * Take a snapshot and calculate the delta and update
   1187	 * the event counter values.
   1188	 */
   1189	imc_event_update(event);
   1190}
   1191
   1192/*
   1193 * Allocate a page of memory for each cpu, and load LDBAR with 0.
   1194 */
   1195static int trace_imc_mem_alloc(int cpu_id, int size)
   1196{
   1197	u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
   1198	int phys_id = cpu_to_node(cpu_id), rc = 0;
   1199	int core_id = (cpu_id / threads_per_core);
   1200
   1201	if (!local_mem) {
   1202		struct page *page;
   1203
   1204		page = alloc_pages_node(phys_id,
   1205				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
   1206				__GFP_NOWARN, get_order(size));
   1207		if (!page)
   1208			return -ENOMEM;
   1209		local_mem = page_address(page);
   1210		per_cpu(trace_imc_mem, cpu_id) = local_mem;
   1211
   1212		/* Initialise the counters for trace mode */
   1213		rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem),
   1214					    get_hard_smp_processor_id(cpu_id));
   1215		if (rc) {
   1216			pr_info("IMC:opal init failed for trace imc\n");
   1217			return rc;
   1218		}
   1219	}
   1220
   1221	/* Init the mutex, if not already */
   1222	trace_imc_refc[core_id].id = core_id;
   1223	mutex_init(&trace_imc_refc[core_id].lock);
   1224
   1225	mtspr(SPRN_LDBAR, 0);
   1226	return 0;
   1227}
   1228
   1229static int ppc_trace_imc_cpu_online(unsigned int cpu)
   1230{
   1231	return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
   1232}
   1233
   1234static int ppc_trace_imc_cpu_offline(unsigned int cpu)
   1235{
   1236	/*
   1237	 * No need to set bit 0 of LDBAR to zero, as
   1238	 * it is set to zero for imc trace-mode
   1239	 *
   1240	 * Reduce the refc if any trace-imc event running
   1241	 * on this cpu.
   1242	 */
   1243	mutex_lock(&imc_global_refc.lock);
   1244	if (imc_global_refc.id == IMC_DOMAIN_TRACE)
   1245		imc_global_refc.refc--;
   1246	mutex_unlock(&imc_global_refc.lock);
   1247
   1248	return 0;
   1249}
   1250
   1251static int trace_imc_cpu_init(void)
   1252{
   1253	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
   1254			  "perf/powerpc/imc_trace:online",
   1255			  ppc_trace_imc_cpu_online,
   1256			  ppc_trace_imc_cpu_offline);
   1257}
   1258
   1259static u64 get_trace_imc_event_base_addr(void)
   1260{
   1261	return (u64)per_cpu(trace_imc_mem, smp_processor_id());
   1262}
   1263
   1264/*
   1265 * Function to parse trace-imc data obtained
   1266 * and to prepare the perf sample.
   1267 */
   1268static int trace_imc_prepare_sample(struct trace_imc_data *mem,
   1269				    struct perf_sample_data *data,
   1270				    u64 *prev_tb,
   1271				    struct perf_event_header *header,
   1272				    struct perf_event *event)
   1273{
   1274	/* Sanity checks for a valid record */
   1275	if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
   1276		*prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
   1277	else
   1278		return -EINVAL;
   1279
   1280	if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
   1281			 be64_to_cpu(READ_ONCE(mem->tb2)))
   1282		return -EINVAL;
   1283
   1284	/* Prepare perf sample */
   1285	data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
   1286	data->period = event->hw.last_period;
   1287
   1288	header->type = PERF_RECORD_SAMPLE;
   1289	header->size = sizeof(*header) + event->header_size;
   1290	header->misc = 0;
   1291
   1292	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
   1293		switch (IMC_TRACE_RECORD_VAL_HVPR(be64_to_cpu(READ_ONCE(mem->val)))) {
   1294		case 0:/* when MSR HV and PR not set in the trace-record */
   1295			header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
   1296			break;
   1297		case 1: /* MSR HV is 0 and PR is 1 */
   1298			header->misc |= PERF_RECORD_MISC_GUEST_USER;
   1299			break;
   1300		case 2: /* MSR HV is 1 and PR is 0 */
   1301			header->misc |= PERF_RECORD_MISC_KERNEL;
   1302			break;
   1303		case 3: /* MSR HV is 1 and PR is 1 */
   1304			header->misc |= PERF_RECORD_MISC_USER;
   1305			break;
   1306		default:
   1307			pr_info("IMC: Unable to set the flag based on MSR bits\n");
   1308			break;
   1309		}
   1310	} else {
   1311		if (is_kernel_addr(data->ip))
   1312			header->misc |= PERF_RECORD_MISC_KERNEL;
   1313		else
   1314			header->misc |= PERF_RECORD_MISC_USER;
   1315	}
   1316	perf_event_header__init_id(header, data, event);
   1317
   1318	return 0;
   1319}
   1320
   1321static void dump_trace_imc_data(struct perf_event *event)
   1322{
   1323	struct trace_imc_data *mem;
   1324	int i, ret;
   1325	u64 prev_tb = 0;
   1326
   1327	mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
   1328	for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
   1329		i++, mem++) {
   1330		struct perf_sample_data data;
   1331		struct perf_event_header header;
   1332
   1333		ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event);
   1334		if (ret) /* Exit, if not a valid record */
   1335			break;
   1336		else {
   1337			/* If this is a valid record, create the sample */
   1338			struct perf_output_handle handle;
   1339
   1340			if (perf_output_begin(&handle, &data, event, header.size))
   1341				return;
   1342
   1343			perf_output_sample(&handle, &header, &data, event);
   1344			perf_output_end(&handle);
   1345		}
   1346	}
   1347}
   1348
   1349static int trace_imc_event_add(struct perf_event *event, int flags)
   1350{
   1351	int core_id = smp_processor_id() / threads_per_core;
   1352	struct imc_pmu_ref *ref = NULL;
   1353	u64 local_mem, ldbar_value;
   1354
   1355	/* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
   1356	local_mem = get_trace_imc_event_base_addr();
   1357	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
   1358
   1359	/* trace-imc reference count */
   1360	if (trace_imc_refc)
   1361		ref = &trace_imc_refc[core_id];
   1362	if (!ref) {
   1363		pr_debug("imc: Failed to get the event reference count\n");
   1364		return -EINVAL;
   1365	}
   1366
   1367	mtspr(SPRN_LDBAR, ldbar_value);
   1368	mutex_lock(&ref->lock);
   1369	if (ref->refc == 0) {
   1370		if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
   1371				get_hard_smp_processor_id(smp_processor_id()))) {
   1372			mutex_unlock(&ref->lock);
   1373			pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
   1374			return -EINVAL;
   1375		}
   1376	}
   1377	++ref->refc;
   1378	mutex_unlock(&ref->lock);
   1379	return 0;
   1380}
   1381
   1382static void trace_imc_event_read(struct perf_event *event)
   1383{
   1384	return;
   1385}
   1386
   1387static void trace_imc_event_stop(struct perf_event *event, int flags)
   1388{
   1389	u64 local_mem = get_trace_imc_event_base_addr();
   1390	dump_trace_imc_data(event);
   1391	memset((void *)local_mem, 0, sizeof(u64));
   1392}
   1393
   1394static void trace_imc_event_start(struct perf_event *event, int flags)
   1395{
   1396	return;
   1397}
   1398
   1399static void trace_imc_event_del(struct perf_event *event, int flags)
   1400{
   1401	int core_id = smp_processor_id() / threads_per_core;
   1402	struct imc_pmu_ref *ref = NULL;
   1403
   1404	if (trace_imc_refc)
   1405		ref = &trace_imc_refc[core_id];
   1406	if (!ref) {
   1407		pr_debug("imc: Failed to get event reference count\n");
   1408		return;
   1409	}
   1410
   1411	mutex_lock(&ref->lock);
   1412	ref->refc--;
   1413	if (ref->refc == 0) {
   1414		if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE,
   1415				get_hard_smp_processor_id(smp_processor_id()))) {
   1416			mutex_unlock(&ref->lock);
   1417			pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id);
   1418			return;
   1419		}
   1420	} else if (ref->refc < 0) {
   1421		ref->refc = 0;
   1422	}
   1423	mutex_unlock(&ref->lock);
   1424
   1425	trace_imc_event_stop(event, flags);
   1426}
   1427
   1428static int trace_imc_event_init(struct perf_event *event)
   1429{
   1430	if (event->attr.type != event->pmu->type)
   1431		return -ENOENT;
   1432
   1433	if (!perfmon_capable())
   1434		return -EACCES;
   1435
   1436	/* Return if this is a couting event */
   1437	if (event->attr.sample_period == 0)
   1438		return -ENOENT;
   1439
   1440	/*
   1441	 * Take the global lock, and make sure
   1442	 * no other thread is running any core/thread imc
   1443	 * events
   1444	 */
   1445	mutex_lock(&imc_global_refc.lock);
   1446	if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
   1447		/*
   1448		 * No core/thread imc events are running in the
   1449		 * system, so set the refc.id to trace-imc.
   1450		 */
   1451		imc_global_refc.id = IMC_DOMAIN_TRACE;
   1452		imc_global_refc.refc++;
   1453	} else {
   1454		mutex_unlock(&imc_global_refc.lock);
   1455		return -EBUSY;
   1456	}
   1457	mutex_unlock(&imc_global_refc.lock);
   1458
   1459	event->hw.idx = -1;
   1460
   1461	/*
   1462	 * There can only be a single PMU for perf_hw_context events which is assigned to
   1463	 * core PMU. Hence use "perf_sw_context" for trace_imc.
   1464	 */
   1465	event->pmu->task_ctx_nr = perf_sw_context;
   1466	event->destroy = reset_global_refc;
   1467	return 0;
   1468}
   1469
   1470/* update_pmu_ops : Populate the appropriate operations for "pmu" */
   1471static int update_pmu_ops(struct imc_pmu *pmu)
   1472{
   1473	pmu->pmu.task_ctx_nr = perf_invalid_context;
   1474	pmu->pmu.add = imc_event_add;
   1475	pmu->pmu.del = imc_event_stop;
   1476	pmu->pmu.start = imc_event_start;
   1477	pmu->pmu.stop = imc_event_stop;
   1478	pmu->pmu.read = imc_event_update;
   1479	pmu->pmu.attr_groups = pmu->attr_groups;
   1480	pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
   1481	pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
   1482
   1483	switch (pmu->domain) {
   1484	case IMC_DOMAIN_NEST:
   1485		pmu->pmu.event_init = nest_imc_event_init;
   1486		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
   1487		break;
   1488	case IMC_DOMAIN_CORE:
   1489		pmu->pmu.event_init = core_imc_event_init;
   1490		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
   1491		break;
   1492	case IMC_DOMAIN_THREAD:
   1493		pmu->pmu.event_init = thread_imc_event_init;
   1494		pmu->pmu.add = thread_imc_event_add;
   1495		pmu->pmu.del = thread_imc_event_del;
   1496		pmu->pmu.start_txn = thread_imc_pmu_start_txn;
   1497		pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
   1498		pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
   1499		break;
   1500	case IMC_DOMAIN_TRACE:
   1501		pmu->pmu.event_init = trace_imc_event_init;
   1502		pmu->pmu.add = trace_imc_event_add;
   1503		pmu->pmu.del = trace_imc_event_del;
   1504		pmu->pmu.start = trace_imc_event_start;
   1505		pmu->pmu.stop = trace_imc_event_stop;
   1506		pmu->pmu.read = trace_imc_event_read;
   1507		pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group;
   1508		break;
   1509	default:
   1510		break;
   1511	}
   1512
   1513	return 0;
   1514}
   1515
   1516/* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */
   1517static int init_nest_pmu_ref(void)
   1518{
   1519	int nid, i, cpu;
   1520
   1521	nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc),
   1522								GFP_KERNEL);
   1523
   1524	if (!nest_imc_refc)
   1525		return -ENOMEM;
   1526
   1527	i = 0;
   1528	for_each_node(nid) {
   1529		/*
   1530		 * Mutex lock to avoid races while tracking the number of
   1531		 * sessions using the chip's nest pmu units.
   1532		 */
   1533		mutex_init(&nest_imc_refc[i].lock);
   1534
   1535		/*
   1536		 * Loop to init the "id" with the node_id. Variable "i" initialized to
   1537		 * 0 and will be used as index to the array. "i" will not go off the
   1538		 * end of the array since the "for_each_node" loops for "N_POSSIBLE"
   1539		 * nodes only.
   1540		 */
   1541		nest_imc_refc[i++].id = nid;
   1542	}
   1543
   1544	/*
   1545	 * Loop to init the per_cpu "local_nest_imc_refc" with the proper
   1546	 * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple.
   1547	 */
   1548	for_each_possible_cpu(cpu) {
   1549		nid = cpu_to_node(cpu);
   1550		for (i = 0; i < num_possible_nodes(); i++) {
   1551			if (nest_imc_refc[i].id == nid) {
   1552				per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i];
   1553				break;
   1554			}
   1555		}
   1556	}
   1557	return 0;
   1558}
   1559
   1560static void cleanup_all_core_imc_memory(void)
   1561{
   1562	int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
   1563	struct imc_mem_info *ptr = core_imc_pmu->mem_info;
   1564	int size = core_imc_pmu->counter_mem_size;
   1565
   1566	/* mem_info will never be NULL */
   1567	for (i = 0; i < nr_cores; i++) {
   1568		if (ptr[i].vbase)
   1569			free_pages((u64)ptr[i].vbase, get_order(size));
   1570	}
   1571
   1572	kfree(ptr);
   1573	kfree(core_imc_refc);
   1574}
   1575
   1576static void thread_imc_ldbar_disable(void *dummy)
   1577{
   1578	/*
   1579	 * By setting 0th bit of LDBAR to zero, we disable thread-imc
   1580	 * updates to memory.
   1581	 */
   1582	mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
   1583}
   1584
   1585void thread_imc_disable(void)
   1586{
   1587	on_each_cpu(thread_imc_ldbar_disable, NULL, 1);
   1588}
   1589
   1590static void cleanup_all_thread_imc_memory(void)
   1591{
   1592	int i, order = get_order(thread_imc_mem_size);
   1593
   1594	for_each_online_cpu(i) {
   1595		if (per_cpu(thread_imc_mem, i))
   1596			free_pages((u64)per_cpu(thread_imc_mem, i), order);
   1597
   1598	}
   1599}
   1600
   1601static void cleanup_all_trace_imc_memory(void)
   1602{
   1603	int i, order = get_order(trace_imc_mem_size);
   1604
   1605	for_each_online_cpu(i) {
   1606		if (per_cpu(trace_imc_mem, i))
   1607			free_pages((u64)per_cpu(trace_imc_mem, i), order);
   1608
   1609	}
   1610	kfree(trace_imc_refc);
   1611}
   1612
   1613/* Function to free the attr_groups which are dynamically allocated */
   1614static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
   1615{
   1616	if (pmu_ptr->attr_groups[IMC_EVENT_ATTR])
   1617		kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
   1618	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
   1619}
   1620
   1621/*
   1622 * Common function to unregister cpu hotplug callback and
   1623 * free the memory.
   1624 * TODO: Need to handle pmu unregistering, which will be
   1625 * done in followup series.
   1626 */
   1627static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
   1628{
   1629	if (pmu_ptr->domain == IMC_DOMAIN_NEST) {
   1630		mutex_lock(&nest_init_lock);
   1631		if (nest_pmus == 1) {
   1632			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
   1633			kfree(nest_imc_refc);
   1634			kfree(per_nest_pmu_arr);
   1635			per_nest_pmu_arr = NULL;
   1636		}
   1637
   1638		if (nest_pmus > 0)
   1639			nest_pmus--;
   1640		mutex_unlock(&nest_init_lock);
   1641	}
   1642
   1643	/* Free core_imc memory */
   1644	if (pmu_ptr->domain == IMC_DOMAIN_CORE) {
   1645		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE);
   1646		cleanup_all_core_imc_memory();
   1647	}
   1648
   1649	/* Free thread_imc memory */
   1650	if (pmu_ptr->domain == IMC_DOMAIN_THREAD) {
   1651		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
   1652		cleanup_all_thread_imc_memory();
   1653	}
   1654
   1655	if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
   1656		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
   1657		cleanup_all_trace_imc_memory();
   1658	}
   1659}
   1660
   1661/*
   1662 * Function to unregister thread-imc if core-imc
   1663 * is not registered.
   1664 */
   1665void unregister_thread_imc(void)
   1666{
   1667	imc_common_cpuhp_mem_free(thread_imc_pmu);
   1668	imc_common_mem_free(thread_imc_pmu);
   1669	perf_pmu_unregister(&thread_imc_pmu->pmu);
   1670}
   1671
   1672/*
   1673 * imc_mem_init : Function to support memory allocation for core imc.
   1674 */
   1675static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
   1676								int pmu_index)
   1677{
   1678	const char *s;
   1679	int nr_cores, cpu, res = -ENOMEM;
   1680
   1681	if (of_property_read_string(parent, "name", &s))
   1682		return -ENODEV;
   1683
   1684	switch (pmu_ptr->domain) {
   1685	case IMC_DOMAIN_NEST:
   1686		/* Update the pmu name */
   1687		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s);
   1688		if (!pmu_ptr->pmu.name)
   1689			goto err;
   1690
   1691		/* Needed for hotplug/migration */
   1692		if (!per_nest_pmu_arr) {
   1693			per_nest_pmu_arr = kcalloc(get_max_nest_dev() + 1,
   1694						sizeof(struct imc_pmu *),
   1695						GFP_KERNEL);
   1696			if (!per_nest_pmu_arr)
   1697				goto err;
   1698		}
   1699		per_nest_pmu_arr[pmu_index] = pmu_ptr;
   1700		break;
   1701	case IMC_DOMAIN_CORE:
   1702		/* Update the pmu name */
   1703		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
   1704		if (!pmu_ptr->pmu.name)
   1705			goto err;
   1706
   1707		nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
   1708		pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info),
   1709								GFP_KERNEL);
   1710
   1711		if (!pmu_ptr->mem_info)
   1712			goto err;
   1713
   1714		core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
   1715								GFP_KERNEL);
   1716
   1717		if (!core_imc_refc) {
   1718			kfree(pmu_ptr->mem_info);
   1719			goto err;
   1720		}
   1721
   1722		core_imc_pmu = pmu_ptr;
   1723		break;
   1724	case IMC_DOMAIN_THREAD:
   1725		/* Update the pmu name */
   1726		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
   1727		if (!pmu_ptr->pmu.name)
   1728			goto err;
   1729
   1730		thread_imc_mem_size = pmu_ptr->counter_mem_size;
   1731		for_each_online_cpu(cpu) {
   1732			res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size);
   1733			if (res) {
   1734				cleanup_all_thread_imc_memory();
   1735				goto err;
   1736			}
   1737		}
   1738
   1739		thread_imc_pmu = pmu_ptr;
   1740		break;
   1741	case IMC_DOMAIN_TRACE:
   1742		/* Update the pmu name */
   1743		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
   1744		if (!pmu_ptr->pmu.name)
   1745			return -ENOMEM;
   1746
   1747		nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
   1748		trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
   1749								GFP_KERNEL);
   1750		if (!trace_imc_refc)
   1751			return -ENOMEM;
   1752
   1753		trace_imc_mem_size = pmu_ptr->counter_mem_size;
   1754		for_each_online_cpu(cpu) {
   1755			res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
   1756			if (res) {
   1757				cleanup_all_trace_imc_memory();
   1758				goto err;
   1759			}
   1760		}
   1761		break;
   1762	default:
   1763		return -EINVAL;
   1764	}
   1765
   1766	return 0;
   1767err:
   1768	return res;
   1769}
   1770
   1771/*
   1772 * init_imc_pmu : Setup and register the IMC pmu device.
   1773 *
   1774 * @parent:	Device tree unit node
   1775 * @pmu_ptr:	memory allocated for this pmu
   1776 * @pmu_idx:	Count of nest pmc registered
   1777 *
   1778 * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback.
   1779 * Handles failure cases and accordingly frees memory.
   1780 */
   1781int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx)
   1782{
   1783	int ret;
   1784
   1785	ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
   1786	if (ret)
   1787		goto err_free_mem;
   1788
   1789	switch (pmu_ptr->domain) {
   1790	case IMC_DOMAIN_NEST:
   1791		/*
   1792		* Nest imc pmu need only one cpu per chip, we initialize the
   1793		* cpumask for the first nest imc pmu and use the same for the
   1794		* rest. To handle the cpuhotplug callback unregister, we track
   1795		* the number of nest pmus in "nest_pmus".
   1796		*/
   1797		mutex_lock(&nest_init_lock);
   1798		if (nest_pmus == 0) {
   1799			ret = init_nest_pmu_ref();
   1800			if (ret) {
   1801				mutex_unlock(&nest_init_lock);
   1802				kfree(per_nest_pmu_arr);
   1803				per_nest_pmu_arr = NULL;
   1804				goto err_free_mem;
   1805			}
   1806			/* Register for cpu hotplug notification. */
   1807			ret = nest_pmu_cpumask_init();
   1808			if (ret) {
   1809				mutex_unlock(&nest_init_lock);
   1810				kfree(nest_imc_refc);
   1811				kfree(per_nest_pmu_arr);
   1812				per_nest_pmu_arr = NULL;
   1813				goto err_free_mem;
   1814			}
   1815		}
   1816		nest_pmus++;
   1817		mutex_unlock(&nest_init_lock);
   1818		break;
   1819	case IMC_DOMAIN_CORE:
   1820		ret = core_imc_pmu_cpumask_init();
   1821		if (ret) {
   1822			cleanup_all_core_imc_memory();
   1823			goto err_free_mem;
   1824		}
   1825
   1826		break;
   1827	case IMC_DOMAIN_THREAD:
   1828		ret = thread_imc_cpu_init();
   1829		if (ret) {
   1830			cleanup_all_thread_imc_memory();
   1831			goto err_free_mem;
   1832		}
   1833
   1834		break;
   1835	case IMC_DOMAIN_TRACE:
   1836		ret = trace_imc_cpu_init();
   1837		if (ret) {
   1838			cleanup_all_trace_imc_memory();
   1839			goto err_free_mem;
   1840		}
   1841
   1842		break;
   1843	default:
   1844		return  -EINVAL;	/* Unknown domain */
   1845	}
   1846
   1847	ret = update_events_in_group(parent, pmu_ptr);
   1848	if (ret)
   1849		goto err_free_cpuhp_mem;
   1850
   1851	ret = update_pmu_ops(pmu_ptr);
   1852	if (ret)
   1853		goto err_free_cpuhp_mem;
   1854
   1855	ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1);
   1856	if (ret)
   1857		goto err_free_cpuhp_mem;
   1858
   1859	pr_debug("%s performance monitor hardware support registered\n",
   1860							pmu_ptr->pmu.name);
   1861
   1862	return 0;
   1863
   1864err_free_cpuhp_mem:
   1865	imc_common_cpuhp_mem_free(pmu_ptr);
   1866err_free_mem:
   1867	imc_common_mem_free(pmu_ptr);
   1868	return ret;
   1869}