cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pt.c (44620B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Intel(R) Processor Trace PMU driver for perf
      4 * Copyright (c) 2013-2014, Intel Corporation.
      5 *
      6 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
      7 * Programming Reference:
      8 * http://software.intel.com/en-us/intel-isa-extensions
      9 */
     10
     11#undef DEBUG
     12
     13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     14
     15#include <linux/types.h>
     16#include <linux/bits.h>
     17#include <linux/limits.h>
     18#include <linux/slab.h>
     19#include <linux/device.h>
     20
     21#include <asm/perf_event.h>
     22#include <asm/insn.h>
     23#include <asm/io.h>
     24#include <asm/intel_pt.h>
     25#include <asm/intel-family.h>
     26
     27#include "../perf_event.h"
     28#include "pt.h"
     29
     30static DEFINE_PER_CPU(struct pt, pt_ctx);
     31
     32static struct pt_pmu pt_pmu;
     33
     34/*
     35 * Capabilities of Intel PT hardware, such as number of address bits or
     36 * supported output schemes, are cached and exported to userspace as "caps"
     37 * attribute group of pt pmu device
     38 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
     39 * relevant bits together with intel_pt traces.
     40 *
     41 * These are necessary for both trace decoding (payloads_lip, contains address
     42 * width encoded in IP-related packets), and event configuration (bitmasks with
     43 * permitted values for certain bit fields).
     44 */
     45#define PT_CAP(_n, _l, _r, _m)						\
     46	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
     47			    .reg = _r, .mask = _m }
     48
     49static struct pt_cap_desc {
     50	const char	*name;
     51	u32		leaf;
     52	u8		reg;
     53	u32		mask;
     54} pt_caps[] = {
     55	PT_CAP(max_subleaf,		0, CPUID_EAX, 0xffffffff),
     56	PT_CAP(cr3_filtering,		0, CPUID_EBX, BIT(0)),
     57	PT_CAP(psb_cyc,			0, CPUID_EBX, BIT(1)),
     58	PT_CAP(ip_filtering,		0, CPUID_EBX, BIT(2)),
     59	PT_CAP(mtc,			0, CPUID_EBX, BIT(3)),
     60	PT_CAP(ptwrite,			0, CPUID_EBX, BIT(4)),
     61	PT_CAP(power_event_trace,	0, CPUID_EBX, BIT(5)),
     62	PT_CAP(event_trace,		0, CPUID_EBX, BIT(7)),
     63	PT_CAP(tnt_disable,		0, CPUID_EBX, BIT(8)),
     64	PT_CAP(topa_output,		0, CPUID_ECX, BIT(0)),
     65	PT_CAP(topa_multiple_entries,	0, CPUID_ECX, BIT(1)),
     66	PT_CAP(single_range_output,	0, CPUID_ECX, BIT(2)),
     67	PT_CAP(output_subsys,		0, CPUID_ECX, BIT(3)),
     68	PT_CAP(payloads_lip,		0, CPUID_ECX, BIT(31)),
     69	PT_CAP(num_address_ranges,	1, CPUID_EAX, 0x7),
     70	PT_CAP(mtc_periods,		1, CPUID_EAX, 0xffff0000),
     71	PT_CAP(cycle_thresholds,	1, CPUID_EBX, 0xffff),
     72	PT_CAP(psb_periods,		1, CPUID_EBX, 0xffff0000),
     73};
     74
     75u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
     76{
     77	struct pt_cap_desc *cd = &pt_caps[capability];
     78	u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
     79	unsigned int shift = __ffs(cd->mask);
     80
     81	return (c & cd->mask) >> shift;
     82}
     83EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
     84
     85u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
     86{
     87	return intel_pt_validate_cap(pt_pmu.caps, cap);
     88}
     89EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
     90
     91static ssize_t pt_cap_show(struct device *cdev,
     92			   struct device_attribute *attr,
     93			   char *buf)
     94{
     95	struct dev_ext_attribute *ea =
     96		container_of(attr, struct dev_ext_attribute, attr);
     97	enum pt_capabilities cap = (long)ea->var;
     98
     99	return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
    100}
    101
    102static struct attribute_group pt_cap_group __ro_after_init = {
    103	.name	= "caps",
    104};
    105
    106PMU_FORMAT_ATTR(pt,		"config:0"	);
    107PMU_FORMAT_ATTR(cyc,		"config:1"	);
    108PMU_FORMAT_ATTR(pwr_evt,	"config:4"	);
    109PMU_FORMAT_ATTR(fup_on_ptw,	"config:5"	);
    110PMU_FORMAT_ATTR(mtc,		"config:9"	);
    111PMU_FORMAT_ATTR(tsc,		"config:10"	);
    112PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
    113PMU_FORMAT_ATTR(ptw,		"config:12"	);
    114PMU_FORMAT_ATTR(branch,		"config:13"	);
    115PMU_FORMAT_ATTR(event,		"config:31"	);
    116PMU_FORMAT_ATTR(notnt,		"config:55"	);
    117PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
    118PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
    119PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
    120
    121static struct attribute *pt_formats_attr[] = {
    122	&format_attr_pt.attr,
    123	&format_attr_cyc.attr,
    124	&format_attr_pwr_evt.attr,
    125	&format_attr_event.attr,
    126	&format_attr_notnt.attr,
    127	&format_attr_fup_on_ptw.attr,
    128	&format_attr_mtc.attr,
    129	&format_attr_tsc.attr,
    130	&format_attr_noretcomp.attr,
    131	&format_attr_ptw.attr,
    132	&format_attr_branch.attr,
    133	&format_attr_mtc_period.attr,
    134	&format_attr_cyc_thresh.attr,
    135	&format_attr_psb_period.attr,
    136	NULL,
    137};
    138
    139static struct attribute_group pt_format_group = {
    140	.name	= "format",
    141	.attrs	= pt_formats_attr,
    142};
    143
    144static ssize_t
    145pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
    146		    char *page)
    147{
    148	struct perf_pmu_events_attr *pmu_attr =
    149		container_of(attr, struct perf_pmu_events_attr, attr);
    150
    151	switch (pmu_attr->id) {
    152	case 0:
    153		return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
    154	case 1:
    155		return sprintf(page, "%u:%u\n",
    156			       pt_pmu.tsc_art_num,
    157			       pt_pmu.tsc_art_den);
    158	default:
    159		break;
    160	}
    161
    162	return -EINVAL;
    163}
    164
    165PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
    166	       pt_timing_attr_show);
    167PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
    168	       pt_timing_attr_show);
    169
    170static struct attribute *pt_timing_attr[] = {
    171	&timing_attr_max_nonturbo_ratio.attr.attr,
    172	&timing_attr_tsc_art_ratio.attr.attr,
    173	NULL,
    174};
    175
    176static struct attribute_group pt_timing_group = {
    177	.attrs	= pt_timing_attr,
    178};
    179
    180static const struct attribute_group *pt_attr_groups[] = {
    181	&pt_cap_group,
    182	&pt_format_group,
    183	&pt_timing_group,
    184	NULL,
    185};
    186
    187static int __init pt_pmu_hw_init(void)
    188{
    189	struct dev_ext_attribute *de_attrs;
    190	struct attribute **attrs;
    191	size_t size;
    192	u64 reg;
    193	int ret;
    194	long i;
    195
    196	rdmsrl(MSR_PLATFORM_INFO, reg);
    197	pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
    198
    199	/*
    200	 * if available, read in TSC to core crystal clock ratio,
    201	 * otherwise, zero for numerator stands for "not enumerated"
    202	 * as per SDM
    203	 */
    204	if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
    205		u32 eax, ebx, ecx, edx;
    206
    207		cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
    208
    209		pt_pmu.tsc_art_num = ebx;
    210		pt_pmu.tsc_art_den = eax;
    211	}
    212
    213	/* model-specific quirks */
    214	switch (boot_cpu_data.x86_model) {
    215	case INTEL_FAM6_BROADWELL:
    216	case INTEL_FAM6_BROADWELL_D:
    217	case INTEL_FAM6_BROADWELL_G:
    218	case INTEL_FAM6_BROADWELL_X:
    219		/* not setting BRANCH_EN will #GP, erratum BDM106 */
    220		pt_pmu.branch_en_always_on = true;
    221		break;
    222	default:
    223		break;
    224	}
    225
    226	if (boot_cpu_has(X86_FEATURE_VMX)) {
    227		/*
    228		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
    229		 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
    230		 * post-VMXON.
    231		 */
    232		rdmsrl(MSR_IA32_VMX_MISC, reg);
    233		if (reg & BIT(14))
    234			pt_pmu.vmx = true;
    235	}
    236
    237	for (i = 0; i < PT_CPUID_LEAVES; i++) {
    238		cpuid_count(20, i,
    239			    &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
    240			    &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
    241			    &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
    242			    &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
    243	}
    244
    245	ret = -ENOMEM;
    246	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
    247	attrs = kzalloc(size, GFP_KERNEL);
    248	if (!attrs)
    249		goto fail;
    250
    251	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
    252	de_attrs = kzalloc(size, GFP_KERNEL);
    253	if (!de_attrs)
    254		goto fail;
    255
    256	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
    257		struct dev_ext_attribute *de_attr = de_attrs + i;
    258
    259		de_attr->attr.attr.name = pt_caps[i].name;
    260
    261		sysfs_attr_init(&de_attr->attr.attr);
    262
    263		de_attr->attr.attr.mode		= S_IRUGO;
    264		de_attr->attr.show		= pt_cap_show;
    265		de_attr->var			= (void *)i;
    266
    267		attrs[i] = &de_attr->attr.attr;
    268	}
    269
    270	pt_cap_group.attrs = attrs;
    271
    272	return 0;
    273
    274fail:
    275	kfree(attrs);
    276
    277	return ret;
    278}
    279
    280#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
    281			  RTIT_CTL_CYC_THRESH	| \
    282			  RTIT_CTL_PSB_FREQ)
    283
    284#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
    285			 RTIT_CTL_MTC_RANGE)
    286
    287#define RTIT_CTL_PTW	(RTIT_CTL_PTW_EN	| \
    288			 RTIT_CTL_FUP_ON_PTW)
    289
    290/*
    291 * Bit 0 (TraceEn) in the attr.config is meaningless as the
    292 * corresponding bit in the RTIT_CTL can only be controlled
    293 * by the driver; therefore, repurpose it to mean: pass
    294 * through the bit that was previously assumed to be always
    295 * on for PT, thereby allowing the user to *not* set it if
    296 * they so wish. See also pt_event_valid() and pt_config().
    297 */
    298#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
    299
    300#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN	| \
    301			RTIT_CTL_TSC_EN		| \
    302			RTIT_CTL_DISRETC	| \
    303			RTIT_CTL_BRANCH_EN	| \
    304			RTIT_CTL_CYC_PSB	| \
    305			RTIT_CTL_MTC		| \
    306			RTIT_CTL_PWR_EVT_EN	| \
    307			RTIT_CTL_EVENT_EN	| \
    308			RTIT_CTL_NOTNT		| \
    309			RTIT_CTL_FUP_ON_PTW	| \
    310			RTIT_CTL_PTW_EN)
    311
    312static bool pt_event_valid(struct perf_event *event)
    313{
    314	u64 config = event->attr.config;
    315	u64 allowed, requested;
    316
    317	if ((config & PT_CONFIG_MASK) != config)
    318		return false;
    319
    320	if (config & RTIT_CTL_CYC_PSB) {
    321		if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
    322			return false;
    323
    324		allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
    325		requested = (config & RTIT_CTL_PSB_FREQ) >>
    326			RTIT_CTL_PSB_FREQ_OFFSET;
    327		if (requested && (!(allowed & BIT(requested))))
    328			return false;
    329
    330		allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
    331		requested = (config & RTIT_CTL_CYC_THRESH) >>
    332			RTIT_CTL_CYC_THRESH_OFFSET;
    333		if (requested && (!(allowed & BIT(requested))))
    334			return false;
    335	}
    336
    337	if (config & RTIT_CTL_MTC) {
    338		/*
    339		 * In the unlikely case that CPUID lists valid mtc periods,
    340		 * but not the mtc capability, drop out here.
    341		 *
    342		 * Spec says that setting mtc period bits while mtc bit in
    343		 * CPUID is 0 will #GP, so better safe than sorry.
    344		 */
    345		if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
    346			return false;
    347
    348		allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
    349		if (!allowed)
    350			return false;
    351
    352		requested = (config & RTIT_CTL_MTC_RANGE) >>
    353			RTIT_CTL_MTC_RANGE_OFFSET;
    354
    355		if (!(allowed & BIT(requested)))
    356			return false;
    357	}
    358
    359	if (config & RTIT_CTL_PWR_EVT_EN &&
    360	    !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
    361		return false;
    362
    363	if (config & RTIT_CTL_EVENT_EN &&
    364	    !intel_pt_validate_hw_cap(PT_CAP_event_trace))
    365		return false;
    366
    367	if (config & RTIT_CTL_NOTNT &&
    368	    !intel_pt_validate_hw_cap(PT_CAP_tnt_disable))
    369		return false;
    370
    371	if (config & RTIT_CTL_PTW) {
    372		if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
    373			return false;
    374
    375		/* FUPonPTW without PTW doesn't make sense */
    376		if ((config & RTIT_CTL_FUP_ON_PTW) &&
    377		    !(config & RTIT_CTL_PTW_EN))
    378			return false;
    379	}
    380
    381	/*
    382	 * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
    383	 * clears the assumption that BranchEn must always be enabled,
    384	 * as was the case with the first implementation of PT.
    385	 * If this bit is not set, the legacy behavior is preserved
    386	 * for compatibility with the older userspace.
    387	 *
    388	 * Re-using bit 0 for this purpose is fine because it is never
    389	 * directly set by the user; previous attempts at setting it in
    390	 * the attr.config resulted in -EINVAL.
    391	 */
    392	if (config & RTIT_CTL_PASSTHROUGH) {
    393		/*
    394		 * Disallow not setting BRANCH_EN where BRANCH_EN is
    395		 * always required.
    396		 */
    397		if (pt_pmu.branch_en_always_on &&
    398		    !(config & RTIT_CTL_BRANCH_EN))
    399			return false;
    400	} else {
    401		/*
    402		 * Disallow BRANCH_EN without the PASSTHROUGH.
    403		 */
    404		if (config & RTIT_CTL_BRANCH_EN)
    405			return false;
    406	}
    407
    408	return true;
    409}
    410
    411/*
    412 * PT configuration helpers
    413 * These all are cpu affine and operate on a local PT
    414 */
    415
    416static void pt_config_start(struct perf_event *event)
    417{
    418	struct pt *pt = this_cpu_ptr(&pt_ctx);
    419	u64 ctl = event->hw.config;
    420
    421	ctl |= RTIT_CTL_TRACEEN;
    422	if (READ_ONCE(pt->vmx_on))
    423		perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
    424	else
    425		wrmsrl(MSR_IA32_RTIT_CTL, ctl);
    426
    427	WRITE_ONCE(event->hw.config, ctl);
    428}
    429
    430/* Address ranges and their corresponding msr configuration registers */
    431static const struct pt_address_range {
    432	unsigned long	msr_a;
    433	unsigned long	msr_b;
    434	unsigned int	reg_off;
    435} pt_address_ranges[] = {
    436	{
    437		.msr_a	 = MSR_IA32_RTIT_ADDR0_A,
    438		.msr_b	 = MSR_IA32_RTIT_ADDR0_B,
    439		.reg_off = RTIT_CTL_ADDR0_OFFSET,
    440	},
    441	{
    442		.msr_a	 = MSR_IA32_RTIT_ADDR1_A,
    443		.msr_b	 = MSR_IA32_RTIT_ADDR1_B,
    444		.reg_off = RTIT_CTL_ADDR1_OFFSET,
    445	},
    446	{
    447		.msr_a	 = MSR_IA32_RTIT_ADDR2_A,
    448		.msr_b	 = MSR_IA32_RTIT_ADDR2_B,
    449		.reg_off = RTIT_CTL_ADDR2_OFFSET,
    450	},
    451	{
    452		.msr_a	 = MSR_IA32_RTIT_ADDR3_A,
    453		.msr_b	 = MSR_IA32_RTIT_ADDR3_B,
    454		.reg_off = RTIT_CTL_ADDR3_OFFSET,
    455	}
    456};
    457
    458static u64 pt_config_filters(struct perf_event *event)
    459{
    460	struct pt_filters *filters = event->hw.addr_filters;
    461	struct pt *pt = this_cpu_ptr(&pt_ctx);
    462	unsigned int range = 0;
    463	u64 rtit_ctl = 0;
    464
    465	if (!filters)
    466		return 0;
    467
    468	perf_event_addr_filters_sync(event);
    469
    470	for (range = 0; range < filters->nr_filters; range++) {
    471		struct pt_filter *filter = &filters->filter[range];
    472
    473		/*
    474		 * Note, if the range has zero start/end addresses due
    475		 * to its dynamic object not being loaded yet, we just
    476		 * go ahead and program zeroed range, which will simply
    477		 * produce no data. Note^2: if executable code at 0x0
    478		 * is a concern, we can set up an "invalid" configuration
    479		 * such as msr_b < msr_a.
    480		 */
    481
    482		/* avoid redundant msr writes */
    483		if (pt->filters.filter[range].msr_a != filter->msr_a) {
    484			wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
    485			pt->filters.filter[range].msr_a = filter->msr_a;
    486		}
    487
    488		if (pt->filters.filter[range].msr_b != filter->msr_b) {
    489			wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
    490			pt->filters.filter[range].msr_b = filter->msr_b;
    491		}
    492
    493		rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
    494	}
    495
    496	return rtit_ctl;
    497}
    498
    499static void pt_config(struct perf_event *event)
    500{
    501	struct pt *pt = this_cpu_ptr(&pt_ctx);
    502	struct pt_buffer *buf = perf_get_aux(&pt->handle);
    503	u64 reg;
    504
    505	/* First round: clear STATUS, in particular the PSB byte counter. */
    506	if (!event->hw.config) {
    507		perf_event_itrace_started(event);
    508		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
    509	}
    510
    511	reg = pt_config_filters(event);
    512	reg |= RTIT_CTL_TRACEEN;
    513	if (!buf->single)
    514		reg |= RTIT_CTL_TOPA;
    515
    516	/*
    517	 * Previously, we had BRANCH_EN on by default, but now that PT has
    518	 * grown features outside of branch tracing, it is useful to allow
    519	 * the user to disable it. Setting bit 0 in the event's attr.config
    520	 * allows BRANCH_EN to pass through instead of being always on. See
    521	 * also the comment in pt_event_valid().
    522	 */
    523	if (event->attr.config & BIT(0)) {
    524		reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
    525	} else {
    526		reg |= RTIT_CTL_BRANCH_EN;
    527	}
    528
    529	if (!event->attr.exclude_kernel)
    530		reg |= RTIT_CTL_OS;
    531	if (!event->attr.exclude_user)
    532		reg |= RTIT_CTL_USR;
    533
    534	reg |= (event->attr.config & PT_CONFIG_MASK);
    535
    536	event->hw.config = reg;
    537	pt_config_start(event);
    538}
    539
    540static void pt_config_stop(struct perf_event *event)
    541{
    542	struct pt *pt = this_cpu_ptr(&pt_ctx);
    543	u64 ctl = READ_ONCE(event->hw.config);
    544
    545	/* may be already stopped by a PMI */
    546	if (!(ctl & RTIT_CTL_TRACEEN))
    547		return;
    548
    549	ctl &= ~RTIT_CTL_TRACEEN;
    550	if (!READ_ONCE(pt->vmx_on))
    551		wrmsrl(MSR_IA32_RTIT_CTL, ctl);
    552
    553	WRITE_ONCE(event->hw.config, ctl);
    554
    555	/*
    556	 * A wrmsr that disables trace generation serializes other PT
    557	 * registers and causes all data packets to be written to memory,
    558	 * but a fence is required for the data to become globally visible.
    559	 *
    560	 * The below WMB, separating data store and aux_head store matches
    561	 * the consumer's RMB that separates aux_head load and data load.
    562	 */
    563	wmb();
    564}
    565
    566/**
    567 * struct topa - ToPA metadata
    568 * @list:	linkage to struct pt_buffer's list of tables
    569 * @offset:	offset of the first entry in this table in the buffer
    570 * @size:	total size of all entries in this table
    571 * @last:	index of the last initialized entry in this table
    572 * @z_count:	how many times the first entry repeats
    573 */
    574struct topa {
    575	struct list_head	list;
    576	u64			offset;
    577	size_t			size;
    578	int			last;
    579	unsigned int		z_count;
    580};
    581
    582/*
    583 * Keep ToPA table-related metadata on the same page as the actual table,
    584 * taking up a few words from the top
    585 */
    586
    587#define TENTS_PER_PAGE	\
    588	((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
    589
    590/**
    591 * struct topa_page - page-sized ToPA table with metadata at the top
    592 * @table:	actual ToPA table entries, as understood by PT hardware
    593 * @topa:	metadata
    594 */
    595struct topa_page {
    596	struct topa_entry	table[TENTS_PER_PAGE];
    597	struct topa		topa;
    598};
    599
    600static inline struct topa_page *topa_to_page(struct topa *topa)
    601{
    602	return container_of(topa, struct topa_page, topa);
    603}
    604
    605static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
    606{
    607	return (struct topa_page *)((unsigned long)te & PAGE_MASK);
    608}
    609
    610static inline phys_addr_t topa_pfn(struct topa *topa)
    611{
    612	return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
    613}
    614
    615/* make -1 stand for the last table entry */
    616#define TOPA_ENTRY(t, i)				\
    617	((i) == -1					\
    618		? &topa_to_page(t)->table[(t)->last]	\
    619		: &topa_to_page(t)->table[(i)])
    620#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
    621#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
    622
    623static void pt_config_buffer(struct pt_buffer *buf)
    624{
    625	struct pt *pt = this_cpu_ptr(&pt_ctx);
    626	u64 reg, mask;
    627	void *base;
    628
    629	if (buf->single) {
    630		base = buf->data_pages[0];
    631		mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
    632	} else {
    633		base = topa_to_page(buf->cur)->table;
    634		mask = (u64)buf->cur_idx;
    635	}
    636
    637	reg = virt_to_phys(base);
    638	if (pt->output_base != reg) {
    639		pt->output_base = reg;
    640		wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
    641	}
    642
    643	reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
    644	if (pt->output_mask != reg) {
    645		pt->output_mask = reg;
    646		wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
    647	}
    648}
    649
    650/**
    651 * topa_alloc() - allocate page-sized ToPA table
    652 * @cpu:	CPU on which to allocate.
    653 * @gfp:	Allocation flags.
    654 *
    655 * Return:	On success, return the pointer to ToPA table page.
    656 */
    657static struct topa *topa_alloc(int cpu, gfp_t gfp)
    658{
    659	int node = cpu_to_node(cpu);
    660	struct topa_page *tp;
    661	struct page *p;
    662
    663	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
    664	if (!p)
    665		return NULL;
    666
    667	tp = page_address(p);
    668	tp->topa.last = 0;
    669
    670	/*
    671	 * In case of singe-entry ToPA, always put the self-referencing END
    672	 * link as the 2nd entry in the table
    673	 */
    674	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
    675		TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
    676		TOPA_ENTRY(&tp->topa, 1)->end = 1;
    677	}
    678
    679	return &tp->topa;
    680}
    681
    682/**
    683 * topa_free() - free a page-sized ToPA table
    684 * @topa:	Table to deallocate.
    685 */
    686static void topa_free(struct topa *topa)
    687{
    688	free_page((unsigned long)topa);
    689}
    690
    691/**
    692 * topa_insert_table() - insert a ToPA table into a buffer
    693 * @buf:	 PT buffer that's being extended.
    694 * @topa:	 New topa table to be inserted.
    695 *
    696 * If it's the first table in this buffer, set up buffer's pointers
    697 * accordingly; otherwise, add a END=1 link entry to @topa to the current
    698 * "last" table and adjust the last table pointer to @topa.
    699 */
    700static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
    701{
    702	struct topa *last = buf->last;
    703
    704	list_add_tail(&topa->list, &buf->tables);
    705
    706	if (!buf->first) {
    707		buf->first = buf->last = buf->cur = topa;
    708		return;
    709	}
    710
    711	topa->offset = last->offset + last->size;
    712	buf->last = topa;
    713
    714	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
    715		return;
    716
    717	BUG_ON(last->last != TENTS_PER_PAGE - 1);
    718
    719	TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
    720	TOPA_ENTRY(last, -1)->end = 1;
    721}
    722
    723/**
    724 * topa_table_full() - check if a ToPA table is filled up
    725 * @topa:	ToPA table.
    726 */
    727static bool topa_table_full(struct topa *topa)
    728{
    729	/* single-entry ToPA is a special case */
    730	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
    731		return !!topa->last;
    732
    733	return topa->last == TENTS_PER_PAGE - 1;
    734}
    735
    736/**
    737 * topa_insert_pages() - create a list of ToPA tables
    738 * @buf:	PT buffer being initialized.
    739 * @gfp:	Allocation flags.
    740 *
    741 * This initializes a list of ToPA tables with entries from
    742 * the data_pages provided by rb_alloc_aux().
    743 *
    744 * Return:	0 on success or error code.
    745 */
    746static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
    747{
    748	struct topa *topa = buf->last;
    749	int order = 0;
    750	struct page *p;
    751
    752	p = virt_to_page(buf->data_pages[buf->nr_pages]);
    753	if (PagePrivate(p))
    754		order = page_private(p);
    755
    756	if (topa_table_full(topa)) {
    757		topa = topa_alloc(cpu, gfp);
    758		if (!topa)
    759			return -ENOMEM;
    760
    761		topa_insert_table(buf, topa);
    762	}
    763
    764	if (topa->z_count == topa->last - 1) {
    765		if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
    766			topa->z_count++;
    767	}
    768
    769	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
    770	TOPA_ENTRY(topa, -1)->size = order;
    771	if (!buf->snapshot &&
    772	    !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
    773		TOPA_ENTRY(topa, -1)->intr = 1;
    774		TOPA_ENTRY(topa, -1)->stop = 1;
    775	}
    776
    777	topa->last++;
    778	topa->size += sizes(order);
    779
    780	buf->nr_pages += 1ul << order;
    781
    782	return 0;
    783}
    784
    785/**
    786 * pt_topa_dump() - print ToPA tables and their entries
    787 * @buf:	PT buffer.
    788 */
    789static void pt_topa_dump(struct pt_buffer *buf)
    790{
    791	struct topa *topa;
    792
    793	list_for_each_entry(topa, &buf->tables, list) {
    794		struct topa_page *tp = topa_to_page(topa);
    795		int i;
    796
    797		pr_debug("# table @%p, off %llx size %zx\n", tp->table,
    798			 topa->offset, topa->size);
    799		for (i = 0; i < TENTS_PER_PAGE; i++) {
    800			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
    801				 &tp->table[i],
    802				 (unsigned long)tp->table[i].base << TOPA_SHIFT,
    803				 sizes(tp->table[i].size),
    804				 tp->table[i].end ?  'E' : ' ',
    805				 tp->table[i].intr ? 'I' : ' ',
    806				 tp->table[i].stop ? 'S' : ' ',
    807				 *(u64 *)&tp->table[i]);
    808			if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
    809			     tp->table[i].stop) ||
    810			    tp->table[i].end)
    811				break;
    812			if (!i && topa->z_count)
    813				i += topa->z_count;
    814		}
    815	}
    816}
    817
    818/**
    819 * pt_buffer_advance() - advance to the next output region
    820 * @buf:	PT buffer.
    821 *
    822 * Advance the current pointers in the buffer to the next ToPA entry.
    823 */
    824static void pt_buffer_advance(struct pt_buffer *buf)
    825{
    826	buf->output_off = 0;
    827	buf->cur_idx++;
    828
    829	if (buf->cur_idx == buf->cur->last) {
    830		if (buf->cur == buf->last)
    831			buf->cur = buf->first;
    832		else
    833			buf->cur = list_entry(buf->cur->list.next, struct topa,
    834					      list);
    835		buf->cur_idx = 0;
    836	}
    837}
    838
    839/**
    840 * pt_update_head() - calculate current offsets and sizes
    841 * @pt:		Per-cpu pt context.
    842 *
    843 * Update buffer's current write pointer position and data size.
    844 */
    845static void pt_update_head(struct pt *pt)
    846{
    847	struct pt_buffer *buf = perf_get_aux(&pt->handle);
    848	u64 topa_idx, base, old;
    849
    850	if (buf->single) {
    851		local_set(&buf->data_size, buf->output_off);
    852		return;
    853	}
    854
    855	/* offset of the first region in this table from the beginning of buf */
    856	base = buf->cur->offset + buf->output_off;
    857
    858	/* offset of the current output region within this table */
    859	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
    860		base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
    861
    862	if (buf->snapshot) {
    863		local_set(&buf->data_size, base);
    864	} else {
    865		old = (local64_xchg(&buf->head, base) &
    866		       ((buf->nr_pages << PAGE_SHIFT) - 1));
    867		if (base < old)
    868			base += buf->nr_pages << PAGE_SHIFT;
    869
    870		local_add(base - old, &buf->data_size);
    871	}
    872}
    873
    874/**
    875 * pt_buffer_region() - obtain current output region's address
    876 * @buf:	PT buffer.
    877 */
    878static void *pt_buffer_region(struct pt_buffer *buf)
    879{
    880	return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
    881}
    882
    883/**
    884 * pt_buffer_region_size() - obtain current output region's size
    885 * @buf:	PT buffer.
    886 */
    887static size_t pt_buffer_region_size(struct pt_buffer *buf)
    888{
    889	return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
    890}
    891
    892/**
    893 * pt_handle_status() - take care of possible status conditions
    894 * @pt:		Per-cpu pt context.
    895 */
    896static void pt_handle_status(struct pt *pt)
    897{
    898	struct pt_buffer *buf = perf_get_aux(&pt->handle);
    899	int advance = 0;
    900	u64 status;
    901
    902	rdmsrl(MSR_IA32_RTIT_STATUS, status);
    903
    904	if (status & RTIT_STATUS_ERROR) {
    905		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
    906		pt_topa_dump(buf);
    907		status &= ~RTIT_STATUS_ERROR;
    908	}
    909
    910	if (status & RTIT_STATUS_STOPPED) {
    911		status &= ~RTIT_STATUS_STOPPED;
    912
    913		/*
    914		 * On systems that only do single-entry ToPA, hitting STOP
    915		 * means we are already losing data; need to let the decoder
    916		 * know.
    917		 */
    918		if (!buf->single &&
    919		    (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
    920		     buf->output_off == pt_buffer_region_size(buf))) {
    921			perf_aux_output_flag(&pt->handle,
    922			                     PERF_AUX_FLAG_TRUNCATED);
    923			advance++;
    924		}
    925	}
    926
    927	/*
    928	 * Also on single-entry ToPA implementations, interrupt will come
    929	 * before the output reaches its output region's boundary.
    930	 */
    931	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
    932	    !buf->snapshot &&
    933	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
    934		void *head = pt_buffer_region(buf);
    935
    936		/* everything within this margin needs to be zeroed out */
    937		memset(head + buf->output_off, 0,
    938		       pt_buffer_region_size(buf) -
    939		       buf->output_off);
    940		advance++;
    941	}
    942
    943	if (advance)
    944		pt_buffer_advance(buf);
    945
    946	wrmsrl(MSR_IA32_RTIT_STATUS, status);
    947}
    948
    949/**
    950 * pt_read_offset() - translate registers into buffer pointers
    951 * @buf:	PT buffer.
    952 *
    953 * Set buffer's output pointers from MSR values.
    954 */
    955static void pt_read_offset(struct pt_buffer *buf)
    956{
    957	struct pt *pt = this_cpu_ptr(&pt_ctx);
    958	struct topa_page *tp;
    959
    960	if (!buf->single) {
    961		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
    962		tp = phys_to_virt(pt->output_base);
    963		buf->cur = &tp->topa;
    964	}
    965
    966	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
    967	/* offset within current output region */
    968	buf->output_off = pt->output_mask >> 32;
    969	/* index of current output region within this table */
    970	if (!buf->single)
    971		buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
    972}
    973
    974static struct topa_entry *
    975pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
    976{
    977	struct topa_page *tp;
    978	struct topa *topa;
    979	unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
    980
    981	/*
    982	 * Indicates a bug in the caller.
    983	 */
    984	if (WARN_ON_ONCE(pg >= buf->nr_pages))
    985		return NULL;
    986
    987	/*
    988	 * First, find the ToPA table where @pg fits. With high
    989	 * order allocations, there shouldn't be many of these.
    990	 */
    991	list_for_each_entry(topa, &buf->tables, list) {
    992		if (topa->offset + topa->size > pg << PAGE_SHIFT)
    993			goto found;
    994	}
    995
    996	/*
    997	 * Hitting this means we have a problem in the ToPA
    998	 * allocation code.
    999	 */
   1000	WARN_ON_ONCE(1);
   1001
   1002	return NULL;
   1003
   1004found:
   1005	/*
   1006	 * Indicates a problem in the ToPA allocation code.
   1007	 */
   1008	if (WARN_ON_ONCE(topa->last == -1))
   1009		return NULL;
   1010
   1011	tp = topa_to_page(topa);
   1012	cur_pg = PFN_DOWN(topa->offset);
   1013	if (topa->z_count) {
   1014		z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
   1015		start_idx = topa->z_count + 1;
   1016	}
   1017
   1018	/*
   1019	 * Multiple entries at the beginning of the table have the same size,
   1020	 * ideally all of them; if @pg falls there, the search is done.
   1021	 */
   1022	if (pg >= cur_pg && pg < cur_pg + z_pg) {
   1023		idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
   1024		return &tp->table[idx];
   1025	}
   1026
   1027	/*
   1028	 * Otherwise, slow path: iterate through the remaining entries.
   1029	 */
   1030	for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
   1031		if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
   1032			return &tp->table[idx];
   1033
   1034		cur_pg += TOPA_ENTRY_PAGES(topa, idx);
   1035	}
   1036
   1037	/*
   1038	 * Means we couldn't find a ToPA entry in the table that does match.
   1039	 */
   1040	WARN_ON_ONCE(1);
   1041
   1042	return NULL;
   1043}
   1044
   1045static struct topa_entry *
   1046pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
   1047{
   1048	unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
   1049	struct topa_page *tp;
   1050	struct topa *topa;
   1051
   1052	tp = (struct topa_page *)table;
   1053	if (tp->table != te)
   1054		return --te;
   1055
   1056	topa = &tp->topa;
   1057	if (topa == buf->first)
   1058		topa = buf->last;
   1059	else
   1060		topa = list_prev_entry(topa, list);
   1061
   1062	tp = topa_to_page(topa);
   1063
   1064	return &tp->table[topa->last - 1];
   1065}
   1066
   1067/**
   1068 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
   1069 * @buf:	PT buffer.
   1070 * @handle:	Current output handle.
   1071 *
   1072 * Place INT and STOP marks to prevent overwriting old data that the consumer
   1073 * hasn't yet collected and waking up the consumer after a certain fraction of
   1074 * the buffer has filled up. Only needed and sensible for non-snapshot counters.
   1075 *
   1076 * This obviously relies on buf::head to figure out buffer markers, so it has
   1077 * to be called after pt_buffer_reset_offsets() and before the hardware tracing
   1078 * is enabled.
   1079 */
   1080static int pt_buffer_reset_markers(struct pt_buffer *buf,
   1081				   struct perf_output_handle *handle)
   1082
   1083{
   1084	unsigned long head = local64_read(&buf->head);
   1085	unsigned long idx, npages, wakeup;
   1086
   1087	if (buf->single)
   1088		return 0;
   1089
   1090	/* can't stop in the middle of an output region */
   1091	if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
   1092		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
   1093		return -EINVAL;
   1094	}
   1095
   1096
   1097	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
   1098	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
   1099		return 0;
   1100
   1101	/* clear STOP and INT from current entry */
   1102	if (buf->stop_te) {
   1103		buf->stop_te->stop = 0;
   1104		buf->stop_te->intr = 0;
   1105	}
   1106
   1107	if (buf->intr_te)
   1108		buf->intr_te->intr = 0;
   1109
   1110	/* how many pages till the STOP marker */
   1111	npages = handle->size >> PAGE_SHIFT;
   1112
   1113	/* if it's on a page boundary, fill up one more page */
   1114	if (!offset_in_page(head + handle->size + 1))
   1115		npages++;
   1116
   1117	idx = (head >> PAGE_SHIFT) + npages;
   1118	idx &= buf->nr_pages - 1;
   1119
   1120	if (idx != buf->stop_pos) {
   1121		buf->stop_pos = idx;
   1122		buf->stop_te = pt_topa_entry_for_page(buf, idx);
   1123		buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
   1124	}
   1125
   1126	wakeup = handle->wakeup >> PAGE_SHIFT;
   1127
   1128	/* in the worst case, wake up the consumer one page before hard stop */
   1129	idx = (head >> PAGE_SHIFT) + npages - 1;
   1130	if (idx > wakeup)
   1131		idx = wakeup;
   1132
   1133	idx &= buf->nr_pages - 1;
   1134	if (idx != buf->intr_pos) {
   1135		buf->intr_pos = idx;
   1136		buf->intr_te = pt_topa_entry_for_page(buf, idx);
   1137		buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
   1138	}
   1139
   1140	buf->stop_te->stop = 1;
   1141	buf->stop_te->intr = 1;
   1142	buf->intr_te->intr = 1;
   1143
   1144	return 0;
   1145}
   1146
   1147/**
   1148 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
   1149 * @buf:	PT buffer.
   1150 * @head:	Write pointer (aux_head) from AUX buffer.
   1151 *
   1152 * Find the ToPA table and entry corresponding to given @head and set buffer's
   1153 * "current" pointers accordingly. This is done after we have obtained the
   1154 * current aux_head position from a successful call to perf_aux_output_begin()
   1155 * to make sure the hardware is writing to the right place.
   1156 *
   1157 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
   1158 * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
   1159 * which are used to determine INT and STOP markers' locations by a subsequent
   1160 * call to pt_buffer_reset_markers().
   1161 */
   1162static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
   1163{
   1164	struct topa_page *cur_tp;
   1165	struct topa_entry *te;
   1166	int pg;
   1167
   1168	if (buf->snapshot)
   1169		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
   1170
   1171	if (!buf->single) {
   1172		pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
   1173		te = pt_topa_entry_for_page(buf, pg);
   1174
   1175		cur_tp = topa_entry_to_page(te);
   1176		buf->cur = &cur_tp->topa;
   1177		buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
   1178		buf->output_off = head & (pt_buffer_region_size(buf) - 1);
   1179	} else {
   1180		buf->output_off = head;
   1181	}
   1182
   1183	local64_set(&buf->head, head);
   1184	local_set(&buf->data_size, 0);
   1185}
   1186
   1187/**
   1188 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
   1189 * @buf:	PT buffer.
   1190 */
   1191static void pt_buffer_fini_topa(struct pt_buffer *buf)
   1192{
   1193	struct topa *topa, *iter;
   1194
   1195	if (buf->single)
   1196		return;
   1197
   1198	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
   1199		/*
   1200		 * right now, this is in free_aux() path only, so
   1201		 * no need to unlink this table from the list
   1202		 */
   1203		topa_free(topa);
   1204	}
   1205}
   1206
   1207/**
   1208 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
   1209 * @buf:	PT buffer.
   1210 * @size:	Total size of all regions within this ToPA.
   1211 * @gfp:	Allocation flags.
   1212 */
   1213static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
   1214			       unsigned long nr_pages, gfp_t gfp)
   1215{
   1216	struct topa *topa;
   1217	int err;
   1218
   1219	topa = topa_alloc(cpu, gfp);
   1220	if (!topa)
   1221		return -ENOMEM;
   1222
   1223	topa_insert_table(buf, topa);
   1224
   1225	while (buf->nr_pages < nr_pages) {
   1226		err = topa_insert_pages(buf, cpu, gfp);
   1227		if (err) {
   1228			pt_buffer_fini_topa(buf);
   1229			return -ENOMEM;
   1230		}
   1231	}
   1232
   1233	/* link last table to the first one, unless we're double buffering */
   1234	if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
   1235		TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
   1236		TOPA_ENTRY(buf->last, -1)->end = 1;
   1237	}
   1238
   1239	pt_topa_dump(buf);
   1240	return 0;
   1241}
   1242
   1243static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
   1244{
   1245	struct page *p = virt_to_page(buf->data_pages[0]);
   1246	int ret = -ENOTSUPP, order = 0;
   1247
   1248	/*
   1249	 * We can use single range output mode
   1250	 * + in snapshot mode, where we don't need interrupts;
   1251	 * + if the hardware supports it;
   1252	 * + if the entire buffer is one contiguous allocation.
   1253	 */
   1254	if (!buf->snapshot)
   1255		goto out;
   1256
   1257	if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
   1258		goto out;
   1259
   1260	if (PagePrivate(p))
   1261		order = page_private(p);
   1262
   1263	if (1 << order != nr_pages)
   1264		goto out;
   1265
   1266	buf->single = true;
   1267	buf->nr_pages = nr_pages;
   1268	ret = 0;
   1269out:
   1270	return ret;
   1271}
   1272
   1273/**
   1274 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
   1275 * @cpu:	Cpu on which to allocate, -1 means current.
   1276 * @pages:	Array of pointers to buffer pages passed from perf core.
   1277 * @nr_pages:	Number of pages in the buffer.
   1278 * @snapshot:	If this is a snapshot/overwrite counter.
   1279 *
   1280 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
   1281 * bookkeeping for an AUX buffer.
   1282 *
   1283 * Return:	Our private PT buffer structure.
   1284 */
   1285static void *
   1286pt_buffer_setup_aux(struct perf_event *event, void **pages,
   1287		    int nr_pages, bool snapshot)
   1288{
   1289	struct pt_buffer *buf;
   1290	int node, ret, cpu = event->cpu;
   1291
   1292	if (!nr_pages)
   1293		return NULL;
   1294
   1295	/*
   1296	 * Only support AUX sampling in snapshot mode, where we don't
   1297	 * generate NMIs.
   1298	 */
   1299	if (event->attr.aux_sample_size && !snapshot)
   1300		return NULL;
   1301
   1302	if (cpu == -1)
   1303		cpu = raw_smp_processor_id();
   1304	node = cpu_to_node(cpu);
   1305
   1306	buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
   1307	if (!buf)
   1308		return NULL;
   1309
   1310	buf->snapshot = snapshot;
   1311	buf->data_pages = pages;
   1312	buf->stop_pos = -1;
   1313	buf->intr_pos = -1;
   1314
   1315	INIT_LIST_HEAD(&buf->tables);
   1316
   1317	ret = pt_buffer_try_single(buf, nr_pages);
   1318	if (!ret)
   1319		return buf;
   1320
   1321	ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
   1322	if (ret) {
   1323		kfree(buf);
   1324		return NULL;
   1325	}
   1326
   1327	return buf;
   1328}
   1329
   1330/**
   1331 * pt_buffer_free_aux() - perf AUX deallocation path callback
   1332 * @data:	PT buffer.
   1333 */
   1334static void pt_buffer_free_aux(void *data)
   1335{
   1336	struct pt_buffer *buf = data;
   1337
   1338	pt_buffer_fini_topa(buf);
   1339	kfree(buf);
   1340}
   1341
   1342static int pt_addr_filters_init(struct perf_event *event)
   1343{
   1344	struct pt_filters *filters;
   1345	int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
   1346
   1347	if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
   1348		return 0;
   1349
   1350	filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
   1351	if (!filters)
   1352		return -ENOMEM;
   1353
   1354	if (event->parent)
   1355		memcpy(filters, event->parent->hw.addr_filters,
   1356		       sizeof(*filters));
   1357
   1358	event->hw.addr_filters = filters;
   1359
   1360	return 0;
   1361}
   1362
   1363static void pt_addr_filters_fini(struct perf_event *event)
   1364{
   1365	kfree(event->hw.addr_filters);
   1366	event->hw.addr_filters = NULL;
   1367}
   1368
   1369#ifdef CONFIG_X86_64
   1370/* Clamp to a canonical address greater-than-or-equal-to the address given */
   1371static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits)
   1372{
   1373	return __is_canonical_address(vaddr, vaddr_bits) ?
   1374	       vaddr :
   1375	       -BIT_ULL(vaddr_bits - 1);
   1376}
   1377
   1378/* Clamp to a canonical address less-than-or-equal-to the address given */
   1379static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits)
   1380{
   1381	return __is_canonical_address(vaddr, vaddr_bits) ?
   1382	       vaddr :
   1383	       BIT_ULL(vaddr_bits - 1) - 1;
   1384}
   1385#else
   1386#define clamp_to_ge_canonical_addr(x, y) (x)
   1387#define clamp_to_le_canonical_addr(x, y) (x)
   1388#endif
   1389
   1390static int pt_event_addr_filters_validate(struct list_head *filters)
   1391{
   1392	struct perf_addr_filter *filter;
   1393	int range = 0;
   1394
   1395	list_for_each_entry(filter, filters, entry) {
   1396		/*
   1397		 * PT doesn't support single address triggers and
   1398		 * 'start' filters.
   1399		 */
   1400		if (!filter->size ||
   1401		    filter->action == PERF_ADDR_FILTER_ACTION_START)
   1402			return -EOPNOTSUPP;
   1403
   1404		if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
   1405			return -EOPNOTSUPP;
   1406	}
   1407
   1408	return 0;
   1409}
   1410
   1411static void pt_event_addr_filters_sync(struct perf_event *event)
   1412{
   1413	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
   1414	unsigned long msr_a, msr_b;
   1415	struct perf_addr_filter_range *fr = event->addr_filter_ranges;
   1416	struct pt_filters *filters = event->hw.addr_filters;
   1417	struct perf_addr_filter *filter;
   1418	int range = 0;
   1419
   1420	if (!filters)
   1421		return;
   1422
   1423	list_for_each_entry(filter, &head->list, entry) {
   1424		if (filter->path.dentry && !fr[range].start) {
   1425			msr_a = msr_b = 0;
   1426		} else {
   1427			unsigned long n = fr[range].size - 1;
   1428			unsigned long a = fr[range].start;
   1429			unsigned long b;
   1430
   1431			if (a > ULONG_MAX - n)
   1432				b = ULONG_MAX;
   1433			else
   1434				b = a + n;
   1435			/*
   1436			 * Apply the offset. 64-bit addresses written to the
   1437			 * MSRs must be canonical, but the range can encompass
   1438			 * non-canonical addresses. Since software cannot
   1439			 * execute at non-canonical addresses, adjusting to
   1440			 * canonical addresses does not affect the result of the
   1441			 * address filter.
   1442			 */
   1443			msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits);
   1444			msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits);
   1445			if (msr_b < msr_a)
   1446				msr_a = msr_b = 0;
   1447		}
   1448
   1449		filters->filter[range].msr_a  = msr_a;
   1450		filters->filter[range].msr_b  = msr_b;
   1451		if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
   1452			filters->filter[range].config = 1;
   1453		else
   1454			filters->filter[range].config = 2;
   1455		range++;
   1456	}
   1457
   1458	filters->nr_filters = range;
   1459}
   1460
   1461/**
   1462 * intel_pt_interrupt() - PT PMI handler
   1463 */
   1464void intel_pt_interrupt(void)
   1465{
   1466	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1467	struct pt_buffer *buf;
   1468	struct perf_event *event = pt->handle.event;
   1469
   1470	/*
   1471	 * There may be a dangling PT bit in the interrupt status register
   1472	 * after PT has been disabled by pt_event_stop(). Make sure we don't
   1473	 * do anything (particularly, re-enable) for this event here.
   1474	 */
   1475	if (!READ_ONCE(pt->handle_nmi))
   1476		return;
   1477
   1478	if (!event)
   1479		return;
   1480
   1481	pt_config_stop(event);
   1482
   1483	buf = perf_get_aux(&pt->handle);
   1484	if (!buf)
   1485		return;
   1486
   1487	pt_read_offset(buf);
   1488
   1489	pt_handle_status(pt);
   1490
   1491	pt_update_head(pt);
   1492
   1493	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
   1494
   1495	if (!event->hw.state) {
   1496		int ret;
   1497
   1498		buf = perf_aux_output_begin(&pt->handle, event);
   1499		if (!buf) {
   1500			event->hw.state = PERF_HES_STOPPED;
   1501			return;
   1502		}
   1503
   1504		pt_buffer_reset_offsets(buf, pt->handle.head);
   1505		/* snapshot counters don't use PMI, so it's safe */
   1506		ret = pt_buffer_reset_markers(buf, &pt->handle);
   1507		if (ret) {
   1508			perf_aux_output_end(&pt->handle, 0);
   1509			return;
   1510		}
   1511
   1512		pt_config_buffer(buf);
   1513		pt_config_start(event);
   1514	}
   1515}
   1516
   1517void intel_pt_handle_vmx(int on)
   1518{
   1519	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1520	struct perf_event *event;
   1521	unsigned long flags;
   1522
   1523	/* PT plays nice with VMX, do nothing */
   1524	if (pt_pmu.vmx)
   1525		return;
   1526
   1527	/*
   1528	 * VMXON will clear RTIT_CTL.TraceEn; we need to make
   1529	 * sure to not try to set it while VMX is on. Disable
   1530	 * interrupts to avoid racing with pmu callbacks;
   1531	 * concurrent PMI should be handled fine.
   1532	 */
   1533	local_irq_save(flags);
   1534	WRITE_ONCE(pt->vmx_on, on);
   1535
   1536	/*
   1537	 * If an AUX transaction is in progress, it will contain
   1538	 * gap(s), so flag it PARTIAL to inform the user.
   1539	 */
   1540	event = pt->handle.event;
   1541	if (event)
   1542		perf_aux_output_flag(&pt->handle,
   1543		                     PERF_AUX_FLAG_PARTIAL);
   1544
   1545	/* Turn PTs back on */
   1546	if (!on && event)
   1547		wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
   1548
   1549	local_irq_restore(flags);
   1550}
   1551EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
   1552
   1553/*
   1554 * PMU callbacks
   1555 */
   1556
   1557static void pt_event_start(struct perf_event *event, int mode)
   1558{
   1559	struct hw_perf_event *hwc = &event->hw;
   1560	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1561	struct pt_buffer *buf;
   1562
   1563	buf = perf_aux_output_begin(&pt->handle, event);
   1564	if (!buf)
   1565		goto fail_stop;
   1566
   1567	pt_buffer_reset_offsets(buf, pt->handle.head);
   1568	if (!buf->snapshot) {
   1569		if (pt_buffer_reset_markers(buf, &pt->handle))
   1570			goto fail_end_stop;
   1571	}
   1572
   1573	WRITE_ONCE(pt->handle_nmi, 1);
   1574	hwc->state = 0;
   1575
   1576	pt_config_buffer(buf);
   1577	pt_config(event);
   1578
   1579	return;
   1580
   1581fail_end_stop:
   1582	perf_aux_output_end(&pt->handle, 0);
   1583fail_stop:
   1584	hwc->state = PERF_HES_STOPPED;
   1585}
   1586
   1587static void pt_event_stop(struct perf_event *event, int mode)
   1588{
   1589	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1590
   1591	/*
   1592	 * Protect against the PMI racing with disabling wrmsr,
   1593	 * see comment in intel_pt_interrupt().
   1594	 */
   1595	WRITE_ONCE(pt->handle_nmi, 0);
   1596
   1597	pt_config_stop(event);
   1598
   1599	if (event->hw.state == PERF_HES_STOPPED)
   1600		return;
   1601
   1602	event->hw.state = PERF_HES_STOPPED;
   1603
   1604	if (mode & PERF_EF_UPDATE) {
   1605		struct pt_buffer *buf = perf_get_aux(&pt->handle);
   1606
   1607		if (!buf)
   1608			return;
   1609
   1610		if (WARN_ON_ONCE(pt->handle.event != event))
   1611			return;
   1612
   1613		pt_read_offset(buf);
   1614
   1615		pt_handle_status(pt);
   1616
   1617		pt_update_head(pt);
   1618
   1619		if (buf->snapshot)
   1620			pt->handle.head =
   1621				local_xchg(&buf->data_size,
   1622					   buf->nr_pages << PAGE_SHIFT);
   1623		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
   1624	}
   1625}
   1626
   1627static long pt_event_snapshot_aux(struct perf_event *event,
   1628				  struct perf_output_handle *handle,
   1629				  unsigned long size)
   1630{
   1631	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1632	struct pt_buffer *buf = perf_get_aux(&pt->handle);
   1633	unsigned long from = 0, to;
   1634	long ret;
   1635
   1636	if (WARN_ON_ONCE(!buf))
   1637		return 0;
   1638
   1639	/*
   1640	 * Sampling is only allowed on snapshot events;
   1641	 * see pt_buffer_setup_aux().
   1642	 */
   1643	if (WARN_ON_ONCE(!buf->snapshot))
   1644		return 0;
   1645
   1646	/*
   1647	 * Here, handle_nmi tells us if the tracing is on
   1648	 */
   1649	if (READ_ONCE(pt->handle_nmi))
   1650		pt_config_stop(event);
   1651
   1652	pt_read_offset(buf);
   1653	pt_update_head(pt);
   1654
   1655	to = local_read(&buf->data_size);
   1656	if (to < size)
   1657		from = buf->nr_pages << PAGE_SHIFT;
   1658	from += to - size;
   1659
   1660	ret = perf_output_copy_aux(&pt->handle, handle, from, to);
   1661
   1662	/*
   1663	 * If the tracing was on when we turned up, restart it.
   1664	 * Compiler barrier not needed as we couldn't have been
   1665	 * preempted by anything that touches pt->handle_nmi.
   1666	 */
   1667	if (pt->handle_nmi)
   1668		pt_config_start(event);
   1669
   1670	return ret;
   1671}
   1672
   1673static void pt_event_del(struct perf_event *event, int mode)
   1674{
   1675	pt_event_stop(event, PERF_EF_UPDATE);
   1676}
   1677
   1678static int pt_event_add(struct perf_event *event, int mode)
   1679{
   1680	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1681	struct hw_perf_event *hwc = &event->hw;
   1682	int ret = -EBUSY;
   1683
   1684	if (pt->handle.event)
   1685		goto fail;
   1686
   1687	if (mode & PERF_EF_START) {
   1688		pt_event_start(event, 0);
   1689		ret = -EINVAL;
   1690		if (hwc->state == PERF_HES_STOPPED)
   1691			goto fail;
   1692	} else {
   1693		hwc->state = PERF_HES_STOPPED;
   1694	}
   1695
   1696	ret = 0;
   1697fail:
   1698
   1699	return ret;
   1700}
   1701
   1702static void pt_event_read(struct perf_event *event)
   1703{
   1704}
   1705
   1706static void pt_event_destroy(struct perf_event *event)
   1707{
   1708	pt_addr_filters_fini(event);
   1709	x86_del_exclusive(x86_lbr_exclusive_pt);
   1710}
   1711
   1712static int pt_event_init(struct perf_event *event)
   1713{
   1714	if (event->attr.type != pt_pmu.pmu.type)
   1715		return -ENOENT;
   1716
   1717	if (!pt_event_valid(event))
   1718		return -EINVAL;
   1719
   1720	if (x86_add_exclusive(x86_lbr_exclusive_pt))
   1721		return -EBUSY;
   1722
   1723	if (pt_addr_filters_init(event)) {
   1724		x86_del_exclusive(x86_lbr_exclusive_pt);
   1725		return -ENOMEM;
   1726	}
   1727
   1728	event->destroy = pt_event_destroy;
   1729
   1730	return 0;
   1731}
   1732
   1733void cpu_emergency_stop_pt(void)
   1734{
   1735	struct pt *pt = this_cpu_ptr(&pt_ctx);
   1736
   1737	if (pt->handle.event)
   1738		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
   1739}
   1740
   1741int is_intel_pt_event(struct perf_event *event)
   1742{
   1743	return event->pmu == &pt_pmu.pmu;
   1744}
   1745
   1746static __init int pt_init(void)
   1747{
   1748	int ret, cpu, prior_warn = 0;
   1749
   1750	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
   1751
   1752	if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
   1753		return -ENODEV;
   1754
   1755	cpus_read_lock();
   1756	for_each_online_cpu(cpu) {
   1757		u64 ctl;
   1758
   1759		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
   1760		if (!ret && (ctl & RTIT_CTL_TRACEEN))
   1761			prior_warn++;
   1762	}
   1763	cpus_read_unlock();
   1764
   1765	if (prior_warn) {
   1766		x86_add_exclusive(x86_lbr_exclusive_pt);
   1767		pr_warn("PT is enabled at boot time, doing nothing\n");
   1768
   1769		return -EBUSY;
   1770	}
   1771
   1772	ret = pt_pmu_hw_init();
   1773	if (ret)
   1774		return ret;
   1775
   1776	if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
   1777		pr_warn("ToPA output is not supported on this CPU\n");
   1778		return -ENODEV;
   1779	}
   1780
   1781	if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
   1782		pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
   1783
   1784	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
   1785	pt_pmu.pmu.attr_groups		 = pt_attr_groups;
   1786	pt_pmu.pmu.task_ctx_nr		 = perf_sw_context;
   1787	pt_pmu.pmu.event_init		 = pt_event_init;
   1788	pt_pmu.pmu.add			 = pt_event_add;
   1789	pt_pmu.pmu.del			 = pt_event_del;
   1790	pt_pmu.pmu.start		 = pt_event_start;
   1791	pt_pmu.pmu.stop			 = pt_event_stop;
   1792	pt_pmu.pmu.snapshot_aux		 = pt_event_snapshot_aux;
   1793	pt_pmu.pmu.read			 = pt_event_read;
   1794	pt_pmu.pmu.setup_aux		 = pt_buffer_setup_aux;
   1795	pt_pmu.pmu.free_aux		 = pt_buffer_free_aux;
   1796	pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
   1797	pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
   1798	pt_pmu.pmu.nr_addr_filters       =
   1799		intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
   1800
   1801	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
   1802
   1803	return ret;
   1804}
   1805arch_initcall(pt_init);