cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

s390-cpumsf.c (34779B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright IBM Corp. 2018
      4 * Auxtrace support for s390 CPU-Measurement Sampling Facility
      5 *
      6 * Author(s):  Thomas Richter <tmricht@linux.ibm.com>
      7 *
      8 * Auxiliary traces are collected during 'perf record' using rbd000 event.
      9 * Several PERF_RECORD_XXX are generated during recording:
     10 *
     11 * PERF_RECORD_AUX:
     12 *	Records that new data landed in the AUX buffer part.
     13 * PERF_RECORD_AUXTRACE:
     14 *	Defines auxtrace data. Followed by the actual data. The contents of
     15 *	the auxtrace data is dependent on the event and the CPU.
     16 *	This record is generated by perf record command. For details
     17 *	see Documentation/perf.data-file-format.txt.
     18 * PERF_RECORD_AUXTRACE_INFO:
     19 *	Defines a table of contains for PERF_RECORD_AUXTRACE records. This
     20 *	record is generated during 'perf record' command. Each record contains
     21 *	up to 256 entries describing offset and size of the AUXTRACE data in the
     22 *	perf.data file.
     23 * PERF_RECORD_AUXTRACE_ERROR:
     24 *	Indicates an error during AUXTRACE collection such as buffer overflow.
     25 * PERF_RECORD_FINISHED_ROUND:
     26 *	Perf events are not necessarily in time stamp order, as they can be
     27 *	collected in parallel on different CPUs. If the events should be
     28 *	processed in time order they need to be sorted first.
     29 *	Perf report guarantees that there is no reordering over a
     30 *	PERF_RECORD_FINISHED_ROUND boundary event. All perf records with a
     31 *	time stamp lower than this record are processed (and displayed) before
     32 *	the succeeding perf record are processed.
     33 *
     34 * These records are evaluated during perf report command.
     35 *
     36 * 1. PERF_RECORD_AUXTRACE_INFO is used to set up the infrastructure for
     37 * auxiliary trace data processing. See s390_cpumsf_process_auxtrace_info()
     38 * below.
     39 * Auxiliary trace data is collected per CPU. To merge the data into the report
     40 * an auxtrace_queue is created for each CPU. It is assumed that the auxtrace
     41 * data is in ascending order.
     42 *
     43 * Each queue has a double linked list of auxtrace_buffers. This list contains
     44 * the offset and size of a CPU's auxtrace data. During auxtrace processing
     45 * the data portion is mmap()'ed.
     46 *
     47 * To sort the queues in chronological order, all queue access is controlled
     48 * by the auxtrace_heap. This is basically a stack, each stack element has two
     49 * entries, the queue number and a time stamp. However the stack is sorted by
     50 * the time stamps. The highest time stamp is at the bottom the lowest
     51 * (nearest) time stamp is at the top. That sort order is maintained at all
     52 * times!
     53 *
     54 * After the auxtrace infrastructure has been setup, the auxtrace queues are
     55 * filled with data (offset/size pairs) and the auxtrace_heap is populated.
     56 *
     57 * 2. PERF_RECORD_XXX processing triggers access to the auxtrace_queues.
     58 * Each record is handled by s390_cpumsf_process_event(). The time stamp of
     59 * the perf record is compared with the time stamp located on the auxtrace_heap
     60 * top element. If that time stamp is lower than the time stamp from the
     61 * record sample, the auxtrace queues will be processed. As auxtrace queues
     62 * control many auxtrace_buffers and each buffer can be quite large, the
     63 * auxtrace buffer might be processed only partially. In this case the
     64 * position in the auxtrace_buffer of that queue is remembered and the time
     65 * stamp of the last processed entry of the auxtrace_buffer replaces the
     66 * current auxtrace_heap top.
     67 *
     68 * 3. Auxtrace_queues might run of out data and are fed by the
     69 * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
     70 *
     71 * Event Generation
     72 * Each sampling-data entry in the auxiliary trace data generates a perf sample.
     73 * This sample is filled
     74 * with data from the auxtrace such as PID/TID, instruction address, CPU state,
     75 * etc. This sample is processed with perf_session__deliver_synth_event() to
     76 * be included into the GUI.
     77 *
     78 * 4. PERF_RECORD_FINISHED_ROUND event is used to process all the remaining
     79 * auxiliary traces entries until the time stamp of this record is reached
     80 * auxtrace_heap top. This is triggered by ordered_event->deliver().
     81 *
     82 *
     83 * Perf event processing.
     84 * Event processing of PERF_RECORD_XXX entries relies on time stamp entries.
     85 * This is the function call sequence:
     86 *
     87 * __cmd_report()
     88 * |
     89 * perf_session__process_events()
     90 * |
     91 * __perf_session__process_events()
     92 * |
     93 * perf_session__process_event()
     94 * |  This functions splits the PERF_RECORD_XXX records.
     95 * |  - Those generated by perf record command (type number equal or higher
     96 * |    than PERF_RECORD_USER_TYPE_START) are handled by
     97 * |    perf_session__process_user_event(see below)
     98 * |  - Those generated by the kernel are handled by
     99 * |    evlist__parse_sample_timestamp()
    100 * |
    101 * evlist__parse_sample_timestamp()
    102 * |  Extract time stamp from sample data.
    103 * |
    104 * perf_session__queue_event()
    105 * |  If timestamp is positive the sample is entered into an ordered_event
    106 * |  list, sort order is the timestamp. The event processing is deferred until
    107 * |  later (see perf_session__process_user_event()).
    108 * |  Other timestamps (0 or -1) are handled immediately by
    109 * |  perf_session__deliver_event(). These are events generated at start up
    110 * |  of command perf record. They create PERF_RECORD_COMM and PERF_RECORD_MMAP*
    111 * |  records. They are needed to create a list of running processes and its
    112 * |  memory mappings and layout. They are needed at the beginning to enable
    113 * |  command perf report to create process trees and memory mappings.
    114 * |
    115 * perf_session__deliver_event()
    116 * |  Delivers a PERF_RECORD_XXX entry for handling.
    117 * |
    118 * auxtrace__process_event()
    119 * |  The timestamp of the PERF_RECORD_XXX entry is taken to correlate with
    120 * |  time stamps from the auxiliary trace buffers. This enables
    121 * |  synchronization between auxiliary trace data and the events on the
    122 * |  perf.data file.
    123 * |
    124 * machine__deliver_event()
    125 * |  Handles the PERF_RECORD_XXX event. This depends on the record type.
    126 *    It might update the process tree, update a process memory map or enter
    127 *    a sample with IP and call back chain data into GUI data pool.
    128 *
    129 *
    130 * Deferred processing determined by perf_session__process_user_event() is
    131 * finally processed when a PERF_RECORD_FINISHED_ROUND is encountered. These
    132 * are generated during command perf record.
    133 * The timestamp of PERF_RECORD_FINISHED_ROUND event is taken to process all
    134 * PERF_RECORD_XXX entries stored in the ordered_event list. This list was
    135 * built up while reading the perf.data file.
    136 * Each event is now processed by calling perf_session__deliver_event().
    137 * This enables time synchronization between the data in the perf.data file and
    138 * the data in the auxiliary trace buffers.
    139 */
    140
    141#include <endian.h>
    142#include <errno.h>
    143#include <byteswap.h>
    144#include <inttypes.h>
    145#include <linux/kernel.h>
    146#include <linux/types.h>
    147#include <linux/bitops.h>
    148#include <linux/log2.h>
    149#include <linux/zalloc.h>
    150
    151#include <sys/stat.h>
    152#include <sys/types.h>
    153
    154#include "color.h"
    155#include "evsel.h"
    156#include "evlist.h"
    157#include "machine.h"
    158#include "session.h"
    159#include "tool.h"
    160#include "debug.h"
    161#include "auxtrace.h"
    162#include "s390-cpumsf.h"
    163#include "s390-cpumsf-kernel.h"
    164#include "s390-cpumcf-kernel.h"
    165#include "config.h"
    166
    167struct s390_cpumsf {
    168	struct auxtrace		auxtrace;
    169	struct auxtrace_queues	queues;
    170	struct auxtrace_heap	heap;
    171	struct perf_session	*session;
    172	struct machine		*machine;
    173	u32			auxtrace_type;
    174	u32			pmu_type;
    175	u16			machine_type;
    176	bool			data_queued;
    177	bool			use_logfile;
    178	char			*logdir;
    179};
    180
    181struct s390_cpumsf_queue {
    182	struct s390_cpumsf	*sf;
    183	unsigned int		queue_nr;
    184	struct auxtrace_buffer	*buffer;
    185	int			cpu;
    186	FILE			*logfile;
    187	FILE			*logfile_ctr;
    188};
    189
    190/* Check if the raw data should be dumped to file. If this is the case and
    191 * the file to dump to has not been opened for writing, do so.
    192 *
    193 * Return 0 on success and greater zero on error so processing continues.
    194 */
    195static int s390_cpumcf_dumpctr(struct s390_cpumsf *sf,
    196			       struct perf_sample *sample)
    197{
    198	struct s390_cpumsf_queue *sfq;
    199	struct auxtrace_queue *q;
    200	int rc = 0;
    201
    202	if (!sf->use_logfile || sf->queues.nr_queues <= sample->cpu)
    203		return rc;
    204
    205	q = &sf->queues.queue_array[sample->cpu];
    206	sfq = q->priv;
    207	if (!sfq)		/* Queue not yet allocated */
    208		return rc;
    209
    210	if (!sfq->logfile_ctr) {
    211		char *name;
    212
    213		rc = (sf->logdir)
    214			? asprintf(&name, "%s/aux.ctr.%02x",
    215				 sf->logdir, sample->cpu)
    216			: asprintf(&name, "aux.ctr.%02x", sample->cpu);
    217		if (rc > 0)
    218			sfq->logfile_ctr = fopen(name, "w");
    219		if (sfq->logfile_ctr == NULL) {
    220			pr_err("Failed to open counter set log file %s, "
    221			       "continue...\n", name);
    222			rc = 1;
    223		}
    224		free(name);
    225	}
    226
    227	if (sfq->logfile_ctr) {
    228		/* See comment above for -4 */
    229		size_t n = fwrite(sample->raw_data, sample->raw_size - 4, 1,
    230				  sfq->logfile_ctr);
    231		if (n != 1) {
    232			pr_err("Failed to write counter set data\n");
    233			rc = 1;
    234		}
    235	}
    236	return rc;
    237}
    238
    239/* Display s390 CPU measurement facility basic-sampling data entry
    240 * Data written on s390 in big endian byte order and contains bit
    241 * fields across byte boundaries.
    242 */
    243static bool s390_cpumsf_basic_show(const char *color, size_t pos,
    244				   struct hws_basic_entry *basicp)
    245{
    246	struct hws_basic_entry *basic = basicp;
    247#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    248	struct hws_basic_entry local;
    249	unsigned long long word = be64toh(*(unsigned long long *)basicp);
    250
    251	memset(&local, 0, sizeof(local));
    252	local.def = be16toh(basicp->def);
    253	local.prim_asn = word & 0xffff;
    254	local.CL = word >> 30 & 0x3;
    255	local.I = word >> 32 & 0x1;
    256	local.AS = word >> 33 & 0x3;
    257	local.P = word >> 35 & 0x1;
    258	local.W = word >> 36 & 0x1;
    259	local.T = word >> 37 & 0x1;
    260	local.U = word >> 40 & 0xf;
    261	local.ia = be64toh(basicp->ia);
    262	local.gpp = be64toh(basicp->gpp);
    263	local.hpp = be64toh(basicp->hpp);
    264	basic = &local;
    265#endif
    266	if (basic->def != 1) {
    267		pr_err("Invalid AUX trace basic entry [%#08zx]\n", pos);
    268		return false;
    269	}
    270	color_fprintf(stdout, color, "    [%#08zx] Basic   Def:%04x Inst:%#04x"
    271		      " %c%c%c%c AS:%d ASN:%#04x IA:%#018llx\n"
    272		      "\t\tCL:%d HPP:%#018llx GPP:%#018llx\n",
    273		      pos, basic->def, basic->U,
    274		      basic->T ? 'T' : ' ',
    275		      basic->W ? 'W' : ' ',
    276		      basic->P ? 'P' : ' ',
    277		      basic->I ? 'I' : ' ',
    278		      basic->AS, basic->prim_asn, basic->ia, basic->CL,
    279		      basic->hpp, basic->gpp);
    280	return true;
    281}
    282
    283/* Display s390 CPU measurement facility diagnostic-sampling data entry.
    284 * Data written on s390 in big endian byte order and contains bit
    285 * fields across byte boundaries.
    286 */
    287static bool s390_cpumsf_diag_show(const char *color, size_t pos,
    288				  struct hws_diag_entry *diagp)
    289{
    290	struct hws_diag_entry *diag = diagp;
    291#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    292	struct hws_diag_entry local;
    293	unsigned long long word = be64toh(*(unsigned long long *)diagp);
    294
    295	local.def = be16toh(diagp->def);
    296	local.I = word >> 32 & 0x1;
    297	diag = &local;
    298#endif
    299	if (diag->def < S390_CPUMSF_DIAG_DEF_FIRST) {
    300		pr_err("Invalid AUX trace diagnostic entry [%#08zx]\n", pos);
    301		return false;
    302	}
    303	color_fprintf(stdout, color, "    [%#08zx] Diag    Def:%04x %c\n",
    304		      pos, diag->def, diag->I ? 'I' : ' ');
    305	return true;
    306}
    307
    308/* Return TOD timestamp contained in an trailer entry */
    309static unsigned long long trailer_timestamp(struct hws_trailer_entry *te,
    310					    int idx)
    311{
    312	/* te->t set: TOD in STCKE format, bytes 8-15
    313	 * to->t not set: TOD in STCK format, bytes 0-7
    314	 */
    315	unsigned long long ts;
    316
    317	memcpy(&ts, &te->timestamp[idx], sizeof(ts));
    318	return be64toh(ts);
    319}
    320
    321/* Display s390 CPU measurement facility trailer entry */
    322static bool s390_cpumsf_trailer_show(const char *color, size_t pos,
    323				     struct hws_trailer_entry *te)
    324{
    325#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    326	struct hws_trailer_entry local;
    327	const unsigned long long flags = be64toh(te->flags);
    328
    329	memset(&local, 0, sizeof(local));
    330	local.f = flags >> 63 & 0x1;
    331	local.a = flags >> 62 & 0x1;
    332	local.t = flags >> 61 & 0x1;
    333	local.bsdes = be16toh((flags >> 16 & 0xffff));
    334	local.dsdes = be16toh((flags & 0xffff));
    335	memcpy(&local.timestamp, te->timestamp, sizeof(te->timestamp));
    336	local.overflow = be64toh(te->overflow);
    337	local.clock_base = be64toh(te->progusage[0]) >> 63 & 1;
    338	local.progusage2 = be64toh(te->progusage2);
    339	te = &local;
    340#endif
    341	if (te->bsdes != sizeof(struct hws_basic_entry)) {
    342		pr_err("Invalid AUX trace trailer entry [%#08zx]\n", pos);
    343		return false;
    344	}
    345	color_fprintf(stdout, color, "    [%#08zx] Trailer %c%c%c bsdes:%d"
    346		      " dsdes:%d Overflow:%lld Time:%#llx\n"
    347		      "\t\tC:%d TOD:%#lx\n",
    348		      pos,
    349		      te->f ? 'F' : ' ',
    350		      te->a ? 'A' : ' ',
    351		      te->t ? 'T' : ' ',
    352		      te->bsdes, te->dsdes, te->overflow,
    353		      trailer_timestamp(te, te->clock_base),
    354		      te->clock_base, te->progusage2);
    355	return true;
    356}
    357
    358/* Test a sample data block. It must be 4KB or a multiple thereof in size and
    359 * 4KB page aligned. Each sample data page has a trailer entry at the
    360 * end which contains the sample entry data sizes.
    361 *
    362 * Return true if the sample data block passes the checks and set the
    363 * basic set entry size and diagnostic set entry size.
    364 *
    365 * Return false on failure.
    366 *
    367 * Note: Old hardware does not set the basic or diagnostic entry sizes
    368 * in the trailer entry. Use the type number instead.
    369 */
    370static bool s390_cpumsf_validate(int machine_type,
    371				 unsigned char *buf, size_t len,
    372				 unsigned short *bsdes,
    373				 unsigned short *dsdes)
    374{
    375	struct hws_basic_entry *basic = (struct hws_basic_entry *)buf;
    376	struct hws_trailer_entry *te;
    377
    378	*dsdes = *bsdes = 0;
    379	if (len & (S390_CPUMSF_PAGESZ - 1))	/* Illegal size */
    380		return false;
    381	if (be16toh(basic->def) != 1)	/* No basic set entry, must be first */
    382		return false;
    383	/* Check for trailer entry at end of SDB */
    384	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
    385					      - sizeof(*te));
    386	*bsdes = be16toh(te->bsdes);
    387	*dsdes = be16toh(te->dsdes);
    388	if (!te->bsdes && !te->dsdes) {
    389		/* Very old hardware, use CPUID */
    390		switch (machine_type) {
    391		case 2097:
    392		case 2098:
    393			*dsdes = 64;
    394			*bsdes = 32;
    395			break;
    396		case 2817:
    397		case 2818:
    398			*dsdes = 74;
    399			*bsdes = 32;
    400			break;
    401		case 2827:
    402		case 2828:
    403			*dsdes = 85;
    404			*bsdes = 32;
    405			break;
    406		case 2964:
    407		case 2965:
    408			*dsdes = 112;
    409			*bsdes = 32;
    410			break;
    411		default:
    412			/* Illegal trailer entry */
    413			return false;
    414		}
    415	}
    416	return true;
    417}
    418
    419/* Return true if there is room for another entry */
    420static bool s390_cpumsf_reached_trailer(size_t entry_sz, size_t pos)
    421{
    422	size_t payload = S390_CPUMSF_PAGESZ - sizeof(struct hws_trailer_entry);
    423
    424	if (payload - (pos & (S390_CPUMSF_PAGESZ - 1)) < entry_sz)
    425		return false;
    426	return true;
    427}
    428
    429/* Dump an auxiliary buffer. These buffers are multiple of
    430 * 4KB SDB pages.
    431 */
    432static void s390_cpumsf_dump(struct s390_cpumsf *sf,
    433			     unsigned char *buf, size_t len)
    434{
    435	const char *color = PERF_COLOR_BLUE;
    436	struct hws_basic_entry *basic;
    437	struct hws_diag_entry *diag;
    438	unsigned short bsdes, dsdes;
    439	size_t pos = 0;
    440
    441	color_fprintf(stdout, color,
    442		      ". ... s390 AUX data: size %zu bytes\n",
    443		      len);
    444
    445	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
    446				  &dsdes)) {
    447		pr_err("Invalid AUX trace data block size:%zu"
    448		       " (type:%d bsdes:%hd dsdes:%hd)\n",
    449		       len, sf->machine_type, bsdes, dsdes);
    450		return;
    451	}
    452
    453	/* s390 kernel always returns 4KB blocks fully occupied,
    454	 * no partially filled SDBs.
    455	 */
    456	while (pos < len) {
    457		/* Handle Basic entry */
    458		basic = (struct hws_basic_entry *)(buf + pos);
    459		if (s390_cpumsf_basic_show(color, pos, basic))
    460			pos += bsdes;
    461		else
    462			return;
    463
    464		/* Handle Diagnostic entry */
    465		diag = (struct hws_diag_entry *)(buf + pos);
    466		if (s390_cpumsf_diag_show(color, pos, diag))
    467			pos += dsdes;
    468		else
    469			return;
    470
    471		/* Check for trailer entry */
    472		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
    473			/* Show trailer entry */
    474			struct hws_trailer_entry te;
    475
    476			pos = (pos + S390_CPUMSF_PAGESZ)
    477			       & ~(S390_CPUMSF_PAGESZ - 1);
    478			pos -= sizeof(te);
    479			memcpy(&te, buf + pos, sizeof(te));
    480			/* Set descriptor sizes in case of old hardware
    481			 * where these values are not set.
    482			 */
    483			te.bsdes = bsdes;
    484			te.dsdes = dsdes;
    485			if (s390_cpumsf_trailer_show(color, pos, &te))
    486				pos += sizeof(te);
    487			else
    488				return;
    489		}
    490	}
    491}
    492
    493static void s390_cpumsf_dump_event(struct s390_cpumsf *sf, unsigned char *buf,
    494				   size_t len)
    495{
    496	printf(".\n");
    497	s390_cpumsf_dump(sf, buf, len);
    498}
    499
    500#define	S390_LPP_PID_MASK	0xffffffff
    501
    502static bool s390_cpumsf_make_event(size_t pos,
    503				   struct hws_basic_entry *basic,
    504				   struct s390_cpumsf_queue *sfq)
    505{
    506	struct perf_sample sample = {
    507				.ip = basic->ia,
    508				.pid = basic->hpp & S390_LPP_PID_MASK,
    509				.tid = basic->hpp & S390_LPP_PID_MASK,
    510				.cpumode = PERF_RECORD_MISC_CPUMODE_UNKNOWN,
    511				.cpu = sfq->cpu,
    512				.period = 1
    513			    };
    514	union perf_event event;
    515
    516	memset(&event, 0, sizeof(event));
    517	if (basic->CL == 1)	/* Native LPAR mode */
    518		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
    519					  : PERF_RECORD_MISC_KERNEL;
    520	else if (basic->CL == 2)	/* Guest kernel/user space */
    521		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
    522					  : PERF_RECORD_MISC_GUEST_KERNEL;
    523	else if (basic->gpp || basic->prim_asn != 0xffff)
    524		/* Use heuristics on old hardware */
    525		sample.cpumode = basic->P ? PERF_RECORD_MISC_GUEST_USER
    526					  : PERF_RECORD_MISC_GUEST_KERNEL;
    527	else
    528		sample.cpumode = basic->P ? PERF_RECORD_MISC_USER
    529					  : PERF_RECORD_MISC_KERNEL;
    530
    531	event.sample.header.type = PERF_RECORD_SAMPLE;
    532	event.sample.header.misc = sample.cpumode;
    533	event.sample.header.size = sizeof(struct perf_event_header);
    534
    535	pr_debug4("%s pos:%#zx ip:%#" PRIx64 " P:%d CL:%d pid:%d.%d cpumode:%d cpu:%d\n",
    536		 __func__, pos, sample.ip, basic->P, basic->CL, sample.pid,
    537		 sample.tid, sample.cpumode, sample.cpu);
    538	if (perf_session__deliver_synth_event(sfq->sf->session, &event,
    539					      &sample)) {
    540		pr_err("s390 Auxiliary Trace: failed to deliver event\n");
    541		return false;
    542	}
    543	return true;
    544}
    545
    546static unsigned long long get_trailer_time(const unsigned char *buf)
    547{
    548	struct hws_trailer_entry *te;
    549	unsigned long long aux_time, progusage2;
    550	bool clock_base;
    551
    552	te = (struct hws_trailer_entry *)(buf + S390_CPUMSF_PAGESZ
    553					      - sizeof(*te));
    554
    555#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    556	clock_base = be64toh(te->progusage[0]) >> 63 & 0x1;
    557	progusage2 = be64toh(te->progusage[1]);
    558#else
    559	clock_base = te->clock_base;
    560	progusage2 = te->progusage2;
    561#endif
    562	if (!clock_base)	/* TOD_CLOCK_BASE value missing */
    563		return 0;
    564
    565	/* Correct calculation to convert time stamp in trailer entry to
    566	 * nano seconds (taken from arch/s390 function tod_to_ns()).
    567	 * TOD_CLOCK_BASE is stored in trailer entry member progusage2.
    568	 */
    569	aux_time = trailer_timestamp(te, clock_base) - progusage2;
    570	aux_time = (aux_time >> 9) * 125 + (((aux_time & 0x1ff) * 125) >> 9);
    571	return aux_time;
    572}
    573
    574/* Process the data samples of a single queue. The first parameter is a
    575 * pointer to the queue, the second parameter is the time stamp. This
    576 * is the time stamp:
    577 * - of the event that triggered this processing.
    578 * - or the time stamp when the last processing of this queue stopped.
    579 *   In this case it stopped at a 4KB page boundary and record the
    580 *   position on where to continue processing on the next invocation
    581 *   (see buffer->use_data and buffer->use_size).
    582 *
    583 * When this function returns the second parameter is updated to
    584 * reflect the time stamp of the last processed auxiliary data entry
    585 * (taken from the trailer entry of that page). The caller uses this
    586 * returned time stamp to record the last processed entry in this
    587 * queue.
    588 *
    589 * The function returns:
    590 * 0:  Processing successful. The second parameter returns the
    591 *     time stamp from the trailer entry until which position
    592 *     processing took place. Subsequent calls resume from this
    593 *     position.
    594 * <0: An error occurred during processing. The second parameter
    595 *     returns the maximum time stamp.
    596 * >0: Done on this queue. The second parameter returns the
    597 *     maximum time stamp.
    598 */
    599static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
    600{
    601	struct s390_cpumsf *sf = sfq->sf;
    602	unsigned char *buf = sfq->buffer->use_data;
    603	size_t len = sfq->buffer->use_size;
    604	struct hws_basic_entry *basic;
    605	unsigned short bsdes, dsdes;
    606	size_t pos = 0;
    607	int err = 1;
    608	u64 aux_ts;
    609
    610	if (!s390_cpumsf_validate(sf->machine_type, buf, len, &bsdes,
    611				  &dsdes)) {
    612		*ts = ~0ULL;
    613		return -1;
    614	}
    615
    616	/* Get trailer entry time stamp and check if entries in
    617	 * this auxiliary page are ready for processing. If the
    618	 * time stamp of the first entry is too high, whole buffer
    619	 * can be skipped. In this case return time stamp.
    620	 */
    621	aux_ts = get_trailer_time(buf);
    622	if (!aux_ts) {
    623		pr_err("[%#08" PRIx64 "] Invalid AUX trailer entry TOD clock base\n",
    624		       (s64)sfq->buffer->data_offset);
    625		aux_ts = ~0ULL;
    626		goto out;
    627	}
    628	if (aux_ts > *ts) {
    629		*ts = aux_ts;
    630		return 0;
    631	}
    632
    633	while (pos < len) {
    634		/* Handle Basic entry */
    635		basic = (struct hws_basic_entry *)(buf + pos);
    636		if (s390_cpumsf_make_event(pos, basic, sfq))
    637			pos += bsdes;
    638		else {
    639			err = -EBADF;
    640			goto out;
    641		}
    642
    643		pos += dsdes;	/* Skip diagnostic entry */
    644
    645		/* Check for trailer entry */
    646		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
    647			pos = (pos + S390_CPUMSF_PAGESZ)
    648			       & ~(S390_CPUMSF_PAGESZ - 1);
    649			/* Check existence of next page */
    650			if (pos >= len)
    651				break;
    652			aux_ts = get_trailer_time(buf + pos);
    653			if (!aux_ts) {
    654				aux_ts = ~0ULL;
    655				goto out;
    656			}
    657			if (aux_ts > *ts) {
    658				*ts = aux_ts;
    659				sfq->buffer->use_data += pos;
    660				sfq->buffer->use_size -= pos;
    661				return 0;
    662			}
    663		}
    664	}
    665out:
    666	*ts = aux_ts;
    667	sfq->buffer->use_size = 0;
    668	sfq->buffer->use_data = NULL;
    669	return err;	/* Buffer completely scanned or error */
    670}
    671
    672/* Run the s390 auxiliary trace decoder.
    673 * Select the queue buffer to operate on, the caller already selected
    674 * the proper queue, depending on second parameter 'ts'.
    675 * This is the time stamp until which the auxiliary entries should
    676 * be processed. This value is updated by called functions and
    677 * returned to the caller.
    678 *
    679 * Resume processing in the current buffer. If there is no buffer
    680 * get a new buffer from the queue and setup start position for
    681 * processing.
    682 * When a buffer is completely processed remove it from the queue
    683 * before returning.
    684 *
    685 * This function returns
    686 * 1: When the queue is empty. Second parameter will be set to
    687 *    maximum time stamp.
    688 * 0: Normal processing done.
    689 * <0: Error during queue buffer setup. This causes the caller
    690 *     to stop processing completely.
    691 */
    692static int s390_cpumsf_run_decoder(struct s390_cpumsf_queue *sfq,
    693				   u64 *ts)
    694{
    695
    696	struct auxtrace_buffer *buffer;
    697	struct auxtrace_queue *queue;
    698	int err;
    699
    700	queue = &sfq->sf->queues.queue_array[sfq->queue_nr];
    701
    702	/* Get buffer and last position in buffer to resume
    703	 * decoding the auxiliary entries. One buffer might be large
    704	 * and decoding might stop in between. This depends on the time
    705	 * stamp of the trailer entry in each page of the auxiliary
    706	 * data and the time stamp of the event triggering the decoding.
    707	 */
    708	if (sfq->buffer == NULL) {
    709		sfq->buffer = buffer = auxtrace_buffer__next(queue,
    710							     sfq->buffer);
    711		if (!buffer) {
    712			*ts = ~0ULL;
    713			return 1;	/* Processing done on this queue */
    714		}
    715		/* Start with a new buffer on this queue */
    716		if (buffer->data) {
    717			buffer->use_size = buffer->size;
    718			buffer->use_data = buffer->data;
    719		}
    720		if (sfq->logfile) {	/* Write into log file */
    721			size_t rc = fwrite(buffer->data, buffer->size, 1,
    722					   sfq->logfile);
    723			if (rc != 1)
    724				pr_err("Failed to write auxiliary data\n");
    725		}
    726	} else
    727		buffer = sfq->buffer;
    728
    729	if (!buffer->data) {
    730		int fd = perf_data__fd(sfq->sf->session->data);
    731
    732		buffer->data = auxtrace_buffer__get_data(buffer, fd);
    733		if (!buffer->data)
    734			return -ENOMEM;
    735		buffer->use_size = buffer->size;
    736		buffer->use_data = buffer->data;
    737
    738		if (sfq->logfile) {	/* Write into log file */
    739			size_t rc = fwrite(buffer->data, buffer->size, 1,
    740					   sfq->logfile);
    741			if (rc != 1)
    742				pr_err("Failed to write auxiliary data\n");
    743		}
    744	}
    745	pr_debug4("%s queue_nr:%d buffer:%" PRId64 " offset:%#" PRIx64 " size:%#zx rest:%#zx\n",
    746		  __func__, sfq->queue_nr, buffer->buffer_nr, buffer->offset,
    747		  buffer->size, buffer->use_size);
    748	err = s390_cpumsf_samples(sfq, ts);
    749
    750	/* If non-zero, there is either an error (err < 0) or the buffer is
    751	 * completely done (err > 0). The error is unrecoverable, usually
    752	 * some descriptors could not be read successfully, so continue with
    753	 * the next buffer.
    754	 * In both cases the parameter 'ts' has been updated.
    755	 */
    756	if (err) {
    757		sfq->buffer = NULL;
    758		list_del_init(&buffer->list);
    759		auxtrace_buffer__free(buffer);
    760		if (err > 0)		/* Buffer done, no error */
    761			err = 0;
    762	}
    763	return err;
    764}
    765
    766static struct s390_cpumsf_queue *
    767s390_cpumsf_alloc_queue(struct s390_cpumsf *sf, unsigned int queue_nr)
    768{
    769	struct s390_cpumsf_queue *sfq;
    770
    771	sfq = zalloc(sizeof(struct s390_cpumsf_queue));
    772	if (sfq == NULL)
    773		return NULL;
    774
    775	sfq->sf = sf;
    776	sfq->queue_nr = queue_nr;
    777	sfq->cpu = -1;
    778	if (sf->use_logfile) {
    779		char *name;
    780		int rc;
    781
    782		rc = (sf->logdir)
    783			? asprintf(&name, "%s/aux.smp.%02x",
    784				 sf->logdir, queue_nr)
    785			: asprintf(&name, "aux.smp.%02x", queue_nr);
    786		if (rc > 0)
    787			sfq->logfile = fopen(name, "w");
    788		if (sfq->logfile == NULL) {
    789			pr_err("Failed to open auxiliary log file %s,"
    790			       "continue...\n", name);
    791			sf->use_logfile = false;
    792		}
    793		free(name);
    794	}
    795	return sfq;
    796}
    797
    798static int s390_cpumsf_setup_queue(struct s390_cpumsf *sf,
    799				   struct auxtrace_queue *queue,
    800				   unsigned int queue_nr, u64 ts)
    801{
    802	struct s390_cpumsf_queue *sfq = queue->priv;
    803
    804	if (list_empty(&queue->head))
    805		return 0;
    806
    807	if (sfq == NULL) {
    808		sfq = s390_cpumsf_alloc_queue(sf, queue_nr);
    809		if (!sfq)
    810			return -ENOMEM;
    811		queue->priv = sfq;
    812
    813		if (queue->cpu != -1)
    814			sfq->cpu = queue->cpu;
    815	}
    816	return auxtrace_heap__add(&sf->heap, queue_nr, ts);
    817}
    818
    819static int s390_cpumsf_setup_queues(struct s390_cpumsf *sf, u64 ts)
    820{
    821	unsigned int i;
    822	int ret = 0;
    823
    824	for (i = 0; i < sf->queues.nr_queues; i++) {
    825		ret = s390_cpumsf_setup_queue(sf, &sf->queues.queue_array[i],
    826					      i, ts);
    827		if (ret)
    828			break;
    829	}
    830	return ret;
    831}
    832
    833static int s390_cpumsf_update_queues(struct s390_cpumsf *sf, u64 ts)
    834{
    835	if (!sf->queues.new_data)
    836		return 0;
    837
    838	sf->queues.new_data = false;
    839	return s390_cpumsf_setup_queues(sf, ts);
    840}
    841
    842static int s390_cpumsf_process_queues(struct s390_cpumsf *sf, u64 timestamp)
    843{
    844	unsigned int queue_nr;
    845	u64 ts;
    846	int ret;
    847
    848	while (1) {
    849		struct auxtrace_queue *queue;
    850		struct s390_cpumsf_queue *sfq;
    851
    852		if (!sf->heap.heap_cnt)
    853			return 0;
    854
    855		if (sf->heap.heap_array[0].ordinal >= timestamp)
    856			return 0;
    857
    858		queue_nr = sf->heap.heap_array[0].queue_nr;
    859		queue = &sf->queues.queue_array[queue_nr];
    860		sfq = queue->priv;
    861
    862		auxtrace_heap__pop(&sf->heap);
    863		if (sf->heap.heap_cnt) {
    864			ts = sf->heap.heap_array[0].ordinal + 1;
    865			if (ts > timestamp)
    866				ts = timestamp;
    867		} else {
    868			ts = timestamp;
    869		}
    870
    871		ret = s390_cpumsf_run_decoder(sfq, &ts);
    872		if (ret < 0) {
    873			auxtrace_heap__add(&sf->heap, queue_nr, ts);
    874			return ret;
    875		}
    876		if (!ret) {
    877			ret = auxtrace_heap__add(&sf->heap, queue_nr, ts);
    878			if (ret < 0)
    879				return ret;
    880		}
    881	}
    882	return 0;
    883}
    884
    885static int s390_cpumsf_synth_error(struct s390_cpumsf *sf, int code, int cpu,
    886				   pid_t pid, pid_t tid, u64 ip, u64 timestamp)
    887{
    888	char msg[MAX_AUXTRACE_ERROR_MSG];
    889	union perf_event event;
    890	int err;
    891
    892	strncpy(msg, "Lost Auxiliary Trace Buffer", sizeof(msg) - 1);
    893	auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
    894			     code, cpu, pid, tid, ip, msg, timestamp);
    895
    896	err = perf_session__deliver_synth_event(sf->session, &event, NULL);
    897	if (err)
    898		pr_err("s390 Auxiliary Trace: failed to deliver error event,"
    899			"error %d\n", err);
    900	return err;
    901}
    902
    903static int s390_cpumsf_lost(struct s390_cpumsf *sf, struct perf_sample *sample)
    904{
    905	return s390_cpumsf_synth_error(sf, 1, sample->cpu,
    906				       sample->pid, sample->tid, 0,
    907				       sample->time);
    908}
    909
    910static int
    911s390_cpumsf_process_event(struct perf_session *session,
    912			  union perf_event *event,
    913			  struct perf_sample *sample,
    914			  struct perf_tool *tool)
    915{
    916	struct s390_cpumsf *sf = container_of(session->auxtrace,
    917					      struct s390_cpumsf,
    918					      auxtrace);
    919	u64 timestamp = sample->time;
    920	struct evsel *ev_bc000;
    921
    922	int err = 0;
    923
    924	if (dump_trace)
    925		return 0;
    926
    927	if (!tool->ordered_events) {
    928		pr_err("s390 Auxiliary Trace requires ordered events\n");
    929		return -EINVAL;
    930	}
    931
    932	if (event->header.type == PERF_RECORD_SAMPLE &&
    933	    sample->raw_size) {
    934		/* Handle event with raw data */
    935		ev_bc000 = evlist__event2evsel(session->evlist, event);
    936		if (ev_bc000 &&
    937		    ev_bc000->core.attr.config == PERF_EVENT_CPUM_CF_DIAG)
    938			err = s390_cpumcf_dumpctr(sf, sample);
    939		return err;
    940	}
    941
    942	if (event->header.type == PERF_RECORD_AUX &&
    943	    event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
    944		return s390_cpumsf_lost(sf, sample);
    945
    946	if (timestamp) {
    947		err = s390_cpumsf_update_queues(sf, timestamp);
    948		if (!err)
    949			err = s390_cpumsf_process_queues(sf, timestamp);
    950	}
    951	return err;
    952}
    953
    954struct s390_cpumsf_synth {
    955	struct perf_tool cpumsf_tool;
    956	struct perf_session *session;
    957};
    958
    959static int
    960s390_cpumsf_process_auxtrace_event(struct perf_session *session,
    961				   union perf_event *event __maybe_unused,
    962				   struct perf_tool *tool __maybe_unused)
    963{
    964	struct s390_cpumsf *sf = container_of(session->auxtrace,
    965					      struct s390_cpumsf,
    966					      auxtrace);
    967
    968	int fd = perf_data__fd(session->data);
    969	struct auxtrace_buffer *buffer;
    970	off_t data_offset;
    971	int err;
    972
    973	if (sf->data_queued)
    974		return 0;
    975
    976	if (perf_data__is_pipe(session->data)) {
    977		data_offset = 0;
    978	} else {
    979		data_offset = lseek(fd, 0, SEEK_CUR);
    980		if (data_offset == -1)
    981			return -errno;
    982	}
    983
    984	err = auxtrace_queues__add_event(&sf->queues, session, event,
    985					 data_offset, &buffer);
    986	if (err)
    987		return err;
    988
    989	/* Dump here after copying piped trace out of the pipe */
    990	if (dump_trace) {
    991		if (auxtrace_buffer__get_data(buffer, fd)) {
    992			s390_cpumsf_dump_event(sf, buffer->data,
    993					       buffer->size);
    994			auxtrace_buffer__put_data(buffer);
    995		}
    996	}
    997	return 0;
    998}
    999
   1000static void s390_cpumsf_free_events(struct perf_session *session __maybe_unused)
   1001{
   1002}
   1003
   1004static int s390_cpumsf_flush(struct perf_session *session __maybe_unused,
   1005			     struct perf_tool *tool __maybe_unused)
   1006{
   1007	return 0;
   1008}
   1009
   1010static void s390_cpumsf_free_queues(struct perf_session *session)
   1011{
   1012	struct s390_cpumsf *sf = container_of(session->auxtrace,
   1013					      struct s390_cpumsf,
   1014					      auxtrace);
   1015	struct auxtrace_queues *queues = &sf->queues;
   1016	unsigned int i;
   1017
   1018	for (i = 0; i < queues->nr_queues; i++) {
   1019		struct s390_cpumsf_queue *sfq = (struct s390_cpumsf_queue *)
   1020						queues->queue_array[i].priv;
   1021
   1022		if (sfq != NULL) {
   1023			if (sfq->logfile) {
   1024				fclose(sfq->logfile);
   1025				sfq->logfile = NULL;
   1026			}
   1027			if (sfq->logfile_ctr) {
   1028				fclose(sfq->logfile_ctr);
   1029				sfq->logfile_ctr = NULL;
   1030			}
   1031		}
   1032		zfree(&queues->queue_array[i].priv);
   1033	}
   1034	auxtrace_queues__free(queues);
   1035}
   1036
   1037static void s390_cpumsf_free(struct perf_session *session)
   1038{
   1039	struct s390_cpumsf *sf = container_of(session->auxtrace,
   1040					      struct s390_cpumsf,
   1041					      auxtrace);
   1042
   1043	auxtrace_heap__free(&sf->heap);
   1044	s390_cpumsf_free_queues(session);
   1045	session->auxtrace = NULL;
   1046	zfree(&sf->logdir);
   1047	free(sf);
   1048}
   1049
   1050static bool
   1051s390_cpumsf_evsel_is_auxtrace(struct perf_session *session __maybe_unused,
   1052			      struct evsel *evsel)
   1053{
   1054	return evsel->core.attr.type == PERF_TYPE_RAW &&
   1055	       evsel->core.attr.config == PERF_EVENT_CPUM_SF_DIAG;
   1056}
   1057
   1058static int s390_cpumsf_get_type(const char *cpuid)
   1059{
   1060	int ret, family = 0;
   1061
   1062	ret = sscanf(cpuid, "%*[^,],%u", &family);
   1063	return (ret == 1) ? family : 0;
   1064}
   1065
   1066/* Check itrace options set on perf report command.
   1067 * Return true, if none are set or all options specified can be
   1068 * handled on s390 (currently only option 'd' for logging.
   1069 * Return false otherwise.
   1070 */
   1071static bool check_auxtrace_itrace(struct itrace_synth_opts *itops)
   1072{
   1073	bool ison = false;
   1074
   1075	if (!itops || !itops->set)
   1076		return true;
   1077	ison = itops->inject || itops->instructions || itops->branches ||
   1078		itops->transactions || itops->ptwrites ||
   1079		itops->pwr_events || itops->errors ||
   1080		itops->dont_decode || itops->calls || itops->returns ||
   1081		itops->callchain || itops->thread_stack ||
   1082		itops->last_branch || itops->add_callchain ||
   1083		itops->add_last_branch;
   1084	if (!ison)
   1085		return true;
   1086	pr_err("Unsupported --itrace options specified\n");
   1087	return false;
   1088}
   1089
   1090/* Check for AUXTRACE dump directory if it is needed.
   1091 * On failure print an error message but continue.
   1092 * Return 0 on wrong keyword in config file and 1 otherwise.
   1093 */
   1094static int s390_cpumsf__config(const char *var, const char *value, void *cb)
   1095{
   1096	struct s390_cpumsf *sf = cb;
   1097	struct stat stbuf;
   1098	int rc;
   1099
   1100	if (strcmp(var, "auxtrace.dumpdir"))
   1101		return 0;
   1102	sf->logdir = strdup(value);
   1103	if (sf->logdir == NULL) {
   1104		pr_err("Failed to find auxtrace log directory %s,"
   1105		       " continue with current directory...\n", value);
   1106		return 1;
   1107	}
   1108	rc = stat(sf->logdir, &stbuf);
   1109	if (rc == -1 || !S_ISDIR(stbuf.st_mode)) {
   1110		pr_err("Missing auxtrace log directory %s,"
   1111		       " continue with current directory...\n", value);
   1112		zfree(&sf->logdir);
   1113	}
   1114	return 1;
   1115}
   1116
   1117int s390_cpumsf_process_auxtrace_info(union perf_event *event,
   1118				      struct perf_session *session)
   1119{
   1120	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
   1121	struct s390_cpumsf *sf;
   1122	int err;
   1123
   1124	if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info))
   1125		return -EINVAL;
   1126
   1127	sf = zalloc(sizeof(struct s390_cpumsf));
   1128	if (sf == NULL)
   1129		return -ENOMEM;
   1130
   1131	if (!check_auxtrace_itrace(session->itrace_synth_opts)) {
   1132		err = -EINVAL;
   1133		goto err_free;
   1134	}
   1135	sf->use_logfile = session->itrace_synth_opts->log;
   1136	if (sf->use_logfile)
   1137		perf_config(s390_cpumsf__config, sf);
   1138
   1139	err = auxtrace_queues__init(&sf->queues);
   1140	if (err)
   1141		goto err_free;
   1142
   1143	sf->session = session;
   1144	sf->machine = &session->machines.host; /* No kvm support */
   1145	sf->auxtrace_type = auxtrace_info->type;
   1146	sf->pmu_type = PERF_TYPE_RAW;
   1147	sf->machine_type = s390_cpumsf_get_type(session->evlist->env->cpuid);
   1148
   1149	sf->auxtrace.process_event = s390_cpumsf_process_event;
   1150	sf->auxtrace.process_auxtrace_event = s390_cpumsf_process_auxtrace_event;
   1151	sf->auxtrace.flush_events = s390_cpumsf_flush;
   1152	sf->auxtrace.free_events = s390_cpumsf_free_events;
   1153	sf->auxtrace.free = s390_cpumsf_free;
   1154	sf->auxtrace.evsel_is_auxtrace = s390_cpumsf_evsel_is_auxtrace;
   1155	session->auxtrace = &sf->auxtrace;
   1156
   1157	if (dump_trace)
   1158		return 0;
   1159
   1160	err = auxtrace_queues__process_index(&sf->queues, session);
   1161	if (err)
   1162		goto err_free_queues;
   1163
   1164	if (sf->queues.populated)
   1165		sf->data_queued = true;
   1166
   1167	return 0;
   1168
   1169err_free_queues:
   1170	auxtrace_queues__free(&sf->queues);
   1171	session->auxtrace = NULL;
   1172err_free:
   1173	zfree(&sf->logdir);
   1174	free(sf);
   1175	return err;
   1176}