cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

perf.c (22409B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *  Parisc performance counters
      4 *  Copyright (C) 2001 Randolph Chung <tausq@debian.org>
      5 *
      6 *  This code is derived, with permission, from HP/UX sources.
      7 */
      8
      9/*
     10 *  Edited comment from original sources:
     11 *
     12 *  This driver programs the PCX-U/PCX-W performance counters
     13 *  on the PA-RISC 2.0 chips.  The driver keeps all images now
     14 *  internally to the kernel to hopefully eliminate the possibility
     15 *  of a bad image halting the CPU.  Also, there are different
     16 *  images for the PCX-W and later chips vs the PCX-U chips.
     17 *
     18 *  Only 1 process is allowed to access the driver at any time,
     19 *  so the only protection that is needed is at open and close.
     20 *  A variable "perf_enabled" is used to hold the state of the
     21 *  driver.  The spinlock "perf_lock" is used to protect the
     22 *  modification of the state during open/close operations so
     23 *  multiple processes don't get into the driver simultaneously.
     24 *
     25 *  This driver accesses the processor directly vs going through
     26 *  the PDC INTRIGUE calls.  This is done to eliminate bugs introduced
     27 *  in various PDC revisions.  The code is much more maintainable
     28 *  and reliable this way vs having to debug on every version of PDC
     29 *  on every box.
     30 */
     31
     32#include <linux/capability.h>
     33#include <linux/init.h>
     34#include <linux/proc_fs.h>
     35#include <linux/miscdevice.h>
     36#include <linux/spinlock.h>
     37
     38#include <linux/uaccess.h>
     39#include <asm/perf.h>
     40#include <asm/parisc-device.h>
     41#include <asm/processor.h>
     42#include <asm/runway.h>
     43#include <asm/io.h>		/* for __raw_read() */
     44
     45#include "perf_images.h"
     46
     47#define MAX_RDR_WORDS	24
     48#define PERF_VERSION	2	/* derived from hpux's PI v2 interface */
     49
     50/* definition of RDR regs */
     51struct rdr_tbl_ent {
     52	uint16_t	width;
     53	uint8_t		num_words;
     54	uint8_t		write_control;
     55};
     56
     57static int perf_processor_interface __read_mostly = UNKNOWN_INTF;
     58static int perf_enabled __read_mostly;
     59static DEFINE_SPINLOCK(perf_lock);
     60struct parisc_device *cpu_device __read_mostly;
     61
     62/* RDRs to write for PCX-W */
     63static const int perf_rdrs_W[] =
     64	{ 0, 1, 4, 5, 6, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, -1 };
     65
     66/* RDRs to write for PCX-U */
     67static const int perf_rdrs_U[] =
     68	{ 0, 1, 4, 5, 6, 7, 16, 17, 18, 20, 21, 22, 23, 24, 25, -1 };
     69
     70/* RDR register descriptions for PCX-W */
     71static const struct rdr_tbl_ent perf_rdr_tbl_W[] = {
     72	{ 19,	1,	8 },   /* RDR 0 */
     73	{ 16,	1,	16 },  /* RDR 1 */
     74	{ 72,	2,	0 },   /* RDR 2 */
     75	{ 81,	2,	0 },   /* RDR 3 */
     76	{ 328,	6,	0 },   /* RDR 4 */
     77	{ 160,	3,	0 },   /* RDR 5 */
     78	{ 336,	6,	0 },   /* RDR 6 */
     79	{ 164,	3,	0 },   /* RDR 7 */
     80	{ 0,	0,	0 },   /* RDR 8 */
     81	{ 35,	1,	0 },   /* RDR 9 */
     82	{ 6,	1,	0 },   /* RDR 10 */
     83	{ 18,	1,	0 },   /* RDR 11 */
     84	{ 13,	1,	0 },   /* RDR 12 */
     85	{ 8,	1,	0 },   /* RDR 13 */
     86	{ 8,	1,	0 },   /* RDR 14 */
     87	{ 8,	1,	0 },   /* RDR 15 */
     88	{ 1530,	24,	0 },   /* RDR 16 */
     89	{ 16,	1,	0 },   /* RDR 17 */
     90	{ 4,	1,	0 },   /* RDR 18 */
     91	{ 0,	0,	0 },   /* RDR 19 */
     92	{ 152,	3,	24 },  /* RDR 20 */
     93	{ 152,	3,	24 },  /* RDR 21 */
     94	{ 233,	4,	48 },  /* RDR 22 */
     95	{ 233,	4,	48 },  /* RDR 23 */
     96	{ 71,	2,	0 },   /* RDR 24 */
     97	{ 71,	2,	0 },   /* RDR 25 */
     98	{ 11,	1,	0 },   /* RDR 26 */
     99	{ 18,	1,	0 },   /* RDR 27 */
    100	{ 128,	2,	0 },   /* RDR 28 */
    101	{ 0,	0,	0 },   /* RDR 29 */
    102	{ 16,	1,	0 },   /* RDR 30 */
    103	{ 16,	1,	0 },   /* RDR 31 */
    104};
    105
    106/* RDR register descriptions for PCX-U */
    107static const struct rdr_tbl_ent perf_rdr_tbl_U[] = {
    108	{ 19,	1,	8 },              /* RDR 0 */
    109	{ 32,	1,	16 },             /* RDR 1 */
    110	{ 20,	1,	0 },              /* RDR 2 */
    111	{ 0,	0,	0 },              /* RDR 3 */
    112	{ 344,	6,	0 },              /* RDR 4 */
    113	{ 176,	3,	0 },              /* RDR 5 */
    114	{ 336,	6,	0 },              /* RDR 6 */
    115	{ 0,	0,	0 },              /* RDR 7 */
    116	{ 0,	0,	0 },              /* RDR 8 */
    117	{ 0,	0,	0 },              /* RDR 9 */
    118	{ 28,	1,	0 },              /* RDR 10 */
    119	{ 33,	1,	0 },              /* RDR 11 */
    120	{ 0,	0,	0 },              /* RDR 12 */
    121	{ 230,	4,	0 },              /* RDR 13 */
    122	{ 32,	1,	0 },              /* RDR 14 */
    123	{ 128,	2,	0 },              /* RDR 15 */
    124	{ 1494,	24,	0 },              /* RDR 16 */
    125	{ 18,	1,	0 },              /* RDR 17 */
    126	{ 4,	1,	0 },              /* RDR 18 */
    127	{ 0,	0,	0 },              /* RDR 19 */
    128	{ 158,	3,	24 },             /* RDR 20 */
    129	{ 158,	3,	24 },             /* RDR 21 */
    130	{ 194,	4,	48 },             /* RDR 22 */
    131	{ 194,	4,	48 },             /* RDR 23 */
    132	{ 71,	2,	0 },              /* RDR 24 */
    133	{ 71,	2,	0 },              /* RDR 25 */
    134	{ 28,	1,	0 },              /* RDR 26 */
    135	{ 33,	1,	0 },              /* RDR 27 */
    136	{ 88,	2,	0 },              /* RDR 28 */
    137	{ 32,	1,	0 },              /* RDR 29 */
    138	{ 24,	1,	0 },              /* RDR 30 */
    139	{ 16,	1,	0 },              /* RDR 31 */
    140};
    141
    142/*
    143 * A non-zero write_control in the above tables is a byte offset into
    144 * this array.
    145 */
    146static const uint64_t perf_bitmasks[] = {
    147	0x0000000000000000ul,     /* first dbl word must be zero */
    148	0xfdffe00000000000ul,     /* RDR0 bitmask */
    149	0x003f000000000000ul,     /* RDR1 bitmask */
    150	0x00fffffffffffffful,     /* RDR20-RDR21 bitmask (152 bits) */
    151	0xfffffffffffffffful,
    152	0xfffffffc00000000ul,
    153	0xfffffffffffffffful,     /* RDR22-RDR23 bitmask (233 bits) */
    154	0xfffffffffffffffful,
    155	0xfffffffffffffffcul,
    156	0xff00000000000000ul
    157};
    158
    159/*
    160 * Write control bitmasks for Pa-8700 processor given
    161 * some things have changed slightly.
    162 */
    163static const uint64_t perf_bitmasks_piranha[] = {
    164	0x0000000000000000ul,     /* first dbl word must be zero */
    165	0xfdffe00000000000ul,     /* RDR0 bitmask */
    166	0x003f000000000000ul,     /* RDR1 bitmask */
    167	0x00fffffffffffffful,     /* RDR20-RDR21 bitmask (158 bits) */
    168	0xfffffffffffffffful,
    169	0xfffffffc00000000ul,
    170	0xfffffffffffffffful,     /* RDR22-RDR23 bitmask (210 bits) */
    171	0xfffffffffffffffful,
    172	0xfffffffffffffffful,
    173	0xfffc000000000000ul
    174};
    175
    176static const uint64_t *bitmask_array;   /* array of bitmasks to use */
    177
    178/******************************************************************************
    179 * Function Prototypes
    180 *****************************************************************************/
    181static int perf_config(uint32_t *image_ptr);
    182static int perf_release(struct inode *inode, struct file *file);
    183static int perf_open(struct inode *inode, struct file *file);
    184static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos);
    185static ssize_t perf_write(struct file *file, const char __user *buf,
    186	size_t count, loff_t *ppos);
    187static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
    188static void perf_start_counters(void);
    189static int perf_stop_counters(uint32_t *raddr);
    190static const struct rdr_tbl_ent * perf_rdr_get_entry(uint32_t rdr_num);
    191static int perf_rdr_read_ubuf(uint32_t	rdr_num, uint64_t *buffer);
    192static int perf_rdr_clear(uint32_t rdr_num);
    193static int perf_write_image(uint64_t *memaddr);
    194static void perf_rdr_write(uint32_t rdr_num, uint64_t *buffer);
    195
    196/* External Assembly Routines */
    197extern uint64_t perf_rdr_shift_in_W (uint32_t rdr_num, uint16_t width);
    198extern uint64_t perf_rdr_shift_in_U (uint32_t rdr_num, uint16_t width);
    199extern void perf_rdr_shift_out_W (uint32_t rdr_num, uint64_t buffer);
    200extern void perf_rdr_shift_out_U (uint32_t rdr_num, uint64_t buffer);
    201extern void perf_intrigue_enable_perf_counters (void);
    202extern void perf_intrigue_disable_perf_counters (void);
    203
    204/******************************************************************************
    205 * Function Definitions
    206 *****************************************************************************/
    207
    208
    209/*
    210 * configure:
    211 *
    212 * Configure the cpu with a given data image.  First turn off the counters,
    213 * then download the image, then turn the counters back on.
    214 */
    215static int perf_config(uint32_t *image_ptr)
    216{
    217	long error;
    218	uint32_t raddr[4];
    219
    220	/* Stop the counters*/
    221	error = perf_stop_counters(raddr);
    222	if (error != 0) {
    223		printk("perf_config: perf_stop_counters = %ld\n", error);
    224		return -EINVAL;
    225	}
    226
    227printk("Preparing to write image\n");
    228	/* Write the image to the chip */
    229	error = perf_write_image((uint64_t *)image_ptr);
    230	if (error != 0) {
    231		printk("perf_config: DOWNLOAD = %ld\n", error);
    232		return -EINVAL;
    233	}
    234
    235printk("Preparing to start counters\n");
    236
    237	/* Start the counters */
    238	perf_start_counters();
    239
    240	return sizeof(uint32_t);
    241}
    242
    243/*
    244 * Open the device and initialize all of its memory.  The device is only
    245 * opened once, but can be "queried" by multiple processes that know its
    246 * file descriptor.
    247 */
    248static int perf_open(struct inode *inode, struct file *file)
    249{
    250	spin_lock(&perf_lock);
    251	if (perf_enabled) {
    252		spin_unlock(&perf_lock);
    253		return -EBUSY;
    254	}
    255	perf_enabled = 1;
    256 	spin_unlock(&perf_lock);
    257
    258	return 0;
    259}
    260
    261/*
    262 * Close the device.
    263 */
    264static int perf_release(struct inode *inode, struct file *file)
    265{
    266	spin_lock(&perf_lock);
    267	perf_enabled = 0;
    268	spin_unlock(&perf_lock);
    269
    270	return 0;
    271}
    272
    273/*
    274 * Read does nothing for this driver
    275 */
    276static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos)
    277{
    278	return 0;
    279}
    280
    281/*
    282 * write:
    283 *
    284 * This routine downloads the image to the chip.  It must be
    285 * called on the processor that the download should happen
    286 * on.
    287 */
    288static ssize_t perf_write(struct file *file, const char __user *buf,
    289	size_t count, loff_t *ppos)
    290{
    291	size_t image_size;
    292	uint32_t image_type;
    293	uint32_t interface_type;
    294	uint32_t test;
    295
    296	if (perf_processor_interface == ONYX_INTF)
    297		image_size = PCXU_IMAGE_SIZE;
    298	else if (perf_processor_interface == CUDA_INTF)
    299		image_size = PCXW_IMAGE_SIZE;
    300	else
    301		return -EFAULT;
    302
    303	if (!perfmon_capable())
    304		return -EACCES;
    305
    306	if (count != sizeof(uint32_t))
    307		return -EIO;
    308
    309	if (copy_from_user(&image_type, buf, sizeof(uint32_t)))
    310		return -EFAULT;
    311
    312	/* Get the interface type and test type */
    313   	interface_type = (image_type >> 16) & 0xffff;
    314	test           = (image_type & 0xffff);
    315
    316	/* Make sure everything makes sense */
    317
    318	/* First check the machine type is correct for
    319	   the requested image */
    320	if (((perf_processor_interface == CUDA_INTF) &&
    321			(interface_type != CUDA_INTF)) ||
    322		((perf_processor_interface == ONYX_INTF) &&
    323			(interface_type != ONYX_INTF)))
    324		return -EINVAL;
    325
    326	/* Next check to make sure the requested image
    327	   is valid */
    328	if (((interface_type == CUDA_INTF) &&
    329		       (test >= MAX_CUDA_IMAGES)) ||
    330	    ((interface_type == ONYX_INTF) &&
    331		       (test >= MAX_ONYX_IMAGES)))
    332		return -EINVAL;
    333
    334	/* Copy the image into the processor */
    335	if (interface_type == CUDA_INTF)
    336		return perf_config(cuda_images[test]);
    337	else
    338		return perf_config(onyx_images[test]);
    339
    340	return count;
    341}
    342
    343/*
    344 * Patch the images that need to know the IVA addresses.
    345 */
    346static void perf_patch_images(void)
    347{
    348#if 0 /* FIXME!! */
    349/*
    350 * NOTE:  this routine is VERY specific to the current TLB image.
    351 * If the image is changed, this routine might also need to be changed.
    352 */
    353	extern void $i_itlb_miss_2_0();
    354	extern void $i_dtlb_miss_2_0();
    355	extern void PA2_0_iva();
    356
    357	/*
    358	 * We can only use the lower 32-bits, the upper 32-bits should be 0
    359	 * anyway given this is in the kernel
    360	 */
    361	uint32_t itlb_addr  = (uint32_t)&($i_itlb_miss_2_0);
    362	uint32_t dtlb_addr  = (uint32_t)&($i_dtlb_miss_2_0);
    363	uint32_t IVAaddress = (uint32_t)&PA2_0_iva;
    364
    365	if (perf_processor_interface == ONYX_INTF) {
    366		/* clear last 2 bytes */
    367		onyx_images[TLBMISS][15] &= 0xffffff00;
    368		/* set 2 bytes */
    369		onyx_images[TLBMISS][15] |= (0x000000ff&((dtlb_addr) >> 24));
    370		onyx_images[TLBMISS][16] = (dtlb_addr << 8)&0xffffff00;
    371		onyx_images[TLBMISS][17] = itlb_addr;
    372
    373		/* clear last 2 bytes */
    374		onyx_images[TLBHANDMISS][15] &= 0xffffff00;
    375		/* set 2 bytes */
    376		onyx_images[TLBHANDMISS][15] |= (0x000000ff&((dtlb_addr) >> 24));
    377		onyx_images[TLBHANDMISS][16] = (dtlb_addr << 8)&0xffffff00;
    378		onyx_images[TLBHANDMISS][17] = itlb_addr;
    379
    380		/* clear last 2 bytes */
    381		onyx_images[BIG_CPI][15] &= 0xffffff00;
    382		/* set 2 bytes */
    383		onyx_images[BIG_CPI][15] |= (0x000000ff&((dtlb_addr) >> 24));
    384		onyx_images[BIG_CPI][16] = (dtlb_addr << 8)&0xffffff00;
    385		onyx_images[BIG_CPI][17] = itlb_addr;
    386
    387	    onyx_images[PANIC][15] &= 0xffffff00;  /* clear last 2 bytes */
    388	 	onyx_images[PANIC][15] |= (0x000000ff&((IVAaddress) >> 24)); /* set 2 bytes */
    389		onyx_images[PANIC][16] = (IVAaddress << 8)&0xffffff00;
    390
    391
    392	} else if (perf_processor_interface == CUDA_INTF) {
    393		/* Cuda interface */
    394		cuda_images[TLBMISS][16] =
    395			(cuda_images[TLBMISS][16]&0xffff0000) |
    396			((dtlb_addr >> 8)&0x0000ffff);
    397		cuda_images[TLBMISS][17] =
    398			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
    399		cuda_images[TLBMISS][18] = (itlb_addr << 16)&0xffff0000;
    400
    401		cuda_images[TLBHANDMISS][16] =
    402			(cuda_images[TLBHANDMISS][16]&0xffff0000) |
    403			((dtlb_addr >> 8)&0x0000ffff);
    404		cuda_images[TLBHANDMISS][17] =
    405			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
    406		cuda_images[TLBHANDMISS][18] = (itlb_addr << 16)&0xffff0000;
    407
    408		cuda_images[BIG_CPI][16] =
    409			(cuda_images[BIG_CPI][16]&0xffff0000) |
    410			((dtlb_addr >> 8)&0x0000ffff);
    411		cuda_images[BIG_CPI][17] =
    412			((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff);
    413		cuda_images[BIG_CPI][18] = (itlb_addr << 16)&0xffff0000;
    414	} else {
    415		/* Unknown type */
    416	}
    417#endif
    418}
    419
    420
    421/*
    422 * ioctl routine
    423 * All routines effect the processor that they are executed on.  Thus you
    424 * must be running on the processor that you wish to change.
    425 */
    426
    427static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
    428{
    429	long error_start;
    430	uint32_t raddr[4];
    431	int error = 0;
    432
    433	switch (cmd) {
    434
    435	    case PA_PERF_ON:
    436			/* Start the counters */
    437			perf_start_counters();
    438			break;
    439
    440	    case PA_PERF_OFF:
    441			error_start = perf_stop_counters(raddr);
    442			if (error_start != 0) {
    443				printk(KERN_ERR "perf_off: perf_stop_counters = %ld\n", error_start);
    444				error = -EFAULT;
    445				break;
    446			}
    447
    448			/* copy out the Counters */
    449			if (copy_to_user((void __user *)arg, raddr,
    450					sizeof (raddr)) != 0) {
    451				error =  -EFAULT;
    452				break;
    453			}
    454			break;
    455
    456	    case PA_PERF_VERSION:
    457  	  		/* Return the version # */
    458			error = put_user(PERF_VERSION, (int *)arg);
    459			break;
    460
    461	    default:
    462  	 		error = -ENOTTY;
    463	}
    464
    465	return error;
    466}
    467
    468static const struct file_operations perf_fops = {
    469	.llseek = no_llseek,
    470	.read = perf_read,
    471	.write = perf_write,
    472	.unlocked_ioctl = perf_ioctl,
    473	.compat_ioctl = perf_ioctl,
    474	.open = perf_open,
    475	.release = perf_release
    476};
    477
    478static struct miscdevice perf_dev = {
    479	MISC_DYNAMIC_MINOR,
    480	PA_PERF_DEV,
    481	&perf_fops
    482};
    483
    484/*
    485 * Initialize the module
    486 */
    487static int __init perf_init(void)
    488{
    489	int ret;
    490
    491	/* Determine correct processor interface to use */
    492	bitmask_array = perf_bitmasks;
    493
    494	if (boot_cpu_data.cpu_type == pcxu ||
    495	    boot_cpu_data.cpu_type == pcxu_) {
    496		perf_processor_interface = ONYX_INTF;
    497	} else if (boot_cpu_data.cpu_type == pcxw ||
    498		 boot_cpu_data.cpu_type == pcxw_ ||
    499		 boot_cpu_data.cpu_type == pcxw2 ||
    500		 boot_cpu_data.cpu_type == mako ||
    501		 boot_cpu_data.cpu_type == mako2) {
    502		perf_processor_interface = CUDA_INTF;
    503		if (boot_cpu_data.cpu_type == pcxw2 ||
    504		    boot_cpu_data.cpu_type == mako ||
    505		    boot_cpu_data.cpu_type == mako2)
    506			bitmask_array = perf_bitmasks_piranha;
    507	} else {
    508		perf_processor_interface = UNKNOWN_INTF;
    509		printk("Performance monitoring counters not supported on this processor\n");
    510		return -ENODEV;
    511	}
    512
    513	ret = misc_register(&perf_dev);
    514	if (ret) {
    515		printk(KERN_ERR "Performance monitoring counters: "
    516			"cannot register misc device.\n");
    517		return ret;
    518	}
    519
    520	/* Patch the images to match the system */
    521    	perf_patch_images();
    522
    523	/* TODO: this only lets us access the first cpu.. what to do for SMP? */
    524	cpu_device = per_cpu(cpu_data, 0).dev;
    525	printk("Performance monitoring counters enabled for %s\n",
    526		per_cpu(cpu_data, 0).dev->name);
    527
    528	return 0;
    529}
    530device_initcall(perf_init);
    531
    532/*
    533 * perf_start_counters(void)
    534 *
    535 * Start the counters.
    536 */
    537static void perf_start_counters(void)
    538{
    539	/* Enable performance monitor counters */
    540	perf_intrigue_enable_perf_counters();
    541}
    542
    543/*
    544 * perf_stop_counters
    545 *
    546 * Stop the performance counters and save counts
    547 * in a per_processor array.
    548 */
    549static int perf_stop_counters(uint32_t *raddr)
    550{
    551	uint64_t userbuf[MAX_RDR_WORDS];
    552
    553	/* Disable performance counters */
    554	perf_intrigue_disable_perf_counters();
    555
    556	if (perf_processor_interface == ONYX_INTF) {
    557		uint64_t tmp64;
    558		/*
    559		 * Read the counters
    560		 */
    561		if (!perf_rdr_read_ubuf(16, userbuf))
    562			return -13;
    563
    564		/* Counter0 is bits 1398 to 1429 */
    565		tmp64 =  (userbuf[21] << 22) & 0x00000000ffc00000;
    566		tmp64 |= (userbuf[22] >> 42) & 0x00000000003fffff;
    567		/* OR sticky0 (bit 1430) to counter0 bit 32 */
    568		tmp64 |= (userbuf[22] >> 10) & 0x0000000080000000;
    569		raddr[0] = (uint32_t)tmp64;
    570
    571		/* Counter1 is bits 1431 to 1462 */
    572		tmp64 =  (userbuf[22] >> 9) & 0x00000000ffffffff;
    573		/* OR sticky1 (bit 1463) to counter1 bit 32 */
    574		tmp64 |= (userbuf[22] << 23) & 0x0000000080000000;
    575		raddr[1] = (uint32_t)tmp64;
    576
    577		/* Counter2 is bits 1464 to 1495 */
    578		tmp64 =  (userbuf[22] << 24) & 0x00000000ff000000;
    579		tmp64 |= (userbuf[23] >> 40) & 0x0000000000ffffff;
    580		/* OR sticky2 (bit 1496) to counter2 bit 32 */
    581		tmp64 |= (userbuf[23] >> 8) & 0x0000000080000000;
    582		raddr[2] = (uint32_t)tmp64;
    583
    584		/* Counter3 is bits 1497 to 1528 */
    585		tmp64 =  (userbuf[23] >> 7) & 0x00000000ffffffff;
    586		/* OR sticky3 (bit 1529) to counter3 bit 32 */
    587		tmp64 |= (userbuf[23] << 25) & 0x0000000080000000;
    588		raddr[3] = (uint32_t)tmp64;
    589
    590		/*
    591		 * Zero out the counters
    592		 */
    593
    594		/*
    595		 * The counters and sticky-bits comprise the last 132 bits
    596		 * (1398 - 1529) of RDR16 on a U chip.  We'll zero these
    597		 * out the easy way: zero out last 10 bits of dword 21,
    598		 * all of dword 22 and 58 bits (plus 6 don't care bits) of
    599		 * dword 23.
    600		 */
    601		userbuf[21] &= 0xfffffffffffffc00ul;	/* 0 to last 10 bits */
    602		userbuf[22] = 0;
    603		userbuf[23] = 0;
    604
    605		/*
    606		 * Write back the zeroed bytes + the image given
    607		 * the read was destructive.
    608		 */
    609		perf_rdr_write(16, userbuf);
    610	} else {
    611
    612		/*
    613		 * Read RDR-15 which contains the counters and sticky bits
    614		 */
    615		if (!perf_rdr_read_ubuf(15, userbuf)) {
    616			return -13;
    617		}
    618
    619		/*
    620		 * Clear out the counters
    621		 */
    622		perf_rdr_clear(15);
    623
    624		/*
    625		 * Copy the counters 
    626		 */
    627		raddr[0] = (uint32_t)((userbuf[0] >> 32) & 0x00000000ffffffffUL);
    628		raddr[1] = (uint32_t)(userbuf[0] & 0x00000000ffffffffUL);
    629		raddr[2] = (uint32_t)((userbuf[1] >> 32) & 0x00000000ffffffffUL);
    630		raddr[3] = (uint32_t)(userbuf[1] & 0x00000000ffffffffUL);
    631	}
    632
    633	return 0;
    634}
    635
    636/*
    637 * perf_rdr_get_entry
    638 *
    639 * Retrieve a pointer to the description of what this
    640 * RDR contains.
    641 */
    642static const struct rdr_tbl_ent * perf_rdr_get_entry(uint32_t rdr_num)
    643{
    644	if (perf_processor_interface == ONYX_INTF) {
    645		return &perf_rdr_tbl_U[rdr_num];
    646	} else {
    647		return &perf_rdr_tbl_W[rdr_num];
    648	}
    649}
    650
    651/*
    652 * perf_rdr_read_ubuf
    653 *
    654 * Read the RDR value into the buffer specified.
    655 */
    656static int perf_rdr_read_ubuf(uint32_t	rdr_num, uint64_t *buffer)
    657{
    658	uint64_t	data, data_mask = 0;
    659	uint32_t	width, xbits, i;
    660	const struct rdr_tbl_ent *tentry;
    661
    662	tentry = perf_rdr_get_entry(rdr_num);
    663	if ((width = tentry->width) == 0)
    664		return 0;
    665
    666	/* Clear out buffer */
    667	i = tentry->num_words;
    668	while (i--) {
    669		buffer[i] = 0;
    670	}
    671
    672	/* Check for bits an even number of 64 */
    673	if ((xbits = width & 0x03f) != 0) {
    674		data_mask = 1;
    675		data_mask <<= (64 - xbits);
    676		data_mask--;
    677	}
    678
    679	/* Grab all of the data */
    680	i = tentry->num_words;
    681	while (i--) {
    682
    683		if (perf_processor_interface == ONYX_INTF) {
    684			data = perf_rdr_shift_in_U(rdr_num, width);
    685		} else {
    686			data = perf_rdr_shift_in_W(rdr_num, width);
    687		}
    688		if (xbits) {
    689			buffer[i] |= (data << (64 - xbits));
    690			if (i) {
    691				buffer[i-1] |= ((data >> xbits) & data_mask);
    692			}
    693		} else {
    694			buffer[i] = data;
    695		}
    696	}
    697
    698	return 1;
    699}
    700
    701/*
    702 * perf_rdr_clear
    703 *
    704 * Zero out the given RDR register
    705 */
    706static int perf_rdr_clear(uint32_t	rdr_num)
    707{
    708	const struct rdr_tbl_ent *tentry;
    709	int32_t		i;
    710
    711	tentry = perf_rdr_get_entry(rdr_num);
    712
    713	if (tentry->width == 0) {
    714		return -1;
    715	}
    716
    717	i = tentry->num_words;
    718	while (i--) {
    719		if (perf_processor_interface == ONYX_INTF) {
    720			perf_rdr_shift_out_U(rdr_num, 0UL);
    721		} else {
    722			perf_rdr_shift_out_W(rdr_num, 0UL);
    723		}
    724	}
    725
    726	return 0;
    727}
    728
    729
    730/*
    731 * perf_write_image
    732 *
    733 * Write the given image out to the processor
    734 */
    735static int perf_write_image(uint64_t *memaddr)
    736{
    737	uint64_t buffer[MAX_RDR_WORDS];
    738	uint64_t *bptr;
    739	uint32_t dwords;
    740	const uint32_t *intrigue_rdr;
    741	const uint64_t *intrigue_bitmask;
    742	uint64_t tmp64;
    743	void __iomem *runway;
    744	const struct rdr_tbl_ent *tentry;
    745	int i;
    746
    747	/* Clear out counters */
    748	if (perf_processor_interface == ONYX_INTF) {
    749
    750		perf_rdr_clear(16);
    751
    752		/* Toggle performance monitor */
    753		perf_intrigue_enable_perf_counters();
    754		perf_intrigue_disable_perf_counters();
    755
    756		intrigue_rdr = perf_rdrs_U;
    757	} else {
    758		perf_rdr_clear(15);
    759		intrigue_rdr = perf_rdrs_W;
    760	}
    761
    762	/* Write all RDRs */
    763	while (*intrigue_rdr != -1) {
    764		tentry = perf_rdr_get_entry(*intrigue_rdr);
    765		perf_rdr_read_ubuf(*intrigue_rdr, buffer);
    766		bptr   = &buffer[0];
    767		dwords = tentry->num_words;
    768		if (tentry->write_control) {
    769			intrigue_bitmask = &bitmask_array[tentry->write_control >> 3];
    770			while (dwords--) {
    771				tmp64 = *intrigue_bitmask & *memaddr++;
    772				tmp64 |= (~(*intrigue_bitmask++)) & *bptr;
    773				*bptr++ = tmp64;
    774			}
    775		} else {
    776			while (dwords--) {
    777				*bptr++ = *memaddr++;
    778			}
    779		}
    780
    781		perf_rdr_write(*intrigue_rdr, buffer);
    782		intrigue_rdr++;
    783	}
    784
    785	/*
    786	 * Now copy out the Runway stuff which is not in RDRs
    787	 */
    788
    789	if (cpu_device == NULL)
    790	{
    791		printk(KERN_ERR "write_image: cpu_device not yet initialized!\n");
    792		return -1;
    793	}
    794
    795	runway = ioremap(cpu_device->hpa.start, 4096);
    796	if (!runway) {
    797		pr_err("perf_write_image: ioremap failed!\n");
    798		return -ENOMEM;
    799	}
    800
    801	/* Merge intrigue bits into Runway STATUS 0 */
    802	tmp64 = __raw_readq(runway + RUNWAY_STATUS) & 0xffecfffffffffffful;
    803	__raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul),
    804		     runway + RUNWAY_STATUS);
    805
    806	/* Write RUNWAY DEBUG registers */
    807	for (i = 0; i < 8; i++) {
    808		__raw_writeq(*memaddr++, runway + RUNWAY_DEBUG);
    809	}
    810
    811	return 0;
    812}
    813
    814/*
    815 * perf_rdr_write
    816 *
    817 * Write the given RDR register with the contents
    818 * of the given buffer.
    819 */
    820static void perf_rdr_write(uint32_t rdr_num, uint64_t *buffer)
    821{
    822	const struct rdr_tbl_ent *tentry;
    823	int32_t		i;
    824
    825printk("perf_rdr_write\n");
    826	tentry = perf_rdr_get_entry(rdr_num);
    827	if (tentry->width == 0) { return; }
    828
    829	i = tentry->num_words;
    830	while (i--) {
    831		if (perf_processor_interface == ONYX_INTF) {
    832			perf_rdr_shift_out_U(rdr_num, buffer[i]);
    833		} else {
    834			perf_rdr_shift_out_W(rdr_num, buffer[i]);
    835		}
    836	}
    837printk("perf_rdr_write done\n");
    838}