cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_rapl_common.c (42605B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Common code for Intel Running Average Power Limit (RAPL) support.
      4 * Copyright (c) 2019, Intel Corporation.
      5 */
      6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      7
      8#include <linux/kernel.h>
      9#include <linux/module.h>
     10#include <linux/list.h>
     11#include <linux/types.h>
     12#include <linux/device.h>
     13#include <linux/slab.h>
     14#include <linux/log2.h>
     15#include <linux/bitmap.h>
     16#include <linux/delay.h>
     17#include <linux/sysfs.h>
     18#include <linux/cpu.h>
     19#include <linux/powercap.h>
     20#include <linux/suspend.h>
     21#include <linux/intel_rapl.h>
     22#include <linux/processor.h>
     23#include <linux/platform_device.h>
     24
     25#include <asm/iosf_mbi.h>
     26#include <asm/cpu_device_id.h>
     27#include <asm/intel-family.h>
     28
     29/* bitmasks for RAPL MSRs, used by primitive access functions */
     30#define ENERGY_STATUS_MASK      0xffffffff
     31
     32#define POWER_LIMIT1_MASK       0x7FFF
     33#define POWER_LIMIT1_ENABLE     BIT(15)
     34#define POWER_LIMIT1_CLAMP      BIT(16)
     35
     36#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
     37#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
     38#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
     39#define POWER_HIGH_LOCK         BIT_ULL(63)
     40#define POWER_LOW_LOCK          BIT(31)
     41
     42#define POWER_LIMIT4_MASK		0x1FFF
     43
     44#define TIME_WINDOW1_MASK       (0x7FULL<<17)
     45#define TIME_WINDOW2_MASK       (0x7FULL<<49)
     46
     47#define POWER_UNIT_OFFSET	0
     48#define POWER_UNIT_MASK		0x0F
     49
     50#define ENERGY_UNIT_OFFSET	0x08
     51#define ENERGY_UNIT_MASK	0x1F00
     52
     53#define TIME_UNIT_OFFSET	0x10
     54#define TIME_UNIT_MASK		0xF0000
     55
     56#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
     57#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
     58#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
     59#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
     60
     61#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
     62#define PP_POLICY_MASK         0x1F
     63
     64/*
     65 * SPR has different layout for Psys Domain PowerLimit registers.
     66 * There are 17 bits of PL1 and PL2 instead of 15 bits.
     67 * The Enable bits and TimeWindow bits are also shifted as a result.
     68 */
     69#define PSYS_POWER_LIMIT1_MASK       0x1FFFF
     70#define PSYS_POWER_LIMIT1_ENABLE     BIT(17)
     71
     72#define PSYS_POWER_LIMIT2_MASK       (0x1FFFFULL<<32)
     73#define PSYS_POWER_LIMIT2_ENABLE     BIT_ULL(49)
     74
     75#define PSYS_TIME_WINDOW1_MASK       (0x7FULL<<19)
     76#define PSYS_TIME_WINDOW2_MASK       (0x7FULL<<51)
     77
     78/* Non HW constants */
     79#define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
     80#define RAPL_PRIMITIVE_DUMMY         BIT(2)
     81
     82#define TIME_WINDOW_MAX_MSEC 40000
     83#define TIME_WINDOW_MIN_MSEC 250
     84#define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
     85enum unit_type {
     86	ARBITRARY_UNIT,		/* no translation */
     87	POWER_UNIT,
     88	ENERGY_UNIT,
     89	TIME_UNIT,
     90};
     91
     92/* per domain data, some are optional */
     93#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
     94
     95#define	DOMAIN_STATE_INACTIVE           BIT(0)
     96#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
     97#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
     98
     99static const char pl1_name[] = "long_term";
    100static const char pl2_name[] = "short_term";
    101static const char pl4_name[] = "peak_power";
    102
    103#define power_zone_to_rapl_domain(_zone) \
    104	container_of(_zone, struct rapl_domain, power_zone)
    105
    106struct rapl_defaults {
    107	u8 floor_freq_reg_addr;
    108	int (*check_unit)(struct rapl_package *rp, int cpu);
    109	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
    110	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
    111				    bool to_raw);
    112	unsigned int dram_domain_energy_unit;
    113	unsigned int psys_domain_energy_unit;
    114	bool spr_psys_bits;
    115};
    116static struct rapl_defaults *rapl_defaults;
    117
    118/* Sideband MBI registers */
    119#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
    120#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
    121
    122#define PACKAGE_PLN_INT_SAVED   BIT(0)
    123#define MAX_PRIM_NAME (32)
    124
    125/* per domain data. used to describe individual knobs such that access function
    126 * can be consolidated into one instead of many inline functions.
    127 */
    128struct rapl_primitive_info {
    129	const char *name;
    130	u64 mask;
    131	int shift;
    132	enum rapl_domain_reg_id id;
    133	enum unit_type unit;
    134	u32 flag;
    135};
    136
    137#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
    138		.name = #p,			\
    139		.mask = m,			\
    140		.shift = s,			\
    141		.id = i,			\
    142		.unit = u,			\
    143		.flag = f			\
    144	}
    145
    146static void rapl_init_domains(struct rapl_package *rp);
    147static int rapl_read_data_raw(struct rapl_domain *rd,
    148			      enum rapl_primitives prim,
    149			      bool xlate, u64 *data);
    150static int rapl_write_data_raw(struct rapl_domain *rd,
    151			       enum rapl_primitives prim,
    152			       unsigned long long value);
    153static u64 rapl_unit_xlate(struct rapl_domain *rd,
    154			   enum unit_type type, u64 value, int to_raw);
    155static void package_power_limit_irq_save(struct rapl_package *rp);
    156
    157static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
    158
    159static const char *const rapl_domain_names[] = {
    160	"package",
    161	"core",
    162	"uncore",
    163	"dram",
    164	"psys",
    165};
    166
    167static int get_energy_counter(struct powercap_zone *power_zone,
    168			      u64 *energy_raw)
    169{
    170	struct rapl_domain *rd;
    171	u64 energy_now;
    172
    173	/* prevent CPU hotplug, make sure the RAPL domain does not go
    174	 * away while reading the counter.
    175	 */
    176	cpus_read_lock();
    177	rd = power_zone_to_rapl_domain(power_zone);
    178
    179	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
    180		*energy_raw = energy_now;
    181		cpus_read_unlock();
    182
    183		return 0;
    184	}
    185	cpus_read_unlock();
    186
    187	return -EIO;
    188}
    189
    190static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
    191{
    192	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
    193
    194	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
    195	return 0;
    196}
    197
    198static int release_zone(struct powercap_zone *power_zone)
    199{
    200	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
    201	struct rapl_package *rp = rd->rp;
    202
    203	/* package zone is the last zone of a package, we can free
    204	 * memory here since all children has been unregistered.
    205	 */
    206	if (rd->id == RAPL_DOMAIN_PACKAGE) {
    207		kfree(rd);
    208		rp->domains = NULL;
    209	}
    210
    211	return 0;
    212
    213}
    214
    215static int find_nr_power_limit(struct rapl_domain *rd)
    216{
    217	int i, nr_pl = 0;
    218
    219	for (i = 0; i < NR_POWER_LIMITS; i++) {
    220		if (rd->rpl[i].name)
    221			nr_pl++;
    222	}
    223
    224	return nr_pl;
    225}
    226
    227static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
    228{
    229	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
    230
    231	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
    232		return -EACCES;
    233
    234	cpus_read_lock();
    235	rapl_write_data_raw(rd, PL1_ENABLE, mode);
    236	if (rapl_defaults->set_floor_freq)
    237		rapl_defaults->set_floor_freq(rd, mode);
    238	cpus_read_unlock();
    239
    240	return 0;
    241}
    242
    243static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
    244{
    245	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
    246	u64 val;
    247
    248	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
    249		*mode = false;
    250		return 0;
    251	}
    252	cpus_read_lock();
    253	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
    254		cpus_read_unlock();
    255		return -EIO;
    256	}
    257	*mode = val;
    258	cpus_read_unlock();
    259
    260	return 0;
    261}
    262
    263/* per RAPL domain ops, in the order of rapl_domain_type */
    264static const struct powercap_zone_ops zone_ops[] = {
    265	/* RAPL_DOMAIN_PACKAGE */
    266	{
    267	 .get_energy_uj = get_energy_counter,
    268	 .get_max_energy_range_uj = get_max_energy_counter,
    269	 .release = release_zone,
    270	 .set_enable = set_domain_enable,
    271	 .get_enable = get_domain_enable,
    272	 },
    273	/* RAPL_DOMAIN_PP0 */
    274	{
    275	 .get_energy_uj = get_energy_counter,
    276	 .get_max_energy_range_uj = get_max_energy_counter,
    277	 .release = release_zone,
    278	 .set_enable = set_domain_enable,
    279	 .get_enable = get_domain_enable,
    280	 },
    281	/* RAPL_DOMAIN_PP1 */
    282	{
    283	 .get_energy_uj = get_energy_counter,
    284	 .get_max_energy_range_uj = get_max_energy_counter,
    285	 .release = release_zone,
    286	 .set_enable = set_domain_enable,
    287	 .get_enable = get_domain_enable,
    288	 },
    289	/* RAPL_DOMAIN_DRAM */
    290	{
    291	 .get_energy_uj = get_energy_counter,
    292	 .get_max_energy_range_uj = get_max_energy_counter,
    293	 .release = release_zone,
    294	 .set_enable = set_domain_enable,
    295	 .get_enable = get_domain_enable,
    296	 },
    297	/* RAPL_DOMAIN_PLATFORM */
    298	{
    299	 .get_energy_uj = get_energy_counter,
    300	 .get_max_energy_range_uj = get_max_energy_counter,
    301	 .release = release_zone,
    302	 .set_enable = set_domain_enable,
    303	 .get_enable = get_domain_enable,
    304	 },
    305};
    306
    307/*
    308 * Constraint index used by powercap can be different than power limit (PL)
    309 * index in that some  PLs maybe missing due to non-existent MSRs. So we
    310 * need to convert here by finding the valid PLs only (name populated).
    311 */
    312static int contraint_to_pl(struct rapl_domain *rd, int cid)
    313{
    314	int i, j;
    315
    316	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
    317		if ((rd->rpl[i].name) && j++ == cid) {
    318			pr_debug("%s: index %d\n", __func__, i);
    319			return i;
    320		}
    321	}
    322	pr_err("Cannot find matching power limit for constraint %d\n", cid);
    323
    324	return -EINVAL;
    325}
    326
    327static int set_power_limit(struct powercap_zone *power_zone, int cid,
    328			   u64 power_limit)
    329{
    330	struct rapl_domain *rd;
    331	struct rapl_package *rp;
    332	int ret = 0;
    333	int id;
    334
    335	cpus_read_lock();
    336	rd = power_zone_to_rapl_domain(power_zone);
    337	id = contraint_to_pl(rd, cid);
    338	if (id < 0) {
    339		ret = id;
    340		goto set_exit;
    341	}
    342
    343	rp = rd->rp;
    344
    345	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
    346		dev_warn(&power_zone->dev,
    347			 "%s locked by BIOS, monitoring only\n", rd->name);
    348		ret = -EACCES;
    349		goto set_exit;
    350	}
    351
    352	switch (rd->rpl[id].prim_id) {
    353	case PL1_ENABLE:
    354		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
    355		break;
    356	case PL2_ENABLE:
    357		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
    358		break;
    359	case PL4_ENABLE:
    360		rapl_write_data_raw(rd, POWER_LIMIT4, power_limit);
    361		break;
    362	default:
    363		ret = -EINVAL;
    364	}
    365	if (!ret)
    366		package_power_limit_irq_save(rp);
    367set_exit:
    368	cpus_read_unlock();
    369	return ret;
    370}
    371
    372static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
    373				   u64 *data)
    374{
    375	struct rapl_domain *rd;
    376	u64 val;
    377	int prim;
    378	int ret = 0;
    379	int id;
    380
    381	cpus_read_lock();
    382	rd = power_zone_to_rapl_domain(power_zone);
    383	id = contraint_to_pl(rd, cid);
    384	if (id < 0) {
    385		ret = id;
    386		goto get_exit;
    387	}
    388
    389	switch (rd->rpl[id].prim_id) {
    390	case PL1_ENABLE:
    391		prim = POWER_LIMIT1;
    392		break;
    393	case PL2_ENABLE:
    394		prim = POWER_LIMIT2;
    395		break;
    396	case PL4_ENABLE:
    397		prim = POWER_LIMIT4;
    398		break;
    399	default:
    400		cpus_read_unlock();
    401		return -EINVAL;
    402	}
    403	if (rapl_read_data_raw(rd, prim, true, &val))
    404		ret = -EIO;
    405	else
    406		*data = val;
    407
    408get_exit:
    409	cpus_read_unlock();
    410
    411	return ret;
    412}
    413
    414static int set_time_window(struct powercap_zone *power_zone, int cid,
    415			   u64 window)
    416{
    417	struct rapl_domain *rd;
    418	int ret = 0;
    419	int id;
    420
    421	cpus_read_lock();
    422	rd = power_zone_to_rapl_domain(power_zone);
    423	id = contraint_to_pl(rd, cid);
    424	if (id < 0) {
    425		ret = id;
    426		goto set_time_exit;
    427	}
    428
    429	switch (rd->rpl[id].prim_id) {
    430	case PL1_ENABLE:
    431		rapl_write_data_raw(rd, TIME_WINDOW1, window);
    432		break;
    433	case PL2_ENABLE:
    434		rapl_write_data_raw(rd, TIME_WINDOW2, window);
    435		break;
    436	default:
    437		ret = -EINVAL;
    438	}
    439
    440set_time_exit:
    441	cpus_read_unlock();
    442	return ret;
    443}
    444
    445static int get_time_window(struct powercap_zone *power_zone, int cid,
    446			   u64 *data)
    447{
    448	struct rapl_domain *rd;
    449	u64 val;
    450	int ret = 0;
    451	int id;
    452
    453	cpus_read_lock();
    454	rd = power_zone_to_rapl_domain(power_zone);
    455	id = contraint_to_pl(rd, cid);
    456	if (id < 0) {
    457		ret = id;
    458		goto get_time_exit;
    459	}
    460
    461	switch (rd->rpl[id].prim_id) {
    462	case PL1_ENABLE:
    463		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
    464		break;
    465	case PL2_ENABLE:
    466		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
    467		break;
    468	case PL4_ENABLE:
    469		/*
    470		 * Time window parameter is not applicable for PL4 entry
    471		 * so assigining '0' as default value.
    472		 */
    473		val = 0;
    474		break;
    475	default:
    476		cpus_read_unlock();
    477		return -EINVAL;
    478	}
    479	if (!ret)
    480		*data = val;
    481
    482get_time_exit:
    483	cpus_read_unlock();
    484
    485	return ret;
    486}
    487
    488static const char *get_constraint_name(struct powercap_zone *power_zone,
    489				       int cid)
    490{
    491	struct rapl_domain *rd;
    492	int id;
    493
    494	rd = power_zone_to_rapl_domain(power_zone);
    495	id = contraint_to_pl(rd, cid);
    496	if (id >= 0)
    497		return rd->rpl[id].name;
    498
    499	return NULL;
    500}
    501
    502static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
    503{
    504	struct rapl_domain *rd;
    505	u64 val;
    506	int prim;
    507	int ret = 0;
    508
    509	cpus_read_lock();
    510	rd = power_zone_to_rapl_domain(power_zone);
    511	switch (rd->rpl[id].prim_id) {
    512	case PL1_ENABLE:
    513		prim = THERMAL_SPEC_POWER;
    514		break;
    515	case PL2_ENABLE:
    516		prim = MAX_POWER;
    517		break;
    518	case PL4_ENABLE:
    519		prim = MAX_POWER;
    520		break;
    521	default:
    522		cpus_read_unlock();
    523		return -EINVAL;
    524	}
    525	if (rapl_read_data_raw(rd, prim, true, &val))
    526		ret = -EIO;
    527	else
    528		*data = val;
    529
    530	/* As a generalization rule, PL4 would be around two times PL2. */
    531	if (rd->rpl[id].prim_id == PL4_ENABLE)
    532		*data = *data * 2;
    533
    534	cpus_read_unlock();
    535
    536	return ret;
    537}
    538
    539static const struct powercap_zone_constraint_ops constraint_ops = {
    540	.set_power_limit_uw = set_power_limit,
    541	.get_power_limit_uw = get_current_power_limit,
    542	.set_time_window_us = set_time_window,
    543	.get_time_window_us = get_time_window,
    544	.get_max_power_uw = get_max_power,
    545	.get_name = get_constraint_name,
    546};
    547
    548/* called after domain detection and package level data are set */
    549static void rapl_init_domains(struct rapl_package *rp)
    550{
    551	enum rapl_domain_type i;
    552	enum rapl_domain_reg_id j;
    553	struct rapl_domain *rd = rp->domains;
    554
    555	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
    556		unsigned int mask = rp->domain_map & (1 << i);
    557
    558		if (!mask)
    559			continue;
    560
    561		rd->rp = rp;
    562
    563		if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) {
    564			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d",
    565				topology_physical_package_id(rp->lead_cpu));
    566		} else
    567			snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s",
    568				rapl_domain_names[i]);
    569
    570		rd->id = i;
    571		rd->rpl[0].prim_id = PL1_ENABLE;
    572		rd->rpl[0].name = pl1_name;
    573
    574		/*
    575		 * The PL2 power domain is applicable for limits two
    576		 * and limits three
    577		 */
    578		if (rp->priv->limits[i] >= 2) {
    579			rd->rpl[1].prim_id = PL2_ENABLE;
    580			rd->rpl[1].name = pl2_name;
    581		}
    582
    583		/* Enable PL4 domain if the total power limits are three */
    584		if (rp->priv->limits[i] == 3) {
    585			rd->rpl[2].prim_id = PL4_ENABLE;
    586			rd->rpl[2].name = pl4_name;
    587		}
    588
    589		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
    590			rd->regs[j] = rp->priv->regs[i][j];
    591
    592		switch (i) {
    593		case RAPL_DOMAIN_DRAM:
    594			rd->domain_energy_unit =
    595			    rapl_defaults->dram_domain_energy_unit;
    596			if (rd->domain_energy_unit)
    597				pr_info("DRAM domain energy unit %dpj\n",
    598					rd->domain_energy_unit);
    599			break;
    600		case RAPL_DOMAIN_PLATFORM:
    601			rd->domain_energy_unit =
    602			    rapl_defaults->psys_domain_energy_unit;
    603			if (rd->domain_energy_unit)
    604				pr_info("Platform domain energy unit %dpj\n",
    605					rd->domain_energy_unit);
    606			break;
    607		default:
    608			break;
    609		}
    610		rd++;
    611	}
    612}
    613
    614static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
    615			   u64 value, int to_raw)
    616{
    617	u64 units = 1;
    618	struct rapl_package *rp = rd->rp;
    619	u64 scale = 1;
    620
    621	switch (type) {
    622	case POWER_UNIT:
    623		units = rp->power_unit;
    624		break;
    625	case ENERGY_UNIT:
    626		scale = ENERGY_UNIT_SCALE;
    627		/* per domain unit takes precedence */
    628		if (rd->domain_energy_unit)
    629			units = rd->domain_energy_unit;
    630		else
    631			units = rp->energy_unit;
    632		break;
    633	case TIME_UNIT:
    634		return rapl_defaults->compute_time_window(rp, value, to_raw);
    635	case ARBITRARY_UNIT:
    636	default:
    637		return value;
    638	}
    639
    640	if (to_raw)
    641		return div64_u64(value, units) * scale;
    642
    643	value *= units;
    644
    645	return div64_u64(value, scale);
    646}
    647
    648/* in the order of enum rapl_primitives */
    649static struct rapl_primitive_info rpi[] = {
    650	/* name, mask, shift, msr index, unit divisor */
    651	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
    652			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
    653	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
    654			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
    655	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
    656			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
    657	PRIMITIVE_INFO_INIT(POWER_LIMIT4, POWER_LIMIT4_MASK, 0,
    658				RAPL_DOMAIN_REG_PL4, POWER_UNIT, 0),
    659	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
    660			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    661	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
    662			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    663	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
    664			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    665	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
    666			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    667	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
    668			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    669	PRIMITIVE_INFO_INIT(PL4_ENABLE, POWER_LIMIT4_MASK, 0,
    670				RAPL_DOMAIN_REG_PL4, ARBITRARY_UNIT, 0),
    671	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
    672			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
    673	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
    674			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
    675	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
    676			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
    677	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
    678			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
    679	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
    680			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
    681	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
    682			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
    683	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
    684			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
    685	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
    686			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
    687	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT1, PSYS_POWER_LIMIT1_MASK, 0,
    688			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
    689	PRIMITIVE_INFO_INIT(PSYS_POWER_LIMIT2, PSYS_POWER_LIMIT2_MASK, 32,
    690			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
    691	PRIMITIVE_INFO_INIT(PSYS_PL1_ENABLE, PSYS_POWER_LIMIT1_ENABLE, 17,
    692			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    693	PRIMITIVE_INFO_INIT(PSYS_PL2_ENABLE, PSYS_POWER_LIMIT2_ENABLE, 49,
    694			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
    695	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW1, PSYS_TIME_WINDOW1_MASK, 19,
    696			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
    697	PRIMITIVE_INFO_INIT(PSYS_TIME_WINDOW2, PSYS_TIME_WINDOW2_MASK, 51,
    698			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
    699	/* non-hardware */
    700	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
    701			    RAPL_PRIMITIVE_DERIVED),
    702	{NULL, 0, 0, 0},
    703};
    704
    705static enum rapl_primitives
    706prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
    707{
    708	if (!rapl_defaults->spr_psys_bits)
    709		return prim;
    710
    711	if (rd->id != RAPL_DOMAIN_PLATFORM)
    712		return prim;
    713
    714	switch (prim) {
    715	case POWER_LIMIT1:
    716		return PSYS_POWER_LIMIT1;
    717	case POWER_LIMIT2:
    718		return PSYS_POWER_LIMIT2;
    719	case PL1_ENABLE:
    720		return PSYS_PL1_ENABLE;
    721	case PL2_ENABLE:
    722		return PSYS_PL2_ENABLE;
    723	case TIME_WINDOW1:
    724		return PSYS_TIME_WINDOW1;
    725	case TIME_WINDOW2:
    726		return PSYS_TIME_WINDOW2;
    727	default:
    728		return prim;
    729	}
    730}
    731
    732/* Read primitive data based on its related struct rapl_primitive_info.
    733 * if xlate flag is set, return translated data based on data units, i.e.
    734 * time, energy, and power.
    735 * RAPL MSRs are non-architectual and are laid out not consistently across
    736 * domains. Here we use primitive info to allow writing consolidated access
    737 * functions.
    738 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
    739 * is pre-assigned based on RAPL unit MSRs read at init time.
    740 * 63-------------------------- 31--------------------------- 0
    741 * |                           xxxxx (mask)                   |
    742 * |                                |<- shift ----------------|
    743 * 63-------------------------- 31--------------------------- 0
    744 */
    745static int rapl_read_data_raw(struct rapl_domain *rd,
    746			      enum rapl_primitives prim, bool xlate, u64 *data)
    747{
    748	u64 value;
    749	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
    750	struct rapl_primitive_info *rp = &rpi[prim_fixed];
    751	struct reg_action ra;
    752	int cpu;
    753
    754	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
    755		return -EINVAL;
    756
    757	ra.reg = rd->regs[rp->id];
    758	if (!ra.reg)
    759		return -EINVAL;
    760
    761	cpu = rd->rp->lead_cpu;
    762
    763	/* domain with 2 limits has different bit */
    764	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
    765		rp->mask = POWER_HIGH_LOCK;
    766		rp->shift = 63;
    767	}
    768	/* non-hardware data are collected by the polling thread */
    769	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
    770		*data = rd->rdd.primitives[prim];
    771		return 0;
    772	}
    773
    774	ra.mask = rp->mask;
    775
    776	if (rd->rp->priv->read_raw(cpu, &ra)) {
    777		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
    778		return -EIO;
    779	}
    780
    781	value = ra.value >> rp->shift;
    782
    783	if (xlate)
    784		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
    785	else
    786		*data = value;
    787
    788	return 0;
    789}
    790
    791/* Similar use of primitive info in the read counterpart */
    792static int rapl_write_data_raw(struct rapl_domain *rd,
    793			       enum rapl_primitives prim,
    794			       unsigned long long value)
    795{
    796	enum rapl_primitives prim_fixed = prim_fixups(rd, prim);
    797	struct rapl_primitive_info *rp = &rpi[prim_fixed];
    798	int cpu;
    799	u64 bits;
    800	struct reg_action ra;
    801	int ret;
    802
    803	cpu = rd->rp->lead_cpu;
    804	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
    805	bits <<= rp->shift;
    806	bits &= rp->mask;
    807
    808	memset(&ra, 0, sizeof(ra));
    809
    810	ra.reg = rd->regs[rp->id];
    811	ra.mask = rp->mask;
    812	ra.value = bits;
    813
    814	ret = rd->rp->priv->write_raw(cpu, &ra);
    815
    816	return ret;
    817}
    818
    819/*
    820 * Raw RAPL data stored in MSRs are in certain scales. We need to
    821 * convert them into standard units based on the units reported in
    822 * the RAPL unit MSRs. This is specific to CPUs as the method to
    823 * calculate units differ on different CPUs.
    824 * We convert the units to below format based on CPUs.
    825 * i.e.
    826 * energy unit: picoJoules  : Represented in picoJoules by default
    827 * power unit : microWatts  : Represented in milliWatts by default
    828 * time unit  : microseconds: Represented in seconds by default
    829 */
    830static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
    831{
    832	struct reg_action ra;
    833	u32 value;
    834
    835	ra.reg = rp->priv->reg_unit;
    836	ra.mask = ~0;
    837	if (rp->priv->read_raw(cpu, &ra)) {
    838		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
    839		       rp->priv->reg_unit, cpu);
    840		return -ENODEV;
    841	}
    842
    843	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
    844	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
    845
    846	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
    847	rp->power_unit = 1000000 / (1 << value);
    848
    849	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
    850	rp->time_unit = 1000000 / (1 << value);
    851
    852	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
    853		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
    854
    855	return 0;
    856}
    857
    858static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
    859{
    860	struct reg_action ra;
    861	u32 value;
    862
    863	ra.reg = rp->priv->reg_unit;
    864	ra.mask = ~0;
    865	if (rp->priv->read_raw(cpu, &ra)) {
    866		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
    867		       rp->priv->reg_unit, cpu);
    868		return -ENODEV;
    869	}
    870
    871	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
    872	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
    873
    874	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
    875	rp->power_unit = (1 << value) * 1000;
    876
    877	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
    878	rp->time_unit = 1000000 / (1 << value);
    879
    880	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
    881		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
    882
    883	return 0;
    884}
    885
    886static void power_limit_irq_save_cpu(void *info)
    887{
    888	u32 l, h = 0;
    889	struct rapl_package *rp = (struct rapl_package *)info;
    890
    891	/* save the state of PLN irq mask bit before disabling it */
    892	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
    893	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
    894		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
    895		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
    896	}
    897	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
    898	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    899}
    900
    901/* REVISIT:
    902 * When package power limit is set artificially low by RAPL, LVT
    903 * thermal interrupt for package power limit should be ignored
    904 * since we are not really exceeding the real limit. The intention
    905 * is to avoid excessive interrupts while we are trying to save power.
    906 * A useful feature might be routing the package_power_limit interrupt
    907 * to userspace via eventfd. once we have a usecase, this is simple
    908 * to do by adding an atomic notifier.
    909 */
    910
    911static void package_power_limit_irq_save(struct rapl_package *rp)
    912{
    913	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
    914		return;
    915
    916	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
    917}
    918
    919/*
    920 * Restore per package power limit interrupt enable state. Called from cpu
    921 * hotplug code on package removal.
    922 */
    923static void package_power_limit_irq_restore(struct rapl_package *rp)
    924{
    925	u32 l, h;
    926
    927	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
    928		return;
    929
    930	/* irq enable state not saved, nothing to restore */
    931	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
    932		return;
    933
    934	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
    935
    936	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
    937		l |= PACKAGE_THERM_INT_PLN_ENABLE;
    938	else
    939		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
    940
    941	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    942}
    943
    944static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
    945{
    946	int nr_powerlimit = find_nr_power_limit(rd);
    947
    948	/* always enable clamp such that p-state can go below OS requested
    949	 * range. power capping priority over guranteed frequency.
    950	 */
    951	rapl_write_data_raw(rd, PL1_CLAMP, mode);
    952
    953	/* some domains have pl2 */
    954	if (nr_powerlimit > 1) {
    955		rapl_write_data_raw(rd, PL2_ENABLE, mode);
    956		rapl_write_data_raw(rd, PL2_CLAMP, mode);
    957	}
    958}
    959
    960static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
    961{
    962	static u32 power_ctrl_orig_val;
    963	u32 mdata;
    964
    965	if (!rapl_defaults->floor_freq_reg_addr) {
    966		pr_err("Invalid floor frequency config register\n");
    967		return;
    968	}
    969
    970	if (!power_ctrl_orig_val)
    971		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
    972			      rapl_defaults->floor_freq_reg_addr,
    973			      &power_ctrl_orig_val);
    974	mdata = power_ctrl_orig_val;
    975	if (enable) {
    976		mdata &= ~(0x7f << 8);
    977		mdata |= 1 << 8;
    978	}
    979	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
    980		       rapl_defaults->floor_freq_reg_addr, mdata);
    981}
    982
    983static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
    984					 bool to_raw)
    985{
    986	u64 f, y;		/* fraction and exp. used for time unit */
    987
    988	/*
    989	 * Special processing based on 2^Y*(1+F/4), refer
    990	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
    991	 */
    992	if (!to_raw) {
    993		f = (value & 0x60) >> 5;
    994		y = value & 0x1f;
    995		value = (1 << y) * (4 + f) * rp->time_unit / 4;
    996	} else {
    997		do_div(value, rp->time_unit);
    998		y = ilog2(value);
    999		f = div64_u64(4 * (value - (1 << y)), 1 << y);
   1000		value = (y & 0x1f) | ((f & 0x3) << 5);
   1001	}
   1002	return value;
   1003}
   1004
   1005static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
   1006					 bool to_raw)
   1007{
   1008	/*
   1009	 * Atom time unit encoding is straight forward val * time_unit,
   1010	 * where time_unit is default to 1 sec. Never 0.
   1011	 */
   1012	if (!to_raw)
   1013		return (value) ? value * rp->time_unit : rp->time_unit;
   1014
   1015	value = div64_u64(value, rp->time_unit);
   1016
   1017	return value;
   1018}
   1019
   1020static const struct rapl_defaults rapl_defaults_core = {
   1021	.floor_freq_reg_addr = 0,
   1022	.check_unit = rapl_check_unit_core,
   1023	.set_floor_freq = set_floor_freq_default,
   1024	.compute_time_window = rapl_compute_time_window_core,
   1025};
   1026
   1027static const struct rapl_defaults rapl_defaults_hsw_server = {
   1028	.check_unit = rapl_check_unit_core,
   1029	.set_floor_freq = set_floor_freq_default,
   1030	.compute_time_window = rapl_compute_time_window_core,
   1031	.dram_domain_energy_unit = 15300,
   1032};
   1033
   1034static const struct rapl_defaults rapl_defaults_spr_server = {
   1035	.check_unit = rapl_check_unit_core,
   1036	.set_floor_freq = set_floor_freq_default,
   1037	.compute_time_window = rapl_compute_time_window_core,
   1038	.dram_domain_energy_unit = 15300,
   1039	.psys_domain_energy_unit = 1000000000,
   1040	.spr_psys_bits = true,
   1041};
   1042
   1043static const struct rapl_defaults rapl_defaults_byt = {
   1044	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
   1045	.check_unit = rapl_check_unit_atom,
   1046	.set_floor_freq = set_floor_freq_atom,
   1047	.compute_time_window = rapl_compute_time_window_atom,
   1048};
   1049
   1050static const struct rapl_defaults rapl_defaults_tng = {
   1051	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
   1052	.check_unit = rapl_check_unit_atom,
   1053	.set_floor_freq = set_floor_freq_atom,
   1054	.compute_time_window = rapl_compute_time_window_atom,
   1055};
   1056
   1057static const struct rapl_defaults rapl_defaults_ann = {
   1058	.floor_freq_reg_addr = 0,
   1059	.check_unit = rapl_check_unit_atom,
   1060	.set_floor_freq = NULL,
   1061	.compute_time_window = rapl_compute_time_window_atom,
   1062};
   1063
   1064static const struct rapl_defaults rapl_defaults_cht = {
   1065	.floor_freq_reg_addr = 0,
   1066	.check_unit = rapl_check_unit_atom,
   1067	.set_floor_freq = NULL,
   1068	.compute_time_window = rapl_compute_time_window_atom,
   1069};
   1070
   1071static const struct rapl_defaults rapl_defaults_amd = {
   1072	.check_unit = rapl_check_unit_core,
   1073};
   1074
   1075static const struct x86_cpu_id rapl_ids[] __initconst = {
   1076	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE,		&rapl_defaults_core),
   1077	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X,	&rapl_defaults_core),
   1078
   1079	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE,		&rapl_defaults_core),
   1080	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X,		&rapl_defaults_core),
   1081
   1082	X86_MATCH_INTEL_FAM6_MODEL(HASWELL,		&rapl_defaults_core),
   1083	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L,		&rapl_defaults_core),
   1084	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G,		&rapl_defaults_core),
   1085	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X,		&rapl_defaults_hsw_server),
   1086
   1087	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL,		&rapl_defaults_core),
   1088	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G,		&rapl_defaults_core),
   1089	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D,		&rapl_defaults_core),
   1090	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X,		&rapl_defaults_hsw_server),
   1091
   1092	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE,		&rapl_defaults_core),
   1093	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L,		&rapl_defaults_core),
   1094	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&rapl_defaults_hsw_server),
   1095	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&rapl_defaults_core),
   1096	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&rapl_defaults_core),
   1097	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L,	&rapl_defaults_core),
   1098	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&rapl_defaults_core),
   1099	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&rapl_defaults_core),
   1100	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&rapl_defaults_core),
   1101	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&rapl_defaults_hsw_server),
   1102	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&rapl_defaults_hsw_server),
   1103	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&rapl_defaults_core),
   1104	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&rapl_defaults_core),
   1105	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,		&rapl_defaults_core),
   1106	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,		&rapl_defaults_core),
   1107	X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,		&rapl_defaults_core),
   1108	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&rapl_defaults_core),
   1109	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&rapl_defaults_core),
   1110	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,		&rapl_defaults_core),
   1111	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&rapl_defaults_core),
   1112	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&rapl_defaults_spr_server),
   1113	X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,		&rapl_defaults_core),
   1114
   1115	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT,	&rapl_defaults_byt),
   1116	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT,	&rapl_defaults_cht),
   1117	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&rapl_defaults_tng),
   1118	X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID,	&rapl_defaults_ann),
   1119	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	&rapl_defaults_core),
   1120	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS,	&rapl_defaults_core),
   1121	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D,	&rapl_defaults_core),
   1122	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,	&rapl_defaults_core),
   1123	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,	&rapl_defaults_core),
   1124	X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,	&rapl_defaults_core),
   1125
   1126	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&rapl_defaults_hsw_server),
   1127	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&rapl_defaults_hsw_server),
   1128
   1129	X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
   1130	X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
   1131	X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
   1132	{}
   1133};
   1134MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
   1135
   1136/* Read once for all raw primitive data for domains */
   1137static void rapl_update_domain_data(struct rapl_package *rp)
   1138{
   1139	int dmn, prim;
   1140	u64 val;
   1141
   1142	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
   1143		pr_debug("update %s domain %s data\n", rp->name,
   1144			 rp->domains[dmn].name);
   1145		/* exclude non-raw primitives */
   1146		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
   1147			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
   1148						rpi[prim].unit, &val))
   1149				rp->domains[dmn].rdd.primitives[prim] = val;
   1150		}
   1151	}
   1152
   1153}
   1154
   1155static int rapl_package_register_powercap(struct rapl_package *rp)
   1156{
   1157	struct rapl_domain *rd;
   1158	struct powercap_zone *power_zone = NULL;
   1159	int nr_pl, ret;
   1160
   1161	/* Update the domain data of the new package */
   1162	rapl_update_domain_data(rp);
   1163
   1164	/* first we register package domain as the parent zone */
   1165	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
   1166		if (rd->id == RAPL_DOMAIN_PACKAGE) {
   1167			nr_pl = find_nr_power_limit(rd);
   1168			pr_debug("register package domain %s\n", rp->name);
   1169			power_zone = powercap_register_zone(&rd->power_zone,
   1170					    rp->priv->control_type, rp->name,
   1171					    NULL, &zone_ops[rd->id], nr_pl,
   1172					    &constraint_ops);
   1173			if (IS_ERR(power_zone)) {
   1174				pr_debug("failed to register power zone %s\n",
   1175					 rp->name);
   1176				return PTR_ERR(power_zone);
   1177			}
   1178			/* track parent zone in per package/socket data */
   1179			rp->power_zone = power_zone;
   1180			/* done, only one package domain per socket */
   1181			break;
   1182		}
   1183	}
   1184	if (!power_zone) {
   1185		pr_err("no package domain found, unknown topology!\n");
   1186		return -ENODEV;
   1187	}
   1188	/* now register domains as children of the socket/package */
   1189	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
   1190		struct powercap_zone *parent = rp->power_zone;
   1191
   1192		if (rd->id == RAPL_DOMAIN_PACKAGE)
   1193			continue;
   1194		if (rd->id == RAPL_DOMAIN_PLATFORM)
   1195			parent = NULL;
   1196		/* number of power limits per domain varies */
   1197		nr_pl = find_nr_power_limit(rd);
   1198		power_zone = powercap_register_zone(&rd->power_zone,
   1199						    rp->priv->control_type,
   1200						    rd->name, parent,
   1201						    &zone_ops[rd->id], nr_pl,
   1202						    &constraint_ops);
   1203
   1204		if (IS_ERR(power_zone)) {
   1205			pr_debug("failed to register power_zone, %s:%s\n",
   1206				 rp->name, rd->name);
   1207			ret = PTR_ERR(power_zone);
   1208			goto err_cleanup;
   1209		}
   1210	}
   1211	return 0;
   1212
   1213err_cleanup:
   1214	/*
   1215	 * Clean up previously initialized domains within the package if we
   1216	 * failed after the first domain setup.
   1217	 */
   1218	while (--rd >= rp->domains) {
   1219		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
   1220		powercap_unregister_zone(rp->priv->control_type,
   1221					 &rd->power_zone);
   1222	}
   1223
   1224	return ret;
   1225}
   1226
   1227static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
   1228{
   1229	struct reg_action ra;
   1230
   1231	switch (domain) {
   1232	case RAPL_DOMAIN_PACKAGE:
   1233	case RAPL_DOMAIN_PP0:
   1234	case RAPL_DOMAIN_PP1:
   1235	case RAPL_DOMAIN_DRAM:
   1236	case RAPL_DOMAIN_PLATFORM:
   1237		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
   1238		break;
   1239	default:
   1240		pr_err("invalid domain id %d\n", domain);
   1241		return -EINVAL;
   1242	}
   1243	/* make sure domain counters are available and contains non-zero
   1244	 * values, otherwise skip it.
   1245	 */
   1246
   1247	ra.mask = ENERGY_STATUS_MASK;
   1248	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
   1249		return -ENODEV;
   1250
   1251	return 0;
   1252}
   1253
   1254/*
   1255 * Check if power limits are available. Two cases when they are not available:
   1256 * 1. Locked by BIOS, in this case we still provide read-only access so that
   1257 *    users can see what limit is set by the BIOS.
   1258 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
   1259 *    exist at all. In this case, we do not show the constraints in powercap.
   1260 *
   1261 * Called after domains are detected and initialized.
   1262 */
   1263static void rapl_detect_powerlimit(struct rapl_domain *rd)
   1264{
   1265	u64 val64;
   1266	int i;
   1267
   1268	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
   1269	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
   1270		if (val64) {
   1271			pr_info("RAPL %s domain %s locked by BIOS\n",
   1272				rd->rp->name, rd->name);
   1273			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
   1274		}
   1275	}
   1276	/* check if power limit MSR exists, otherwise domain is monitoring only */
   1277	for (i = 0; i < NR_POWER_LIMITS; i++) {
   1278		int prim = rd->rpl[i].prim_id;
   1279
   1280		if (rapl_read_data_raw(rd, prim, false, &val64))
   1281			rd->rpl[i].name = NULL;
   1282	}
   1283}
   1284
   1285/* Detect active and valid domains for the given CPU, caller must
   1286 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
   1287 */
   1288static int rapl_detect_domains(struct rapl_package *rp, int cpu)
   1289{
   1290	struct rapl_domain *rd;
   1291	int i;
   1292
   1293	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
   1294		/* use physical package id to read counters */
   1295		if (!rapl_check_domain(cpu, i, rp)) {
   1296			rp->domain_map |= 1 << i;
   1297			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
   1298		}
   1299	}
   1300	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
   1301	if (!rp->nr_domains) {
   1302		pr_debug("no valid rapl domains found in %s\n", rp->name);
   1303		return -ENODEV;
   1304	}
   1305	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
   1306
   1307	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
   1308			      GFP_KERNEL);
   1309	if (!rp->domains)
   1310		return -ENOMEM;
   1311
   1312	rapl_init_domains(rp);
   1313
   1314	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
   1315		rapl_detect_powerlimit(rd);
   1316
   1317	return 0;
   1318}
   1319
   1320/* called from CPU hotplug notifier, hotplug lock held */
   1321void rapl_remove_package(struct rapl_package *rp)
   1322{
   1323	struct rapl_domain *rd, *rd_package = NULL;
   1324
   1325	package_power_limit_irq_restore(rp);
   1326
   1327	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
   1328		rapl_write_data_raw(rd, PL1_ENABLE, 0);
   1329		rapl_write_data_raw(rd, PL1_CLAMP, 0);
   1330		if (find_nr_power_limit(rd) > 1) {
   1331			rapl_write_data_raw(rd, PL2_ENABLE, 0);
   1332			rapl_write_data_raw(rd, PL2_CLAMP, 0);
   1333			rapl_write_data_raw(rd, PL4_ENABLE, 0);
   1334		}
   1335		if (rd->id == RAPL_DOMAIN_PACKAGE) {
   1336			rd_package = rd;
   1337			continue;
   1338		}
   1339		pr_debug("remove package, undo power limit on %s: %s\n",
   1340			 rp->name, rd->name);
   1341		powercap_unregister_zone(rp->priv->control_type,
   1342					 &rd->power_zone);
   1343	}
   1344	/* do parent zone last */
   1345	powercap_unregister_zone(rp->priv->control_type,
   1346				 &rd_package->power_zone);
   1347	list_del(&rp->plist);
   1348	kfree(rp);
   1349}
   1350EXPORT_SYMBOL_GPL(rapl_remove_package);
   1351
   1352/* caller to ensure CPU hotplug lock is held */
   1353struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
   1354{
   1355	int id = topology_logical_die_id(cpu);
   1356	struct rapl_package *rp;
   1357
   1358	list_for_each_entry(rp, &rapl_packages, plist) {
   1359		if (rp->id == id
   1360		    && rp->priv->control_type == priv->control_type)
   1361			return rp;
   1362	}
   1363
   1364	return NULL;
   1365}
   1366EXPORT_SYMBOL_GPL(rapl_find_package_domain);
   1367
   1368/* called from CPU hotplug notifier, hotplug lock held */
   1369struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
   1370{
   1371	int id = topology_logical_die_id(cpu);
   1372	struct rapl_package *rp;
   1373	int ret;
   1374
   1375	if (!rapl_defaults)
   1376		return ERR_PTR(-ENODEV);
   1377
   1378	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
   1379	if (!rp)
   1380		return ERR_PTR(-ENOMEM);
   1381
   1382	/* add the new package to the list */
   1383	rp->id = id;
   1384	rp->lead_cpu = cpu;
   1385	rp->priv = priv;
   1386
   1387	if (topology_max_die_per_package() > 1)
   1388		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
   1389			 "package-%d-die-%d",
   1390			 topology_physical_package_id(cpu), topology_die_id(cpu));
   1391	else
   1392		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
   1393			 topology_physical_package_id(cpu));
   1394
   1395	/* check if the package contains valid domains */
   1396	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
   1397		ret = -ENODEV;
   1398		goto err_free_package;
   1399	}
   1400	ret = rapl_package_register_powercap(rp);
   1401	if (!ret) {
   1402		INIT_LIST_HEAD(&rp->plist);
   1403		list_add(&rp->plist, &rapl_packages);
   1404		return rp;
   1405	}
   1406
   1407err_free_package:
   1408	kfree(rp->domains);
   1409	kfree(rp);
   1410	return ERR_PTR(ret);
   1411}
   1412EXPORT_SYMBOL_GPL(rapl_add_package);
   1413
   1414static void power_limit_state_save(void)
   1415{
   1416	struct rapl_package *rp;
   1417	struct rapl_domain *rd;
   1418	int nr_pl, ret, i;
   1419
   1420	cpus_read_lock();
   1421	list_for_each_entry(rp, &rapl_packages, plist) {
   1422		if (!rp->power_zone)
   1423			continue;
   1424		rd = power_zone_to_rapl_domain(rp->power_zone);
   1425		nr_pl = find_nr_power_limit(rd);
   1426		for (i = 0; i < nr_pl; i++) {
   1427			switch (rd->rpl[i].prim_id) {
   1428			case PL1_ENABLE:
   1429				ret = rapl_read_data_raw(rd,
   1430						 POWER_LIMIT1, true,
   1431						 &rd->rpl[i].last_power_limit);
   1432				if (ret)
   1433					rd->rpl[i].last_power_limit = 0;
   1434				break;
   1435			case PL2_ENABLE:
   1436				ret = rapl_read_data_raw(rd,
   1437						 POWER_LIMIT2, true,
   1438						 &rd->rpl[i].last_power_limit);
   1439				if (ret)
   1440					rd->rpl[i].last_power_limit = 0;
   1441				break;
   1442			case PL4_ENABLE:
   1443				ret = rapl_read_data_raw(rd,
   1444						 POWER_LIMIT4, true,
   1445						 &rd->rpl[i].last_power_limit);
   1446				if (ret)
   1447					rd->rpl[i].last_power_limit = 0;
   1448				break;
   1449			}
   1450		}
   1451	}
   1452	cpus_read_unlock();
   1453}
   1454
   1455static void power_limit_state_restore(void)
   1456{
   1457	struct rapl_package *rp;
   1458	struct rapl_domain *rd;
   1459	int nr_pl, i;
   1460
   1461	cpus_read_lock();
   1462	list_for_each_entry(rp, &rapl_packages, plist) {
   1463		if (!rp->power_zone)
   1464			continue;
   1465		rd = power_zone_to_rapl_domain(rp->power_zone);
   1466		nr_pl = find_nr_power_limit(rd);
   1467		for (i = 0; i < nr_pl; i++) {
   1468			switch (rd->rpl[i].prim_id) {
   1469			case PL1_ENABLE:
   1470				if (rd->rpl[i].last_power_limit)
   1471					rapl_write_data_raw(rd, POWER_LIMIT1,
   1472					    rd->rpl[i].last_power_limit);
   1473				break;
   1474			case PL2_ENABLE:
   1475				if (rd->rpl[i].last_power_limit)
   1476					rapl_write_data_raw(rd, POWER_LIMIT2,
   1477					    rd->rpl[i].last_power_limit);
   1478				break;
   1479			case PL4_ENABLE:
   1480				if (rd->rpl[i].last_power_limit)
   1481					rapl_write_data_raw(rd, POWER_LIMIT4,
   1482					    rd->rpl[i].last_power_limit);
   1483				break;
   1484			}
   1485		}
   1486	}
   1487	cpus_read_unlock();
   1488}
   1489
   1490static int rapl_pm_callback(struct notifier_block *nb,
   1491			    unsigned long mode, void *_unused)
   1492{
   1493	switch (mode) {
   1494	case PM_SUSPEND_PREPARE:
   1495		power_limit_state_save();
   1496		break;
   1497	case PM_POST_SUSPEND:
   1498		power_limit_state_restore();
   1499		break;
   1500	}
   1501	return NOTIFY_OK;
   1502}
   1503
   1504static struct notifier_block rapl_pm_notifier = {
   1505	.notifier_call = rapl_pm_callback,
   1506};
   1507
   1508static struct platform_device *rapl_msr_platdev;
   1509
   1510static int __init rapl_init(void)
   1511{
   1512	const struct x86_cpu_id *id;
   1513	int ret;
   1514
   1515	id = x86_match_cpu(rapl_ids);
   1516	if (!id) {
   1517		pr_err("driver does not support CPU family %d model %d\n",
   1518		       boot_cpu_data.x86, boot_cpu_data.x86_model);
   1519
   1520		return -ENODEV;
   1521	}
   1522
   1523	rapl_defaults = (struct rapl_defaults *)id->driver_data;
   1524
   1525	ret = register_pm_notifier(&rapl_pm_notifier);
   1526	if (ret)
   1527		return ret;
   1528
   1529	rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
   1530	if (!rapl_msr_platdev) {
   1531		ret = -ENOMEM;
   1532		goto end;
   1533	}
   1534
   1535	ret = platform_device_add(rapl_msr_platdev);
   1536	if (ret)
   1537		platform_device_put(rapl_msr_platdev);
   1538
   1539end:
   1540	if (ret)
   1541		unregister_pm_notifier(&rapl_pm_notifier);
   1542
   1543	return ret;
   1544}
   1545
   1546static void __exit rapl_exit(void)
   1547{
   1548	platform_device_unregister(rapl_msr_platdev);
   1549	unregister_pm_notifier(&rapl_pm_notifier);
   1550}
   1551
   1552fs_initcall(rapl_init);
   1553module_exit(rapl_exit);
   1554
   1555MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
   1556MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
   1557MODULE_LICENSE("GPL v2");