cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

x86_pkg_temp_thermal.c (14197B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * x86_pkg_temp_thermal driver
      4 * Copyright (c) 2013, Intel Corporation.
      5 */
      6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      7
      8#include <linux/module.h>
      9#include <linux/init.h>
     10#include <linux/err.h>
     11#include <linux/param.h>
     12#include <linux/device.h>
     13#include <linux/platform_device.h>
     14#include <linux/cpu.h>
     15#include <linux/smp.h>
     16#include <linux/slab.h>
     17#include <linux/pm.h>
     18#include <linux/thermal.h>
     19#include <linux/debugfs.h>
     20
     21#include <asm/cpu_device_id.h>
     22
     23#include "thermal_interrupt.h"
     24
     25/*
     26* Rate control delay: Idea is to introduce denounce effect
     27* This should be long enough to avoid reduce events, when
     28* threshold is set to a temperature, which is constantly
     29* violated, but at the short enough to take any action.
     30* The action can be remove threshold or change it to next
     31* interesting setting. Based on experiments, in around
     32* every 5 seconds under load will give us a significant
     33* temperature change.
     34*/
     35#define PKG_TEMP_THERMAL_NOTIFY_DELAY	5000
     36static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
     37module_param(notify_delay_ms, int, 0644);
     38MODULE_PARM_DESC(notify_delay_ms,
     39	"User space notification delay in milli seconds.");
     40
     41/* Number of trip points in thermal zone. Currently it can't
     42* be more than 2. MSR can allow setting and getting notifications
     43* for only 2 thresholds. This define enforces this, if there
     44* is some wrong values returned by cpuid for number of thresholds.
     45*/
     46#define MAX_NUMBER_OF_TRIPS	2
     47
     48struct zone_device {
     49	int				cpu;
     50	bool				work_scheduled;
     51	u32				tj_max;
     52	u32				msr_pkg_therm_low;
     53	u32				msr_pkg_therm_high;
     54	struct delayed_work		work;
     55	struct thermal_zone_device	*tzone;
     56	struct cpumask			cpumask;
     57};
     58
     59static struct thermal_zone_params pkg_temp_tz_params = {
     60	.no_hwmon	= true,
     61};
     62
     63/* Keep track of how many zone pointers we allocated in init() */
     64static int max_id __read_mostly;
     65/* Array of zone pointers */
     66static struct zone_device **zones;
     67/* Serializes interrupt notification, work and hotplug */
     68static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
     69/* Protects zone operation in the work function against hotplug removal */
     70static DEFINE_MUTEX(thermal_zone_mutex);
     71
     72/* The dynamically assigned cpu hotplug state for module_exit() */
     73static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
     74
     75/* Debug counters to show using debugfs */
     76static struct dentry *debugfs;
     77static unsigned int pkg_interrupt_cnt;
     78static unsigned int pkg_work_cnt;
     79
     80static void pkg_temp_debugfs_init(void)
     81{
     82	debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
     83
     84	debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
     85			   &pkg_interrupt_cnt);
     86	debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
     87			   &pkg_work_cnt);
     88}
     89
     90/*
     91 * Protection:
     92 *
     93 * - cpu hotplug: Read serialized by cpu hotplug lock
     94 *		  Write must hold pkg_temp_lock
     95 *
     96 * - Other callsites: Must hold pkg_temp_lock
     97 */
     98static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
     99{
    100	int id = topology_logical_die_id(cpu);
    101
    102	if (id >= 0 && id < max_id)
    103		return zones[id];
    104	return NULL;
    105}
    106
    107/*
    108* tj-max is is interesting because threshold is set relative to this
    109* temperature.
    110*/
    111static int get_tj_max(int cpu, u32 *tj_max)
    112{
    113	u32 eax, edx, val;
    114	int err;
    115
    116	err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
    117	if (err)
    118		return err;
    119
    120	val = (eax >> 16) & 0xff;
    121	*tj_max = val * 1000;
    122
    123	return val ? 0 : -EINVAL;
    124}
    125
    126static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
    127{
    128	struct zone_device *zonedev = tzd->devdata;
    129	u32 eax, edx;
    130
    131	rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
    132			&eax, &edx);
    133	if (eax & 0x80000000) {
    134		*temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
    135		pr_debug("sys_get_curr_temp %d\n", *temp);
    136		return 0;
    137	}
    138	return -EINVAL;
    139}
    140
    141static int sys_get_trip_temp(struct thermal_zone_device *tzd,
    142			     int trip, int *temp)
    143{
    144	struct zone_device *zonedev = tzd->devdata;
    145	unsigned long thres_reg_value;
    146	u32 mask, shift, eax, edx;
    147	int ret;
    148
    149	if (trip >= MAX_NUMBER_OF_TRIPS)
    150		return -EINVAL;
    151
    152	if (trip) {
    153		mask = THERM_MASK_THRESHOLD1;
    154		shift = THERM_SHIFT_THRESHOLD1;
    155	} else {
    156		mask = THERM_MASK_THRESHOLD0;
    157		shift = THERM_SHIFT_THRESHOLD0;
    158	}
    159
    160	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
    161			   &eax, &edx);
    162	if (ret < 0)
    163		return ret;
    164
    165	thres_reg_value = (eax & mask) >> shift;
    166	if (thres_reg_value)
    167		*temp = zonedev->tj_max - thres_reg_value * 1000;
    168	else
    169		*temp = THERMAL_TEMP_INVALID;
    170	pr_debug("sys_get_trip_temp %d\n", *temp);
    171
    172	return 0;
    173}
    174
    175static int
    176sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
    177{
    178	struct zone_device *zonedev = tzd->devdata;
    179	u32 l, h, mask, shift, intr;
    180	int ret;
    181
    182	if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
    183		return -EINVAL;
    184
    185	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
    186			   &l, &h);
    187	if (ret < 0)
    188		return ret;
    189
    190	if (trip) {
    191		mask = THERM_MASK_THRESHOLD1;
    192		shift = THERM_SHIFT_THRESHOLD1;
    193		intr = THERM_INT_THRESHOLD1_ENABLE;
    194	} else {
    195		mask = THERM_MASK_THRESHOLD0;
    196		shift = THERM_SHIFT_THRESHOLD0;
    197		intr = THERM_INT_THRESHOLD0_ENABLE;
    198	}
    199	l &= ~mask;
    200	/*
    201	* When users space sets a trip temperature == 0, which is indication
    202	* that, it is no longer interested in receiving notifications.
    203	*/
    204	if (!temp) {
    205		l &= ~intr;
    206	} else {
    207		l |= (zonedev->tj_max - temp)/1000 << shift;
    208		l |= intr;
    209	}
    210
    211	return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
    212			l, h);
    213}
    214
    215static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
    216			     enum thermal_trip_type *type)
    217{
    218	*type = THERMAL_TRIP_PASSIVE;
    219	return 0;
    220}
    221
    222/* Thermal zone callback registry */
    223static struct thermal_zone_device_ops tzone_ops = {
    224	.get_temp = sys_get_curr_temp,
    225	.get_trip_temp = sys_get_trip_temp,
    226	.get_trip_type = sys_get_trip_type,
    227	.set_trip_temp = sys_set_trip_temp,
    228};
    229
    230static bool pkg_thermal_rate_control(void)
    231{
    232	return true;
    233}
    234
    235/* Enable threshold interrupt on local package/cpu */
    236static inline void enable_pkg_thres_interrupt(void)
    237{
    238	u8 thres_0, thres_1;
    239	u32 l, h;
    240
    241	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    242	/* only enable/disable if it had valid threshold value */
    243	thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
    244	thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
    245	if (thres_0)
    246		l |= THERM_INT_THRESHOLD0_ENABLE;
    247	if (thres_1)
    248		l |= THERM_INT_THRESHOLD1_ENABLE;
    249	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    250}
    251
    252/* Disable threshold interrupt on local package/cpu */
    253static inline void disable_pkg_thres_interrupt(void)
    254{
    255	u32 l, h;
    256
    257	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    258
    259	l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
    260	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
    261}
    262
    263static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
    264{
    265	struct thermal_zone_device *tzone = NULL;
    266	int cpu = smp_processor_id();
    267	struct zone_device *zonedev;
    268	u64 msr_val, wr_val;
    269
    270	mutex_lock(&thermal_zone_mutex);
    271	raw_spin_lock_irq(&pkg_temp_lock);
    272	++pkg_work_cnt;
    273
    274	zonedev = pkg_temp_thermal_get_dev(cpu);
    275	if (!zonedev) {
    276		raw_spin_unlock_irq(&pkg_temp_lock);
    277		mutex_unlock(&thermal_zone_mutex);
    278		return;
    279	}
    280	zonedev->work_scheduled = false;
    281
    282	rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
    283	wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
    284	if (wr_val != msr_val) {
    285		wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
    286		tzone = zonedev->tzone;
    287	}
    288
    289	enable_pkg_thres_interrupt();
    290	raw_spin_unlock_irq(&pkg_temp_lock);
    291
    292	/*
    293	 * If tzone is not NULL, then thermal_zone_mutex will prevent the
    294	 * concurrent removal in the cpu offline callback.
    295	 */
    296	if (tzone)
    297		thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
    298
    299	mutex_unlock(&thermal_zone_mutex);
    300}
    301
    302static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
    303{
    304	unsigned long ms = msecs_to_jiffies(notify_delay_ms);
    305
    306	schedule_delayed_work_on(cpu, work, ms);
    307}
    308
    309static int pkg_thermal_notify(u64 msr_val)
    310{
    311	int cpu = smp_processor_id();
    312	struct zone_device *zonedev;
    313	unsigned long flags;
    314
    315	raw_spin_lock_irqsave(&pkg_temp_lock, flags);
    316	++pkg_interrupt_cnt;
    317
    318	disable_pkg_thres_interrupt();
    319
    320	/* Work is per package, so scheduling it once is enough. */
    321	zonedev = pkg_temp_thermal_get_dev(cpu);
    322	if (zonedev && !zonedev->work_scheduled) {
    323		zonedev->work_scheduled = true;
    324		pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
    325	}
    326
    327	raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
    328	return 0;
    329}
    330
    331static int pkg_temp_thermal_device_add(unsigned int cpu)
    332{
    333	int id = topology_logical_die_id(cpu);
    334	u32 tj_max, eax, ebx, ecx, edx;
    335	struct zone_device *zonedev;
    336	int thres_count, err;
    337
    338	if (id >= max_id)
    339		return -ENOMEM;
    340
    341	cpuid(6, &eax, &ebx, &ecx, &edx);
    342	thres_count = ebx & 0x07;
    343	if (!thres_count)
    344		return -ENODEV;
    345
    346	thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
    347
    348	err = get_tj_max(cpu, &tj_max);
    349	if (err)
    350		return err;
    351
    352	zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
    353	if (!zonedev)
    354		return -ENOMEM;
    355
    356	INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
    357	zonedev->cpu = cpu;
    358	zonedev->tj_max = tj_max;
    359	zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
    360			thres_count,
    361			(thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
    362			zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
    363	if (IS_ERR(zonedev->tzone)) {
    364		err = PTR_ERR(zonedev->tzone);
    365		kfree(zonedev);
    366		return err;
    367	}
    368	err = thermal_zone_device_enable(zonedev->tzone);
    369	if (err) {
    370		thermal_zone_device_unregister(zonedev->tzone);
    371		kfree(zonedev);
    372		return err;
    373	}
    374	/* Store MSR value for package thermal interrupt, to restore at exit */
    375	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
    376	      zonedev->msr_pkg_therm_high);
    377
    378	cpumask_set_cpu(cpu, &zonedev->cpumask);
    379	raw_spin_lock_irq(&pkg_temp_lock);
    380	zones[id] = zonedev;
    381	raw_spin_unlock_irq(&pkg_temp_lock);
    382	return 0;
    383}
    384
    385static int pkg_thermal_cpu_offline(unsigned int cpu)
    386{
    387	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
    388	bool lastcpu, was_target;
    389	int target;
    390
    391	if (!zonedev)
    392		return 0;
    393
    394	target = cpumask_any_but(&zonedev->cpumask, cpu);
    395	cpumask_clear_cpu(cpu, &zonedev->cpumask);
    396	lastcpu = target >= nr_cpu_ids;
    397	/*
    398	 * Remove the sysfs files, if this is the last cpu in the package
    399	 * before doing further cleanups.
    400	 */
    401	if (lastcpu) {
    402		struct thermal_zone_device *tzone = zonedev->tzone;
    403
    404		/*
    405		 * We must protect against a work function calling
    406		 * thermal_zone_update, after/while unregister. We null out
    407		 * the pointer under the zone mutex, so the worker function
    408		 * won't try to call.
    409		 */
    410		mutex_lock(&thermal_zone_mutex);
    411		zonedev->tzone = NULL;
    412		mutex_unlock(&thermal_zone_mutex);
    413
    414		thermal_zone_device_unregister(tzone);
    415	}
    416
    417	/* Protect against work and interrupts */
    418	raw_spin_lock_irq(&pkg_temp_lock);
    419
    420	/*
    421	 * Check whether this cpu was the current target and store the new
    422	 * one. When we drop the lock, then the interrupt notify function
    423	 * will see the new target.
    424	 */
    425	was_target = zonedev->cpu == cpu;
    426	zonedev->cpu = target;
    427
    428	/*
    429	 * If this is the last CPU in the package remove the package
    430	 * reference from the array and restore the interrupt MSR. When we
    431	 * drop the lock neither the interrupt notify function nor the
    432	 * worker will see the package anymore.
    433	 */
    434	if (lastcpu) {
    435		zones[topology_logical_die_id(cpu)] = NULL;
    436		/* After this point nothing touches the MSR anymore. */
    437		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
    438		      zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
    439	}
    440
    441	/*
    442	 * Check whether there is work scheduled and whether the work is
    443	 * targeted at the outgoing CPU.
    444	 */
    445	if (zonedev->work_scheduled && was_target) {
    446		/*
    447		 * To cancel the work we need to drop the lock, otherwise
    448		 * we might deadlock if the work needs to be flushed.
    449		 */
    450		raw_spin_unlock_irq(&pkg_temp_lock);
    451		cancel_delayed_work_sync(&zonedev->work);
    452		raw_spin_lock_irq(&pkg_temp_lock);
    453		/*
    454		 * If this is not the last cpu in the package and the work
    455		 * did not run after we dropped the lock above, then we
    456		 * need to reschedule the work, otherwise the interrupt
    457		 * stays disabled forever.
    458		 */
    459		if (!lastcpu && zonedev->work_scheduled)
    460			pkg_thermal_schedule_work(target, &zonedev->work);
    461	}
    462
    463	raw_spin_unlock_irq(&pkg_temp_lock);
    464
    465	/* Final cleanup if this is the last cpu */
    466	if (lastcpu)
    467		kfree(zonedev);
    468	return 0;
    469}
    470
    471static int pkg_thermal_cpu_online(unsigned int cpu)
    472{
    473	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
    474	struct cpuinfo_x86 *c = &cpu_data(cpu);
    475
    476	/* Paranoia check */
    477	if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
    478		return -ENODEV;
    479
    480	/* If the package exists, nothing to do */
    481	if (zonedev) {
    482		cpumask_set_cpu(cpu, &zonedev->cpumask);
    483		return 0;
    484	}
    485	return pkg_temp_thermal_device_add(cpu);
    486}
    487
    488static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
    489	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
    490	{}
    491};
    492MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
    493
    494static int __init pkg_temp_thermal_init(void)
    495{
    496	int ret;
    497
    498	if (!x86_match_cpu(pkg_temp_thermal_ids))
    499		return -ENODEV;
    500
    501	max_id = topology_max_packages() * topology_max_die_per_package();
    502	zones = kcalloc(max_id, sizeof(struct zone_device *),
    503			   GFP_KERNEL);
    504	if (!zones)
    505		return -ENOMEM;
    506
    507	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
    508				pkg_thermal_cpu_online,	pkg_thermal_cpu_offline);
    509	if (ret < 0)
    510		goto err;
    511
    512	/* Store the state for module exit */
    513	pkg_thermal_hp_state = ret;
    514
    515	platform_thermal_package_notify = pkg_thermal_notify;
    516	platform_thermal_package_rate_control = pkg_thermal_rate_control;
    517
    518	 /* Don't care if it fails */
    519	pkg_temp_debugfs_init();
    520	return 0;
    521
    522err:
    523	kfree(zones);
    524	return ret;
    525}
    526module_init(pkg_temp_thermal_init)
    527
    528static void __exit pkg_temp_thermal_exit(void)
    529{
    530	platform_thermal_package_notify = NULL;
    531	platform_thermal_package_rate_control = NULL;
    532
    533	cpuhp_remove_state(pkg_thermal_hp_state);
    534	debugfs_remove_recursive(debugfs);
    535	kfree(zones);
    536}
    537module_exit(pkg_temp_thermal_exit)
    538
    539MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
    540MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
    541MODULE_LICENSE("GPL v2");