cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_powerclamp.c (20063B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * intel_powerclamp.c - package c-state idle injection
      4 *
      5 * Copyright (c) 2012, Intel Corporation.
      6 *
      7 * Authors:
      8 *     Arjan van de Ven <arjan@linux.intel.com>
      9 *     Jacob Pan <jacob.jun.pan@linux.intel.com>
     10 *
     11 *	TODO:
     12 *           1. better handle wakeup from external interrupts, currently a fixed
     13 *              compensation is added to clamping duration when excessive amount
     14 *              of wakeups are observed during idle time. the reason is that in
     15 *              case of external interrupts without need for ack, clamping down
     16 *              cpu in non-irq context does not reduce irq. for majority of the
     17 *              cases, clamping down cpu does help reduce irq as well, we should
     18 *              be able to differentiate the two cases and give a quantitative
     19 *              solution for the irqs that we can control. perhaps based on
     20 *              get_cpu_iowait_time_us()
     21 *
     22 *	     2. synchronization with other hw blocks
     23 */
     24
     25#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
     26
     27#include <linux/module.h>
     28#include <linux/kernel.h>
     29#include <linux/delay.h>
     30#include <linux/kthread.h>
     31#include <linux/cpu.h>
     32#include <linux/thermal.h>
     33#include <linux/slab.h>
     34#include <linux/tick.h>
     35#include <linux/debugfs.h>
     36#include <linux/seq_file.h>
     37#include <linux/sched/rt.h>
     38#include <uapi/linux/sched/types.h>
     39
     40#include <asm/nmi.h>
     41#include <asm/msr.h>
     42#include <asm/mwait.h>
     43#include <asm/cpu_device_id.h>
     44#include <asm/hardirq.h>
     45
     46#define MAX_TARGET_RATIO (50U)
     47/* For each undisturbed clamping period (no extra wake ups during idle time),
     48 * we increment the confidence counter for the given target ratio.
     49 * CONFIDENCE_OK defines the level where runtime calibration results are
     50 * valid.
     51 */
     52#define CONFIDENCE_OK (3)
     53/* Default idle injection duration, driver adjust sleep time to meet target
     54 * idle ratio. Similar to frequency modulation.
     55 */
     56#define DEFAULT_DURATION_JIFFIES (6)
     57
     58static unsigned int target_mwait;
     59static struct dentry *debug_dir;
     60
     61/* user selected target */
     62static unsigned int set_target_ratio;
     63static unsigned int current_ratio;
     64static bool should_skip;
     65static bool reduce_irq;
     66static atomic_t idle_wakeup_counter;
     67static unsigned int control_cpu; /* The cpu assigned to collect stat and update
     68				  * control parameters. default to BSP but BSP
     69				  * can be offlined.
     70				  */
     71static bool clamping;
     72
     73struct powerclamp_worker_data {
     74	struct kthread_worker *worker;
     75	struct kthread_work balancing_work;
     76	struct kthread_delayed_work idle_injection_work;
     77	unsigned int cpu;
     78	unsigned int count;
     79	unsigned int guard;
     80	unsigned int window_size_now;
     81	unsigned int target_ratio;
     82	unsigned int duration_jiffies;
     83	bool clamping;
     84};
     85
     86static struct powerclamp_worker_data __percpu *worker_data;
     87static struct thermal_cooling_device *cooling_dev;
     88static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
     89					   * clamping kthread worker
     90					   */
     91
     92static unsigned int duration;
     93static unsigned int pkg_cstate_ratio_cur;
     94static unsigned int window_size;
     95
     96static int duration_set(const char *arg, const struct kernel_param *kp)
     97{
     98	int ret = 0;
     99	unsigned long new_duration;
    100
    101	ret = kstrtoul(arg, 10, &new_duration);
    102	if (ret)
    103		goto exit;
    104	if (new_duration > 25 || new_duration < 6) {
    105		pr_err("Out of recommended range %lu, between 6-25ms\n",
    106			new_duration);
    107		ret = -EINVAL;
    108	}
    109
    110	duration = clamp(new_duration, 6ul, 25ul);
    111	smp_mb();
    112
    113exit:
    114
    115	return ret;
    116}
    117
    118static const struct kernel_param_ops duration_ops = {
    119	.set = duration_set,
    120	.get = param_get_int,
    121};
    122
    123
    124module_param_cb(duration, &duration_ops, &duration, 0644);
    125MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
    126
    127struct powerclamp_calibration_data {
    128	unsigned long confidence;  /* used for calibration, basically a counter
    129				    * gets incremented each time a clamping
    130				    * period is completed without extra wakeups
    131				    * once that counter is reached given level,
    132				    * compensation is deemed usable.
    133				    */
    134	unsigned long steady_comp; /* steady state compensation used when
    135				    * no extra wakeups occurred.
    136				    */
    137	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
    138				     * mostly from external interrupts.
    139				     */
    140};
    141
    142static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
    143
    144static int window_size_set(const char *arg, const struct kernel_param *kp)
    145{
    146	int ret = 0;
    147	unsigned long new_window_size;
    148
    149	ret = kstrtoul(arg, 10, &new_window_size);
    150	if (ret)
    151		goto exit_win;
    152	if (new_window_size > 10 || new_window_size < 2) {
    153		pr_err("Out of recommended window size %lu, between 2-10\n",
    154			new_window_size);
    155		ret = -EINVAL;
    156	}
    157
    158	window_size = clamp(new_window_size, 2ul, 10ul);
    159	smp_mb();
    160
    161exit_win:
    162
    163	return ret;
    164}
    165
    166static const struct kernel_param_ops window_size_ops = {
    167	.set = window_size_set,
    168	.get = param_get_int,
    169};
    170
    171module_param_cb(window_size, &window_size_ops, &window_size, 0644);
    172MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
    173	"\tpowerclamp controls idle ratio within this window. larger\n"
    174	"\twindow size results in slower response time but more smooth\n"
    175	"\tclamping results. default to 2.");
    176
    177static void find_target_mwait(void)
    178{
    179	unsigned int eax, ebx, ecx, edx;
    180	unsigned int highest_cstate = 0;
    181	unsigned int highest_subcstate = 0;
    182	int i;
    183
    184	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
    185		return;
    186
    187	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
    188
    189	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
    190	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
    191		return;
    192
    193	edx >>= MWAIT_SUBSTATE_SIZE;
    194	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
    195		if (edx & MWAIT_SUBSTATE_MASK) {
    196			highest_cstate = i;
    197			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
    198		}
    199	}
    200	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
    201		(highest_subcstate - 1);
    202
    203}
    204
    205struct pkg_cstate_info {
    206	bool skip;
    207	int msr_index;
    208	int cstate_id;
    209};
    210
    211#define PKG_CSTATE_INIT(id) {				\
    212		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
    213		.cstate_id = id				\
    214			}
    215
    216static struct pkg_cstate_info pkg_cstates[] = {
    217	PKG_CSTATE_INIT(2),
    218	PKG_CSTATE_INIT(3),
    219	PKG_CSTATE_INIT(6),
    220	PKG_CSTATE_INIT(7),
    221	PKG_CSTATE_INIT(8),
    222	PKG_CSTATE_INIT(9),
    223	PKG_CSTATE_INIT(10),
    224	{NULL},
    225};
    226
    227static bool has_pkg_state_counter(void)
    228{
    229	u64 val;
    230	struct pkg_cstate_info *info = pkg_cstates;
    231
    232	/* check if any one of the counter msrs exists */
    233	while (info->msr_index) {
    234		if (!rdmsrl_safe(info->msr_index, &val))
    235			return true;
    236		info++;
    237	}
    238
    239	return false;
    240}
    241
    242static u64 pkg_state_counter(void)
    243{
    244	u64 val;
    245	u64 count = 0;
    246	struct pkg_cstate_info *info = pkg_cstates;
    247
    248	while (info->msr_index) {
    249		if (!info->skip) {
    250			if (!rdmsrl_safe(info->msr_index, &val))
    251				count += val;
    252			else
    253				info->skip = true;
    254		}
    255		info++;
    256	}
    257
    258	return count;
    259}
    260
    261static unsigned int get_compensation(int ratio)
    262{
    263	unsigned int comp = 0;
    264
    265	/* we only use compensation if all adjacent ones are good */
    266	if (ratio == 1 &&
    267		cal_data[ratio].confidence >= CONFIDENCE_OK &&
    268		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
    269		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
    270		comp = (cal_data[ratio].steady_comp +
    271			cal_data[ratio + 1].steady_comp +
    272			cal_data[ratio + 2].steady_comp) / 3;
    273	} else if (ratio == MAX_TARGET_RATIO - 1 &&
    274		cal_data[ratio].confidence >= CONFIDENCE_OK &&
    275		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
    276		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
    277		comp = (cal_data[ratio].steady_comp +
    278			cal_data[ratio - 1].steady_comp +
    279			cal_data[ratio - 2].steady_comp) / 3;
    280	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
    281		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
    282		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
    283		comp = (cal_data[ratio].steady_comp +
    284			cal_data[ratio - 1].steady_comp +
    285			cal_data[ratio + 1].steady_comp) / 3;
    286	}
    287
    288	/* REVISIT: simple penalty of double idle injection */
    289	if (reduce_irq)
    290		comp = ratio;
    291	/* do not exceed limit */
    292	if (comp + ratio >= MAX_TARGET_RATIO)
    293		comp = MAX_TARGET_RATIO - ratio - 1;
    294
    295	return comp;
    296}
    297
    298static void adjust_compensation(int target_ratio, unsigned int win)
    299{
    300	int delta;
    301	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
    302
    303	/*
    304	 * adjust compensations if confidence level has not been reached or
    305	 * there are too many wakeups during the last idle injection period, we
    306	 * cannot trust the data for compensation.
    307	 */
    308	if (d->confidence >= CONFIDENCE_OK ||
    309		atomic_read(&idle_wakeup_counter) >
    310		win * num_online_cpus())
    311		return;
    312
    313	delta = set_target_ratio - current_ratio;
    314	/* filter out bad data */
    315	if (delta >= 0 && delta <= (1+target_ratio/10)) {
    316		if (d->steady_comp)
    317			d->steady_comp =
    318				roundup(delta+d->steady_comp, 2)/2;
    319		else
    320			d->steady_comp = delta;
    321		d->confidence++;
    322	}
    323}
    324
    325static bool powerclamp_adjust_controls(unsigned int target_ratio,
    326				unsigned int guard, unsigned int win)
    327{
    328	static u64 msr_last, tsc_last;
    329	u64 msr_now, tsc_now;
    330	u64 val64;
    331
    332	/* check result for the last window */
    333	msr_now = pkg_state_counter();
    334	tsc_now = rdtsc();
    335
    336	/* calculate pkg cstate vs tsc ratio */
    337	if (!msr_last || !tsc_last)
    338		current_ratio = 1;
    339	else if (tsc_now-tsc_last) {
    340		val64 = 100*(msr_now-msr_last);
    341		do_div(val64, (tsc_now-tsc_last));
    342		current_ratio = val64;
    343	}
    344
    345	/* update record */
    346	msr_last = msr_now;
    347	tsc_last = tsc_now;
    348
    349	adjust_compensation(target_ratio, win);
    350	/*
    351	 * too many external interrupts, set flag such
    352	 * that we can take measure later.
    353	 */
    354	reduce_irq = atomic_read(&idle_wakeup_counter) >=
    355		2 * win * num_online_cpus();
    356
    357	atomic_set(&idle_wakeup_counter, 0);
    358	/* if we are above target+guard, skip */
    359	return set_target_ratio + guard <= current_ratio;
    360}
    361
    362static void clamp_balancing_func(struct kthread_work *work)
    363{
    364	struct powerclamp_worker_data *w_data;
    365	int sleeptime;
    366	unsigned long target_jiffies;
    367	unsigned int compensated_ratio;
    368	int interval; /* jiffies to sleep for each attempt */
    369
    370	w_data = container_of(work, struct powerclamp_worker_data,
    371			      balancing_work);
    372
    373	/*
    374	 * make sure user selected ratio does not take effect until
    375	 * the next round. adjust target_ratio if user has changed
    376	 * target such that we can converge quickly.
    377	 */
    378	w_data->target_ratio = READ_ONCE(set_target_ratio);
    379	w_data->guard = 1 + w_data->target_ratio / 20;
    380	w_data->window_size_now = window_size;
    381	w_data->duration_jiffies = msecs_to_jiffies(duration);
    382	w_data->count++;
    383
    384	/*
    385	 * systems may have different ability to enter package level
    386	 * c-states, thus we need to compensate the injected idle ratio
    387	 * to achieve the actual target reported by the HW.
    388	 */
    389	compensated_ratio = w_data->target_ratio +
    390		get_compensation(w_data->target_ratio);
    391	if (compensated_ratio <= 0)
    392		compensated_ratio = 1;
    393	interval = w_data->duration_jiffies * 100 / compensated_ratio;
    394
    395	/* align idle time */
    396	target_jiffies = roundup(jiffies, interval);
    397	sleeptime = target_jiffies - jiffies;
    398	if (sleeptime <= 0)
    399		sleeptime = 1;
    400
    401	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
    402		kthread_queue_delayed_work(w_data->worker,
    403					   &w_data->idle_injection_work,
    404					   sleeptime);
    405}
    406
    407static void clamp_idle_injection_func(struct kthread_work *work)
    408{
    409	struct powerclamp_worker_data *w_data;
    410
    411	w_data = container_of(work, struct powerclamp_worker_data,
    412			      idle_injection_work.work);
    413
    414	/*
    415	 * only elected controlling cpu can collect stats and update
    416	 * control parameters.
    417	 */
    418	if (w_data->cpu == control_cpu &&
    419	    !(w_data->count % w_data->window_size_now)) {
    420		should_skip =
    421			powerclamp_adjust_controls(w_data->target_ratio,
    422						   w_data->guard,
    423						   w_data->window_size_now);
    424		smp_mb();
    425	}
    426
    427	if (should_skip)
    428		goto balance;
    429
    430	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
    431
    432balance:
    433	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
    434		kthread_queue_work(w_data->worker, &w_data->balancing_work);
    435}
    436
    437/*
    438 * 1 HZ polling while clamping is active, useful for userspace
    439 * to monitor actual idle ratio.
    440 */
    441static void poll_pkg_cstate(struct work_struct *dummy);
    442static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
    443static void poll_pkg_cstate(struct work_struct *dummy)
    444{
    445	static u64 msr_last;
    446	static u64 tsc_last;
    447
    448	u64 msr_now;
    449	u64 tsc_now;
    450	u64 val64;
    451
    452	msr_now = pkg_state_counter();
    453	tsc_now = rdtsc();
    454
    455	/* calculate pkg cstate vs tsc ratio */
    456	if (!msr_last || !tsc_last)
    457		pkg_cstate_ratio_cur = 1;
    458	else {
    459		if (tsc_now - tsc_last) {
    460			val64 = 100 * (msr_now - msr_last);
    461			do_div(val64, (tsc_now - tsc_last));
    462			pkg_cstate_ratio_cur = val64;
    463		}
    464	}
    465
    466	/* update record */
    467	msr_last = msr_now;
    468	tsc_last = tsc_now;
    469
    470	if (true == clamping)
    471		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
    472}
    473
    474static void start_power_clamp_worker(unsigned long cpu)
    475{
    476	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
    477	struct kthread_worker *worker;
    478
    479	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
    480	if (IS_ERR(worker))
    481		return;
    482
    483	w_data->worker = worker;
    484	w_data->count = 0;
    485	w_data->cpu = cpu;
    486	w_data->clamping = true;
    487	set_bit(cpu, cpu_clamping_mask);
    488	sched_set_fifo(worker->task);
    489	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
    490	kthread_init_delayed_work(&w_data->idle_injection_work,
    491				  clamp_idle_injection_func);
    492	kthread_queue_work(w_data->worker, &w_data->balancing_work);
    493}
    494
    495static void stop_power_clamp_worker(unsigned long cpu)
    496{
    497	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
    498
    499	if (!w_data->worker)
    500		return;
    501
    502	w_data->clamping = false;
    503	/*
    504	 * Make sure that all works that get queued after this point see
    505	 * the clamping disabled. The counter part is not needed because
    506	 * there is an implicit memory barrier when the queued work
    507	 * is proceed.
    508	 */
    509	smp_wmb();
    510	kthread_cancel_work_sync(&w_data->balancing_work);
    511	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
    512	/*
    513	 * The balancing work still might be queued here because
    514	 * the handling of the "clapming" variable, cancel, and queue
    515	 * operations are not synchronized via a lock. But it is not
    516	 * a big deal. The balancing work is fast and destroy kthread
    517	 * will wait for it.
    518	 */
    519	clear_bit(w_data->cpu, cpu_clamping_mask);
    520	kthread_destroy_worker(w_data->worker);
    521
    522	w_data->worker = NULL;
    523}
    524
    525static int start_power_clamp(void)
    526{
    527	unsigned long cpu;
    528
    529	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
    530	/* prevent cpu hotplug */
    531	cpus_read_lock();
    532
    533	/* prefer BSP */
    534	control_cpu = 0;
    535	if (!cpu_online(control_cpu))
    536		control_cpu = smp_processor_id();
    537
    538	clamping = true;
    539	schedule_delayed_work(&poll_pkg_cstate_work, 0);
    540
    541	/* start one kthread worker per online cpu */
    542	for_each_online_cpu(cpu) {
    543		start_power_clamp_worker(cpu);
    544	}
    545	cpus_read_unlock();
    546
    547	return 0;
    548}
    549
    550static void end_power_clamp(void)
    551{
    552	int i;
    553
    554	/*
    555	 * Block requeuing in all the kthread workers. They will flush and
    556	 * stop faster.
    557	 */
    558	clamping = false;
    559	for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
    560		pr_debug("clamping worker for cpu %d alive, destroy\n", i);
    561		stop_power_clamp_worker(i);
    562	}
    563}
    564
    565static int powerclamp_cpu_online(unsigned int cpu)
    566{
    567	if (clamping == false)
    568		return 0;
    569	start_power_clamp_worker(cpu);
    570	/* prefer BSP as controlling CPU */
    571	if (cpu == 0) {
    572		control_cpu = 0;
    573		smp_mb();
    574	}
    575	return 0;
    576}
    577
    578static int powerclamp_cpu_predown(unsigned int cpu)
    579{
    580	if (clamping == false)
    581		return 0;
    582
    583	stop_power_clamp_worker(cpu);
    584	if (cpu != control_cpu)
    585		return 0;
    586
    587	control_cpu = cpumask_first(cpu_online_mask);
    588	if (control_cpu == cpu)
    589		control_cpu = cpumask_next(cpu, cpu_online_mask);
    590	smp_mb();
    591	return 0;
    592}
    593
    594static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
    595				 unsigned long *state)
    596{
    597	*state = MAX_TARGET_RATIO;
    598
    599	return 0;
    600}
    601
    602static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
    603				 unsigned long *state)
    604{
    605	if (true == clamping)
    606		*state = pkg_cstate_ratio_cur;
    607	else
    608		/* to save power, do not poll idle ratio while not clamping */
    609		*state = -1; /* indicates invalid state */
    610
    611	return 0;
    612}
    613
    614static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
    615				 unsigned long new_target_ratio)
    616{
    617	int ret = 0;
    618
    619	new_target_ratio = clamp(new_target_ratio, 0UL,
    620				(unsigned long) (MAX_TARGET_RATIO-1));
    621	if (set_target_ratio == 0 && new_target_ratio > 0) {
    622		pr_info("Start idle injection to reduce power\n");
    623		set_target_ratio = new_target_ratio;
    624		ret = start_power_clamp();
    625		goto exit_set;
    626	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
    627		pr_info("Stop forced idle injection\n");
    628		end_power_clamp();
    629		set_target_ratio = 0;
    630	} else	/* adjust currently running */ {
    631		set_target_ratio = new_target_ratio;
    632		/* make new set_target_ratio visible to other cpus */
    633		smp_mb();
    634	}
    635
    636exit_set:
    637	return ret;
    638}
    639
    640/* bind to generic thermal layer as cooling device*/
    641static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
    642	.get_max_state = powerclamp_get_max_state,
    643	.get_cur_state = powerclamp_get_cur_state,
    644	.set_cur_state = powerclamp_set_cur_state,
    645};
    646
    647static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
    648	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
    649	{}
    650};
    651MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
    652
    653static int __init powerclamp_probe(void)
    654{
    655
    656	if (!x86_match_cpu(intel_powerclamp_ids)) {
    657		pr_err("CPU does not support MWAIT\n");
    658		return -ENODEV;
    659	}
    660
    661	/* The goal for idle time alignment is to achieve package cstate. */
    662	if (!has_pkg_state_counter()) {
    663		pr_info("No package C-state available\n");
    664		return -ENODEV;
    665	}
    666
    667	/* find the deepest mwait value */
    668	find_target_mwait();
    669
    670	return 0;
    671}
    672
    673static int powerclamp_debug_show(struct seq_file *m, void *unused)
    674{
    675	int i = 0;
    676
    677	seq_printf(m, "controlling cpu: %d\n", control_cpu);
    678	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
    679	for (i = 0; i < MAX_TARGET_RATIO; i++) {
    680		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
    681			i,
    682			cal_data[i].confidence,
    683			cal_data[i].steady_comp,
    684			cal_data[i].dynamic_comp);
    685	}
    686
    687	return 0;
    688}
    689
    690DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
    691
    692static inline void powerclamp_create_debug_files(void)
    693{
    694	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
    695
    696	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
    697			    &powerclamp_debug_fops);
    698}
    699
    700static enum cpuhp_state hp_state;
    701
    702static int __init powerclamp_init(void)
    703{
    704	int retval;
    705
    706	cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
    707	if (!cpu_clamping_mask)
    708		return -ENOMEM;
    709
    710	/* probe cpu features and ids here */
    711	retval = powerclamp_probe();
    712	if (retval)
    713		goto exit_free;
    714
    715	/* set default limit, maybe adjusted during runtime based on feedback */
    716	window_size = 2;
    717	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
    718					   "thermal/intel_powerclamp:online",
    719					   powerclamp_cpu_online,
    720					   powerclamp_cpu_predown);
    721	if (retval < 0)
    722		goto exit_free;
    723
    724	hp_state = retval;
    725
    726	worker_data = alloc_percpu(struct powerclamp_worker_data);
    727	if (!worker_data) {
    728		retval = -ENOMEM;
    729		goto exit_unregister;
    730	}
    731
    732	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
    733						&powerclamp_cooling_ops);
    734	if (IS_ERR(cooling_dev)) {
    735		retval = -ENODEV;
    736		goto exit_free_thread;
    737	}
    738
    739	if (!duration)
    740		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
    741
    742	powerclamp_create_debug_files();
    743
    744	return 0;
    745
    746exit_free_thread:
    747	free_percpu(worker_data);
    748exit_unregister:
    749	cpuhp_remove_state_nocalls(hp_state);
    750exit_free:
    751	bitmap_free(cpu_clamping_mask);
    752	return retval;
    753}
    754module_init(powerclamp_init);
    755
    756static void __exit powerclamp_exit(void)
    757{
    758	end_power_clamp();
    759	cpuhp_remove_state_nocalls(hp_state);
    760	free_percpu(worker_data);
    761	thermal_cooling_device_unregister(cooling_dev);
    762	bitmap_free(cpu_clamping_mask);
    763
    764	cancel_delayed_work_sync(&poll_pkg_cstate_work);
    765	debugfs_remove_recursive(debug_dir);
    766}
    767module_exit(powerclamp_exit);
    768
    769MODULE_LICENSE("GPL");
    770MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
    771MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
    772MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");