cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_pstate.c (92634B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * intel_pstate.c: Native P state management for Intel processors
      4 *
      5 * (C) Copyright 2012 Intel Corporation
      6 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
      7 */
      8
      9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     10
     11#include <linux/kernel.h>
     12#include <linux/kernel_stat.h>
     13#include <linux/module.h>
     14#include <linux/ktime.h>
     15#include <linux/hrtimer.h>
     16#include <linux/tick.h>
     17#include <linux/slab.h>
     18#include <linux/sched/cpufreq.h>
     19#include <linux/list.h>
     20#include <linux/cpu.h>
     21#include <linux/cpufreq.h>
     22#include <linux/sysfs.h>
     23#include <linux/types.h>
     24#include <linux/fs.h>
     25#include <linux/acpi.h>
     26#include <linux/vmalloc.h>
     27#include <linux/pm_qos.h>
     28#include <trace/events/power.h>
     29
     30#include <asm/div64.h>
     31#include <asm/msr.h>
     32#include <asm/cpu_device_id.h>
     33#include <asm/cpufeature.h>
     34#include <asm/intel-family.h>
     35#include "../drivers/thermal/intel/thermal_interrupt.h"
     36
     37#define INTEL_PSTATE_SAMPLING_INTERVAL	(10 * NSEC_PER_MSEC)
     38
     39#define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
     40#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP	5000
     41#define INTEL_CPUFREQ_TRANSITION_DELAY		500
     42
     43#ifdef CONFIG_ACPI
     44#include <acpi/processor.h>
     45#include <acpi/cppc_acpi.h>
     46#endif
     47
     48#define FRAC_BITS 8
     49#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
     50#define fp_toint(X) ((X) >> FRAC_BITS)
     51
     52#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3))
     53
     54#define EXT_BITS 6
     55#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
     56#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
     57#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
     58
     59static inline int32_t mul_fp(int32_t x, int32_t y)
     60{
     61	return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
     62}
     63
     64static inline int32_t div_fp(s64 x, s64 y)
     65{
     66	return div64_s64((int64_t)x << FRAC_BITS, y);
     67}
     68
     69static inline int ceiling_fp(int32_t x)
     70{
     71	int mask, ret;
     72
     73	ret = fp_toint(x);
     74	mask = (1 << FRAC_BITS) - 1;
     75	if (x & mask)
     76		ret += 1;
     77	return ret;
     78}
     79
     80static inline u64 mul_ext_fp(u64 x, u64 y)
     81{
     82	return (x * y) >> EXT_FRAC_BITS;
     83}
     84
     85static inline u64 div_ext_fp(u64 x, u64 y)
     86{
     87	return div64_u64(x << EXT_FRAC_BITS, y);
     88}
     89
     90/**
     91 * struct sample -	Store performance sample
     92 * @core_avg_perf:	Ratio of APERF/MPERF which is the actual average
     93 *			performance during last sample period
     94 * @busy_scaled:	Scaled busy value which is used to calculate next
     95 *			P state. This can be different than core_avg_perf
     96 *			to account for cpu idle period
     97 * @aperf:		Difference of actual performance frequency clock count
     98 *			read from APERF MSR between last and current sample
     99 * @mperf:		Difference of maximum performance frequency clock count
    100 *			read from MPERF MSR between last and current sample
    101 * @tsc:		Difference of time stamp counter between last and
    102 *			current sample
    103 * @time:		Current time from scheduler
    104 *
    105 * This structure is used in the cpudata structure to store performance sample
    106 * data for choosing next P State.
    107 */
    108struct sample {
    109	int32_t core_avg_perf;
    110	int32_t busy_scaled;
    111	u64 aperf;
    112	u64 mperf;
    113	u64 tsc;
    114	u64 time;
    115};
    116
    117/**
    118 * struct pstate_data - Store P state data
    119 * @current_pstate:	Current requested P state
    120 * @min_pstate:		Min P state possible for this platform
    121 * @max_pstate:		Max P state possible for this platform
    122 * @max_pstate_physical:This is physical Max P state for a processor
    123 *			This can be higher than the max_pstate which can
    124 *			be limited by platform thermal design power limits
    125 * @perf_ctl_scaling:	PERF_CTL P-state to frequency scaling factor
    126 * @scaling:		Scaling factor between performance and frequency
    127 * @turbo_pstate:	Max Turbo P state possible for this platform
    128 * @min_freq:		@min_pstate frequency in cpufreq units
    129 * @max_freq:		@max_pstate frequency in cpufreq units
    130 * @turbo_freq:		@turbo_pstate frequency in cpufreq units
    131 *
    132 * Stores the per cpu model P state limits and current P state.
    133 */
    134struct pstate_data {
    135	int	current_pstate;
    136	int	min_pstate;
    137	int	max_pstate;
    138	int	max_pstate_physical;
    139	int	perf_ctl_scaling;
    140	int	scaling;
    141	int	turbo_pstate;
    142	unsigned int min_freq;
    143	unsigned int max_freq;
    144	unsigned int turbo_freq;
    145};
    146
    147/**
    148 * struct vid_data -	Stores voltage information data
    149 * @min:		VID data for this platform corresponding to
    150 *			the lowest P state
    151 * @max:		VID data corresponding to the highest P State.
    152 * @turbo:		VID data for turbo P state
    153 * @ratio:		Ratio of (vid max - vid min) /
    154 *			(max P state - Min P State)
    155 *
    156 * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling)
    157 * This data is used in Atom platforms, where in addition to target P state,
    158 * the voltage data needs to be specified to select next P State.
    159 */
    160struct vid_data {
    161	int min;
    162	int max;
    163	int turbo;
    164	int32_t ratio;
    165};
    166
    167/**
    168 * struct global_params - Global parameters, mostly tunable via sysfs.
    169 * @no_turbo:		Whether or not to use turbo P-states.
    170 * @turbo_disabled:	Whether or not turbo P-states are available at all,
    171 *			based on the MSR_IA32_MISC_ENABLE value and whether or
    172 *			not the maximum reported turbo P-state is different from
    173 *			the maximum reported non-turbo one.
    174 * @turbo_disabled_mf:	The @turbo_disabled value reflected by cpuinfo.max_freq.
    175 * @min_perf_pct:	Minimum capacity limit in percent of the maximum turbo
    176 *			P-state capacity.
    177 * @max_perf_pct:	Maximum capacity limit in percent of the maximum turbo
    178 *			P-state capacity.
    179 */
    180struct global_params {
    181	bool no_turbo;
    182	bool turbo_disabled;
    183	bool turbo_disabled_mf;
    184	int max_perf_pct;
    185	int min_perf_pct;
    186};
    187
    188/**
    189 * struct cpudata -	Per CPU instance data storage
    190 * @cpu:		CPU number for this instance data
    191 * @policy:		CPUFreq policy value
    192 * @update_util:	CPUFreq utility callback information
    193 * @update_util_set:	CPUFreq utility callback is set
    194 * @iowait_boost:	iowait-related boost fraction
    195 * @last_update:	Time of the last update.
    196 * @pstate:		Stores P state limits for this CPU
    197 * @vid:		Stores VID limits for this CPU
    198 * @last_sample_time:	Last Sample time
    199 * @aperf_mperf_shift:	APERF vs MPERF counting frequency difference
    200 * @prev_aperf:		Last APERF value read from APERF MSR
    201 * @prev_mperf:		Last MPERF value read from MPERF MSR
    202 * @prev_tsc:		Last timestamp counter (TSC) value
    203 * @prev_cummulative_iowait: IO Wait time difference from last and
    204 *			current sample
    205 * @sample:		Storage for storing last Sample data
    206 * @min_perf_ratio:	Minimum capacity in terms of PERF or HWP ratios
    207 * @max_perf_ratio:	Maximum capacity in terms of PERF or HWP ratios
    208 * @acpi_perf_data:	Stores ACPI perf information read from _PSS
    209 * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
    210 * @epp_powersave:	Last saved HWP energy performance preference
    211 *			(EPP) or energy performance bias (EPB),
    212 *			when policy switched to performance
    213 * @epp_policy:		Last saved policy used to set EPP/EPB
    214 * @epp_default:	Power on default HWP energy performance
    215 *			preference/bias
    216 * @epp_cached		Cached HWP energy-performance preference value
    217 * @hwp_req_cached:	Cached value of the last HWP Request MSR
    218 * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
    219 * @last_io_update:	Last time when IO wake flag was set
    220 * @sched_flags:	Store scheduler flags for possible cross CPU update
    221 * @hwp_boost_min:	Last HWP boosted min performance
    222 * @suspended:		Whether or not the driver has been suspended.
    223 * @hwp_notify_work:	workqueue for HWP notifications.
    224 *
    225 * This structure stores per CPU instance data for all CPUs.
    226 */
    227struct cpudata {
    228	int cpu;
    229
    230	unsigned int policy;
    231	struct update_util_data update_util;
    232	bool   update_util_set;
    233
    234	struct pstate_data pstate;
    235	struct vid_data vid;
    236
    237	u64	last_update;
    238	u64	last_sample_time;
    239	u64	aperf_mperf_shift;
    240	u64	prev_aperf;
    241	u64	prev_mperf;
    242	u64	prev_tsc;
    243	u64	prev_cummulative_iowait;
    244	struct sample sample;
    245	int32_t	min_perf_ratio;
    246	int32_t	max_perf_ratio;
    247#ifdef CONFIG_ACPI
    248	struct acpi_processor_performance acpi_perf_data;
    249	bool valid_pss_table;
    250#endif
    251	unsigned int iowait_boost;
    252	s16 epp_powersave;
    253	s16 epp_policy;
    254	s16 epp_default;
    255	s16 epp_cached;
    256	u64 hwp_req_cached;
    257	u64 hwp_cap_cached;
    258	u64 last_io_update;
    259	unsigned int sched_flags;
    260	u32 hwp_boost_min;
    261	bool suspended;
    262	struct delayed_work hwp_notify_work;
    263};
    264
    265static struct cpudata **all_cpu_data;
    266
    267/**
    268 * struct pstate_funcs - Per CPU model specific callbacks
    269 * @get_max:		Callback to get maximum non turbo effective P state
    270 * @get_max_physical:	Callback to get maximum non turbo physical P state
    271 * @get_min:		Callback to get minimum P state
    272 * @get_turbo:		Callback to get turbo P state
    273 * @get_scaling:	Callback to get frequency scaling factor
    274 * @get_cpu_scaling:	Get frequency scaling factor for a given cpu
    275 * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference
    276 * @get_val:		Callback to convert P state to actual MSR write value
    277 * @get_vid:		Callback to get VID data for Atom platforms
    278 *
    279 * Core and Atom CPU models have different way to get P State limits. This
    280 * structure is used to store those callbacks.
    281 */
    282struct pstate_funcs {
    283	int (*get_max)(void);
    284	int (*get_max_physical)(void);
    285	int (*get_min)(void);
    286	int (*get_turbo)(void);
    287	int (*get_scaling)(void);
    288	int (*get_cpu_scaling)(int cpu);
    289	int (*get_aperf_mperf_shift)(void);
    290	u64 (*get_val)(struct cpudata*, int pstate);
    291	void (*get_vid)(struct cpudata *);
    292};
    293
    294static struct pstate_funcs pstate_funcs __read_mostly;
    295
    296static int hwp_active __read_mostly;
    297static int hwp_mode_bdw __read_mostly;
    298static bool per_cpu_limits __read_mostly;
    299static bool hwp_boost __read_mostly;
    300
    301static struct cpufreq_driver *intel_pstate_driver __read_mostly;
    302
    303#ifdef CONFIG_ACPI
    304static bool acpi_ppc;
    305#endif
    306
    307static struct global_params global;
    308
    309static DEFINE_MUTEX(intel_pstate_driver_lock);
    310static DEFINE_MUTEX(intel_pstate_limits_lock);
    311
    312#ifdef CONFIG_ACPI
    313
    314static bool intel_pstate_acpi_pm_profile_server(void)
    315{
    316	if (acpi_gbl_FADT.preferred_profile == PM_ENTERPRISE_SERVER ||
    317	    acpi_gbl_FADT.preferred_profile == PM_PERFORMANCE_SERVER)
    318		return true;
    319
    320	return false;
    321}
    322
    323static bool intel_pstate_get_ppc_enable_status(void)
    324{
    325	if (intel_pstate_acpi_pm_profile_server())
    326		return true;
    327
    328	return acpi_ppc;
    329}
    330
    331#ifdef CONFIG_ACPI_CPPC_LIB
    332
    333/* The work item is needed to avoid CPU hotplug locking issues */
    334static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
    335{
    336	sched_set_itmt_support();
    337}
    338
    339static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
    340
    341#define CPPC_MAX_PERF	U8_MAX
    342
    343static void intel_pstate_set_itmt_prio(int cpu)
    344{
    345	struct cppc_perf_caps cppc_perf;
    346	static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
    347	int ret;
    348
    349	ret = cppc_get_perf_caps(cpu, &cppc_perf);
    350	if (ret)
    351		return;
    352
    353	/*
    354	 * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff.
    355	 * In this case we can't use CPPC.highest_perf to enable ITMT.
    356	 * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide.
    357	 */
    358	if (cppc_perf.highest_perf == CPPC_MAX_PERF)
    359		cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached));
    360
    361	/*
    362	 * The priorities can be set regardless of whether or not
    363	 * sched_set_itmt_support(true) has been called and it is valid to
    364	 * update them at any time after it has been called.
    365	 */
    366	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
    367
    368	if (max_highest_perf <= min_highest_perf) {
    369		if (cppc_perf.highest_perf > max_highest_perf)
    370			max_highest_perf = cppc_perf.highest_perf;
    371
    372		if (cppc_perf.highest_perf < min_highest_perf)
    373			min_highest_perf = cppc_perf.highest_perf;
    374
    375		if (max_highest_perf > min_highest_perf) {
    376			/*
    377			 * This code can be run during CPU online under the
    378			 * CPU hotplug locks, so sched_set_itmt_support()
    379			 * cannot be called from here.  Queue up a work item
    380			 * to invoke it.
    381			 */
    382			schedule_work(&sched_itmt_work);
    383		}
    384	}
    385}
    386
    387static int intel_pstate_get_cppc_guaranteed(int cpu)
    388{
    389	struct cppc_perf_caps cppc_perf;
    390	int ret;
    391
    392	ret = cppc_get_perf_caps(cpu, &cppc_perf);
    393	if (ret)
    394		return ret;
    395
    396	if (cppc_perf.guaranteed_perf)
    397		return cppc_perf.guaranteed_perf;
    398
    399	return cppc_perf.nominal_perf;
    400}
    401
    402static u32 intel_pstate_cppc_nominal(int cpu)
    403{
    404	u64 nominal_perf;
    405
    406	if (cppc_get_nominal_perf(cpu, &nominal_perf))
    407		return 0;
    408
    409	return nominal_perf;
    410}
    411#else /* CONFIG_ACPI_CPPC_LIB */
    412static inline void intel_pstate_set_itmt_prio(int cpu)
    413{
    414}
    415#endif /* CONFIG_ACPI_CPPC_LIB */
    416
    417static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
    418{
    419	struct cpudata *cpu;
    420	int ret;
    421	int i;
    422
    423	if (hwp_active) {
    424		intel_pstate_set_itmt_prio(policy->cpu);
    425		return;
    426	}
    427
    428	if (!intel_pstate_get_ppc_enable_status())
    429		return;
    430
    431	cpu = all_cpu_data[policy->cpu];
    432
    433	ret = acpi_processor_register_performance(&cpu->acpi_perf_data,
    434						  policy->cpu);
    435	if (ret)
    436		return;
    437
    438	/*
    439	 * Check if the control value in _PSS is for PERF_CTL MSR, which should
    440	 * guarantee that the states returned by it map to the states in our
    441	 * list directly.
    442	 */
    443	if (cpu->acpi_perf_data.control_register.space_id !=
    444						ACPI_ADR_SPACE_FIXED_HARDWARE)
    445		goto err;
    446
    447	/*
    448	 * If there is only one entry _PSS, simply ignore _PSS and continue as
    449	 * usual without taking _PSS into account
    450	 */
    451	if (cpu->acpi_perf_data.state_count < 2)
    452		goto err;
    453
    454	pr_debug("CPU%u - ACPI _PSS perf data\n", policy->cpu);
    455	for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
    456		pr_debug("     %cP%d: %u MHz, %u mW, 0x%x\n",
    457			 (i == cpu->acpi_perf_data.state ? '*' : ' '), i,
    458			 (u32) cpu->acpi_perf_data.states[i].core_frequency,
    459			 (u32) cpu->acpi_perf_data.states[i].power,
    460			 (u32) cpu->acpi_perf_data.states[i].control);
    461	}
    462
    463	/*
    464	 * The _PSS table doesn't contain whole turbo frequency range.
    465	 * This just contains +1 MHZ above the max non turbo frequency,
    466	 * with control value corresponding to max turbo ratio. But
    467	 * when cpufreq set policy is called, it will call with this
    468	 * max frequency, which will cause a reduced performance as
    469	 * this driver uses real max turbo frequency as the max
    470	 * frequency. So correct this frequency in _PSS table to
    471	 * correct max turbo frequency based on the turbo state.
    472	 * Also need to convert to MHz as _PSS freq is in MHz.
    473	 */
    474	if (!global.turbo_disabled)
    475		cpu->acpi_perf_data.states[0].core_frequency =
    476					policy->cpuinfo.max_freq / 1000;
    477	cpu->valid_pss_table = true;
    478	pr_debug("_PPC limits will be enforced\n");
    479
    480	return;
    481
    482 err:
    483	cpu->valid_pss_table = false;
    484	acpi_processor_unregister_performance(policy->cpu);
    485}
    486
    487static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
    488{
    489	struct cpudata *cpu;
    490
    491	cpu = all_cpu_data[policy->cpu];
    492	if (!cpu->valid_pss_table)
    493		return;
    494
    495	acpi_processor_unregister_performance(policy->cpu);
    496}
    497#else /* CONFIG_ACPI */
    498static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
    499{
    500}
    501
    502static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
    503{
    504}
    505
    506static inline bool intel_pstate_acpi_pm_profile_server(void)
    507{
    508	return false;
    509}
    510#endif /* CONFIG_ACPI */
    511
    512#ifndef CONFIG_ACPI_CPPC_LIB
    513static inline int intel_pstate_get_cppc_guaranteed(int cpu)
    514{
    515	return -ENOTSUPP;
    516}
    517#endif /* CONFIG_ACPI_CPPC_LIB */
    518
    519/**
    520 * intel_pstate_hybrid_hwp_adjust - Calibrate HWP performance levels.
    521 * @cpu: Target CPU.
    522 *
    523 * On hybrid processors, HWP may expose more performance levels than there are
    524 * P-states accessible through the PERF_CTL interface.  If that happens, the
    525 * scaling factor between HWP performance levels and CPU frequency will be less
    526 * than the scaling factor between P-state values and CPU frequency.
    527 *
    528 * In that case, adjust the CPU parameters used in computations accordingly.
    529 */
    530static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu)
    531{
    532	int perf_ctl_max_phys = cpu->pstate.max_pstate_physical;
    533	int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
    534	int perf_ctl_turbo = pstate_funcs.get_turbo();
    535	int turbo_freq = perf_ctl_turbo * perf_ctl_scaling;
    536	int scaling = cpu->pstate.scaling;
    537
    538	pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
    539	pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, pstate_funcs.get_max());
    540	pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo);
    541	pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling);
    542	pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate);
    543	pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate);
    544	pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
    545
    546	/*
    547	 * If the product of the HWP performance scaling factor and the HWP_CAP
    548	 * highest performance is greater than the maximum turbo frequency
    549	 * corresponding to the pstate_funcs.get_turbo() return value, the
    550	 * scaling factor is too high, so recompute it to make the HWP_CAP
    551	 * highest performance correspond to the maximum turbo frequency.
    552	 */
    553	cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling;
    554	if (turbo_freq < cpu->pstate.turbo_freq) {
    555		cpu->pstate.turbo_freq = turbo_freq;
    556		scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate);
    557		cpu->pstate.scaling = scaling;
    558
    559		pr_debug("CPU%d: refined HWP-to-frequency scaling factor: %d\n",
    560			 cpu->cpu, scaling);
    561	}
    562
    563	cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling,
    564					 perf_ctl_scaling);
    565
    566	cpu->pstate.max_pstate_physical =
    567			DIV_ROUND_UP(perf_ctl_max_phys * perf_ctl_scaling,
    568				     scaling);
    569
    570	cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
    571	/*
    572	 * Cast the min P-state value retrieved via pstate_funcs.get_min() to
    573	 * the effective range of HWP performance levels.
    574	 */
    575	cpu->pstate.min_pstate = DIV_ROUND_UP(cpu->pstate.min_freq, scaling);
    576}
    577
    578static inline void update_turbo_state(void)
    579{
    580	u64 misc_en;
    581	struct cpudata *cpu;
    582
    583	cpu = all_cpu_data[0];
    584	rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
    585	global.turbo_disabled =
    586		(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
    587		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
    588}
    589
    590static int min_perf_pct_min(void)
    591{
    592	struct cpudata *cpu = all_cpu_data[0];
    593	int turbo_pstate = cpu->pstate.turbo_pstate;
    594
    595	return turbo_pstate ?
    596		(cpu->pstate.min_pstate * 100 / turbo_pstate) : 0;
    597}
    598
    599static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
    600{
    601	u64 epb;
    602	int ret;
    603
    604	if (!boot_cpu_has(X86_FEATURE_EPB))
    605		return -ENXIO;
    606
    607	ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
    608	if (ret)
    609		return (s16)ret;
    610
    611	return (s16)(epb & 0x0f);
    612}
    613
    614static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
    615{
    616	s16 epp;
    617
    618	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
    619		/*
    620		 * When hwp_req_data is 0, means that caller didn't read
    621		 * MSR_HWP_REQUEST, so need to read and get EPP.
    622		 */
    623		if (!hwp_req_data) {
    624			epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
    625					    &hwp_req_data);
    626			if (epp)
    627				return epp;
    628		}
    629		epp = (hwp_req_data >> 24) & 0xff;
    630	} else {
    631		/* When there is no EPP present, HWP uses EPB settings */
    632		epp = intel_pstate_get_epb(cpu_data);
    633	}
    634
    635	return epp;
    636}
    637
    638static int intel_pstate_set_epb(int cpu, s16 pref)
    639{
    640	u64 epb;
    641	int ret;
    642
    643	if (!boot_cpu_has(X86_FEATURE_EPB))
    644		return -ENXIO;
    645
    646	ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
    647	if (ret)
    648		return ret;
    649
    650	epb = (epb & ~0x0f) | pref;
    651	wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
    652
    653	return 0;
    654}
    655
    656/*
    657 * EPP/EPB display strings corresponding to EPP index in the
    658 * energy_perf_strings[]
    659 *	index		String
    660 *-------------------------------------
    661 *	0		default
    662 *	1		performance
    663 *	2		balance_performance
    664 *	3		balance_power
    665 *	4		power
    666 */
    667
    668enum energy_perf_value_index {
    669	EPP_INDEX_DEFAULT = 0,
    670	EPP_INDEX_PERFORMANCE,
    671	EPP_INDEX_BALANCE_PERFORMANCE,
    672	EPP_INDEX_BALANCE_POWERSAVE,
    673	EPP_INDEX_POWERSAVE,
    674};
    675
    676static const char * const energy_perf_strings[] = {
    677	[EPP_INDEX_DEFAULT] = "default",
    678	[EPP_INDEX_PERFORMANCE] = "performance",
    679	[EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance",
    680	[EPP_INDEX_BALANCE_POWERSAVE] = "balance_power",
    681	[EPP_INDEX_POWERSAVE] = "power",
    682	NULL
    683};
    684static unsigned int epp_values[] = {
    685	[EPP_INDEX_DEFAULT] = 0, /* Unused index */
    686	[EPP_INDEX_PERFORMANCE] = HWP_EPP_PERFORMANCE,
    687	[EPP_INDEX_BALANCE_PERFORMANCE] = HWP_EPP_BALANCE_PERFORMANCE,
    688	[EPP_INDEX_BALANCE_POWERSAVE] = HWP_EPP_BALANCE_POWERSAVE,
    689	[EPP_INDEX_POWERSAVE] = HWP_EPP_POWERSAVE,
    690};
    691
    692static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp)
    693{
    694	s16 epp;
    695	int index = -EINVAL;
    696
    697	*raw_epp = 0;
    698	epp = intel_pstate_get_epp(cpu_data, 0);
    699	if (epp < 0)
    700		return epp;
    701
    702	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
    703		if (epp == epp_values[EPP_INDEX_PERFORMANCE])
    704			return EPP_INDEX_PERFORMANCE;
    705		if (epp == epp_values[EPP_INDEX_BALANCE_PERFORMANCE])
    706			return EPP_INDEX_BALANCE_PERFORMANCE;
    707		if (epp == epp_values[EPP_INDEX_BALANCE_POWERSAVE])
    708			return EPP_INDEX_BALANCE_POWERSAVE;
    709		if (epp == epp_values[EPP_INDEX_POWERSAVE])
    710			return EPP_INDEX_POWERSAVE;
    711		*raw_epp = epp;
    712		return 0;
    713	} else if (boot_cpu_has(X86_FEATURE_EPB)) {
    714		/*
    715		 * Range:
    716		 *	0x00-0x03	:	Performance
    717		 *	0x04-0x07	:	Balance performance
    718		 *	0x08-0x0B	:	Balance power
    719		 *	0x0C-0x0F	:	Power
    720		 * The EPB is a 4 bit value, but our ranges restrict the
    721		 * value which can be set. Here only using top two bits
    722		 * effectively.
    723		 */
    724		index = (epp >> 2) + 1;
    725	}
    726
    727	return index;
    728}
    729
    730static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
    731{
    732	int ret;
    733
    734	/*
    735	 * Use the cached HWP Request MSR value, because in the active mode the
    736	 * register itself may be updated by intel_pstate_hwp_boost_up() or
    737	 * intel_pstate_hwp_boost_down() at any time.
    738	 */
    739	u64 value = READ_ONCE(cpu->hwp_req_cached);
    740
    741	value &= ~GENMASK_ULL(31, 24);
    742	value |= (u64)epp << 24;
    743	/*
    744	 * The only other updater of hwp_req_cached in the active mode,
    745	 * intel_pstate_hwp_set(), is called under the same lock as this
    746	 * function, so it cannot run in parallel with the update below.
    747	 */
    748	WRITE_ONCE(cpu->hwp_req_cached, value);
    749	ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
    750	if (!ret)
    751		cpu->epp_cached = epp;
    752
    753	return ret;
    754}
    755
    756static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
    757					      int pref_index, bool use_raw,
    758					      u32 raw_epp)
    759{
    760	int epp = -EINVAL;
    761	int ret;
    762
    763	if (!pref_index)
    764		epp = cpu_data->epp_default;
    765
    766	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
    767		if (use_raw)
    768			epp = raw_epp;
    769		else if (epp == -EINVAL)
    770			epp = epp_values[pref_index];
    771
    772		/*
    773		 * To avoid confusion, refuse to set EPP to any values different
    774		 * from 0 (performance) if the current policy is "performance",
    775		 * because those values would be overridden.
    776		 */
    777		if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
    778			return -EBUSY;
    779
    780		ret = intel_pstate_set_epp(cpu_data, epp);
    781	} else {
    782		if (epp == -EINVAL)
    783			epp = (pref_index - 1) << 2;
    784		ret = intel_pstate_set_epb(cpu_data->cpu, epp);
    785	}
    786
    787	return ret;
    788}
    789
    790static ssize_t show_energy_performance_available_preferences(
    791				struct cpufreq_policy *policy, char *buf)
    792{
    793	int i = 0;
    794	int ret = 0;
    795
    796	while (energy_perf_strings[i] != NULL)
    797		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
    798
    799	ret += sprintf(&buf[ret], "\n");
    800
    801	return ret;
    802}
    803
    804cpufreq_freq_attr_ro(energy_performance_available_preferences);
    805
    806static struct cpufreq_driver intel_pstate;
    807
    808static ssize_t store_energy_performance_preference(
    809		struct cpufreq_policy *policy, const char *buf, size_t count)
    810{
    811	struct cpudata *cpu = all_cpu_data[policy->cpu];
    812	char str_preference[21];
    813	bool raw = false;
    814	ssize_t ret;
    815	u32 epp = 0;
    816
    817	ret = sscanf(buf, "%20s", str_preference);
    818	if (ret != 1)
    819		return -EINVAL;
    820
    821	ret = match_string(energy_perf_strings, -1, str_preference);
    822	if (ret < 0) {
    823		if (!boot_cpu_has(X86_FEATURE_HWP_EPP))
    824			return ret;
    825
    826		ret = kstrtouint(buf, 10, &epp);
    827		if (ret)
    828			return ret;
    829
    830		if (epp > 255)
    831			return -EINVAL;
    832
    833		raw = true;
    834	}
    835
    836	/*
    837	 * This function runs with the policy R/W semaphore held, which
    838	 * guarantees that the driver pointer will not change while it is
    839	 * running.
    840	 */
    841	if (!intel_pstate_driver)
    842		return -EAGAIN;
    843
    844	mutex_lock(&intel_pstate_limits_lock);
    845
    846	if (intel_pstate_driver == &intel_pstate) {
    847		ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp);
    848	} else {
    849		/*
    850		 * In the passive mode the governor needs to be stopped on the
    851		 * target CPU before the EPP update and restarted after it,
    852		 * which is super-heavy-weight, so make sure it is worth doing
    853		 * upfront.
    854		 */
    855		if (!raw)
    856			epp = ret ? epp_values[ret] : cpu->epp_default;
    857
    858		if (cpu->epp_cached != epp) {
    859			int err;
    860
    861			cpufreq_stop_governor(policy);
    862			ret = intel_pstate_set_epp(cpu, epp);
    863			err = cpufreq_start_governor(policy);
    864			if (!ret)
    865				ret = err;
    866		}
    867	}
    868
    869	mutex_unlock(&intel_pstate_limits_lock);
    870
    871	return ret ?: count;
    872}
    873
    874static ssize_t show_energy_performance_preference(
    875				struct cpufreq_policy *policy, char *buf)
    876{
    877	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
    878	int preference, raw_epp;
    879
    880	preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp);
    881	if (preference < 0)
    882		return preference;
    883
    884	if (raw_epp)
    885		return  sprintf(buf, "%d\n", raw_epp);
    886	else
    887		return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
    888}
    889
    890cpufreq_freq_attr_rw(energy_performance_preference);
    891
    892static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf)
    893{
    894	struct cpudata *cpu = all_cpu_data[policy->cpu];
    895	int ratio, freq;
    896
    897	ratio = intel_pstate_get_cppc_guaranteed(policy->cpu);
    898	if (ratio <= 0) {
    899		u64 cap;
    900
    901		rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap);
    902		ratio = HWP_GUARANTEED_PERF(cap);
    903	}
    904
    905	freq = ratio * cpu->pstate.scaling;
    906	if (cpu->pstate.scaling != cpu->pstate.perf_ctl_scaling)
    907		freq = rounddown(freq, cpu->pstate.perf_ctl_scaling);
    908
    909	return sprintf(buf, "%d\n", freq);
    910}
    911
    912cpufreq_freq_attr_ro(base_frequency);
    913
    914static struct freq_attr *hwp_cpufreq_attrs[] = {
    915	&energy_performance_preference,
    916	&energy_performance_available_preferences,
    917	&base_frequency,
    918	NULL,
    919};
    920
    921static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
    922{
    923	u64 cap;
    924
    925	rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap);
    926	WRITE_ONCE(cpu->hwp_cap_cached, cap);
    927	cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap);
    928	cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap);
    929}
    930
    931static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
    932{
    933	int scaling = cpu->pstate.scaling;
    934
    935	__intel_pstate_get_hwp_cap(cpu);
    936
    937	cpu->pstate.max_freq = cpu->pstate.max_pstate * scaling;
    938	cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * scaling;
    939	if (scaling != cpu->pstate.perf_ctl_scaling) {
    940		int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
    941
    942		cpu->pstate.max_freq = rounddown(cpu->pstate.max_freq,
    943						 perf_ctl_scaling);
    944		cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_freq,
    945						   perf_ctl_scaling);
    946	}
    947}
    948
    949static void intel_pstate_hwp_set(unsigned int cpu)
    950{
    951	struct cpudata *cpu_data = all_cpu_data[cpu];
    952	int max, min;
    953	u64 value;
    954	s16 epp;
    955
    956	max = cpu_data->max_perf_ratio;
    957	min = cpu_data->min_perf_ratio;
    958
    959	if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
    960		min = max;
    961
    962	rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
    963
    964	value &= ~HWP_MIN_PERF(~0L);
    965	value |= HWP_MIN_PERF(min);
    966
    967	value &= ~HWP_MAX_PERF(~0L);
    968	value |= HWP_MAX_PERF(max);
    969
    970	if (cpu_data->epp_policy == cpu_data->policy)
    971		goto skip_epp;
    972
    973	cpu_data->epp_policy = cpu_data->policy;
    974
    975	if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
    976		epp = intel_pstate_get_epp(cpu_data, value);
    977		cpu_data->epp_powersave = epp;
    978		/* If EPP read was failed, then don't try to write */
    979		if (epp < 0)
    980			goto skip_epp;
    981
    982		epp = 0;
    983	} else {
    984		/* skip setting EPP, when saved value is invalid */
    985		if (cpu_data->epp_powersave < 0)
    986			goto skip_epp;
    987
    988		/*
    989		 * No need to restore EPP when it is not zero. This
    990		 * means:
    991		 *  - Policy is not changed
    992		 *  - user has manually changed
    993		 *  - Error reading EPB
    994		 */
    995		epp = intel_pstate_get_epp(cpu_data, value);
    996		if (epp)
    997			goto skip_epp;
    998
    999		epp = cpu_data->epp_powersave;
   1000	}
   1001	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
   1002		value &= ~GENMASK_ULL(31, 24);
   1003		value |= (u64)epp << 24;
   1004	} else {
   1005		intel_pstate_set_epb(cpu, epp);
   1006	}
   1007skip_epp:
   1008	WRITE_ONCE(cpu_data->hwp_req_cached, value);
   1009	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
   1010}
   1011
   1012static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata);
   1013
   1014static void intel_pstate_hwp_offline(struct cpudata *cpu)
   1015{
   1016	u64 value = READ_ONCE(cpu->hwp_req_cached);
   1017	int min_perf;
   1018
   1019	intel_pstate_disable_hwp_interrupt(cpu);
   1020
   1021	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
   1022		/*
   1023		 * In case the EPP has been set to "performance" by the
   1024		 * active mode "performance" scaling algorithm, replace that
   1025		 * temporary value with the cached EPP one.
   1026		 */
   1027		value &= ~GENMASK_ULL(31, 24);
   1028		value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached);
   1029		/*
   1030		 * However, make sure that EPP will be set to "performance" when
   1031		 * the CPU is brought back online again and the "performance"
   1032		 * scaling algorithm is still in effect.
   1033		 */
   1034		cpu->epp_policy = CPUFREQ_POLICY_UNKNOWN;
   1035	}
   1036
   1037	/*
   1038	 * Clear the desired perf field in the cached HWP request value to
   1039	 * prevent nonzero desired values from being leaked into the active
   1040	 * mode.
   1041	 */
   1042	value &= ~HWP_DESIRED_PERF(~0L);
   1043	WRITE_ONCE(cpu->hwp_req_cached, value);
   1044
   1045	value &= ~GENMASK_ULL(31, 0);
   1046	min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
   1047
   1048	/* Set hwp_max = hwp_min */
   1049	value |= HWP_MAX_PERF(min_perf);
   1050	value |= HWP_MIN_PERF(min_perf);
   1051
   1052	/* Set EPP to min */
   1053	if (boot_cpu_has(X86_FEATURE_HWP_EPP))
   1054		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
   1055
   1056	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
   1057}
   1058
   1059#define POWER_CTL_EE_ENABLE	1
   1060#define POWER_CTL_EE_DISABLE	2
   1061
   1062static int power_ctl_ee_state;
   1063
   1064static void set_power_ctl_ee_state(bool input)
   1065{
   1066	u64 power_ctl;
   1067
   1068	mutex_lock(&intel_pstate_driver_lock);
   1069	rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
   1070	if (input) {
   1071		power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE);
   1072		power_ctl_ee_state = POWER_CTL_EE_ENABLE;
   1073	} else {
   1074		power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE);
   1075		power_ctl_ee_state = POWER_CTL_EE_DISABLE;
   1076	}
   1077	wrmsrl(MSR_IA32_POWER_CTL, power_ctl);
   1078	mutex_unlock(&intel_pstate_driver_lock);
   1079}
   1080
   1081static void intel_pstate_hwp_enable(struct cpudata *cpudata);
   1082
   1083static void intel_pstate_hwp_reenable(struct cpudata *cpu)
   1084{
   1085	intel_pstate_hwp_enable(cpu);
   1086	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached));
   1087}
   1088
   1089static int intel_pstate_suspend(struct cpufreq_policy *policy)
   1090{
   1091	struct cpudata *cpu = all_cpu_data[policy->cpu];
   1092
   1093	pr_debug("CPU %d suspending\n", cpu->cpu);
   1094
   1095	cpu->suspended = true;
   1096
   1097	/* disable HWP interrupt and cancel any pending work */
   1098	intel_pstate_disable_hwp_interrupt(cpu);
   1099
   1100	return 0;
   1101}
   1102
   1103static int intel_pstate_resume(struct cpufreq_policy *policy)
   1104{
   1105	struct cpudata *cpu = all_cpu_data[policy->cpu];
   1106
   1107	pr_debug("CPU %d resuming\n", cpu->cpu);
   1108
   1109	/* Only restore if the system default is changed */
   1110	if (power_ctl_ee_state == POWER_CTL_EE_ENABLE)
   1111		set_power_ctl_ee_state(true);
   1112	else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE)
   1113		set_power_ctl_ee_state(false);
   1114
   1115	if (cpu->suspended && hwp_active) {
   1116		mutex_lock(&intel_pstate_limits_lock);
   1117
   1118		/* Re-enable HWP, because "online" has not done that. */
   1119		intel_pstate_hwp_reenable(cpu);
   1120
   1121		mutex_unlock(&intel_pstate_limits_lock);
   1122	}
   1123
   1124	cpu->suspended = false;
   1125
   1126	return 0;
   1127}
   1128
   1129static void intel_pstate_update_policies(void)
   1130{
   1131	int cpu;
   1132
   1133	for_each_possible_cpu(cpu)
   1134		cpufreq_update_policy(cpu);
   1135}
   1136
   1137static void __intel_pstate_update_max_freq(struct cpudata *cpudata,
   1138					   struct cpufreq_policy *policy)
   1139{
   1140	policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
   1141			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
   1142	refresh_frequency_limits(policy);
   1143}
   1144
   1145static void intel_pstate_update_max_freq(unsigned int cpu)
   1146{
   1147	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
   1148
   1149	if (!policy)
   1150		return;
   1151
   1152	__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
   1153
   1154	cpufreq_cpu_release(policy);
   1155}
   1156
   1157static void intel_pstate_update_limits(unsigned int cpu)
   1158{
   1159	mutex_lock(&intel_pstate_driver_lock);
   1160
   1161	update_turbo_state();
   1162	/*
   1163	 * If turbo has been turned on or off globally, policy limits for
   1164	 * all CPUs need to be updated to reflect that.
   1165	 */
   1166	if (global.turbo_disabled_mf != global.turbo_disabled) {
   1167		global.turbo_disabled_mf = global.turbo_disabled;
   1168		arch_set_max_freq_ratio(global.turbo_disabled);
   1169		for_each_possible_cpu(cpu)
   1170			intel_pstate_update_max_freq(cpu);
   1171	} else {
   1172		cpufreq_update_policy(cpu);
   1173	}
   1174
   1175	mutex_unlock(&intel_pstate_driver_lock);
   1176}
   1177
   1178/************************** sysfs begin ************************/
   1179#define show_one(file_name, object)					\
   1180	static ssize_t show_##file_name					\
   1181	(struct kobject *kobj, struct kobj_attribute *attr, char *buf)	\
   1182	{								\
   1183		return sprintf(buf, "%u\n", global.object);		\
   1184	}
   1185
   1186static ssize_t intel_pstate_show_status(char *buf);
   1187static int intel_pstate_update_status(const char *buf, size_t size);
   1188
   1189static ssize_t show_status(struct kobject *kobj,
   1190			   struct kobj_attribute *attr, char *buf)
   1191{
   1192	ssize_t ret;
   1193
   1194	mutex_lock(&intel_pstate_driver_lock);
   1195	ret = intel_pstate_show_status(buf);
   1196	mutex_unlock(&intel_pstate_driver_lock);
   1197
   1198	return ret;
   1199}
   1200
   1201static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
   1202			    const char *buf, size_t count)
   1203{
   1204	char *p = memchr(buf, '\n', count);
   1205	int ret;
   1206
   1207	mutex_lock(&intel_pstate_driver_lock);
   1208	ret = intel_pstate_update_status(buf, p ? p - buf : count);
   1209	mutex_unlock(&intel_pstate_driver_lock);
   1210
   1211	return ret < 0 ? ret : count;
   1212}
   1213
   1214static ssize_t show_turbo_pct(struct kobject *kobj,
   1215				struct kobj_attribute *attr, char *buf)
   1216{
   1217	struct cpudata *cpu;
   1218	int total, no_turbo, turbo_pct;
   1219	uint32_t turbo_fp;
   1220
   1221	mutex_lock(&intel_pstate_driver_lock);
   1222
   1223	if (!intel_pstate_driver) {
   1224		mutex_unlock(&intel_pstate_driver_lock);
   1225		return -EAGAIN;
   1226	}
   1227
   1228	cpu = all_cpu_data[0];
   1229
   1230	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
   1231	no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
   1232	turbo_fp = div_fp(no_turbo, total);
   1233	turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
   1234
   1235	mutex_unlock(&intel_pstate_driver_lock);
   1236
   1237	return sprintf(buf, "%u\n", turbo_pct);
   1238}
   1239
   1240static ssize_t show_num_pstates(struct kobject *kobj,
   1241				struct kobj_attribute *attr, char *buf)
   1242{
   1243	struct cpudata *cpu;
   1244	int total;
   1245
   1246	mutex_lock(&intel_pstate_driver_lock);
   1247
   1248	if (!intel_pstate_driver) {
   1249		mutex_unlock(&intel_pstate_driver_lock);
   1250		return -EAGAIN;
   1251	}
   1252
   1253	cpu = all_cpu_data[0];
   1254	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
   1255
   1256	mutex_unlock(&intel_pstate_driver_lock);
   1257
   1258	return sprintf(buf, "%u\n", total);
   1259}
   1260
   1261static ssize_t show_no_turbo(struct kobject *kobj,
   1262			     struct kobj_attribute *attr, char *buf)
   1263{
   1264	ssize_t ret;
   1265
   1266	mutex_lock(&intel_pstate_driver_lock);
   1267
   1268	if (!intel_pstate_driver) {
   1269		mutex_unlock(&intel_pstate_driver_lock);
   1270		return -EAGAIN;
   1271	}
   1272
   1273	update_turbo_state();
   1274	if (global.turbo_disabled)
   1275		ret = sprintf(buf, "%u\n", global.turbo_disabled);
   1276	else
   1277		ret = sprintf(buf, "%u\n", global.no_turbo);
   1278
   1279	mutex_unlock(&intel_pstate_driver_lock);
   1280
   1281	return ret;
   1282}
   1283
   1284static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
   1285			      const char *buf, size_t count)
   1286{
   1287	unsigned int input;
   1288	int ret;
   1289
   1290	ret = sscanf(buf, "%u", &input);
   1291	if (ret != 1)
   1292		return -EINVAL;
   1293
   1294	mutex_lock(&intel_pstate_driver_lock);
   1295
   1296	if (!intel_pstate_driver) {
   1297		mutex_unlock(&intel_pstate_driver_lock);
   1298		return -EAGAIN;
   1299	}
   1300
   1301	mutex_lock(&intel_pstate_limits_lock);
   1302
   1303	update_turbo_state();
   1304	if (global.turbo_disabled) {
   1305		pr_notice_once("Turbo disabled by BIOS or unavailable on processor\n");
   1306		mutex_unlock(&intel_pstate_limits_lock);
   1307		mutex_unlock(&intel_pstate_driver_lock);
   1308		return -EPERM;
   1309	}
   1310
   1311	global.no_turbo = clamp_t(int, input, 0, 1);
   1312
   1313	if (global.no_turbo) {
   1314		struct cpudata *cpu = all_cpu_data[0];
   1315		int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate;
   1316
   1317		/* Squash the global minimum into the permitted range. */
   1318		if (global.min_perf_pct > pct)
   1319			global.min_perf_pct = pct;
   1320	}
   1321
   1322	mutex_unlock(&intel_pstate_limits_lock);
   1323
   1324	intel_pstate_update_policies();
   1325	arch_set_max_freq_ratio(global.no_turbo);
   1326
   1327	mutex_unlock(&intel_pstate_driver_lock);
   1328
   1329	return count;
   1330}
   1331
   1332static void update_qos_request(enum freq_qos_req_type type)
   1333{
   1334	struct freq_qos_request *req;
   1335	struct cpufreq_policy *policy;
   1336	int i;
   1337
   1338	for_each_possible_cpu(i) {
   1339		struct cpudata *cpu = all_cpu_data[i];
   1340		unsigned int freq, perf_pct;
   1341
   1342		policy = cpufreq_cpu_get(i);
   1343		if (!policy)
   1344			continue;
   1345
   1346		req = policy->driver_data;
   1347		cpufreq_cpu_put(policy);
   1348
   1349		if (!req)
   1350			continue;
   1351
   1352		if (hwp_active)
   1353			intel_pstate_get_hwp_cap(cpu);
   1354
   1355		if (type == FREQ_QOS_MIN) {
   1356			perf_pct = global.min_perf_pct;
   1357		} else {
   1358			req++;
   1359			perf_pct = global.max_perf_pct;
   1360		}
   1361
   1362		freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * perf_pct, 100);
   1363
   1364		if (freq_qos_update_request(req, freq) < 0)
   1365			pr_warn("Failed to update freq constraint: CPU%d\n", i);
   1366	}
   1367}
   1368
   1369static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
   1370				  const char *buf, size_t count)
   1371{
   1372	unsigned int input;
   1373	int ret;
   1374
   1375	ret = sscanf(buf, "%u", &input);
   1376	if (ret != 1)
   1377		return -EINVAL;
   1378
   1379	mutex_lock(&intel_pstate_driver_lock);
   1380
   1381	if (!intel_pstate_driver) {
   1382		mutex_unlock(&intel_pstate_driver_lock);
   1383		return -EAGAIN;
   1384	}
   1385
   1386	mutex_lock(&intel_pstate_limits_lock);
   1387
   1388	global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100);
   1389
   1390	mutex_unlock(&intel_pstate_limits_lock);
   1391
   1392	if (intel_pstate_driver == &intel_pstate)
   1393		intel_pstate_update_policies();
   1394	else
   1395		update_qos_request(FREQ_QOS_MAX);
   1396
   1397	mutex_unlock(&intel_pstate_driver_lock);
   1398
   1399	return count;
   1400}
   1401
   1402static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
   1403				  const char *buf, size_t count)
   1404{
   1405	unsigned int input;
   1406	int ret;
   1407
   1408	ret = sscanf(buf, "%u", &input);
   1409	if (ret != 1)
   1410		return -EINVAL;
   1411
   1412	mutex_lock(&intel_pstate_driver_lock);
   1413
   1414	if (!intel_pstate_driver) {
   1415		mutex_unlock(&intel_pstate_driver_lock);
   1416		return -EAGAIN;
   1417	}
   1418
   1419	mutex_lock(&intel_pstate_limits_lock);
   1420
   1421	global.min_perf_pct = clamp_t(int, input,
   1422				      min_perf_pct_min(), global.max_perf_pct);
   1423
   1424	mutex_unlock(&intel_pstate_limits_lock);
   1425
   1426	if (intel_pstate_driver == &intel_pstate)
   1427		intel_pstate_update_policies();
   1428	else
   1429		update_qos_request(FREQ_QOS_MIN);
   1430
   1431	mutex_unlock(&intel_pstate_driver_lock);
   1432
   1433	return count;
   1434}
   1435
   1436static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
   1437				struct kobj_attribute *attr, char *buf)
   1438{
   1439	return sprintf(buf, "%u\n", hwp_boost);
   1440}
   1441
   1442static ssize_t store_hwp_dynamic_boost(struct kobject *a,
   1443				       struct kobj_attribute *b,
   1444				       const char *buf, size_t count)
   1445{
   1446	unsigned int input;
   1447	int ret;
   1448
   1449	ret = kstrtouint(buf, 10, &input);
   1450	if (ret)
   1451		return ret;
   1452
   1453	mutex_lock(&intel_pstate_driver_lock);
   1454	hwp_boost = !!input;
   1455	intel_pstate_update_policies();
   1456	mutex_unlock(&intel_pstate_driver_lock);
   1457
   1458	return count;
   1459}
   1460
   1461static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr,
   1462				      char *buf)
   1463{
   1464	u64 power_ctl;
   1465	int enable;
   1466
   1467	rdmsrl(MSR_IA32_POWER_CTL, power_ctl);
   1468	enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE));
   1469	return sprintf(buf, "%d\n", !enable);
   1470}
   1471
   1472static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b,
   1473				       const char *buf, size_t count)
   1474{
   1475	bool input;
   1476	int ret;
   1477
   1478	ret = kstrtobool(buf, &input);
   1479	if (ret)
   1480		return ret;
   1481
   1482	set_power_ctl_ee_state(input);
   1483
   1484	return count;
   1485}
   1486
   1487show_one(max_perf_pct, max_perf_pct);
   1488show_one(min_perf_pct, min_perf_pct);
   1489
   1490define_one_global_rw(status);
   1491define_one_global_rw(no_turbo);
   1492define_one_global_rw(max_perf_pct);
   1493define_one_global_rw(min_perf_pct);
   1494define_one_global_ro(turbo_pct);
   1495define_one_global_ro(num_pstates);
   1496define_one_global_rw(hwp_dynamic_boost);
   1497define_one_global_rw(energy_efficiency);
   1498
   1499static struct attribute *intel_pstate_attributes[] = {
   1500	&status.attr,
   1501	&no_turbo.attr,
   1502	NULL
   1503};
   1504
   1505static const struct attribute_group intel_pstate_attr_group = {
   1506	.attrs = intel_pstate_attributes,
   1507};
   1508
   1509static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[];
   1510
   1511static struct kobject *intel_pstate_kobject;
   1512
   1513static void __init intel_pstate_sysfs_expose_params(void)
   1514{
   1515	int rc;
   1516
   1517	intel_pstate_kobject = kobject_create_and_add("intel_pstate",
   1518						&cpu_subsys.dev_root->kobj);
   1519	if (WARN_ON(!intel_pstate_kobject))
   1520		return;
   1521
   1522	rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
   1523	if (WARN_ON(rc))
   1524		return;
   1525
   1526	if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
   1527		rc = sysfs_create_file(intel_pstate_kobject, &turbo_pct.attr);
   1528		WARN_ON(rc);
   1529
   1530		rc = sysfs_create_file(intel_pstate_kobject, &num_pstates.attr);
   1531		WARN_ON(rc);
   1532	}
   1533
   1534	/*
   1535	 * If per cpu limits are enforced there are no global limits, so
   1536	 * return without creating max/min_perf_pct attributes
   1537	 */
   1538	if (per_cpu_limits)
   1539		return;
   1540
   1541	rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
   1542	WARN_ON(rc);
   1543
   1544	rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
   1545	WARN_ON(rc);
   1546
   1547	if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) {
   1548		rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr);
   1549		WARN_ON(rc);
   1550	}
   1551}
   1552
   1553static void __init intel_pstate_sysfs_remove(void)
   1554{
   1555	if (!intel_pstate_kobject)
   1556		return;
   1557
   1558	sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group);
   1559
   1560	if (!boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
   1561		sysfs_remove_file(intel_pstate_kobject, &num_pstates.attr);
   1562		sysfs_remove_file(intel_pstate_kobject, &turbo_pct.attr);
   1563	}
   1564
   1565	if (!per_cpu_limits) {
   1566		sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr);
   1567		sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr);
   1568
   1569		if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids))
   1570			sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr);
   1571	}
   1572
   1573	kobject_put(intel_pstate_kobject);
   1574}
   1575
   1576static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void)
   1577{
   1578	int rc;
   1579
   1580	if (!hwp_active)
   1581		return;
   1582
   1583	rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
   1584	WARN_ON_ONCE(rc);
   1585}
   1586
   1587static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void)
   1588{
   1589	if (!hwp_active)
   1590		return;
   1591
   1592	sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr);
   1593}
   1594
   1595/************************** sysfs end ************************/
   1596
   1597static void intel_pstate_notify_work(struct work_struct *work)
   1598{
   1599	struct cpudata *cpudata =
   1600		container_of(to_delayed_work(work), struct cpudata, hwp_notify_work);
   1601	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu);
   1602
   1603	if (policy) {
   1604		intel_pstate_get_hwp_cap(cpudata);
   1605		__intel_pstate_update_max_freq(cpudata, policy);
   1606
   1607		cpufreq_cpu_release(policy);
   1608	}
   1609
   1610	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
   1611}
   1612
   1613static DEFINE_SPINLOCK(hwp_notify_lock);
   1614static cpumask_t hwp_intr_enable_mask;
   1615
   1616void notify_hwp_interrupt(void)
   1617{
   1618	unsigned int this_cpu = smp_processor_id();
   1619	struct cpudata *cpudata;
   1620	unsigned long flags;
   1621	u64 value;
   1622
   1623	if (!READ_ONCE(hwp_active) || !boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
   1624		return;
   1625
   1626	rdmsrl_safe(MSR_HWP_STATUS, &value);
   1627	if (!(value & 0x01))
   1628		return;
   1629
   1630	spin_lock_irqsave(&hwp_notify_lock, flags);
   1631
   1632	if (!cpumask_test_cpu(this_cpu, &hwp_intr_enable_mask))
   1633		goto ack_intr;
   1634
   1635	/*
   1636	 * Currently we never free all_cpu_data. And we can't reach here
   1637	 * without this allocated. But for safety for future changes, added
   1638	 * check.
   1639	 */
   1640	if (unlikely(!READ_ONCE(all_cpu_data)))
   1641		goto ack_intr;
   1642
   1643	/*
   1644	 * The free is done during cleanup, when cpufreq registry is failed.
   1645	 * We wouldn't be here if it fails on init or switch status. But for
   1646	 * future changes, added check.
   1647	 */
   1648	cpudata = READ_ONCE(all_cpu_data[this_cpu]);
   1649	if (unlikely(!cpudata))
   1650		goto ack_intr;
   1651
   1652	schedule_delayed_work(&cpudata->hwp_notify_work, msecs_to_jiffies(10));
   1653
   1654	spin_unlock_irqrestore(&hwp_notify_lock, flags);
   1655
   1656	return;
   1657
   1658ack_intr:
   1659	wrmsrl_safe(MSR_HWP_STATUS, 0);
   1660	spin_unlock_irqrestore(&hwp_notify_lock, flags);
   1661}
   1662
   1663static void intel_pstate_disable_hwp_interrupt(struct cpudata *cpudata)
   1664{
   1665	unsigned long flags;
   1666
   1667	if (!boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
   1668		return;
   1669
   1670	/* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
   1671	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
   1672
   1673	spin_lock_irqsave(&hwp_notify_lock, flags);
   1674	if (cpumask_test_and_clear_cpu(cpudata->cpu, &hwp_intr_enable_mask))
   1675		cancel_delayed_work(&cpudata->hwp_notify_work);
   1676	spin_unlock_irqrestore(&hwp_notify_lock, flags);
   1677}
   1678
   1679static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
   1680{
   1681	/* Enable HWP notification interrupt for guaranteed performance change */
   1682	if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) {
   1683		unsigned long flags;
   1684
   1685		spin_lock_irqsave(&hwp_notify_lock, flags);
   1686		INIT_DELAYED_WORK(&cpudata->hwp_notify_work, intel_pstate_notify_work);
   1687		cpumask_set_cpu(cpudata->cpu, &hwp_intr_enable_mask);
   1688		spin_unlock_irqrestore(&hwp_notify_lock, flags);
   1689
   1690		/* wrmsrl_on_cpu has to be outside spinlock as this can result in IPC */
   1691		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x01);
   1692		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
   1693	}
   1694}
   1695
   1696static void intel_pstate_update_epp_defaults(struct cpudata *cpudata)
   1697{
   1698	cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
   1699
   1700	/*
   1701	 * If this CPU gen doesn't call for change in balance_perf
   1702	 * EPP return.
   1703	 */
   1704	if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE)
   1705		return;
   1706
   1707	/*
   1708	 * If powerup EPP is something other than chipset default 0x80 and
   1709	 * - is more performance oriented than 0x80 (default balance_perf EPP)
   1710	 * - But less performance oriented than performance EPP
   1711	 *   then use this as new balance_perf EPP.
   1712	 */
   1713	if (cpudata->epp_default < HWP_EPP_BALANCE_PERFORMANCE &&
   1714	    cpudata->epp_default > HWP_EPP_PERFORMANCE) {
   1715		epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = cpudata->epp_default;
   1716		return;
   1717	}
   1718
   1719	/*
   1720	 * Use hard coded value per gen to update the balance_perf
   1721	 * and default EPP.
   1722	 */
   1723	cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE];
   1724	intel_pstate_set_epp(cpudata, cpudata->epp_default);
   1725}
   1726
   1727static void intel_pstate_hwp_enable(struct cpudata *cpudata)
   1728{
   1729	/* First disable HWP notification interrupt till we activate again */
   1730	if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY))
   1731		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
   1732
   1733	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
   1734
   1735	intel_pstate_enable_hwp_interrupt(cpudata);
   1736
   1737	if (cpudata->epp_default >= 0)
   1738		return;
   1739
   1740	intel_pstate_update_epp_defaults(cpudata);
   1741}
   1742
   1743static int atom_get_min_pstate(void)
   1744{
   1745	u64 value;
   1746
   1747	rdmsrl(MSR_ATOM_CORE_RATIOS, value);
   1748	return (value >> 8) & 0x7F;
   1749}
   1750
   1751static int atom_get_max_pstate(void)
   1752{
   1753	u64 value;
   1754
   1755	rdmsrl(MSR_ATOM_CORE_RATIOS, value);
   1756	return (value >> 16) & 0x7F;
   1757}
   1758
   1759static int atom_get_turbo_pstate(void)
   1760{
   1761	u64 value;
   1762
   1763	rdmsrl(MSR_ATOM_CORE_TURBO_RATIOS, value);
   1764	return value & 0x7F;
   1765}
   1766
   1767static u64 atom_get_val(struct cpudata *cpudata, int pstate)
   1768{
   1769	u64 val;
   1770	int32_t vid_fp;
   1771	u32 vid;
   1772
   1773	val = (u64)pstate << 8;
   1774	if (global.no_turbo && !global.turbo_disabled)
   1775		val |= (u64)1 << 32;
   1776
   1777	vid_fp = cpudata->vid.min + mul_fp(
   1778		int_tofp(pstate - cpudata->pstate.min_pstate),
   1779		cpudata->vid.ratio);
   1780
   1781	vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
   1782	vid = ceiling_fp(vid_fp);
   1783
   1784	if (pstate > cpudata->pstate.max_pstate)
   1785		vid = cpudata->vid.turbo;
   1786
   1787	return val | vid;
   1788}
   1789
   1790static int silvermont_get_scaling(void)
   1791{
   1792	u64 value;
   1793	int i;
   1794	/* Defined in Table 35-6 from SDM (Sept 2015) */
   1795	static int silvermont_freq_table[] = {
   1796		83300, 100000, 133300, 116700, 80000};
   1797
   1798	rdmsrl(MSR_FSB_FREQ, value);
   1799	i = value & 0x7;
   1800	WARN_ON(i > 4);
   1801
   1802	return silvermont_freq_table[i];
   1803}
   1804
   1805static int airmont_get_scaling(void)
   1806{
   1807	u64 value;
   1808	int i;
   1809	/* Defined in Table 35-10 from SDM (Sept 2015) */
   1810	static int airmont_freq_table[] = {
   1811		83300, 100000, 133300, 116700, 80000,
   1812		93300, 90000, 88900, 87500};
   1813
   1814	rdmsrl(MSR_FSB_FREQ, value);
   1815	i = value & 0xF;
   1816	WARN_ON(i > 8);
   1817
   1818	return airmont_freq_table[i];
   1819}
   1820
   1821static void atom_get_vid(struct cpudata *cpudata)
   1822{
   1823	u64 value;
   1824
   1825	rdmsrl(MSR_ATOM_CORE_VIDS, value);
   1826	cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
   1827	cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
   1828	cpudata->vid.ratio = div_fp(
   1829		cpudata->vid.max - cpudata->vid.min,
   1830		int_tofp(cpudata->pstate.max_pstate -
   1831			cpudata->pstate.min_pstate));
   1832
   1833	rdmsrl(MSR_ATOM_CORE_TURBO_VIDS, value);
   1834	cpudata->vid.turbo = value & 0x7f;
   1835}
   1836
   1837static int core_get_min_pstate(void)
   1838{
   1839	u64 value;
   1840
   1841	rdmsrl(MSR_PLATFORM_INFO, value);
   1842	return (value >> 40) & 0xFF;
   1843}
   1844
   1845static int core_get_max_pstate_physical(void)
   1846{
   1847	u64 value;
   1848
   1849	rdmsrl(MSR_PLATFORM_INFO, value);
   1850	return (value >> 8) & 0xFF;
   1851}
   1852
   1853static int core_get_tdp_ratio(u64 plat_info)
   1854{
   1855	/* Check how many TDP levels present */
   1856	if (plat_info & 0x600000000) {
   1857		u64 tdp_ctrl;
   1858		u64 tdp_ratio;
   1859		int tdp_msr;
   1860		int err;
   1861
   1862		/* Get the TDP level (0, 1, 2) to get ratios */
   1863		err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
   1864		if (err)
   1865			return err;
   1866
   1867		/* TDP MSR are continuous starting at 0x648 */
   1868		tdp_msr = MSR_CONFIG_TDP_NOMINAL + (tdp_ctrl & 0x03);
   1869		err = rdmsrl_safe(tdp_msr, &tdp_ratio);
   1870		if (err)
   1871			return err;
   1872
   1873		/* For level 1 and 2, bits[23:16] contain the ratio */
   1874		if (tdp_ctrl & 0x03)
   1875			tdp_ratio >>= 16;
   1876
   1877		tdp_ratio &= 0xff; /* ratios are only 8 bits long */
   1878		pr_debug("tdp_ratio %x\n", (int)tdp_ratio);
   1879
   1880		return (int)tdp_ratio;
   1881	}
   1882
   1883	return -ENXIO;
   1884}
   1885
   1886static int core_get_max_pstate(void)
   1887{
   1888	u64 tar;
   1889	u64 plat_info;
   1890	int max_pstate;
   1891	int tdp_ratio;
   1892	int err;
   1893
   1894	rdmsrl(MSR_PLATFORM_INFO, plat_info);
   1895	max_pstate = (plat_info >> 8) & 0xFF;
   1896
   1897	tdp_ratio = core_get_tdp_ratio(plat_info);
   1898	if (tdp_ratio <= 0)
   1899		return max_pstate;
   1900
   1901	if (hwp_active) {
   1902		/* Turbo activation ratio is not used on HWP platforms */
   1903		return tdp_ratio;
   1904	}
   1905
   1906	err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
   1907	if (!err) {
   1908		int tar_levels;
   1909
   1910		/* Do some sanity checking for safety */
   1911		tar_levels = tar & 0xff;
   1912		if (tdp_ratio - 1 == tar_levels) {
   1913			max_pstate = tar_levels;
   1914			pr_debug("max_pstate=TAC %x\n", max_pstate);
   1915		}
   1916	}
   1917
   1918	return max_pstate;
   1919}
   1920
   1921static int core_get_turbo_pstate(void)
   1922{
   1923	u64 value;
   1924	int nont, ret;
   1925
   1926	rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
   1927	nont = core_get_max_pstate();
   1928	ret = (value) & 255;
   1929	if (ret <= nont)
   1930		ret = nont;
   1931	return ret;
   1932}
   1933
   1934static inline int core_get_scaling(void)
   1935{
   1936	return 100000;
   1937}
   1938
   1939static u64 core_get_val(struct cpudata *cpudata, int pstate)
   1940{
   1941	u64 val;
   1942
   1943	val = (u64)pstate << 8;
   1944	if (global.no_turbo && !global.turbo_disabled)
   1945		val |= (u64)1 << 32;
   1946
   1947	return val;
   1948}
   1949
   1950static int knl_get_aperf_mperf_shift(void)
   1951{
   1952	return 10;
   1953}
   1954
   1955static int knl_get_turbo_pstate(void)
   1956{
   1957	u64 value;
   1958	int nont, ret;
   1959
   1960	rdmsrl(MSR_TURBO_RATIO_LIMIT, value);
   1961	nont = core_get_max_pstate();
   1962	ret = (((value) >> 8) & 0xFF);
   1963	if (ret <= nont)
   1964		ret = nont;
   1965	return ret;
   1966}
   1967
   1968#ifdef CONFIG_ACPI_CPPC_LIB
   1969static u32 hybrid_ref_perf;
   1970
   1971static int hybrid_get_cpu_scaling(int cpu)
   1972{
   1973	return DIV_ROUND_UP(core_get_scaling() * hybrid_ref_perf,
   1974			    intel_pstate_cppc_nominal(cpu));
   1975}
   1976
   1977static void intel_pstate_cppc_set_cpu_scaling(void)
   1978{
   1979	u32 min_nominal_perf = U32_MAX;
   1980	int cpu;
   1981
   1982	for_each_present_cpu(cpu) {
   1983		u32 nominal_perf = intel_pstate_cppc_nominal(cpu);
   1984
   1985		if (nominal_perf && nominal_perf < min_nominal_perf)
   1986			min_nominal_perf = nominal_perf;
   1987	}
   1988
   1989	if (min_nominal_perf < U32_MAX) {
   1990		hybrid_ref_perf = min_nominal_perf;
   1991		pstate_funcs.get_cpu_scaling = hybrid_get_cpu_scaling;
   1992	}
   1993}
   1994#else
   1995static inline void intel_pstate_cppc_set_cpu_scaling(void)
   1996{
   1997}
   1998#endif /* CONFIG_ACPI_CPPC_LIB */
   1999
   2000static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
   2001{
   2002	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
   2003	cpu->pstate.current_pstate = pstate;
   2004	/*
   2005	 * Generally, there is no guarantee that this code will always run on
   2006	 * the CPU being updated, so force the register update to run on the
   2007	 * right CPU.
   2008	 */
   2009	wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
   2010		      pstate_funcs.get_val(cpu, pstate));
   2011}
   2012
   2013static void intel_pstate_set_min_pstate(struct cpudata *cpu)
   2014{
   2015	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
   2016}
   2017
   2018static void intel_pstate_max_within_limits(struct cpudata *cpu)
   2019{
   2020	int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio);
   2021
   2022	update_turbo_state();
   2023	intel_pstate_set_pstate(cpu, pstate);
   2024}
   2025
   2026static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
   2027{
   2028	int perf_ctl_max_phys = pstate_funcs.get_max_physical();
   2029	int perf_ctl_scaling = pstate_funcs.get_scaling();
   2030
   2031	cpu->pstate.min_pstate = pstate_funcs.get_min();
   2032	cpu->pstate.max_pstate_physical = perf_ctl_max_phys;
   2033	cpu->pstate.perf_ctl_scaling = perf_ctl_scaling;
   2034
   2035	if (hwp_active && !hwp_mode_bdw) {
   2036		__intel_pstate_get_hwp_cap(cpu);
   2037
   2038		if (pstate_funcs.get_cpu_scaling) {
   2039			cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
   2040			if (cpu->pstate.scaling != perf_ctl_scaling)
   2041				intel_pstate_hybrid_hwp_adjust(cpu);
   2042		} else {
   2043			cpu->pstate.scaling = perf_ctl_scaling;
   2044		}
   2045	} else {
   2046		cpu->pstate.scaling = perf_ctl_scaling;
   2047		cpu->pstate.max_pstate = pstate_funcs.get_max();
   2048		cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
   2049	}
   2050
   2051	if (cpu->pstate.scaling == perf_ctl_scaling) {
   2052		cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
   2053		cpu->pstate.max_freq = cpu->pstate.max_pstate * perf_ctl_scaling;
   2054		cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * perf_ctl_scaling;
   2055	}
   2056
   2057	if (pstate_funcs.get_aperf_mperf_shift)
   2058		cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
   2059
   2060	if (pstate_funcs.get_vid)
   2061		pstate_funcs.get_vid(cpu);
   2062
   2063	intel_pstate_set_min_pstate(cpu);
   2064}
   2065
   2066/*
   2067 * Long hold time will keep high perf limits for long time,
   2068 * which negatively impacts perf/watt for some workloads,
   2069 * like specpower. 3ms is based on experiements on some
   2070 * workoads.
   2071 */
   2072static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
   2073
   2074static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu)
   2075{
   2076	u64 hwp_req = READ_ONCE(cpu->hwp_req_cached);
   2077	u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
   2078	u32 max_limit = (hwp_req & 0xff00) >> 8;
   2079	u32 min_limit = (hwp_req & 0xff);
   2080	u32 boost_level1;
   2081
   2082	/*
   2083	 * Cases to consider (User changes via sysfs or boot time):
   2084	 * If, P0 (Turbo max) = P1 (Guaranteed max) = min:
   2085	 *	No boost, return.
   2086	 * If, P0 (Turbo max) > P1 (Guaranteed max) = min:
   2087	 *     Should result in one level boost only for P0.
   2088	 * If, P0 (Turbo max) = P1 (Guaranteed max) > min:
   2089	 *     Should result in two level boost:
   2090	 *         (min + p1)/2 and P1.
   2091	 * If, P0 (Turbo max) > P1 (Guaranteed max) > min:
   2092	 *     Should result in three level boost:
   2093	 *        (min + p1)/2, P1 and P0.
   2094	 */
   2095
   2096	/* If max and min are equal or already at max, nothing to boost */
   2097	if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit)
   2098		return;
   2099
   2100	if (!cpu->hwp_boost_min)
   2101		cpu->hwp_boost_min = min_limit;
   2102
   2103	/* level at half way mark between min and guranteed */
   2104	boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1;
   2105
   2106	if (cpu->hwp_boost_min < boost_level1)
   2107		cpu->hwp_boost_min = boost_level1;
   2108	else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap))
   2109		cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap);
   2110	else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) &&
   2111		 max_limit != HWP_GUARANTEED_PERF(hwp_cap))
   2112		cpu->hwp_boost_min = max_limit;
   2113	else
   2114		return;
   2115
   2116	hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min;
   2117	wrmsrl(MSR_HWP_REQUEST, hwp_req);
   2118	cpu->last_update = cpu->sample.time;
   2119}
   2120
   2121static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu)
   2122{
   2123	if (cpu->hwp_boost_min) {
   2124		bool expired;
   2125
   2126		/* Check if we are idle for hold time to boost down */
   2127		expired = time_after64(cpu->sample.time, cpu->last_update +
   2128				       hwp_boost_hold_time_ns);
   2129		if (expired) {
   2130			wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached);
   2131			cpu->hwp_boost_min = 0;
   2132		}
   2133	}
   2134	cpu->last_update = cpu->sample.time;
   2135}
   2136
   2137static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
   2138						      u64 time)
   2139{
   2140	cpu->sample.time = time;
   2141
   2142	if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
   2143		bool do_io = false;
   2144
   2145		cpu->sched_flags = 0;
   2146		/*
   2147		 * Set iowait_boost flag and update time. Since IO WAIT flag
   2148		 * is set all the time, we can't just conclude that there is
   2149		 * some IO bound activity is scheduled on this CPU with just
   2150		 * one occurrence. If we receive at least two in two
   2151		 * consecutive ticks, then we treat as boost candidate.
   2152		 */
   2153		if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
   2154			do_io = true;
   2155
   2156		cpu->last_io_update = time;
   2157
   2158		if (do_io)
   2159			intel_pstate_hwp_boost_up(cpu);
   2160
   2161	} else {
   2162		intel_pstate_hwp_boost_down(cpu);
   2163	}
   2164}
   2165
   2166static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
   2167						u64 time, unsigned int flags)
   2168{
   2169	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
   2170
   2171	cpu->sched_flags |= flags;
   2172
   2173	if (smp_processor_id() == cpu->cpu)
   2174		intel_pstate_update_util_hwp_local(cpu, time);
   2175}
   2176
   2177static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
   2178{
   2179	struct sample *sample = &cpu->sample;
   2180
   2181	sample->core_avg_perf = div_ext_fp(sample->aperf, sample->mperf);
   2182}
   2183
   2184static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
   2185{
   2186	u64 aperf, mperf;
   2187	unsigned long flags;
   2188	u64 tsc;
   2189
   2190	local_irq_save(flags);
   2191	rdmsrl(MSR_IA32_APERF, aperf);
   2192	rdmsrl(MSR_IA32_MPERF, mperf);
   2193	tsc = rdtsc();
   2194	if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
   2195		local_irq_restore(flags);
   2196		return false;
   2197	}
   2198	local_irq_restore(flags);
   2199
   2200	cpu->last_sample_time = cpu->sample.time;
   2201	cpu->sample.time = time;
   2202	cpu->sample.aperf = aperf;
   2203	cpu->sample.mperf = mperf;
   2204	cpu->sample.tsc =  tsc;
   2205	cpu->sample.aperf -= cpu->prev_aperf;
   2206	cpu->sample.mperf -= cpu->prev_mperf;
   2207	cpu->sample.tsc -= cpu->prev_tsc;
   2208
   2209	cpu->prev_aperf = aperf;
   2210	cpu->prev_mperf = mperf;
   2211	cpu->prev_tsc = tsc;
   2212	/*
   2213	 * First time this function is invoked in a given cycle, all of the
   2214	 * previous sample data fields are equal to zero or stale and they must
   2215	 * be populated with meaningful numbers for things to work, so assume
   2216	 * that sample.time will always be reset before setting the utilization
   2217	 * update hook and make the caller skip the sample then.
   2218	 */
   2219	if (cpu->last_sample_time) {
   2220		intel_pstate_calc_avg_perf(cpu);
   2221		return true;
   2222	}
   2223	return false;
   2224}
   2225
   2226static inline int32_t get_avg_frequency(struct cpudata *cpu)
   2227{
   2228	return mul_ext_fp(cpu->sample.core_avg_perf, cpu_khz);
   2229}
   2230
   2231static inline int32_t get_avg_pstate(struct cpudata *cpu)
   2232{
   2233	return mul_ext_fp(cpu->pstate.max_pstate_physical,
   2234			  cpu->sample.core_avg_perf);
   2235}
   2236
   2237static inline int32_t get_target_pstate(struct cpudata *cpu)
   2238{
   2239	struct sample *sample = &cpu->sample;
   2240	int32_t busy_frac;
   2241	int target, avg_pstate;
   2242
   2243	busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
   2244			   sample->tsc);
   2245
   2246	if (busy_frac < cpu->iowait_boost)
   2247		busy_frac = cpu->iowait_boost;
   2248
   2249	sample->busy_scaled = busy_frac * 100;
   2250
   2251	target = global.no_turbo || global.turbo_disabled ?
   2252			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
   2253	target += target >> 2;
   2254	target = mul_fp(target, busy_frac);
   2255	if (target < cpu->pstate.min_pstate)
   2256		target = cpu->pstate.min_pstate;
   2257
   2258	/*
   2259	 * If the average P-state during the previous cycle was higher than the
   2260	 * current target, add 50% of the difference to the target to reduce
   2261	 * possible performance oscillations and offset possible performance
   2262	 * loss related to moving the workload from one CPU to another within
   2263	 * a package/module.
   2264	 */
   2265	avg_pstate = get_avg_pstate(cpu);
   2266	if (avg_pstate > target)
   2267		target += (avg_pstate - target) >> 1;
   2268
   2269	return target;
   2270}
   2271
   2272static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
   2273{
   2274	int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio);
   2275	int max_pstate = max(min_pstate, cpu->max_perf_ratio);
   2276
   2277	return clamp_t(int, pstate, min_pstate, max_pstate);
   2278}
   2279
   2280static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
   2281{
   2282	if (pstate == cpu->pstate.current_pstate)
   2283		return;
   2284
   2285	cpu->pstate.current_pstate = pstate;
   2286	wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
   2287}
   2288
   2289static void intel_pstate_adjust_pstate(struct cpudata *cpu)
   2290{
   2291	int from = cpu->pstate.current_pstate;
   2292	struct sample *sample;
   2293	int target_pstate;
   2294
   2295	update_turbo_state();
   2296
   2297	target_pstate = get_target_pstate(cpu);
   2298	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
   2299	trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu);
   2300	intel_pstate_update_pstate(cpu, target_pstate);
   2301
   2302	sample = &cpu->sample;
   2303	trace_pstate_sample(mul_ext_fp(100, sample->core_avg_perf),
   2304		fp_toint(sample->busy_scaled),
   2305		from,
   2306		cpu->pstate.current_pstate,
   2307		sample->mperf,
   2308		sample->aperf,
   2309		sample->tsc,
   2310		get_avg_frequency(cpu),
   2311		fp_toint(cpu->iowait_boost * 100));
   2312}
   2313
   2314static void intel_pstate_update_util(struct update_util_data *data, u64 time,
   2315				     unsigned int flags)
   2316{
   2317	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
   2318	u64 delta_ns;
   2319
   2320	/* Don't allow remote callbacks */
   2321	if (smp_processor_id() != cpu->cpu)
   2322		return;
   2323
   2324	delta_ns = time - cpu->last_update;
   2325	if (flags & SCHED_CPUFREQ_IOWAIT) {
   2326		/* Start over if the CPU may have been idle. */
   2327		if (delta_ns > TICK_NSEC) {
   2328			cpu->iowait_boost = ONE_EIGHTH_FP;
   2329		} else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
   2330			cpu->iowait_boost <<= 1;
   2331			if (cpu->iowait_boost > int_tofp(1))
   2332				cpu->iowait_boost = int_tofp(1);
   2333		} else {
   2334			cpu->iowait_boost = ONE_EIGHTH_FP;
   2335		}
   2336	} else if (cpu->iowait_boost) {
   2337		/* Clear iowait_boost if the CPU may have been idle. */
   2338		if (delta_ns > TICK_NSEC)
   2339			cpu->iowait_boost = 0;
   2340		else
   2341			cpu->iowait_boost >>= 1;
   2342	}
   2343	cpu->last_update = time;
   2344	delta_ns = time - cpu->sample.time;
   2345	if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
   2346		return;
   2347
   2348	if (intel_pstate_sample(cpu, time))
   2349		intel_pstate_adjust_pstate(cpu);
   2350}
   2351
   2352static struct pstate_funcs core_funcs = {
   2353	.get_max = core_get_max_pstate,
   2354	.get_max_physical = core_get_max_pstate_physical,
   2355	.get_min = core_get_min_pstate,
   2356	.get_turbo = core_get_turbo_pstate,
   2357	.get_scaling = core_get_scaling,
   2358	.get_val = core_get_val,
   2359};
   2360
   2361static const struct pstate_funcs silvermont_funcs = {
   2362	.get_max = atom_get_max_pstate,
   2363	.get_max_physical = atom_get_max_pstate,
   2364	.get_min = atom_get_min_pstate,
   2365	.get_turbo = atom_get_turbo_pstate,
   2366	.get_val = atom_get_val,
   2367	.get_scaling = silvermont_get_scaling,
   2368	.get_vid = atom_get_vid,
   2369};
   2370
   2371static const struct pstate_funcs airmont_funcs = {
   2372	.get_max = atom_get_max_pstate,
   2373	.get_max_physical = atom_get_max_pstate,
   2374	.get_min = atom_get_min_pstate,
   2375	.get_turbo = atom_get_turbo_pstate,
   2376	.get_val = atom_get_val,
   2377	.get_scaling = airmont_get_scaling,
   2378	.get_vid = atom_get_vid,
   2379};
   2380
   2381static const struct pstate_funcs knl_funcs = {
   2382	.get_max = core_get_max_pstate,
   2383	.get_max_physical = core_get_max_pstate_physical,
   2384	.get_min = core_get_min_pstate,
   2385	.get_turbo = knl_get_turbo_pstate,
   2386	.get_aperf_mperf_shift = knl_get_aperf_mperf_shift,
   2387	.get_scaling = core_get_scaling,
   2388	.get_val = core_get_val,
   2389};
   2390
   2391#define X86_MATCH(model, policy)					 \
   2392	X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
   2393					   X86_FEATURE_APERFMPERF, &policy)
   2394
   2395static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
   2396	X86_MATCH(SANDYBRIDGE,		core_funcs),
   2397	X86_MATCH(SANDYBRIDGE_X,	core_funcs),
   2398	X86_MATCH(ATOM_SILVERMONT,	silvermont_funcs),
   2399	X86_MATCH(IVYBRIDGE,		core_funcs),
   2400	X86_MATCH(HASWELL,		core_funcs),
   2401	X86_MATCH(BROADWELL,		core_funcs),
   2402	X86_MATCH(IVYBRIDGE_X,		core_funcs),
   2403	X86_MATCH(HASWELL_X,		core_funcs),
   2404	X86_MATCH(HASWELL_L,		core_funcs),
   2405	X86_MATCH(HASWELL_G,		core_funcs),
   2406	X86_MATCH(BROADWELL_G,		core_funcs),
   2407	X86_MATCH(ATOM_AIRMONT,		airmont_funcs),
   2408	X86_MATCH(SKYLAKE_L,		core_funcs),
   2409	X86_MATCH(BROADWELL_X,		core_funcs),
   2410	X86_MATCH(SKYLAKE,		core_funcs),
   2411	X86_MATCH(BROADWELL_D,		core_funcs),
   2412	X86_MATCH(XEON_PHI_KNL,		knl_funcs),
   2413	X86_MATCH(XEON_PHI_KNM,		knl_funcs),
   2414	X86_MATCH(ATOM_GOLDMONT,	core_funcs),
   2415	X86_MATCH(ATOM_GOLDMONT_PLUS,	core_funcs),
   2416	X86_MATCH(SKYLAKE_X,		core_funcs),
   2417	X86_MATCH(COMETLAKE,		core_funcs),
   2418	X86_MATCH(ICELAKE_X,		core_funcs),
   2419	{}
   2420};
   2421MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
   2422
   2423static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
   2424	X86_MATCH(BROADWELL_D,		core_funcs),
   2425	X86_MATCH(BROADWELL_X,		core_funcs),
   2426	X86_MATCH(SKYLAKE_X,		core_funcs),
   2427	X86_MATCH(ICELAKE_X,		core_funcs),
   2428	X86_MATCH(SAPPHIRERAPIDS_X,	core_funcs),
   2429	{}
   2430};
   2431
   2432static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
   2433	X86_MATCH(KABYLAKE,		core_funcs),
   2434	{}
   2435};
   2436
   2437static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = {
   2438	X86_MATCH(SKYLAKE_X,		core_funcs),
   2439	X86_MATCH(SKYLAKE,		core_funcs),
   2440	{}
   2441};
   2442
   2443static int intel_pstate_init_cpu(unsigned int cpunum)
   2444{
   2445	struct cpudata *cpu;
   2446
   2447	cpu = all_cpu_data[cpunum];
   2448
   2449	if (!cpu) {
   2450		cpu = kzalloc(sizeof(*cpu), GFP_KERNEL);
   2451		if (!cpu)
   2452			return -ENOMEM;
   2453
   2454		WRITE_ONCE(all_cpu_data[cpunum], cpu);
   2455
   2456		cpu->cpu = cpunum;
   2457
   2458		cpu->epp_default = -EINVAL;
   2459
   2460		if (hwp_active) {
   2461			const struct x86_cpu_id *id;
   2462
   2463			intel_pstate_hwp_enable(cpu);
   2464
   2465			id = x86_match_cpu(intel_pstate_hwp_boost_ids);
   2466			if (id && intel_pstate_acpi_pm_profile_server())
   2467				hwp_boost = true;
   2468		}
   2469	} else if (hwp_active) {
   2470		/*
   2471		 * Re-enable HWP in case this happens after a resume from ACPI
   2472		 * S3 if the CPU was offline during the whole system/resume
   2473		 * cycle.
   2474		 */
   2475		intel_pstate_hwp_reenable(cpu);
   2476	}
   2477
   2478	cpu->epp_powersave = -EINVAL;
   2479	cpu->epp_policy = 0;
   2480
   2481	intel_pstate_get_cpu_pstates(cpu);
   2482
   2483	pr_debug("controlling: cpu %d\n", cpunum);
   2484
   2485	return 0;
   2486}
   2487
   2488static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
   2489{
   2490	struct cpudata *cpu = all_cpu_data[cpu_num];
   2491
   2492	if (hwp_active && !hwp_boost)
   2493		return;
   2494
   2495	if (cpu->update_util_set)
   2496		return;
   2497
   2498	/* Prevent intel_pstate_update_util() from using stale data. */
   2499	cpu->sample.time = 0;
   2500	cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
   2501				     (hwp_active ?
   2502				      intel_pstate_update_util_hwp :
   2503				      intel_pstate_update_util));
   2504	cpu->update_util_set = true;
   2505}
   2506
   2507static void intel_pstate_clear_update_util_hook(unsigned int cpu)
   2508{
   2509	struct cpudata *cpu_data = all_cpu_data[cpu];
   2510
   2511	if (!cpu_data->update_util_set)
   2512		return;
   2513
   2514	cpufreq_remove_update_util_hook(cpu);
   2515	cpu_data->update_util_set = false;
   2516	synchronize_rcu();
   2517}
   2518
   2519static int intel_pstate_get_max_freq(struct cpudata *cpu)
   2520{
   2521	return global.turbo_disabled || global.no_turbo ?
   2522			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
   2523}
   2524
   2525static void intel_pstate_update_perf_limits(struct cpudata *cpu,
   2526					    unsigned int policy_min,
   2527					    unsigned int policy_max)
   2528{
   2529	int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
   2530	int32_t max_policy_perf, min_policy_perf;
   2531
   2532	max_policy_perf = policy_max / perf_ctl_scaling;
   2533	if (policy_max == policy_min) {
   2534		min_policy_perf = max_policy_perf;
   2535	} else {
   2536		min_policy_perf = policy_min / perf_ctl_scaling;
   2537		min_policy_perf = clamp_t(int32_t, min_policy_perf,
   2538					  0, max_policy_perf);
   2539	}
   2540
   2541	/*
   2542	 * HWP needs some special consideration, because HWP_REQUEST uses
   2543	 * abstract values to represent performance rather than pure ratios.
   2544	 */
   2545	if (hwp_active && cpu->pstate.scaling != perf_ctl_scaling) {
   2546		int scaling = cpu->pstate.scaling;
   2547		int freq;
   2548
   2549		freq = max_policy_perf * perf_ctl_scaling;
   2550		max_policy_perf = DIV_ROUND_UP(freq, scaling);
   2551		freq = min_policy_perf * perf_ctl_scaling;
   2552		min_policy_perf = DIV_ROUND_UP(freq, scaling);
   2553	}
   2554
   2555	pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n",
   2556		 cpu->cpu, min_policy_perf, max_policy_perf);
   2557
   2558	/* Normalize user input to [min_perf, max_perf] */
   2559	if (per_cpu_limits) {
   2560		cpu->min_perf_ratio = min_policy_perf;
   2561		cpu->max_perf_ratio = max_policy_perf;
   2562	} else {
   2563		int turbo_max = cpu->pstate.turbo_pstate;
   2564		int32_t global_min, global_max;
   2565
   2566		/* Global limits are in percent of the maximum turbo P-state. */
   2567		global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
   2568		global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
   2569		global_min = clamp_t(int32_t, global_min, 0, global_max);
   2570
   2571		pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu,
   2572			 global_min, global_max);
   2573
   2574		cpu->min_perf_ratio = max(min_policy_perf, global_min);
   2575		cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf);
   2576		cpu->max_perf_ratio = min(max_policy_perf, global_max);
   2577		cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio);
   2578
   2579		/* Make sure min_perf <= max_perf */
   2580		cpu->min_perf_ratio = min(cpu->min_perf_ratio,
   2581					  cpu->max_perf_ratio);
   2582
   2583	}
   2584	pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu,
   2585		 cpu->max_perf_ratio,
   2586		 cpu->min_perf_ratio);
   2587}
   2588
   2589static int intel_pstate_set_policy(struct cpufreq_policy *policy)
   2590{
   2591	struct cpudata *cpu;
   2592
   2593	if (!policy->cpuinfo.max_freq)
   2594		return -ENODEV;
   2595
   2596	pr_debug("set_policy cpuinfo.max %u policy->max %u\n",
   2597		 policy->cpuinfo.max_freq, policy->max);
   2598
   2599	cpu = all_cpu_data[policy->cpu];
   2600	cpu->policy = policy->policy;
   2601
   2602	mutex_lock(&intel_pstate_limits_lock);
   2603
   2604	intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
   2605
   2606	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
   2607		/*
   2608		 * NOHZ_FULL CPUs need this as the governor callback may not
   2609		 * be invoked on them.
   2610		 */
   2611		intel_pstate_clear_update_util_hook(policy->cpu);
   2612		intel_pstate_max_within_limits(cpu);
   2613	} else {
   2614		intel_pstate_set_update_util_hook(policy->cpu);
   2615	}
   2616
   2617	if (hwp_active) {
   2618		/*
   2619		 * When hwp_boost was active before and dynamically it
   2620		 * was turned off, in that case we need to clear the
   2621		 * update util hook.
   2622		 */
   2623		if (!hwp_boost)
   2624			intel_pstate_clear_update_util_hook(policy->cpu);
   2625		intel_pstate_hwp_set(policy->cpu);
   2626	}
   2627
   2628	mutex_unlock(&intel_pstate_limits_lock);
   2629
   2630	return 0;
   2631}
   2632
   2633static void intel_pstate_adjust_policy_max(struct cpudata *cpu,
   2634					   struct cpufreq_policy_data *policy)
   2635{
   2636	if (!hwp_active &&
   2637	    cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
   2638	    policy->max < policy->cpuinfo.max_freq &&
   2639	    policy->max > cpu->pstate.max_freq) {
   2640		pr_debug("policy->max > max non turbo frequency\n");
   2641		policy->max = policy->cpuinfo.max_freq;
   2642	}
   2643}
   2644
   2645static void intel_pstate_verify_cpu_policy(struct cpudata *cpu,
   2646					   struct cpufreq_policy_data *policy)
   2647{
   2648	int max_freq;
   2649
   2650	update_turbo_state();
   2651	if (hwp_active) {
   2652		intel_pstate_get_hwp_cap(cpu);
   2653		max_freq = global.no_turbo || global.turbo_disabled ?
   2654				cpu->pstate.max_freq : cpu->pstate.turbo_freq;
   2655	} else {
   2656		max_freq = intel_pstate_get_max_freq(cpu);
   2657	}
   2658	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq);
   2659
   2660	intel_pstate_adjust_policy_max(cpu, policy);
   2661}
   2662
   2663static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy)
   2664{
   2665	intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy);
   2666
   2667	return 0;
   2668}
   2669
   2670static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy)
   2671{
   2672	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2673
   2674	pr_debug("CPU %d going offline\n", cpu->cpu);
   2675
   2676	if (cpu->suspended)
   2677		return 0;
   2678
   2679	/*
   2680	 * If the CPU is an SMT thread and it goes offline with the performance
   2681	 * settings different from the minimum, it will prevent its sibling
   2682	 * from getting to lower performance levels, so force the minimum
   2683	 * performance on CPU offline to prevent that from happening.
   2684	 */
   2685	if (hwp_active)
   2686		intel_pstate_hwp_offline(cpu);
   2687	else
   2688		intel_pstate_set_min_pstate(cpu);
   2689
   2690	intel_pstate_exit_perf_limits(policy);
   2691
   2692	return 0;
   2693}
   2694
   2695static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
   2696{
   2697	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2698
   2699	pr_debug("CPU %d going online\n", cpu->cpu);
   2700
   2701	intel_pstate_init_acpi_perf_limits(policy);
   2702
   2703	if (hwp_active) {
   2704		/*
   2705		 * Re-enable HWP and clear the "suspended" flag to let "resume"
   2706		 * know that it need not do that.
   2707		 */
   2708		intel_pstate_hwp_reenable(cpu);
   2709		cpu->suspended = false;
   2710	}
   2711
   2712	return 0;
   2713}
   2714
   2715static int intel_pstate_cpu_offline(struct cpufreq_policy *policy)
   2716{
   2717	intel_pstate_clear_update_util_hook(policy->cpu);
   2718
   2719	return intel_cpufreq_cpu_offline(policy);
   2720}
   2721
   2722static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
   2723{
   2724	pr_debug("CPU %d exiting\n", policy->cpu);
   2725
   2726	policy->fast_switch_possible = false;
   2727
   2728	return 0;
   2729}
   2730
   2731static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
   2732{
   2733	struct cpudata *cpu;
   2734	int rc;
   2735
   2736	rc = intel_pstate_init_cpu(policy->cpu);
   2737	if (rc)
   2738		return rc;
   2739
   2740	cpu = all_cpu_data[policy->cpu];
   2741
   2742	cpu->max_perf_ratio = 0xFF;
   2743	cpu->min_perf_ratio = 0;
   2744
   2745	/* cpuinfo and default policy values */
   2746	policy->cpuinfo.min_freq = cpu->pstate.min_freq;
   2747	update_turbo_state();
   2748	global.turbo_disabled_mf = global.turbo_disabled;
   2749	policy->cpuinfo.max_freq = global.turbo_disabled ?
   2750			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
   2751
   2752	policy->min = policy->cpuinfo.min_freq;
   2753	policy->max = policy->cpuinfo.max_freq;
   2754
   2755	intel_pstate_init_acpi_perf_limits(policy);
   2756
   2757	policy->fast_switch_possible = true;
   2758
   2759	return 0;
   2760}
   2761
   2762static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
   2763{
   2764	int ret = __intel_pstate_cpu_init(policy);
   2765
   2766	if (ret)
   2767		return ret;
   2768
   2769	/*
   2770	 * Set the policy to powersave to provide a valid fallback value in case
   2771	 * the default cpufreq governor is neither powersave nor performance.
   2772	 */
   2773	policy->policy = CPUFREQ_POLICY_POWERSAVE;
   2774
   2775	if (hwp_active) {
   2776		struct cpudata *cpu = all_cpu_data[policy->cpu];
   2777
   2778		cpu->epp_cached = intel_pstate_get_epp(cpu, 0);
   2779	}
   2780
   2781	return 0;
   2782}
   2783
   2784static struct cpufreq_driver intel_pstate = {
   2785	.flags		= CPUFREQ_CONST_LOOPS,
   2786	.verify		= intel_pstate_verify_policy,
   2787	.setpolicy	= intel_pstate_set_policy,
   2788	.suspend	= intel_pstate_suspend,
   2789	.resume		= intel_pstate_resume,
   2790	.init		= intel_pstate_cpu_init,
   2791	.exit		= intel_pstate_cpu_exit,
   2792	.offline	= intel_pstate_cpu_offline,
   2793	.online		= intel_pstate_cpu_online,
   2794	.update_limits	= intel_pstate_update_limits,
   2795	.name		= "intel_pstate",
   2796};
   2797
   2798static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
   2799{
   2800	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2801
   2802	intel_pstate_verify_cpu_policy(cpu, policy);
   2803	intel_pstate_update_perf_limits(cpu, policy->min, policy->max);
   2804
   2805	return 0;
   2806}
   2807
   2808/* Use of trace in passive mode:
   2809 *
   2810 * In passive mode the trace core_busy field (also known as the
   2811 * performance field, and lablelled as such on the graphs; also known as
   2812 * core_avg_perf) is not needed and so is re-assigned to indicate if the
   2813 * driver call was via the normal or fast switch path. Various graphs
   2814 * output from the intel_pstate_tracer.py utility that include core_busy
   2815 * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%,
   2816 * so we use 10 to indicate the normal path through the driver, and
   2817 * 90 to indicate the fast switch path through the driver.
   2818 * The scaled_busy field is not used, and is set to 0.
   2819 */
   2820
   2821#define	INTEL_PSTATE_TRACE_TARGET 10
   2822#define	INTEL_PSTATE_TRACE_FAST_SWITCH 90
   2823
   2824static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
   2825{
   2826	struct sample *sample;
   2827
   2828	if (!trace_pstate_sample_enabled())
   2829		return;
   2830
   2831	if (!intel_pstate_sample(cpu, ktime_get()))
   2832		return;
   2833
   2834	sample = &cpu->sample;
   2835	trace_pstate_sample(trace_type,
   2836		0,
   2837		old_pstate,
   2838		cpu->pstate.current_pstate,
   2839		sample->mperf,
   2840		sample->aperf,
   2841		sample->tsc,
   2842		get_avg_frequency(cpu),
   2843		fp_toint(cpu->iowait_boost * 100));
   2844}
   2845
   2846static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
   2847				     u32 desired, bool fast_switch)
   2848{
   2849	u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev;
   2850
   2851	value &= ~HWP_MIN_PERF(~0L);
   2852	value |= HWP_MIN_PERF(min);
   2853
   2854	value &= ~HWP_MAX_PERF(~0L);
   2855	value |= HWP_MAX_PERF(max);
   2856
   2857	value &= ~HWP_DESIRED_PERF(~0L);
   2858	value |= HWP_DESIRED_PERF(desired);
   2859
   2860	if (value == prev)
   2861		return;
   2862
   2863	WRITE_ONCE(cpu->hwp_req_cached, value);
   2864	if (fast_switch)
   2865		wrmsrl(MSR_HWP_REQUEST, value);
   2866	else
   2867		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
   2868}
   2869
   2870static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu,
   2871					  u32 target_pstate, bool fast_switch)
   2872{
   2873	if (fast_switch)
   2874		wrmsrl(MSR_IA32_PERF_CTL,
   2875		       pstate_funcs.get_val(cpu, target_pstate));
   2876	else
   2877		wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
   2878			      pstate_funcs.get_val(cpu, target_pstate));
   2879}
   2880
   2881static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy,
   2882				       int target_pstate, bool fast_switch)
   2883{
   2884	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2885	int old_pstate = cpu->pstate.current_pstate;
   2886
   2887	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
   2888	if (hwp_active) {
   2889		int max_pstate = policy->strict_target ?
   2890					target_pstate : cpu->max_perf_ratio;
   2891
   2892		intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0,
   2893					 fast_switch);
   2894	} else if (target_pstate != old_pstate) {
   2895		intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch);
   2896	}
   2897
   2898	cpu->pstate.current_pstate = target_pstate;
   2899
   2900	intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH :
   2901			    INTEL_PSTATE_TRACE_TARGET, old_pstate);
   2902
   2903	return target_pstate;
   2904}
   2905
   2906static int intel_cpufreq_target(struct cpufreq_policy *policy,
   2907				unsigned int target_freq,
   2908				unsigned int relation)
   2909{
   2910	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2911	struct cpufreq_freqs freqs;
   2912	int target_pstate;
   2913
   2914	update_turbo_state();
   2915
   2916	freqs.old = policy->cur;
   2917	freqs.new = target_freq;
   2918
   2919	cpufreq_freq_transition_begin(policy, &freqs);
   2920
   2921	switch (relation) {
   2922	case CPUFREQ_RELATION_L:
   2923		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
   2924		break;
   2925	case CPUFREQ_RELATION_H:
   2926		target_pstate = freqs.new / cpu->pstate.scaling;
   2927		break;
   2928	default:
   2929		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
   2930		break;
   2931	}
   2932
   2933	target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false);
   2934
   2935	freqs.new = target_pstate * cpu->pstate.scaling;
   2936
   2937	cpufreq_freq_transition_end(policy, &freqs, false);
   2938
   2939	return 0;
   2940}
   2941
   2942static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
   2943					      unsigned int target_freq)
   2944{
   2945	struct cpudata *cpu = all_cpu_data[policy->cpu];
   2946	int target_pstate;
   2947
   2948	update_turbo_state();
   2949
   2950	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
   2951
   2952	target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true);
   2953
   2954	return target_pstate * cpu->pstate.scaling;
   2955}
   2956
   2957static void intel_cpufreq_adjust_perf(unsigned int cpunum,
   2958				      unsigned long min_perf,
   2959				      unsigned long target_perf,
   2960				      unsigned long capacity)
   2961{
   2962	struct cpudata *cpu = all_cpu_data[cpunum];
   2963	u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
   2964	int old_pstate = cpu->pstate.current_pstate;
   2965	int cap_pstate, min_pstate, max_pstate, target_pstate;
   2966
   2967	update_turbo_state();
   2968	cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) :
   2969					     HWP_HIGHEST_PERF(hwp_cap);
   2970
   2971	/* Optimization: Avoid unnecessary divisions. */
   2972
   2973	target_pstate = cap_pstate;
   2974	if (target_perf < capacity)
   2975		target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);
   2976
   2977	min_pstate = cap_pstate;
   2978	if (min_perf < capacity)
   2979		min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);
   2980
   2981	if (min_pstate < cpu->pstate.min_pstate)
   2982		min_pstate = cpu->pstate.min_pstate;
   2983
   2984	if (min_pstate < cpu->min_perf_ratio)
   2985		min_pstate = cpu->min_perf_ratio;
   2986
   2987	max_pstate = min(cap_pstate, cpu->max_perf_ratio);
   2988	if (max_pstate < min_pstate)
   2989		max_pstate = min_pstate;
   2990
   2991	target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);
   2992
   2993	intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true);
   2994
   2995	cpu->pstate.current_pstate = target_pstate;
   2996	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
   2997}
   2998
   2999static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
   3000{
   3001	struct freq_qos_request *req;
   3002	struct cpudata *cpu;
   3003	struct device *dev;
   3004	int ret, freq;
   3005
   3006	dev = get_cpu_device(policy->cpu);
   3007	if (!dev)
   3008		return -ENODEV;
   3009
   3010	ret = __intel_pstate_cpu_init(policy);
   3011	if (ret)
   3012		return ret;
   3013
   3014	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
   3015	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
   3016	policy->cur = policy->cpuinfo.min_freq;
   3017
   3018	req = kcalloc(2, sizeof(*req), GFP_KERNEL);
   3019	if (!req) {
   3020		ret = -ENOMEM;
   3021		goto pstate_exit;
   3022	}
   3023
   3024	cpu = all_cpu_data[policy->cpu];
   3025
   3026	if (hwp_active) {
   3027		u64 value;
   3028
   3029		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
   3030
   3031		intel_pstate_get_hwp_cap(cpu);
   3032
   3033		rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value);
   3034		WRITE_ONCE(cpu->hwp_req_cached, value);
   3035
   3036		cpu->epp_cached = intel_pstate_get_epp(cpu, value);
   3037	} else {
   3038		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
   3039	}
   3040
   3041	freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100);
   3042
   3043	ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN,
   3044				   freq);
   3045	if (ret < 0) {
   3046		dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
   3047		goto free_req;
   3048	}
   3049
   3050	freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100);
   3051
   3052	ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX,
   3053				   freq);
   3054	if (ret < 0) {
   3055		dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
   3056		goto remove_min_req;
   3057	}
   3058
   3059	policy->driver_data = req;
   3060
   3061	return 0;
   3062
   3063remove_min_req:
   3064	freq_qos_remove_request(req);
   3065free_req:
   3066	kfree(req);
   3067pstate_exit:
   3068	intel_pstate_exit_perf_limits(policy);
   3069
   3070	return ret;
   3071}
   3072
   3073static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy)
   3074{
   3075	struct freq_qos_request *req;
   3076
   3077	req = policy->driver_data;
   3078
   3079	freq_qos_remove_request(req + 1);
   3080	freq_qos_remove_request(req);
   3081	kfree(req);
   3082
   3083	return intel_pstate_cpu_exit(policy);
   3084}
   3085
   3086static int intel_cpufreq_suspend(struct cpufreq_policy *policy)
   3087{
   3088	intel_pstate_suspend(policy);
   3089
   3090	if (hwp_active) {
   3091		struct cpudata *cpu = all_cpu_data[policy->cpu];
   3092		u64 value = READ_ONCE(cpu->hwp_req_cached);
   3093
   3094		/*
   3095		 * Clear the desired perf field in MSR_HWP_REQUEST in case
   3096		 * intel_cpufreq_adjust_perf() is in use and the last value
   3097		 * written by it may not be suitable.
   3098		 */
   3099		value &= ~HWP_DESIRED_PERF(~0L);
   3100		wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
   3101		WRITE_ONCE(cpu->hwp_req_cached, value);
   3102	}
   3103
   3104	return 0;
   3105}
   3106
   3107static struct cpufreq_driver intel_cpufreq = {
   3108	.flags		= CPUFREQ_CONST_LOOPS,
   3109	.verify		= intel_cpufreq_verify_policy,
   3110	.target		= intel_cpufreq_target,
   3111	.fast_switch	= intel_cpufreq_fast_switch,
   3112	.init		= intel_cpufreq_cpu_init,
   3113	.exit		= intel_cpufreq_cpu_exit,
   3114	.offline	= intel_cpufreq_cpu_offline,
   3115	.online		= intel_pstate_cpu_online,
   3116	.suspend	= intel_cpufreq_suspend,
   3117	.resume		= intel_pstate_resume,
   3118	.update_limits	= intel_pstate_update_limits,
   3119	.name		= "intel_cpufreq",
   3120};
   3121
   3122static struct cpufreq_driver *default_driver;
   3123
   3124static void intel_pstate_driver_cleanup(void)
   3125{
   3126	unsigned int cpu;
   3127
   3128	cpus_read_lock();
   3129	for_each_online_cpu(cpu) {
   3130		if (all_cpu_data[cpu]) {
   3131			if (intel_pstate_driver == &intel_pstate)
   3132				intel_pstate_clear_update_util_hook(cpu);
   3133
   3134			spin_lock(&hwp_notify_lock);
   3135			kfree(all_cpu_data[cpu]);
   3136			WRITE_ONCE(all_cpu_data[cpu], NULL);
   3137			spin_unlock(&hwp_notify_lock);
   3138		}
   3139	}
   3140	cpus_read_unlock();
   3141
   3142	intel_pstate_driver = NULL;
   3143}
   3144
   3145static int intel_pstate_register_driver(struct cpufreq_driver *driver)
   3146{
   3147	int ret;
   3148
   3149	if (driver == &intel_pstate)
   3150		intel_pstate_sysfs_expose_hwp_dynamic_boost();
   3151
   3152	memset(&global, 0, sizeof(global));
   3153	global.max_perf_pct = 100;
   3154
   3155	intel_pstate_driver = driver;
   3156	ret = cpufreq_register_driver(intel_pstate_driver);
   3157	if (ret) {
   3158		intel_pstate_driver_cleanup();
   3159		return ret;
   3160	}
   3161
   3162	global.min_perf_pct = min_perf_pct_min();
   3163
   3164	return 0;
   3165}
   3166
   3167static ssize_t intel_pstate_show_status(char *buf)
   3168{
   3169	if (!intel_pstate_driver)
   3170		return sprintf(buf, "off\n");
   3171
   3172	return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ?
   3173					"active" : "passive");
   3174}
   3175
   3176static int intel_pstate_update_status(const char *buf, size_t size)
   3177{
   3178	if (size == 3 && !strncmp(buf, "off", size)) {
   3179		if (!intel_pstate_driver)
   3180			return -EINVAL;
   3181
   3182		if (hwp_active)
   3183			return -EBUSY;
   3184
   3185		cpufreq_unregister_driver(intel_pstate_driver);
   3186		intel_pstate_driver_cleanup();
   3187		return 0;
   3188	}
   3189
   3190	if (size == 6 && !strncmp(buf, "active", size)) {
   3191		if (intel_pstate_driver) {
   3192			if (intel_pstate_driver == &intel_pstate)
   3193				return 0;
   3194
   3195			cpufreq_unregister_driver(intel_pstate_driver);
   3196		}
   3197
   3198		return intel_pstate_register_driver(&intel_pstate);
   3199	}
   3200
   3201	if (size == 7 && !strncmp(buf, "passive", size)) {
   3202		if (intel_pstate_driver) {
   3203			if (intel_pstate_driver == &intel_cpufreq)
   3204				return 0;
   3205
   3206			cpufreq_unregister_driver(intel_pstate_driver);
   3207			intel_pstate_sysfs_hide_hwp_dynamic_boost();
   3208		}
   3209
   3210		return intel_pstate_register_driver(&intel_cpufreq);
   3211	}
   3212
   3213	return -EINVAL;
   3214}
   3215
   3216static int no_load __initdata;
   3217static int no_hwp __initdata;
   3218static int hwp_only __initdata;
   3219static unsigned int force_load __initdata;
   3220
   3221static int __init intel_pstate_msrs_not_valid(void)
   3222{
   3223	if (!pstate_funcs.get_max() ||
   3224	    !pstate_funcs.get_min() ||
   3225	    !pstate_funcs.get_turbo())
   3226		return -ENODEV;
   3227
   3228	return 0;
   3229}
   3230
   3231static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
   3232{
   3233	pstate_funcs.get_max   = funcs->get_max;
   3234	pstate_funcs.get_max_physical = funcs->get_max_physical;
   3235	pstate_funcs.get_min   = funcs->get_min;
   3236	pstate_funcs.get_turbo = funcs->get_turbo;
   3237	pstate_funcs.get_scaling = funcs->get_scaling;
   3238	pstate_funcs.get_val   = funcs->get_val;
   3239	pstate_funcs.get_vid   = funcs->get_vid;
   3240	pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift;
   3241}
   3242
   3243#ifdef CONFIG_ACPI
   3244
   3245static bool __init intel_pstate_no_acpi_pss(void)
   3246{
   3247	int i;
   3248
   3249	for_each_possible_cpu(i) {
   3250		acpi_status status;
   3251		union acpi_object *pss;
   3252		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
   3253		struct acpi_processor *pr = per_cpu(processors, i);
   3254
   3255		if (!pr)
   3256			continue;
   3257
   3258		status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
   3259		if (ACPI_FAILURE(status))
   3260			continue;
   3261
   3262		pss = buffer.pointer;
   3263		if (pss && pss->type == ACPI_TYPE_PACKAGE) {
   3264			kfree(pss);
   3265			return false;
   3266		}
   3267
   3268		kfree(pss);
   3269	}
   3270
   3271	pr_debug("ACPI _PSS not found\n");
   3272	return true;
   3273}
   3274
   3275static bool __init intel_pstate_no_acpi_pcch(void)
   3276{
   3277	acpi_status status;
   3278	acpi_handle handle;
   3279
   3280	status = acpi_get_handle(NULL, "\\_SB", &handle);
   3281	if (ACPI_FAILURE(status))
   3282		goto not_found;
   3283
   3284	if (acpi_has_method(handle, "PCCH"))
   3285		return false;
   3286
   3287not_found:
   3288	pr_debug("ACPI PCCH not found\n");
   3289	return true;
   3290}
   3291
   3292static bool __init intel_pstate_has_acpi_ppc(void)
   3293{
   3294	int i;
   3295
   3296	for_each_possible_cpu(i) {
   3297		struct acpi_processor *pr = per_cpu(processors, i);
   3298
   3299		if (!pr)
   3300			continue;
   3301		if (acpi_has_method(pr->handle, "_PPC"))
   3302			return true;
   3303	}
   3304	pr_debug("ACPI _PPC not found\n");
   3305	return false;
   3306}
   3307
   3308enum {
   3309	PSS,
   3310	PPC,
   3311};
   3312
   3313/* Hardware vendor-specific info that has its own power management modes */
   3314static struct acpi_platform_list plat_info[] __initdata = {
   3315	{"HP    ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS},
   3316	{"ORACLE", "X4-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3317	{"ORACLE", "X4-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3318	{"ORACLE", "X4-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3319	{"ORACLE", "X3-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3320	{"ORACLE", "X3-2L   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3321	{"ORACLE", "X3-2B   ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3322	{"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3323	{"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3324	{"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3325	{"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3326	{"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3327	{"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3328	{"ORACLE", "X6-2    ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3329	{"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC},
   3330	{ } /* End */
   3331};
   3332
   3333#define BITMASK_OOB	(BIT(8) | BIT(18))
   3334
   3335static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
   3336{
   3337	const struct x86_cpu_id *id;
   3338	u64 misc_pwr;
   3339	int idx;
   3340
   3341	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
   3342	if (id) {
   3343		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
   3344		if (misc_pwr & BITMASK_OOB) {
   3345			pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n");
   3346			pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n");
   3347			return true;
   3348		}
   3349	}
   3350
   3351	idx = acpi_match_platform_list(plat_info);
   3352	if (idx < 0)
   3353		return false;
   3354
   3355	switch (plat_info[idx].data) {
   3356	case PSS:
   3357		if (!intel_pstate_no_acpi_pss())
   3358			return false;
   3359
   3360		return intel_pstate_no_acpi_pcch();
   3361	case PPC:
   3362		return intel_pstate_has_acpi_ppc() && !force_load;
   3363	}
   3364
   3365	return false;
   3366}
   3367
   3368static void intel_pstate_request_control_from_smm(void)
   3369{
   3370	/*
   3371	 * It may be unsafe to request P-states control from SMM if _PPC support
   3372	 * has not been enabled.
   3373	 */
   3374	if (acpi_ppc)
   3375		acpi_processor_pstate_control();
   3376}
   3377#else /* CONFIG_ACPI not enabled */
   3378static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
   3379static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
   3380static inline void intel_pstate_request_control_from_smm(void) {}
   3381#endif /* CONFIG_ACPI */
   3382
   3383#define INTEL_PSTATE_HWP_BROADWELL	0x01
   3384
   3385#define X86_MATCH_HWP(model, hwp_mode)					\
   3386	X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
   3387					   X86_FEATURE_HWP, hwp_mode)
   3388
   3389static const struct x86_cpu_id hwp_support_ids[] __initconst = {
   3390	X86_MATCH_HWP(BROADWELL_X,	INTEL_PSTATE_HWP_BROADWELL),
   3391	X86_MATCH_HWP(BROADWELL_D,	INTEL_PSTATE_HWP_BROADWELL),
   3392	X86_MATCH_HWP(ANY,		0),
   3393	{}
   3394};
   3395
   3396static bool intel_pstate_hwp_is_enabled(void)
   3397{
   3398	u64 value;
   3399
   3400	rdmsrl(MSR_PM_ENABLE, value);
   3401	return !!(value & 0x1);
   3402}
   3403
   3404static const struct x86_cpu_id intel_epp_balance_perf[] = {
   3405	/*
   3406	 * Set EPP value as 102, this is the max suggested EPP
   3407	 * which can result in one core turbo frequency for
   3408	 * AlderLake Mobile CPUs.
   3409	 */
   3410	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 102),
   3411	{}
   3412};
   3413
   3414static int __init intel_pstate_init(void)
   3415{
   3416	static struct cpudata **_all_cpu_data;
   3417	const struct x86_cpu_id *id;
   3418	int rc;
   3419
   3420	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
   3421		return -ENODEV;
   3422
   3423	id = x86_match_cpu(hwp_support_ids);
   3424	if (id) {
   3425		bool hwp_forced = intel_pstate_hwp_is_enabled();
   3426
   3427		if (hwp_forced)
   3428			pr_info("HWP enabled by BIOS\n");
   3429		else if (no_load)
   3430			return -ENODEV;
   3431
   3432		copy_cpu_funcs(&core_funcs);
   3433		/*
   3434		 * Avoid enabling HWP for processors without EPP support,
   3435		 * because that means incomplete HWP implementation which is a
   3436		 * corner case and supporting it is generally problematic.
   3437		 *
   3438		 * If HWP is enabled already, though, there is no choice but to
   3439		 * deal with it.
   3440		 */
   3441		if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) {
   3442			WRITE_ONCE(hwp_active, 1);
   3443			hwp_mode_bdw = id->driver_data;
   3444			intel_pstate.attr = hwp_cpufreq_attrs;
   3445			intel_cpufreq.attr = hwp_cpufreq_attrs;
   3446			intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS;
   3447			intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf;
   3448			if (!default_driver)
   3449				default_driver = &intel_pstate;
   3450
   3451			if (boot_cpu_has(X86_FEATURE_HYBRID_CPU))
   3452				intel_pstate_cppc_set_cpu_scaling();
   3453
   3454			goto hwp_cpu_matched;
   3455		}
   3456		pr_info("HWP not enabled\n");
   3457	} else {
   3458		if (no_load)
   3459			return -ENODEV;
   3460
   3461		id = x86_match_cpu(intel_pstate_cpu_ids);
   3462		if (!id) {
   3463			pr_info("CPU model not supported\n");
   3464			return -ENODEV;
   3465		}
   3466
   3467		copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
   3468	}
   3469
   3470	if (intel_pstate_msrs_not_valid()) {
   3471		pr_info("Invalid MSRs\n");
   3472		return -ENODEV;
   3473	}
   3474	/* Without HWP start in the passive mode. */
   3475	if (!default_driver)
   3476		default_driver = &intel_cpufreq;
   3477
   3478hwp_cpu_matched:
   3479	/*
   3480	 * The Intel pstate driver will be ignored if the platform
   3481	 * firmware has its own power management modes.
   3482	 */
   3483	if (intel_pstate_platform_pwr_mgmt_exists()) {
   3484		pr_info("P-states controlled by the platform\n");
   3485		return -ENODEV;
   3486	}
   3487
   3488	if (!hwp_active && hwp_only)
   3489		return -ENOTSUPP;
   3490
   3491	pr_info("Intel P-state driver initializing\n");
   3492
   3493	_all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
   3494	if (!_all_cpu_data)
   3495		return -ENOMEM;
   3496
   3497	WRITE_ONCE(all_cpu_data, _all_cpu_data);
   3498
   3499	intel_pstate_request_control_from_smm();
   3500
   3501	intel_pstate_sysfs_expose_params();
   3502
   3503	if (hwp_active) {
   3504		const struct x86_cpu_id *id = x86_match_cpu(intel_epp_balance_perf);
   3505
   3506		if (id)
   3507			epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = id->driver_data;
   3508	}
   3509
   3510	mutex_lock(&intel_pstate_driver_lock);
   3511	rc = intel_pstate_register_driver(default_driver);
   3512	mutex_unlock(&intel_pstate_driver_lock);
   3513	if (rc) {
   3514		intel_pstate_sysfs_remove();
   3515		return rc;
   3516	}
   3517
   3518	if (hwp_active) {
   3519		const struct x86_cpu_id *id;
   3520
   3521		id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids);
   3522		if (id) {
   3523			set_power_ctl_ee_state(false);
   3524			pr_info("Disabling energy efficiency optimization\n");
   3525		}
   3526
   3527		pr_info("HWP enabled\n");
   3528	} else if (boot_cpu_has(X86_FEATURE_HYBRID_CPU)) {
   3529		pr_warn("Problematic setup: Hybrid processor with disabled HWP\n");
   3530	}
   3531
   3532	return 0;
   3533}
   3534device_initcall(intel_pstate_init);
   3535
   3536static int __init intel_pstate_setup(char *str)
   3537{
   3538	if (!str)
   3539		return -EINVAL;
   3540
   3541	if (!strcmp(str, "disable"))
   3542		no_load = 1;
   3543	else if (!strcmp(str, "active"))
   3544		default_driver = &intel_pstate;
   3545	else if (!strcmp(str, "passive"))
   3546		default_driver = &intel_cpufreq;
   3547
   3548	if (!strcmp(str, "no_hwp"))
   3549		no_hwp = 1;
   3550
   3551	if (!strcmp(str, "force"))
   3552		force_load = 1;
   3553	if (!strcmp(str, "hwp_only"))
   3554		hwp_only = 1;
   3555	if (!strcmp(str, "per_cpu_perf_limits"))
   3556		per_cpu_limits = true;
   3557
   3558#ifdef CONFIG_ACPI
   3559	if (!strcmp(str, "support_acpi_ppc"))
   3560		acpi_ppc = true;
   3561#endif
   3562
   3563	return 0;
   3564}
   3565early_param("intel_pstate", intel_pstate_setup);
   3566
   3567MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
   3568MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
   3569MODULE_LICENSE("GPL");