From f70575805708cabdedea7498aaa3f710fde4d920 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Wed, 31 Jan 2024 03:29:01 +0000 Subject: [PATCH] add lvds1024*800 --- kernel/drivers/cpufreq/intel_pstate.c | 1168 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 847 insertions(+), 321 deletions(-) diff --git a/kernel/drivers/cpufreq/intel_pstate.c b/kernel/drivers/cpufreq/intel_pstate.c index c2d222f..4359ed1 100644 --- a/kernel/drivers/cpufreq/intel_pstate.c +++ b/kernel/drivers/cpufreq/intel_pstate.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * intel_pstate.c: Native P state management for Intel processors * * (C) Copyright 2012 Intel Corporation * Author: Dirk Brandewie <dirk.j.brandewie@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -28,6 +24,7 @@ #include <linux/fs.h> #include <linux/acpi.h> #include <linux/vmalloc.h> +#include <linux/pm_qos.h> #include <trace/events/power.h> #include <asm/div64.h> @@ -39,6 +36,7 @@ #define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC) #define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 +#define INTEL_CPUFREQ_TRANSITION_DELAY_HWP 5000 #define INTEL_CPUFREQ_TRANSITION_DELAY 500 #ifdef CONFIG_ACPI @@ -49,6 +47,8 @@ #define FRAC_BITS 8 #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) #define fp_toint(X) ((X) >> FRAC_BITS) + +#define ONE_EIGHTH_FP ((int64_t)1 << (FRAC_BITS - 3)) #define EXT_BITS 6 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) @@ -173,10 +173,11 @@ /** * struct global_params - Global parameters, mostly tunable via sysfs. * @no_turbo: Whether or not to use turbo P-states. - * @turbo_disabled: Whethet or not turbo P-states are available at all, + * @turbo_disabled: Whether or not turbo P-states are available at all, * based on the MSR_IA32_MISC_ENABLE value and whether or * not the maximum reported turbo P-state is different from * the maximum reported non-turbo one. + * @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq. * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo * P-state capacity. * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo @@ -185,6 +186,7 @@ struct global_params { bool no_turbo; bool turbo_disabled; + bool turbo_disabled_mf; int max_perf_pct; int min_perf_pct; }; @@ -200,9 +202,7 @@ * @pstate: Stores P state limits for this CPU * @vid: Stores VID limits for this CPU * @last_sample_time: Last Sample time - * @aperf_mperf_shift: Number of clock cycles after aperf, merf is incremented - * This shift is a multiplier to mperf delta to - * calculate CPU busy. + * @aperf_mperf_shift: APERF vs MPERF counting frequency difference * @prev_aperf: Last APERF value read from APERF MSR * @prev_mperf: Last MPERF value read from MPERF MSR * @prev_tsc: Last timestamp counter (TSC) value @@ -219,13 +219,13 @@ * @epp_policy: Last saved policy used to set EPP/EPB * @epp_default: Power on default HWP energy performance * preference/bias - * @epp_saved: Saved EPP/EPB during system suspend or CPU offline - * operation + * @epp_cached Cached HWP energy-performance preference value * @hwp_req_cached: Cached value of the last HWP Request MSR * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR * @last_io_update: Last time when IO wake flag was set * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance + * @suspended: Whether or not the driver has been suspended. * * This structure stores per CPU instance data for all CPUs. */ @@ -257,12 +257,13 @@ s16 epp_powersave; s16 epp_policy; s16 epp_default; - s16 epp_saved; + s16 epp_cached; u64 hwp_req_cached; u64 hwp_cap_cached; u64 last_io_update; unsigned int sched_flags; u32 hwp_boost_min; + bool suspended; }; static struct cpudata **all_cpu_data; @@ -274,6 +275,7 @@ * @get_min: Callback to get minimum P state * @get_turbo: Callback to get turbo P state * @get_scaling: Callback to get frequency scaling factor + * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference * @get_val: Callback to convert P state to actual MSR write value * @get_vid: Callback to get VID data for Atom platforms * @@ -373,11 +375,27 @@ } } } -#else + +static int intel_pstate_get_cppc_guranteed(int cpu) +{ + struct cppc_perf_caps cppc_perf; + int ret; + + ret = cppc_get_perf_caps(cpu, &cppc_perf); + if (ret) + return ret; + + if (cppc_perf.guaranteed_perf) + return cppc_perf.guaranteed_perf; + + return cppc_perf.nominal_perf; +} + +#else /* CONFIG_ACPI_CPPC_LIB */ static void intel_pstate_set_itmt_prio(int cpu) { } -#endif +#endif /* CONFIG_ACPI_CPPC_LIB */ static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) { @@ -425,20 +443,6 @@ (u32) cpu->acpi_perf_data.states[i].control); } - /* - * The _PSS table doesn't contain whole turbo frequency range. - * This just contains +1 MHZ above the max non turbo frequency, - * with control value corresponding to max turbo ratio. But - * when cpufreq set policy is called, it will call with this - * max frequency, which will cause a reduced performance as - * this driver uses real max turbo frequency as the max - * frequency. So correct this frequency in _PSS table to - * correct max turbo frequency based on the turbo state. - * Also need to convert to MHz as _PSS freq is in MHz. - */ - if (!global.turbo_disabled) - cpu->acpi_perf_data.states[0].core_frequency = - policy->cpuinfo.max_freq / 1000; cpu->valid_pss_table = true; pr_debug("_PPC limits will be enforced\n"); @@ -459,7 +463,7 @@ acpi_processor_unregister_performance(policy->cpu); } -#else +#else /* CONFIG_ACPI */ static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) { } @@ -472,7 +476,14 @@ { return false; } -#endif +#endif /* CONFIG_ACPI */ + +#ifndef CONFIG_ACPI_CPPC_LIB +static int intel_pstate_get_cppc_guranteed(int cpu) +{ + return -ENOTSUPP; +} +#endif /* CONFIG_ACPI_CPPC_LIB */ static inline void update_turbo_state(void) { @@ -500,7 +511,7 @@ u64 epb; int ret; - if (!static_cpu_has(X86_FEATURE_EPB)) + if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENXIO; ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); @@ -514,7 +525,7 @@ { s16 epp; - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { /* * When hwp_req_data is 0, means that caller didn't read * MSR_HWP_REQUEST, so need to read and get EPP. @@ -539,7 +550,7 @@ u64 epb; int ret; - if (!static_cpu_has(X86_FEATURE_EPB)) + if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENXIO; ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); @@ -578,25 +589,28 @@ HWP_EPP_POWERSAVE }; -static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) +static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) { s16 epp; int index = -EINVAL; + *raw_epp = 0; epp = intel_pstate_get_epp(cpu_data, 0); if (epp < 0) return epp; - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { if (epp == HWP_EPP_PERFORMANCE) return 1; - if (epp <= HWP_EPP_BALANCE_PERFORMANCE) + if (epp == HWP_EPP_BALANCE_PERFORMANCE) return 2; - if (epp <= HWP_EPP_BALANCE_POWERSAVE) + if (epp == HWP_EPP_BALANCE_POWERSAVE) return 3; - else + if (epp == HWP_EPP_POWERSAVE) return 4; - } else if (static_cpu_has(X86_FEATURE_EPB)) { + *raw_epp = epp; + return 0; + } else if (boot_cpu_has(X86_FEATURE_EPB)) { /* * Range: * 0x00-0x03 : Performance @@ -613,8 +627,35 @@ return index; } +static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp) +{ + int ret; + + /* + * Use the cached HWP Request MSR value, because in the active mode the + * register itself may be updated by intel_pstate_hwp_boost_up() or + * intel_pstate_hwp_boost_down() at any time. + */ + u64 value = READ_ONCE(cpu->hwp_req_cached); + + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + /* + * The only other updater of hwp_req_cached in the active mode, + * intel_pstate_hwp_set(), is called under the same lock as this + * function, so it cannot run in parallel with the update below. + */ + WRITE_ONCE(cpu->hwp_req_cached, value); + ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); + if (!ret) + cpu->epp_cached = epp; + + return ret; +} + static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, - int pref_index) + int pref_index, bool use_raw, + u32 raw_epp) { int epp = -EINVAL; int ret; @@ -622,29 +663,26 @@ if (!pref_index) epp = cpu_data->epp_default; - mutex_lock(&intel_pstate_limits_lock); - - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { - u64 value; - - ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value); - if (ret) - goto return_pref; - - value &= ~GENMASK_ULL(31, 24); - - if (epp == -EINVAL) + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { + if (use_raw) + epp = raw_epp; + else if (epp == -EINVAL) epp = epp_values[pref_index - 1]; - value |= (u64)epp << 24; - ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); + /* + * To avoid confusion, refuse to set EPP to any values different + * from 0 (performance) if the current policy is "performance", + * because those values would be overridden. + */ + if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) + return -EBUSY; + + ret = intel_pstate_set_epp(cpu_data, epp); } else { if (epp == -EINVAL) epp = (pref_index - 1) << 2; ret = intel_pstate_set_epb(cpu_data->cpu, epp); } -return_pref: - mutex_unlock(&intel_pstate_limits_lock); return ret; } @@ -665,53 +703,127 @@ cpufreq_freq_attr_ro(energy_performance_available_preferences); +static struct cpufreq_driver intel_pstate; + static ssize_t store_energy_performance_preference( struct cpufreq_policy *policy, const char *buf, size_t count) { - struct cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cpudata *cpu = all_cpu_data[policy->cpu]; char str_preference[21]; - int ret; + bool raw = false; + ssize_t ret; + u32 epp = 0; ret = sscanf(buf, "%20s", str_preference); if (ret != 1) return -EINVAL; ret = match_string(energy_perf_strings, -1, str_preference); - if (ret < 0) - return ret; + if (ret < 0) { + if (!boot_cpu_has(X86_FEATURE_HWP_EPP)) + return ret; - intel_pstate_set_energy_pref_index(cpu_data, ret); - return count; + ret = kstrtouint(buf, 10, &epp); + if (ret) + return ret; + + if (epp > 255) + return -EINVAL; + + raw = true; + } + + /* + * This function runs with the policy R/W semaphore held, which + * guarantees that the driver pointer will not change while it is + * running. + */ + if (!intel_pstate_driver) + return -EAGAIN; + + mutex_lock(&intel_pstate_limits_lock); + + if (intel_pstate_driver == &intel_pstate) { + ret = intel_pstate_set_energy_pref_index(cpu, ret, raw, epp); + } else { + /* + * In the passive mode the governor needs to be stopped on the + * target CPU before the EPP update and restarted after it, + * which is super-heavy-weight, so make sure it is worth doing + * upfront. + */ + if (!raw) + epp = ret ? epp_values[ret - 1] : cpu->epp_default; + + if (cpu->epp_cached != epp) { + int err; + + cpufreq_stop_governor(policy); + ret = intel_pstate_set_epp(cpu, epp); + err = cpufreq_start_governor(policy); + if (!ret) + ret = err; + } else { + ret = 0; + } + } + + mutex_unlock(&intel_pstate_limits_lock); + + return ret ?: count; } static ssize_t show_energy_performance_preference( struct cpufreq_policy *policy, char *buf) { struct cpudata *cpu_data = all_cpu_data[policy->cpu]; - int preference; + int preference, raw_epp; - preference = intel_pstate_get_energy_pref_index(cpu_data); + preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp); if (preference < 0) return preference; - return sprintf(buf, "%s\n", energy_perf_strings[preference]); + if (raw_epp) + return sprintf(buf, "%d\n", raw_epp); + else + return sprintf(buf, "%s\n", energy_perf_strings[preference]); } cpufreq_freq_attr_rw(energy_performance_preference); +static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) +{ + struct cpudata *cpu; + u64 cap; + int ratio; + + ratio = intel_pstate_get_cppc_guranteed(policy->cpu); + if (ratio <= 0) { + rdmsrl_on_cpu(policy->cpu, MSR_HWP_CAPABILITIES, &cap); + ratio = HWP_GUARANTEED_PERF(cap); + } + + cpu = all_cpu_data[policy->cpu]; + + return sprintf(buf, "%d\n", ratio * cpu->pstate.scaling); +} + +cpufreq_freq_attr_ro(base_frequency); + static struct freq_attr *hwp_cpufreq_attrs[] = { &energy_performance_preference, &energy_performance_available_preferences, + &base_frequency, NULL, }; -static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, +static void intel_pstate_get_hwp_max(struct cpudata *cpu, int *phy_max, int *current_max) { u64 cap; - rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); - WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap); + rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap); + WRITE_ONCE(cpu->hwp_cap_cached, cap); if (global.no_turbo || global.turbo_disabled) *current_max = HWP_GUARANTEED_PERF(cap); else @@ -746,12 +858,6 @@ cpu_data->epp_policy = cpu_data->policy; - if (cpu_data->epp_saved >= 0) { - epp = cpu_data->epp_saved; - cpu_data->epp_saved = -EINVAL; - goto update_epp; - } - if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { epp = intel_pstate_get_epp(cpu_data, value); cpu_data->epp_powersave = epp; @@ -778,8 +884,7 @@ epp = cpu_data->epp_powersave; } -update_epp: - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { value &= ~GENMASK_ULL(31, 24); value |= (u64)epp << 24; } else { @@ -790,34 +895,99 @@ wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } -static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy) +static void intel_pstate_hwp_offline(struct cpudata *cpu) { - struct cpudata *cpu_data = all_cpu_data[policy->cpu]; + u64 value = READ_ONCE(cpu->hwp_req_cached); + int min_perf; - if (!hwp_active) - return 0; + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { + /* + * In case the EPP has been set to "performance" by the + * active mode "performance" scaling algorithm, replace that + * temporary value with the cached EPP one. + */ + value &= ~GENMASK_ULL(31, 24); + value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached); + WRITE_ONCE(cpu->hwp_req_cached, value); + } - cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0); + value &= ~GENMASK_ULL(31, 0); + min_perf = HWP_LOWEST_PERF(cpu->hwp_cap_cached); - return 0; + /* Set hwp_max = hwp_min */ + value |= HWP_MAX_PERF(min_perf); + value |= HWP_MIN_PERF(min_perf); + + /* Set EPP to min */ + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) + value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); + + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); +} + +#define POWER_CTL_EE_ENABLE 1 +#define POWER_CTL_EE_DISABLE 2 + +static int power_ctl_ee_state; + +static void set_power_ctl_ee_state(bool input) +{ + u64 power_ctl; + + mutex_lock(&intel_pstate_driver_lock); + rdmsrl(MSR_IA32_POWER_CTL, power_ctl); + if (input) { + power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); + power_ctl_ee_state = POWER_CTL_EE_ENABLE; + } else { + power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); + power_ctl_ee_state = POWER_CTL_EE_DISABLE; + } + wrmsrl(MSR_IA32_POWER_CTL, power_ctl); + mutex_unlock(&intel_pstate_driver_lock); } static void intel_pstate_hwp_enable(struct cpudata *cpudata); +static void intel_pstate_hwp_reenable(struct cpudata *cpu) +{ + intel_pstate_hwp_enable(cpu); + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached)); +} + +static int intel_pstate_suspend(struct cpufreq_policy *policy) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + + pr_debug("CPU %d suspending\n", cpu->cpu); + + cpu->suspended = true; + + return 0; +} + static int intel_pstate_resume(struct cpufreq_policy *policy) { - if (!hwp_active) - return 0; + struct cpudata *cpu = all_cpu_data[policy->cpu]; - mutex_lock(&intel_pstate_limits_lock); + pr_debug("CPU %d resuming\n", cpu->cpu); - if (policy->cpu == 0) - intel_pstate_hwp_enable(all_cpu_data[policy->cpu]); + /* Only restore if the system default is changed */ + if (power_ctl_ee_state == POWER_CTL_EE_ENABLE) + set_power_ctl_ee_state(true); + else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE) + set_power_ctl_ee_state(false); - all_cpu_data[policy->cpu]->epp_policy = 0; - intel_pstate_hwp_set(policy->cpu); + if (cpu->suspended && hwp_active) { + mutex_lock(&intel_pstate_limits_lock); - mutex_unlock(&intel_pstate_limits_lock); + /* Re-enable HWP, because "online" has not done that. */ + intel_pstate_hwp_reenable(cpu); + + mutex_unlock(&intel_pstate_limits_lock); + } + + cpu->suspended = false; return 0; } @@ -828,6 +998,44 @@ for_each_possible_cpu(cpu) cpufreq_update_policy(cpu); +} + +static void intel_pstate_update_max_freq(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); + struct cpudata *cpudata; + + if (!policy) + return; + + cpudata = all_cpu_data[cpu]; + policy->cpuinfo.max_freq = global.turbo_disabled_mf ? + cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; + + refresh_frequency_limits(policy); + + cpufreq_cpu_release(policy); +} + +static void intel_pstate_update_limits(unsigned int cpu) +{ + mutex_lock(&intel_pstate_driver_lock); + + update_turbo_state(); + /* + * If turbo has been turned on or off globally, policy limits for + * all CPUs need to be updated to reflect that. + */ + if (global.turbo_disabled_mf != global.turbo_disabled) { + global.turbo_disabled_mf = global.turbo_disabled; + arch_set_max_freq_ratio(global.turbo_disabled); + for_each_possible_cpu(cpu) + intel_pstate_update_max_freq(cpu); + } else { + cpufreq_update_policy(cpu); + } + + mutex_unlock(&intel_pstate_driver_lock); } /************************** sysfs begin ************************/ @@ -983,6 +1191,45 @@ return count; } +static void update_qos_request(enum freq_qos_req_type type) +{ + int max_state, turbo_max, freq, i, perf_pct; + struct freq_qos_request *req; + struct cpufreq_policy *policy; + + for_each_possible_cpu(i) { + struct cpudata *cpu = all_cpu_data[i]; + + policy = cpufreq_cpu_get(i); + if (!policy) + continue; + + req = policy->driver_data; + cpufreq_cpu_put(policy); + + if (!req) + continue; + + if (hwp_active) + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); + else + turbo_max = cpu->pstate.turbo_pstate; + + if (type == FREQ_QOS_MIN) { + perf_pct = global.min_perf_pct; + } else { + req++; + perf_pct = global.max_perf_pct; + } + + freq = DIV_ROUND_UP(turbo_max * perf_pct, 100); + freq *= cpu->pstate.scaling; + + if (freq_qos_update_request(req, freq) < 0) + pr_warn("Failed to update freq constraint: CPU%d\n", i); + } +} + static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b, const char *buf, size_t count) { @@ -1006,7 +1253,10 @@ mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + if (intel_pstate_driver == &intel_pstate) + intel_pstate_update_policies(); + else + update_qos_request(FREQ_QOS_MAX); mutex_unlock(&intel_pstate_driver_lock); @@ -1037,7 +1287,10 @@ mutex_unlock(&intel_pstate_limits_lock); - intel_pstate_update_policies(); + if (intel_pstate_driver == &intel_pstate) + intel_pstate_update_policies(); + else + update_qos_request(FREQ_QOS_MIN); mutex_unlock(&intel_pstate_driver_lock); @@ -1069,6 +1322,32 @@ return count; } +static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + u64 power_ctl; + int enable; + + rdmsrl(MSR_IA32_POWER_CTL, power_ctl); + enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE)); + return sprintf(buf, "%d\n", !enable); +} + +static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b, + const char *buf, size_t count) +{ + bool input; + int ret; + + ret = kstrtobool(buf, &input); + if (ret) + return ret; + + set_power_ctl_ee_state(input); + + return count; +} + show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -1079,6 +1358,7 @@ define_one_global_ro(turbo_pct); define_one_global_ro(num_pstates); define_one_global_rw(hwp_dynamic_boost); +define_one_global_rw(energy_efficiency); static struct attribute *intel_pstate_attributes[] = { &status.attr, @@ -1092,9 +1372,12 @@ .attrs = intel_pstate_attributes, }; +static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[]; + +static struct kobject *intel_pstate_kobject; + static void __init intel_pstate_sysfs_expose_params(void) { - struct kobject *intel_pstate_kobject; int rc; intel_pstate_kobject = kobject_create_and_add("intel_pstate", @@ -1119,43 +1402,60 @@ rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); WARN_ON(rc); - if (hwp_active) { - rc = sysfs_create_file(intel_pstate_kobject, - &hwp_dynamic_boost.attr); + if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) { + rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr); WARN_ON(rc); } } + +static void __init intel_pstate_sysfs_remove(void) +{ + if (!intel_pstate_kobject) + return; + + sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group); + + if (!per_cpu_limits) { + sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr); + sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr); + + if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) + sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr); + } + + kobject_put(intel_pstate_kobject); +} + +static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void) +{ + int rc; + + if (!hwp_active) + return; + + rc = sysfs_create_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); + WARN_ON_ONCE(rc); +} + +static void intel_pstate_sysfs_hide_hwp_dynamic_boost(void) +{ + if (!hwp_active) + return; + + sysfs_remove_file(intel_pstate_kobject, &hwp_dynamic_boost.attr); +} + /************************** sysfs end ************************/ static void intel_pstate_hwp_enable(struct cpudata *cpudata) { /* First disable HWP notification interrupt as we don't process them */ - if (static_cpu_has(X86_FEATURE_HWP_NOTIFY)) + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); - cpudata->epp_policy = 0; if (cpudata->epp_default == -EINVAL) cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); -} - -#define MSR_IA32_POWER_CTL_BIT_EE 19 - -/* Disable energy efficiency optimization */ -static void intel_pstate_disable_ee(int cpu) -{ - u64 power_ctl; - int ret; - - ret = rdmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, &power_ctl); - if (ret) - return; - - if (!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE))) { - pr_info("Disabling energy efficiency optimization\n"); - power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); - wrmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, power_ctl); - } } static int atom_get_min_pstate(void) @@ -1383,12 +1683,6 @@ return ret; } -static int intel_pstate_get_base_pstate(struct cpudata *cpu) -{ - return global.no_turbo || global.turbo_disabled ? - cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; -} - static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) { trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); @@ -1409,11 +1703,9 @@ static void intel_pstate_max_within_limits(struct cpudata *cpu) { - int pstate; + int pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); update_turbo_state(); - pstate = intel_pstate_get_base_pstate(cpu); - pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); intel_pstate_set_pstate(cpu, pstate); } @@ -1427,7 +1719,7 @@ if (hwp_active && !hwp_mode_bdw) { unsigned int phy_max, current_max; - intel_pstate_get_hwp_max(cpu->cpu, &phy_max, ¤t_max); + intel_pstate_get_hwp_max(cpu, &phy_max, ¤t_max); cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; cpu->pstate.turbo_pstate = phy_max; cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(READ_ONCE(cpu->hwp_cap_cached)); @@ -1619,17 +1911,14 @@ static inline int32_t get_target_pstate(struct cpudata *cpu) { struct sample *sample = &cpu->sample; - int32_t busy_frac, boost; + int32_t busy_frac; int target, avg_pstate; busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, sample->tsc); - boost = cpu->iowait_boost; - cpu->iowait_boost >>= 1; - - if (busy_frac < boost) - busy_frac = boost; + if (busy_frac < cpu->iowait_boost) + busy_frac = cpu->iowait_boost; sample->busy_scaled = busy_frac * 100; @@ -1656,11 +1945,9 @@ static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) { - int max_pstate = intel_pstate_get_base_pstate(cpu); - int min_pstate; + int min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); + int max_pstate = max(min_pstate, cpu->max_perf_ratio); - min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); - max_pstate = max(min_pstate, cpu->max_perf_ratio); return clamp_t(int, pstate, min_pstate, max_pstate); } @@ -1708,29 +1995,30 @@ if (smp_processor_id() != cpu->cpu) return; + delta_ns = time - cpu->last_update; if (flags & SCHED_CPUFREQ_IOWAIT) { - cpu->iowait_boost = int_tofp(1); - cpu->last_update = time; - /* - * The last time the busy was 100% so P-state was max anyway - * so avoid overhead of computation. - */ - if (fp_toint(cpu->sample.busy_scaled) == 100) - return; - - goto set_pstate; + /* Start over if the CPU may have been idle. */ + if (delta_ns > TICK_NSEC) { + cpu->iowait_boost = ONE_EIGHTH_FP; + } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) { + cpu->iowait_boost <<= 1; + if (cpu->iowait_boost > int_tofp(1)) + cpu->iowait_boost = int_tofp(1); + } else { + cpu->iowait_boost = ONE_EIGHTH_FP; + } } else if (cpu->iowait_boost) { /* Clear iowait_boost if the CPU may have been idle. */ - delta_ns = time - cpu->last_update; if (delta_ns > TICK_NSEC) cpu->iowait_boost = 0; + else + cpu->iowait_boost >>= 1; } cpu->last_update = time; delta_ns = time - cpu->sample.time; if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL) return; -set_pstate: if (intel_pstate_sample(cpu, time)) intel_pstate_adjust_pstate(cpu); } @@ -1774,51 +2062,51 @@ .get_val = core_get_val, }; -#define ICPU(model, policy) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ - (unsigned long)&policy } +#define X86_MATCH(model, policy) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + X86_FEATURE_APERFMPERF, &policy) static const struct x86_cpu_id intel_pstate_cpu_ids[] = { - ICPU(INTEL_FAM6_SANDYBRIDGE, core_funcs), - ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_funcs), - ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_funcs), - ICPU(INTEL_FAM6_IVYBRIDGE, core_funcs), - ICPU(INTEL_FAM6_HASWELL_CORE, core_funcs), - ICPU(INTEL_FAM6_BROADWELL_CORE, core_funcs), - ICPU(INTEL_FAM6_IVYBRIDGE_X, core_funcs), - ICPU(INTEL_FAM6_HASWELL_X, core_funcs), - ICPU(INTEL_FAM6_HASWELL_ULT, core_funcs), - ICPU(INTEL_FAM6_HASWELL_GT3E, core_funcs), - ICPU(INTEL_FAM6_BROADWELL_GT3E, core_funcs), - ICPU(INTEL_FAM6_ATOM_AIRMONT, airmont_funcs), - ICPU(INTEL_FAM6_SKYLAKE_MOBILE, core_funcs), - ICPU(INTEL_FAM6_BROADWELL_X, core_funcs), - ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs), - ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_funcs), - ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_funcs), - ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_funcs), - ICPU(INTEL_FAM6_ATOM_GOLDMONT, core_funcs), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS, core_funcs), - ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs), + X86_MATCH(SANDYBRIDGE, core_funcs), + X86_MATCH(SANDYBRIDGE_X, core_funcs), + X86_MATCH(ATOM_SILVERMONT, silvermont_funcs), + X86_MATCH(IVYBRIDGE, core_funcs), + X86_MATCH(HASWELL, core_funcs), + X86_MATCH(BROADWELL, core_funcs), + X86_MATCH(IVYBRIDGE_X, core_funcs), + X86_MATCH(HASWELL_X, core_funcs), + X86_MATCH(HASWELL_L, core_funcs), + X86_MATCH(HASWELL_G, core_funcs), + X86_MATCH(BROADWELL_G, core_funcs), + X86_MATCH(ATOM_AIRMONT, airmont_funcs), + X86_MATCH(SKYLAKE_L, core_funcs), + X86_MATCH(BROADWELL_X, core_funcs), + X86_MATCH(SKYLAKE, core_funcs), + X86_MATCH(BROADWELL_D, core_funcs), + X86_MATCH(XEON_PHI_KNL, knl_funcs), + X86_MATCH(XEON_PHI_KNM, knl_funcs), + X86_MATCH(ATOM_GOLDMONT, core_funcs), + X86_MATCH(ATOM_GOLDMONT_PLUS, core_funcs), + X86_MATCH(SKYLAKE_X, core_funcs), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { - ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_funcs), - ICPU(INTEL_FAM6_BROADWELL_X, core_funcs), - ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs), + X86_MATCH(BROADWELL_D, core_funcs), + X86_MATCH(BROADWELL_X, core_funcs), + X86_MATCH(SKYLAKE_X, core_funcs), {} }; static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { - ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, core_funcs), + X86_MATCH(KABYLAKE, core_funcs), {} }; static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = { - ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs), - ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs), + X86_MATCH(SKYLAKE_X, core_funcs), + X86_MATCH(SKYLAKE, core_funcs), {} }; @@ -1835,28 +2123,30 @@ all_cpu_data[cpunum] = cpu; + cpu->cpu = cpunum; + cpu->epp_default = -EINVAL; - cpu->epp_powersave = -EINVAL; - cpu->epp_saved = -EINVAL; + + if (hwp_active) { + const struct x86_cpu_id *id; + + intel_pstate_hwp_enable(cpu); + + id = x86_match_cpu(intel_pstate_hwp_boost_ids); + if (id && intel_pstate_acpi_pm_profile_server()) + hwp_boost = true; + } + } else if (hwp_active) { + /* + * Re-enable HWP in case this happens after a resume from ACPI + * S3 if the CPU was offline during the whole system/resume + * cycle. + */ + intel_pstate_hwp_reenable(cpu); } - cpu = all_cpu_data[cpunum]; - - cpu->cpu = cpunum; - - if (hwp_active) { - const struct x86_cpu_id *id; - - id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); - if (id) - intel_pstate_disable_ee(cpunum); - - intel_pstate_hwp_enable(cpu); - - id = x86_match_cpu(intel_pstate_hwp_boost_ids); - if (id && intel_pstate_acpi_pm_profile_server()) - hwp_boost = true; - } + cpu->epp_powersave = -EINVAL; + cpu->epp_policy = 0; intel_pstate_get_cpu_pstates(cpu); @@ -1893,7 +2183,7 @@ cpufreq_remove_update_util_hook(cpu); cpu_data->update_util_set = false; - synchronize_sched(); + synchronize_rcu(); } static int intel_pstate_get_max_freq(struct cpudata *cpu) @@ -1902,12 +2192,13 @@ cpu->pstate.max_freq : cpu->pstate.turbo_freq; } -static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, - struct cpudata *cpu) +static void intel_pstate_update_perf_limits(struct cpudata *cpu, + unsigned int policy_min, + unsigned int policy_max) { - int max_freq = intel_pstate_get_max_freq(cpu); int32_t max_policy_perf, min_policy_perf; int max_state, turbo_max; + int max_freq; /* * HWP needs some special consideration, because on BDX the @@ -1915,24 +2206,25 @@ * rather than pure ratios. */ if (hwp_active) { - intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); } else { - max_state = intel_pstate_get_base_pstate(cpu); + max_state = global.no_turbo || global.turbo_disabled ? + cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; turbo_max = cpu->pstate.turbo_pstate; } + max_freq = max_state * cpu->pstate.scaling; - max_policy_perf = max_state * policy->max / max_freq; - if (policy->max == policy->min) { + max_policy_perf = max_state * policy_max / max_freq; + if (policy_max == policy_min) { min_policy_perf = max_policy_perf; } else { - min_policy_perf = max_state * policy->min / max_freq; + min_policy_perf = max_state * policy_min / max_freq; min_policy_perf = clamp_t(int32_t, min_policy_perf, 0, max_policy_perf); } pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n", - policy->cpu, max_state, - min_policy_perf, max_policy_perf); + cpu->cpu, max_state, min_policy_perf, max_policy_perf); /* Normalize user input to [min_perf, max_perf] */ if (per_cpu_limits) { @@ -1946,7 +2238,7 @@ global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); global_min = clamp_t(int32_t, global_min, 0, global_max); - pr_debug("cpu:%d global_min:%d global_max:%d\n", policy->cpu, + pr_debug("cpu:%d global_min:%d global_max:%d\n", cpu->cpu, global_min, global_max); cpu->min_perf_ratio = max(min_policy_perf, global_min); @@ -1959,7 +2251,7 @@ cpu->max_perf_ratio); } - pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", policy->cpu, + pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", cpu->cpu, cpu->max_perf_ratio, cpu->min_perf_ratio); } @@ -1979,7 +2271,7 @@ mutex_lock(&intel_pstate_limits_lock); - intel_pstate_update_perf_limits(policy, cpu); + intel_pstate_update_perf_limits(cpu, policy->min, policy->max); if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { /* @@ -2008,8 +2300,8 @@ return 0; } -static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy, - struct cpudata *cpu) +static void intel_pstate_adjust_policy_max(struct cpudata *cpu, + struct cpufreq_policy_data *policy) { if (!hwp_active && cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && @@ -2020,42 +2312,87 @@ } } -static int intel_pstate_verify_policy(struct cpufreq_policy *policy) +static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, + struct cpufreq_policy_data *policy) { - struct cpudata *cpu = all_cpu_data[policy->cpu]; + int max_freq; update_turbo_state(); - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - intel_pstate_get_max_freq(cpu)); + if (hwp_active) { + int max_state, turbo_max; - if (policy->policy != CPUFREQ_POLICY_POWERSAVE && - policy->policy != CPUFREQ_POLICY_PERFORMANCE) - return -EINVAL; + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); + max_freq = max_state * cpu->pstate.scaling; + } else { + max_freq = intel_pstate_get_max_freq(cpu); + } + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, max_freq); - intel_pstate_adjust_policy_max(policy, cpu); + intel_pstate_adjust_policy_max(cpu, policy); +} + +static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) +{ + intel_pstate_verify_cpu_policy(all_cpu_data[policy->cpu], policy); return 0; } -static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) +static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) { - intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); + struct cpudata *cpu = all_cpu_data[policy->cpu]; + + pr_debug("CPU %d going offline\n", cpu->cpu); + + if (cpu->suspended) + return 0; + + /* + * If the CPU is an SMT thread and it goes offline with the performance + * settings different from the minimum, it will prevent its sibling + * from getting to lower performance levels, so force the minimum + * performance on CPU offline to prevent that from happening. + */ + if (hwp_active) + intel_pstate_hwp_offline(cpu); + else + intel_pstate_set_min_pstate(cpu); + + intel_pstate_exit_perf_limits(policy); + + return 0; +} + +static int intel_pstate_cpu_online(struct cpufreq_policy *policy) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + + pr_debug("CPU %d going online\n", cpu->cpu); + + intel_pstate_init_acpi_perf_limits(policy); + + if (hwp_active) { + /* + * Re-enable HWP and clear the "suspended" flag to let "resume" + * know that it need not do that. + */ + intel_pstate_hwp_reenable(cpu); + cpu->suspended = false; + } + + return 0; } static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) { - pr_debug("CPU %d exiting\n", policy->cpu); + pr_debug("CPU %d stopping\n", policy->cpu); intel_pstate_clear_update_util_hook(policy->cpu); - if (hwp_active) - intel_pstate_hwp_save_state(policy); - else - intel_cpufreq_stop_cpu(policy); } static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) { - intel_pstate_exit_perf_limits(policy); + pr_debug("CPU %d exiting\n", policy->cpu); policy->fast_switch_possible = false; @@ -2082,6 +2419,7 @@ /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; update_turbo_state(); + global.turbo_disabled_mf = global.turbo_disabled; policy->cpuinfo.max_freq = global.turbo_disabled ? cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; policy->cpuinfo.max_freq *= cpu->pstate.scaling; @@ -2109,10 +2447,17 @@ if (ret) return ret; - if (IS_ENABLED(CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE)) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; + /* + * Set the policy to powersave to provide a valid fallback value in case + * the default cpufreq governor is neither powersave nor performance. + */ + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + if (hwp_active) { + struct cpudata *cpu = all_cpu_data[policy->cpu]; + + cpu->epp_cached = intel_pstate_get_epp(cpu, 0); + } return 0; } @@ -2121,25 +2466,23 @@ .flags = CPUFREQ_CONST_LOOPS, .verify = intel_pstate_verify_policy, .setpolicy = intel_pstate_set_policy, - .suspend = intel_pstate_hwp_save_state, + .suspend = intel_pstate_suspend, .resume = intel_pstate_resume, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, .stop_cpu = intel_pstate_stop_cpu, + .offline = intel_pstate_cpu_offline, + .online = intel_pstate_cpu_online, + .update_limits = intel_pstate_update_limits, .name = "intel_pstate", }; -static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) +static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy) { struct cpudata *cpu = all_cpu_data[policy->cpu]; - update_turbo_state(); - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - intel_pstate_get_max_freq(cpu)); - - intel_pstate_adjust_policy_max(policy, cpu); - - intel_pstate_update_perf_limits(policy, cpu); + intel_pstate_verify_cpu_policy(cpu, policy); + intel_pstate_update_perf_limits(cpu, policy->min, policy->max); return 0; } @@ -2182,13 +2525,71 @@ fp_toint(cpu->iowait_boost * 100)); } +static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate, + bool strict, bool fast_switch) +{ + u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; + + value &= ~HWP_MIN_PERF(~0L); + value |= HWP_MIN_PERF(target_pstate); + + /* + * The entire MSR needs to be updated in order to update the HWP min + * field in it, so opportunistically update the max too if needed. + */ + value &= ~HWP_MAX_PERF(~0L); + value |= HWP_MAX_PERF(strict ? target_pstate : cpu->max_perf_ratio); + + if (value == prev) + return; + + WRITE_ONCE(cpu->hwp_req_cached, value); + if (fast_switch) + wrmsrl(MSR_HWP_REQUEST, value); + else + wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); +} + +static void intel_cpufreq_adjust_perf_ctl(struct cpudata *cpu, + u32 target_pstate, bool fast_switch) +{ + if (fast_switch) + wrmsrl(MSR_IA32_PERF_CTL, + pstate_funcs.get_val(cpu, target_pstate)); + else + wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, + pstate_funcs.get_val(cpu, target_pstate)); +} + +static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, + int target_pstate, bool fast_switch) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + int old_pstate = cpu->pstate.current_pstate; + + target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + if (hwp_active) { + intel_cpufreq_adjust_hwp(cpu, target_pstate, + policy->strict_target, fast_switch); + cpu->pstate.current_pstate = target_pstate; + } else if (target_pstate != old_pstate) { + intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, fast_switch); + cpu->pstate.current_pstate = target_pstate; + } + + intel_cpufreq_trace(cpu, fast_switch ? INTEL_PSTATE_TRACE_FAST_SWITCH : + INTEL_PSTATE_TRACE_TARGET, old_pstate); + + return target_pstate; +} + static int intel_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { struct cpudata *cpu = all_cpu_data[policy->cpu]; struct cpufreq_freqs freqs; - int target_pstate, old_pstate; + int target_pstate; update_turbo_state(); @@ -2196,6 +2597,7 @@ freqs.new = target_freq; cpufreq_freq_transition_begin(policy, &freqs); + switch (relation) { case CPUFREQ_RELATION_L: target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); @@ -2207,15 +2609,11 @@ target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); break; } - target_pstate = intel_pstate_prepare_request(cpu, target_pstate); - old_pstate = cpu->pstate.current_pstate; - if (target_pstate != cpu->pstate.current_pstate) { - cpu->pstate.current_pstate = target_pstate; - wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, - pstate_funcs.get_val(cpu, target_pstate)); - } + + target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, false); + freqs.new = target_pstate * cpu->pstate.scaling; - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate); + cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -2225,31 +2623,101 @@ unsigned int target_freq) { struct cpudata *cpu = all_cpu_data[policy->cpu]; - int target_pstate, old_pstate; + int target_pstate; update_turbo_state(); target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); - target_pstate = intel_pstate_prepare_request(cpu, target_pstate); - old_pstate = cpu->pstate.current_pstate; - intel_pstate_update_pstate(cpu, target_pstate); - intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); + + target_pstate = intel_cpufreq_update_pstate(policy, target_pstate, true); + return target_pstate * cpu->pstate.scaling; } static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) { - int ret = __intel_pstate_cpu_init(policy); + int max_state, turbo_max, min_freq, max_freq, ret; + struct freq_qos_request *req; + struct cpudata *cpu; + struct device *dev; + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + ret = __intel_pstate_cpu_init(policy); if (ret) return ret; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; - policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq; + req = kcalloc(2, sizeof(*req), GFP_KERNEL); + if (!req) { + ret = -ENOMEM; + goto pstate_exit; + } + + cpu = all_cpu_data[policy->cpu]; + + if (hwp_active) { + u64 value; + + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; + rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value); + WRITE_ONCE(cpu->hwp_req_cached, value); + cpu->epp_cached = intel_pstate_get_epp(cpu, value); + } else { + turbo_max = cpu->pstate.turbo_pstate; + policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY; + } + + min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); + min_freq *= cpu->pstate.scaling; + max_freq = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); + max_freq *= cpu->pstate.scaling; + + ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN, + min_freq); + if (ret < 0) { + dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); + goto free_req; + } + + ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX, + max_freq); + if (ret < 0) { + dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); + goto remove_min_req; + } + + policy->driver_data = req; + return 0; + +remove_min_req: + freq_qos_remove_request(req); +free_req: + kfree(req); +pstate_exit: + intel_pstate_exit_perf_limits(policy); + + return ret; +} + +static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct freq_qos_request *req; + + req = policy->driver_data; + + freq_qos_remove_request(req + 1); + freq_qos_remove_request(req); + kfree(req); + + return intel_pstate_cpu_exit(policy); } static struct cpufreq_driver intel_cpufreq = { @@ -2258,12 +2726,16 @@ .target = intel_cpufreq_target, .fast_switch = intel_cpufreq_fast_switch, .init = intel_cpufreq_cpu_init, - .exit = intel_pstate_cpu_exit, - .stop_cpu = intel_cpufreq_stop_cpu, + .exit = intel_cpufreq_cpu_exit, + .offline = intel_pstate_cpu_offline, + .online = intel_pstate_cpu_online, + .suspend = intel_pstate_suspend, + .resume = intel_pstate_resume, + .update_limits = intel_pstate_update_limits, .name = "intel_cpufreq", }; -static struct cpufreq_driver *default_driver = &intel_pstate; +static struct cpufreq_driver *default_driver; static void intel_pstate_driver_cleanup(void) { @@ -2280,12 +2752,16 @@ } } put_online_cpus(); + intel_pstate_driver = NULL; } static int intel_pstate_register_driver(struct cpufreq_driver *driver) { int ret; + + if (driver == &intel_pstate) + intel_pstate_sysfs_expose_hwp_dynamic_boost(); memset(&global, 0, sizeof(global)); global.max_perf_pct = 100; @@ -2302,17 +2778,6 @@ return 0; } -static int intel_pstate_unregister_driver(void) -{ - if (hwp_active) - return -EBUSY; - - cpufreq_unregister_driver(intel_pstate_driver); - intel_pstate_driver_cleanup(); - - return 0; -} - static ssize_t intel_pstate_show_status(char *buf) { if (!intel_pstate_driver) @@ -2324,8 +2789,6 @@ static int intel_pstate_update_status(const char *buf, size_t size) { - int ret; - if (size == 3 && !strncmp(buf, "off", size)) { if (!intel_pstate_driver) return -EINVAL; @@ -2333,7 +2796,9 @@ if (hwp_active) return -EBUSY; - return intel_pstate_unregister_driver(); + cpufreq_unregister_driver(intel_pstate_driver); + intel_pstate_driver_cleanup(); + return 0; } if (size == 6 && !strncmp(buf, "active", size)) { @@ -2341,9 +2806,7 @@ if (intel_pstate_driver == &intel_pstate) return 0; - ret = intel_pstate_unregister_driver(); - if (ret) - return ret; + cpufreq_unregister_driver(intel_pstate_driver); } return intel_pstate_register_driver(&intel_pstate); @@ -2354,9 +2817,8 @@ if (intel_pstate_driver == &intel_cpufreq) return 0; - ret = intel_pstate_unregister_driver(); - if (ret) - return ret; + cpufreq_unregister_driver(intel_pstate_driver); + intel_pstate_sysfs_hide_hwp_dynamic_boost(); } return intel_pstate_register_driver(&intel_cpufreq); @@ -2420,6 +2882,7 @@ kfree(pss); } + pr_debug("ACPI _PSS not found\n"); return true; } @@ -2430,9 +2893,14 @@ status = acpi_get_handle(NULL, "\\_SB", &handle); if (ACPI_FAILURE(status)) - return true; + goto not_found; - return !acpi_has_method(handle, "PCCH"); + if (acpi_has_method(handle, "PCCH")) + return false; + +not_found: + pr_debug("ACPI PCCH not found\n"); + return true; } static bool __init intel_pstate_has_acpi_ppc(void) @@ -2447,6 +2915,7 @@ if (acpi_has_method(pr->handle, "_PPC")) return true; } + pr_debug("ACPI _PPC not found\n"); return false; } @@ -2457,23 +2926,25 @@ /* Hardware vendor-specific info that has its own power management modes */ static struct acpi_platform_list plat_info[] __initdata = { - {"HP ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, 0, PSS}, - {"ORACLE", "X4-2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4-2L ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4-2B ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X3-2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X3-2L ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X3-2B ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "X6-2 ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, - {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, 0, PPC}, + {"HP ", "ProLiant", 0, ACPI_SIG_FADT, all_versions, NULL, PSS}, + {"ORACLE", "X4-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X3-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X3-2L ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X3-2B ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4470M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4270M3 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4270M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4170M2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4170 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X4275 M3", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "X6-2 ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, + {"ORACLE", "Sudbury ", 0, ACPI_SIG_FADT, all_versions, NULL, PPC}, { } /* End */ }; + +#define BITMASK_OOB (BIT(8) | BIT(18)) static bool __init intel_pstate_platform_pwr_mgmt_exists(void) { @@ -2484,8 +2955,11 @@ id = x86_match_cpu(intel_pstate_cpu_oob_ids); if (id) { rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr); - if ( misc_pwr & (1 << 8)) + if (misc_pwr & BITMASK_OOB) { + pr_debug("Bit 8 or 18 in the MISC_PWR_MGMT MSR set\n"); + pr_debug("P states are controlled in Out of Band mode by the firmware/hardware\n"); return true; + } } idx = acpi_match_platform_list(plat_info); @@ -2522,51 +2996,93 @@ #define INTEL_PSTATE_HWP_BROADWELL 0x01 -#define ICPU_HWP(model, hwp_mode) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_HWP, hwp_mode } +#define X86_MATCH_HWP(model, hwp_mode) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + X86_FEATURE_HWP, hwp_mode) static const struct x86_cpu_id hwp_support_ids[] __initconst = { - ICPU_HWP(INTEL_FAM6_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), - ICPU_HWP(INTEL_FAM6_BROADWELL_XEON_D, INTEL_PSTATE_HWP_BROADWELL), - ICPU_HWP(X86_MODEL_ANY, 0), + X86_MATCH_HWP(BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), + X86_MATCH_HWP(BROADWELL_D, INTEL_PSTATE_HWP_BROADWELL), + X86_MATCH_HWP(ANY, 0), {} }; + +static bool intel_pstate_hwp_is_enabled(void) +{ + u64 value; + + rdmsrl(MSR_PM_ENABLE, value); + return !!(value & 0x1); +} static int __init intel_pstate_init(void) { const struct x86_cpu_id *id; int rc; - if (no_load) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; id = x86_match_cpu(hwp_support_ids); if (id) { + bool hwp_forced = intel_pstate_hwp_is_enabled(); + + if (hwp_forced) + pr_info("HWP enabled by BIOS\n"); + else if (no_load) + return -ENODEV; + copy_cpu_funcs(&core_funcs); - if (!no_hwp) { + /* + * Avoid enabling HWP for processors without EPP support, + * because that means incomplete HWP implementation which is a + * corner case and supporting it is generally problematic. + * + * If HWP is enabled already, though, there is no choice but to + * deal with it. + */ + if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) { hwp_active++; hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; + intel_cpufreq.attr = hwp_cpufreq_attrs; + intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS; + if (!default_driver) + default_driver = &intel_pstate; + goto hwp_cpu_matched; } + pr_info("HWP not enabled\n"); } else { - id = x86_match_cpu(intel_pstate_cpu_ids); - if (!id) + if (no_load) return -ENODEV; + + id = x86_match_cpu(intel_pstate_cpu_ids); + if (!id) { + pr_info("CPU model not supported\n"); + return -ENODEV; + } copy_cpu_funcs((struct pstate_funcs *)id->driver_data); } - if (intel_pstate_msrs_not_valid()) + if (intel_pstate_msrs_not_valid()) { + pr_info("Invalid MSRs\n"); return -ENODEV; + } + /* Without HWP start in the passive mode. */ + if (!default_driver) + default_driver = &intel_cpufreq; hwp_cpu_matched: /* * The Intel pstate driver will be ignored if the platform * firmware has its own power management modes. */ - if (intel_pstate_platform_pwr_mgmt_exists()) + if (intel_pstate_platform_pwr_mgmt_exists()) { + pr_info("P-states controlled by the platform\n"); return -ENODEV; + } if (!hwp_active && hwp_only) return -ENOTSUPP; @@ -2584,11 +3100,22 @@ mutex_lock(&intel_pstate_driver_lock); rc = intel_pstate_register_driver(default_driver); mutex_unlock(&intel_pstate_driver_lock); - if (rc) + if (rc) { + intel_pstate_sysfs_remove(); return rc; + } - if (hwp_active) + if (hwp_active) { + const struct x86_cpu_id *id; + + id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); + if (id) { + set_power_ctl_ee_state(false); + pr_info("Disabling energy efficiency optimization\n"); + } + pr_info("HWP enabled\n"); + } return 0; } @@ -2599,17 +3126,16 @@ if (!str) return -EINVAL; - if (!strcmp(str, "disable")) { + if (!strcmp(str, "disable")) no_load = 1; - } else if (!strcmp(str, "passive")) { - pr_info("Passive mode enabled\n"); + else if (!strcmp(str, "active")) + default_driver = &intel_pstate; + else if (!strcmp(str, "passive")) default_driver = &intel_cpufreq; + + if (!strcmp(str, "no_hwp")) no_hwp = 1; - } - if (!strcmp(str, "no_hwp")) { - pr_info("HWP disabled\n"); - no_hwp = 1; - } + if (!strcmp(str, "force")) force_load = 1; if (!strcmp(str, "hwp_only")) -- Gitblit v1.6.2