From f9f4872df6e1801572949f8a370c886122d4b6da Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 8 Oct 2016 12:42:38 -0700 Subject: cpufreq: intel_pstate: Fix unsafe HWP MSR access This is a requirement that MSR MSR_PM_ENABLE must be set to 0x01 before reading MSR_HWP_CAPABILITIES on a given CPU. If cpufreq init() is scheduled on a CPU which is not same as policy->cpu or migrates to a different CPU before calling msr read for MSR_HWP_CAPABILITIES, it is possible that MSR_PM_ENABLE was not to set to 0x01 on that CPU. This will cause GP fault. So like other places in this path rdmsrl_on_cpu should be used instead of rdmsrl. Moreover the scope of MSR_HWP_CAPABILITIES is on per thread basis, so it should be read from the same CPU, for which MSR MSR_HWP_REQUEST is getting set. dmesg dump or warning: [ 22.014488] WARNING: CPU: 139 PID: 1 at arch/x86/mm/extable.c:50 ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014492] unchecked MSR access error: RDMSR from 0x771 [ 22.014493] Modules linked in: [ 22.014507] CPU: 139 PID: 1 Comm: swapper/0 Not tainted 4.7.5+ #1 ... ... [ 22.014516] Call Trace: [ 22.014542] [] dump_stack+0x63/0x82 [ 22.014558] [] __warn+0xcb/0xf0 [ 22.014561] [] warn_slowpath_fmt+0x4f/0x60 [ 22.014563] [] ex_handler_rdmsr_unsafe+0x68/0x70 [ 22.014564] [] fixup_exception+0x39/0x50 [ 22.014604] [] do_general_protection+0x80/0x150 [ 22.014610] [] general_protection+0x28/0x30 [ 22.014635] [] ? get_target_pstate_use_performance+0xb0/0xb0 [ 22.014642] [] ? native_read_msr+0x7/0x40 [ 22.014657] [] intel_pstate_hwp_set+0x23/0x130 [ 22.014660] [] intel_pstate_set_policy+0x1b6/0x340 [ 22.014662] [] cpufreq_set_policy+0xeb/0x2c0 [ 22.014664] [] cpufreq_init_policy+0x79/0xe0 [ 22.014666] [] ? cpufreq_update_policy+0x120/0x120 [ 22.014669] [] cpufreq_online+0x406/0x820 [ 22.014671] [] cpufreq_add_dev+0x5f/0x90 [ 22.014717] [] subsys_interface_register+0xb8/0x100 [ 22.014719] [] cpufreq_register_driver+0x14c/0x210 [ 22.014749] [] intel_pstate_init+0x39d/0x4d5 [ 22.014751] [] ? cpufreq_gov_dbs_init+0x12/0x12 Cc: 4.3+ # 4.3+ Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 806f2039571e..bc976a3db253 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -562,12 +562,12 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask) int min, hw_min, max, hw_max, cpu, range, adj_range; u64 value, cap; - rdmsrl(MSR_HWP_CAPABILITIES, cap); - hw_min = HWP_LOWEST_PERF(cap); - hw_max = HWP_HIGHEST_PERF(cap); - range = hw_max - hw_min; - for_each_cpu(cpu, cpumask) { + rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); + hw_min = HWP_LOWEST_PERF(cap); + hw_max = HWP_HIGHEST_PERF(cap); + range = hw_max - hw_min; + rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); adj_range = limits->min_perf_pct * range / 100; min = hw_min + adj_range; -- cgit v1.2.1 From f00593a4bdd22e0885db89df8ee8afcc6867994f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Sep 2016 03:13:34 +0200 Subject: cpufreq: intel_pstate: Clarify comment in get_target_pstate_use_performance() Make the comment explaining the meaning of the perf_scaled variable in get_target_pstate_use_performance() more straightforward. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bc976a3db253..1c7b91c5805b 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1251,10 +1251,11 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) u64 duration_ns; /* - * perf_scaled is the average performance during the last sampling - * period scaled by the ratio of the maximum P-state to the P-state - * requested last time (in percent). That measures the system's - * response to the previous P-state selection. + * perf_scaled is the ratio of the average P-state during the last + * sampling period to the P-state requested last time (in percent). + * + * That measures the system's response to the previous P-state + * selection. */ max_pstate = cpu->pstate.max_pstate_physical; current_pstate = cpu->pstate.current_pstate; -- cgit v1.2.1 From 0843e83c1a4aa67e08f424430526c948d591d5f1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Oct 2016 14:07:51 +0200 Subject: cpufreq: intel_pstate: Proportional algorithm for Atom The PID algorithm used by the intel_pstate driver tends to drive performance to the minimum for workloads with utilization below the setpoint, which is undesirable, so replace it with a modified "proportional" algorithm on Atom. The new algorithm will set the new P-state to be 1.25 times the available maximum times the (frequency-invariant) utilization during the previous sampling period except when the target P-state computed this way is lower than the average P-state during the previous sampling period. In the latter case, it will increase the target by 50% of the difference between it and the average P-state to prevent performance from dropping down too fast in some cases. Signed-off-by: Rafael J. Wysocki Tested-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1c7b91c5805b..6c8c897d0a2d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1232,6 +1232,7 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) { struct sample *sample = &cpu->sample; int32_t busy_frac, boost; + int target, avg_pstate; busy_frac = div_fp(sample->mperf, sample->tsc); @@ -1242,7 +1243,26 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) busy_frac = boost; sample->busy_scaled = busy_frac * 100; - return get_avg_pstate(cpu) - pid_calc(&cpu->pid, sample->busy_scaled); + + target = limits->no_turbo || limits->turbo_disabled ? + cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; + target += target >> 2; + target = mul_fp(target, busy_frac); + if (target < cpu->pstate.min_pstate) + target = cpu->pstate.min_pstate; + + /* + * If the average P-state during the previous cycle was higher than the + * current target, add 50% of the difference to the target to reduce + * possible performance oscillations and offset possible performance + * loss related to moving the workload from one CPU to another within + * a package/module. + */ + avg_pstate = get_avg_pstate(cpu); + if (avg_pstate > target) + target += (avg_pstate - target) >> 1; + + return target; } static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) -- cgit v1.2.1 From 3954517e2f083c8aeb52bfd467b7b2c164232ffc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Oct 2016 23:07:38 +0200 Subject: cpufreq: intel_pstate: Fix struct pstate_adjust_policy kerneldoc It looks like the name of struct pstate_adjust_policy was updated without updating its kerneldoc comment accordingly, so fix that mistake. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6c8c897d0a2d..f535f8123258 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -225,7 +225,7 @@ struct cpudata { static struct cpudata **all_cpu_data; /** - * struct pid_adjust_policy - Stores static PID configuration data + * struct pstate_adjust_policy - Stores static PID configuration data * @sample_rate_ms: PID calculation sample rate in ms * @sample_rate_ns: Sample rate calculation in ns * @deadband: PID deadband -- cgit v1.2.1