summaryrefslogtreecommitdiffstats
path: root/drivers/thermal/intel/x86_pkg_temp_thermal.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
commit5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
treea94efe259b9009378be6d90eb30d2b019d95c194 /drivers/thermal/intel/x86_pkg_temp_thermal.c
parentInitial commit. (diff)
downloadlinux-upstream/5.10.209.tar.xz
linux-upstream/5.10.209.zip
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/thermal/intel/x86_pkg_temp_thermal.c')
-rw-r--r--drivers/thermal/intel/x86_pkg_temp_thermal.c539
1 files changed, 539 insertions, 0 deletions
diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
new file mode 100644
index 000000000..4f5d97329
--- /dev/null
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -0,0 +1,539 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86_pkg_temp_thermal driver
+ * Copyright (c) 2013, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/param.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/thermal.h>
+#include <linux/debugfs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/mce.h>
+
+/*
+* Rate control delay: Idea is to introduce denounce effect
+* This should be long enough to avoid reduce events, when
+* threshold is set to a temperature, which is constantly
+* violated, but at the short enough to take any action.
+* The action can be remove threshold or change it to next
+* interesting setting. Based on experiments, in around
+* every 5 seconds under load will give us a significant
+* temperature change.
+*/
+#define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
+static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
+module_param(notify_delay_ms, int, 0644);
+MODULE_PARM_DESC(notify_delay_ms,
+ "User space notification delay in milli seconds.");
+
+/* Number of trip points in thermal zone. Currently it can't
+* be more than 2. MSR can allow setting and getting notifications
+* for only 2 thresholds. This define enforces this, if there
+* is some wrong values returned by cpuid for number of thresholds.
+*/
+#define MAX_NUMBER_OF_TRIPS 2
+
+struct zone_device {
+ int cpu;
+ bool work_scheduled;
+ u32 tj_max;
+ u32 msr_pkg_therm_low;
+ u32 msr_pkg_therm_high;
+ struct delayed_work work;
+ struct thermal_zone_device *tzone;
+ struct cpumask cpumask;
+};
+
+static struct thermal_zone_params pkg_temp_tz_params = {
+ .no_hwmon = true,
+};
+
+/* Keep track of how many zone pointers we allocated in init() */
+static int max_id __read_mostly;
+/* Array of zone pointers */
+static struct zone_device **zones;
+/* Serializes interrupt notification, work and hotplug */
+static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
+/* Protects zone operation in the work function against hotplug removal */
+static DEFINE_MUTEX(thermal_zone_mutex);
+
+/* The dynamically assigned cpu hotplug state for module_exit() */
+static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
+
+/* Debug counters to show using debugfs */
+static struct dentry *debugfs;
+static unsigned int pkg_interrupt_cnt;
+static unsigned int pkg_work_cnt;
+
+static void pkg_temp_debugfs_init(void)
+{
+ debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
+
+ debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
+ &pkg_interrupt_cnt);
+ debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
+ &pkg_work_cnt);
+}
+
+/*
+ * Protection:
+ *
+ * - cpu hotplug: Read serialized by cpu hotplug lock
+ * Write must hold pkg_temp_lock
+ *
+ * - Other callsites: Must hold pkg_temp_lock
+ */
+static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
+{
+ int id = topology_logical_die_id(cpu);
+
+ if (id >= 0 && id < max_id)
+ return zones[id];
+ return NULL;
+}
+
+/*
+* tj-max is is interesting because threshold is set relative to this
+* temperature.
+*/
+static int get_tj_max(int cpu, u32 *tj_max)
+{
+ u32 eax, edx, val;
+ int err;
+
+ err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+ if (err)
+ return err;
+
+ val = (eax >> 16) & 0xff;
+ *tj_max = val * 1000;
+
+ return val ? 0 : -EINVAL;
+}
+
+static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
+{
+ struct zone_device *zonedev = tzd->devdata;
+ u32 eax, edx;
+
+ rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_STATUS,
+ &eax, &edx);
+ if (eax & 0x80000000) {
+ *temp = zonedev->tj_max - ((eax >> 16) & 0x7f) * 1000;
+ pr_debug("sys_get_curr_temp %d\n", *temp);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static int sys_get_trip_temp(struct thermal_zone_device *tzd,
+ int trip, int *temp)
+{
+ struct zone_device *zonedev = tzd->devdata;
+ unsigned long thres_reg_value;
+ u32 mask, shift, eax, edx;
+ int ret;
+
+ if (trip >= MAX_NUMBER_OF_TRIPS)
+ return -EINVAL;
+
+ if (trip) {
+ mask = THERM_MASK_THRESHOLD1;
+ shift = THERM_SHIFT_THRESHOLD1;
+ } else {
+ mask = THERM_MASK_THRESHOLD0;
+ shift = THERM_SHIFT_THRESHOLD0;
+ }
+
+ ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ &eax, &edx);
+ if (ret < 0)
+ return ret;
+
+ thres_reg_value = (eax & mask) >> shift;
+ if (thres_reg_value)
+ *temp = zonedev->tj_max - thres_reg_value * 1000;
+ else
+ *temp = THERMAL_TEMP_INVALID;
+ pr_debug("sys_get_trip_temp %d\n", *temp);
+
+ return 0;
+}
+
+static int
+sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
+{
+ struct zone_device *zonedev = tzd->devdata;
+ u32 l, h, mask, shift, intr;
+ int ret;
+
+ if (trip >= MAX_NUMBER_OF_TRIPS || temp >= zonedev->tj_max)
+ return -EINVAL;
+
+ ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ &l, &h);
+ if (ret < 0)
+ return ret;
+
+ if (trip) {
+ mask = THERM_MASK_THRESHOLD1;
+ shift = THERM_SHIFT_THRESHOLD1;
+ intr = THERM_INT_THRESHOLD1_ENABLE;
+ } else {
+ mask = THERM_MASK_THRESHOLD0;
+ shift = THERM_SHIFT_THRESHOLD0;
+ intr = THERM_INT_THRESHOLD0_ENABLE;
+ }
+ l &= ~mask;
+ /*
+ * When users space sets a trip temperature == 0, which is indication
+ * that, it is no longer interested in receiving notifications.
+ */
+ if (!temp) {
+ l &= ~intr;
+ } else {
+ l |= (zonedev->tj_max - temp)/1000 << shift;
+ l |= intr;
+ }
+
+ return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l, h);
+}
+
+static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip,
+ enum thermal_trip_type *type)
+{
+ *type = THERMAL_TRIP_PASSIVE;
+ return 0;
+}
+
+/* Thermal zone callback registry */
+static struct thermal_zone_device_ops tzone_ops = {
+ .get_temp = sys_get_curr_temp,
+ .get_trip_temp = sys_get_trip_temp,
+ .get_trip_type = sys_get_trip_type,
+ .set_trip_temp = sys_set_trip_temp,
+};
+
+static bool pkg_thermal_rate_control(void)
+{
+ return true;
+}
+
+/* Enable threshold interrupt on local package/cpu */
+static inline void enable_pkg_thres_interrupt(void)
+{
+ u8 thres_0, thres_1;
+ u32 l, h;
+
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ /* only enable/disable if it had valid threshold value */
+ thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
+ thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
+ if (thres_0)
+ l |= THERM_INT_THRESHOLD0_ENABLE;
+ if (thres_1)
+ l |= THERM_INT_THRESHOLD1_ENABLE;
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+/* Disable threshold interrupt on local package/cpu */
+static inline void disable_pkg_thres_interrupt(void)
+{
+ u32 l, h;
+
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+
+ l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
+{
+ struct thermal_zone_device *tzone = NULL;
+ int cpu = smp_processor_id();
+ struct zone_device *zonedev;
+ u64 msr_val, wr_val;
+
+ mutex_lock(&thermal_zone_mutex);
+ raw_spin_lock_irq(&pkg_temp_lock);
+ ++pkg_work_cnt;
+
+ zonedev = pkg_temp_thermal_get_dev(cpu);
+ if (!zonedev) {
+ raw_spin_unlock_irq(&pkg_temp_lock);
+ mutex_unlock(&thermal_zone_mutex);
+ return;
+ }
+ zonedev->work_scheduled = false;
+
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
+ if (wr_val != msr_val) {
+ wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val);
+ tzone = zonedev->tzone;
+ }
+
+ enable_pkg_thres_interrupt();
+ raw_spin_unlock_irq(&pkg_temp_lock);
+
+ /*
+ * If tzone is not NULL, then thermal_zone_mutex will prevent the
+ * concurrent removal in the cpu offline callback.
+ */
+ if (tzone)
+ thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
+
+ mutex_unlock(&thermal_zone_mutex);
+}
+
+static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
+{
+ unsigned long ms = msecs_to_jiffies(notify_delay_ms);
+
+ schedule_delayed_work_on(cpu, work, ms);
+}
+
+static int pkg_thermal_notify(u64 msr_val)
+{
+ int cpu = smp_processor_id();
+ struct zone_device *zonedev;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pkg_temp_lock, flags);
+ ++pkg_interrupt_cnt;
+
+ disable_pkg_thres_interrupt();
+
+ /* Work is per package, so scheduling it once is enough. */
+ zonedev = pkg_temp_thermal_get_dev(cpu);
+ if (zonedev && !zonedev->work_scheduled) {
+ zonedev->work_scheduled = true;
+ pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
+ }
+
+ raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
+ return 0;
+}
+
+static int pkg_temp_thermal_device_add(unsigned int cpu)
+{
+ int id = topology_logical_die_id(cpu);
+ u32 tj_max, eax, ebx, ecx, edx;
+ struct zone_device *zonedev;
+ int thres_count, err;
+
+ if (id >= max_id)
+ return -ENOMEM;
+
+ cpuid(6, &eax, &ebx, &ecx, &edx);
+ thres_count = ebx & 0x07;
+ if (!thres_count)
+ return -ENODEV;
+
+ thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
+
+ err = get_tj_max(cpu, &tj_max);
+ if (err)
+ return err;
+
+ zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
+ if (!zonedev)
+ return -ENOMEM;
+
+ INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
+ zonedev->cpu = cpu;
+ zonedev->tj_max = tj_max;
+ zonedev->tzone = thermal_zone_device_register("x86_pkg_temp",
+ thres_count,
+ (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01,
+ zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
+ if (IS_ERR(zonedev->tzone)) {
+ err = PTR_ERR(zonedev->tzone);
+ kfree(zonedev);
+ return err;
+ }
+ err = thermal_zone_device_enable(zonedev->tzone);
+ if (err) {
+ thermal_zone_device_unregister(zonedev->tzone);
+ kfree(zonedev);
+ return err;
+ }
+ /* Store MSR value for package thermal interrupt, to restore at exit */
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
+ zonedev->msr_pkg_therm_high);
+
+ cpumask_set_cpu(cpu, &zonedev->cpumask);
+ raw_spin_lock_irq(&pkg_temp_lock);
+ zones[id] = zonedev;
+ raw_spin_unlock_irq(&pkg_temp_lock);
+ return 0;
+}
+
+static int pkg_thermal_cpu_offline(unsigned int cpu)
+{
+ struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
+ bool lastcpu, was_target;
+ int target;
+
+ if (!zonedev)
+ return 0;
+
+ target = cpumask_any_but(&zonedev->cpumask, cpu);
+ cpumask_clear_cpu(cpu, &zonedev->cpumask);
+ lastcpu = target >= nr_cpu_ids;
+ /*
+ * Remove the sysfs files, if this is the last cpu in the package
+ * before doing further cleanups.
+ */
+ if (lastcpu) {
+ struct thermal_zone_device *tzone = zonedev->tzone;
+
+ /*
+ * We must protect against a work function calling
+ * thermal_zone_update, after/while unregister. We null out
+ * the pointer under the zone mutex, so the worker function
+ * won't try to call.
+ */
+ mutex_lock(&thermal_zone_mutex);
+ zonedev->tzone = NULL;
+ mutex_unlock(&thermal_zone_mutex);
+
+ thermal_zone_device_unregister(tzone);
+ }
+
+ /* Protect against work and interrupts */
+ raw_spin_lock_irq(&pkg_temp_lock);
+
+ /*
+ * Check whether this cpu was the current target and store the new
+ * one. When we drop the lock, then the interrupt notify function
+ * will see the new target.
+ */
+ was_target = zonedev->cpu == cpu;
+ zonedev->cpu = target;
+
+ /*
+ * If this is the last CPU in the package remove the package
+ * reference from the array and restore the interrupt MSR. When we
+ * drop the lock neither the interrupt notify function nor the
+ * worker will see the package anymore.
+ */
+ if (lastcpu) {
+ zones[topology_logical_die_id(cpu)] = NULL;
+ /* After this point nothing touches the MSR anymore. */
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
+ }
+
+ /*
+ * Check whether there is work scheduled and whether the work is
+ * targeted at the outgoing CPU.
+ */
+ if (zonedev->work_scheduled && was_target) {
+ /*
+ * To cancel the work we need to drop the lock, otherwise
+ * we might deadlock if the work needs to be flushed.
+ */
+ raw_spin_unlock_irq(&pkg_temp_lock);
+ cancel_delayed_work_sync(&zonedev->work);
+ raw_spin_lock_irq(&pkg_temp_lock);
+ /*
+ * If this is not the last cpu in the package and the work
+ * did not run after we dropped the lock above, then we
+ * need to reschedule the work, otherwise the interrupt
+ * stays disabled forever.
+ */
+ if (!lastcpu && zonedev->work_scheduled)
+ pkg_thermal_schedule_work(target, &zonedev->work);
+ }
+
+ raw_spin_unlock_irq(&pkg_temp_lock);
+
+ /* Final cleanup if this is the last cpu */
+ if (lastcpu)
+ kfree(zonedev);
+ return 0;
+}
+
+static int pkg_thermal_cpu_online(unsigned int cpu)
+{
+ struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Paranoia check */
+ if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
+ return -ENODEV;
+
+ /* If the package exists, nothing to do */
+ if (zonedev) {
+ cpumask_set_cpu(cpu, &zonedev->cpumask);
+ return 0;
+ }
+ return pkg_temp_thermal_device_add(cpu);
+}
+
+static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
+ X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
+
+static int __init pkg_temp_thermal_init(void)
+{
+ int ret;
+
+ if (!x86_match_cpu(pkg_temp_thermal_ids))
+ return -ENODEV;
+
+ max_id = topology_max_packages() * topology_max_die_per_package();
+ zones = kcalloc(max_id, sizeof(struct zone_device *),
+ GFP_KERNEL);
+ if (!zones)
+ return -ENOMEM;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
+ pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
+ if (ret < 0)
+ goto err;
+
+ /* Store the state for module exit */
+ pkg_thermal_hp_state = ret;
+
+ platform_thermal_package_notify = pkg_thermal_notify;
+ platform_thermal_package_rate_control = pkg_thermal_rate_control;
+
+ /* Don't care if it fails */
+ pkg_temp_debugfs_init();
+ return 0;
+
+err:
+ kfree(zones);
+ return ret;
+}
+module_init(pkg_temp_thermal_init)
+
+static void __exit pkg_temp_thermal_exit(void)
+{
+ platform_thermal_package_notify = NULL;
+ platform_thermal_package_rate_control = NULL;
+
+ cpuhp_remove_state(pkg_thermal_hp_state);
+ debugfs_remove_recursive(debugfs);
+ kfree(zones);
+}
+module_exit(pkg_temp_thermal_exit)
+
+MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_LICENSE("GPL v2");