From 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 12:05:51 +0200 Subject: Adding upstream version 5.10.209. Signed-off-by: Daniel Baumann --- .../cpupower/utils/idle_monitor/amd_fam14h_idle.c | 334 +++++++++++++++ .../cpupower/utils/idle_monitor/cpuidle_sysfs.c | 212 ++++++++++ .../cpupower/utils/idle_monitor/cpupower-monitor.c | 467 +++++++++++++++++++++ .../cpupower/utils/idle_monitor/cpupower-monitor.h | 95 +++++ .../cpupower/utils/idle_monitor/hsw_ext_idle.c | 194 +++++++++ .../cpupower/utils/idle_monitor/idle_monitors.def | 8 + .../cpupower/utils/idle_monitor/idle_monitors.h | 16 + .../cpupower/utils/idle_monitor/mperf_monitor.c | 379 +++++++++++++++++ tools/power/cpupower/utils/idle_monitor/nhm_idle.c | 215 ++++++++++ tools/power/cpupower/utils/idle_monitor/snb_idle.c | 199 +++++++++ 10 files changed, 2119 insertions(+) create mode 100644 tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c create mode 100644 tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c create mode 100644 tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c create mode 100644 tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h create mode 100644 tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c create mode 100644 tools/power/cpupower/utils/idle_monitor/idle_monitors.def create mode 100644 tools/power/cpupower/utils/idle_monitor/idle_monitors.h create mode 100644 tools/power/cpupower/utils/idle_monitor/mperf_monitor.c create mode 100644 tools/power/cpupower/utils/idle_monitor/nhm_idle.c create mode 100644 tools/power/cpupower/utils/idle_monitor/snb_idle.c (limited to 'tools/power/cpupower/utils/idle_monitor') diff --git a/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c b/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c new file mode 100644 index 000000000..5edd35bd9 --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/amd_fam14h_idle.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * PCI initialization based on example code from: + * Andreas Herrmann + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include +#include +#include + +#include + +#include "idle_monitor/cpupower-monitor.h" +#include "helpers/helpers.h" + +#define PCI_NON_PC0_OFFSET 0xb0 +#define PCI_PC1_OFFSET 0xb4 +#define PCI_PC6_OFFSET 0xb8 + +#define PCI_MONITOR_ENABLE_REG 0xe0 + +#define PCI_NON_PC0_ENABLE_BIT 0 +#define PCI_PC1_ENABLE_BIT 1 +#define PCI_PC6_ENABLE_BIT 2 + +#define PCI_NBP1_STAT_OFFSET 0x98 +#define PCI_NBP1_ACTIVE_BIT 2 +#define PCI_NBP1_ENTERED_BIT 1 + +#define PCI_NBP1_CAP_OFFSET 0x90 +#define PCI_NBP1_CAPABLE_BIT 31 + +#define OVERFLOW_MS 343597 /* 32 bit register filled at 12500 HZ + (1 tick per 80ns) */ + +enum amd_fam14h_states {NON_PC0 = 0, PC1, PC6, NBP1, + AMD_FAM14H_STATE_NUM}; + +static int fam14h_get_count_percent(unsigned int self_id, double *percent, + unsigned int cpu); +static int fam14h_nbp1_count(unsigned int id, unsigned long long *count, + unsigned int cpu); + +static cstate_t amd_fam14h_cstates[AMD_FAM14H_STATE_NUM] = { + { + .name = "!PC0", + .desc = N_("Package in sleep state (PC1 or deeper)"), + .id = NON_PC0, + .range = RANGE_PACKAGE, + .get_count_percent = fam14h_get_count_percent, + }, + { + .name = "PC1", + .desc = N_("Processor Package C1"), + .id = PC1, + .range = RANGE_PACKAGE, + .get_count_percent = fam14h_get_count_percent, + }, + { + .name = "PC6", + .desc = N_("Processor Package C6"), + .id = PC6, + .range = RANGE_PACKAGE, + .get_count_percent = fam14h_get_count_percent, + }, + { + .name = "NBP1", + .desc = N_("North Bridge P1 boolean counter (returns 0 or 1)"), + .id = NBP1, + .range = RANGE_PACKAGE, + .get_count = fam14h_nbp1_count, + }, +}; + +static struct pci_access *pci_acc; +static struct pci_dev *amd_fam14h_pci_dev; +static int nbp1_entered; + +static struct timespec start_time; +static unsigned long long timediff; + +#ifdef DEBUG +struct timespec dbg_time; +long dbg_timediff; +#endif + +static unsigned long long *previous_count[AMD_FAM14H_STATE_NUM]; +static unsigned long long *current_count[AMD_FAM14H_STATE_NUM]; + +static int amd_fam14h_get_pci_info(struct cstate *state, + unsigned int *pci_offset, + unsigned int *enable_bit, + unsigned int cpu) +{ + switch (state->id) { + case NON_PC0: + *enable_bit = PCI_NON_PC0_ENABLE_BIT; + *pci_offset = PCI_NON_PC0_OFFSET; + break; + case PC1: + *enable_bit = PCI_PC1_ENABLE_BIT; + *pci_offset = PCI_PC1_OFFSET; + break; + case PC6: + *enable_bit = PCI_PC6_ENABLE_BIT; + *pci_offset = PCI_PC6_OFFSET; + break; + case NBP1: + *enable_bit = PCI_NBP1_ENTERED_BIT; + *pci_offset = PCI_NBP1_STAT_OFFSET; + break; + default: + return -1; + } + return 0; +} + +static int amd_fam14h_init(cstate_t *state, unsigned int cpu) +{ + int enable_bit, pci_offset, ret; + uint32_t val; + + ret = amd_fam14h_get_pci_info(state, &pci_offset, &enable_bit, cpu); + if (ret) + return ret; + + /* NBP1 needs extra treating -> write 1 to D18F6x98 bit 1 for init */ + if (state->id == NBP1) { + val = pci_read_long(amd_fam14h_pci_dev, pci_offset); + val |= 1 << enable_bit; + val = pci_write_long(amd_fam14h_pci_dev, pci_offset, val); + return ret; + } + + /* Enable monitor */ + val = pci_read_long(amd_fam14h_pci_dev, PCI_MONITOR_ENABLE_REG); + dprint("Init %s: read at offset: 0x%x val: %u\n", state->name, + PCI_MONITOR_ENABLE_REG, (unsigned int) val); + val |= 1 << enable_bit; + pci_write_long(amd_fam14h_pci_dev, PCI_MONITOR_ENABLE_REG, val); + + dprint("Init %s: offset: 0x%x enable_bit: %d - val: %u (%u)\n", + state->name, PCI_MONITOR_ENABLE_REG, enable_bit, + (unsigned int) val, cpu); + + /* Set counter to zero */ + pci_write_long(amd_fam14h_pci_dev, pci_offset, 0); + previous_count[state->id][cpu] = 0; + + return 0; +} + +static int amd_fam14h_disable(cstate_t *state, unsigned int cpu) +{ + int enable_bit, pci_offset, ret; + uint32_t val; + + ret = amd_fam14h_get_pci_info(state, &pci_offset, &enable_bit, cpu); + if (ret) + return ret; + + val = pci_read_long(amd_fam14h_pci_dev, pci_offset); + dprint("%s: offset: 0x%x %u\n", state->name, pci_offset, val); + if (state->id == NBP1) { + /* was the bit whether NBP1 got entered set? */ + nbp1_entered = (val & (1 << PCI_NBP1_ACTIVE_BIT)) | + (val & (1 << PCI_NBP1_ENTERED_BIT)); + + dprint("NBP1 was %sentered - 0x%x - enable_bit: " + "%d - pci_offset: 0x%x\n", + nbp1_entered ? "" : "not ", + val, enable_bit, pci_offset); + return ret; + } + current_count[state->id][cpu] = val; + + dprint("%s: Current - %llu (%u)\n", state->name, + current_count[state->id][cpu], cpu); + dprint("%s: Previous - %llu (%u)\n", state->name, + previous_count[state->id][cpu], cpu); + + val = pci_read_long(amd_fam14h_pci_dev, PCI_MONITOR_ENABLE_REG); + val &= ~(1 << enable_bit); + pci_write_long(amd_fam14h_pci_dev, PCI_MONITOR_ENABLE_REG, val); + + return 0; +} + +static int fam14h_nbp1_count(unsigned int id, unsigned long long *count, + unsigned int cpu) +{ + if (id == NBP1) { + if (nbp1_entered) + *count = 1; + else + *count = 0; + return 0; + } + return -1; +} +static int fam14h_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + unsigned long diff; + + if (id >= AMD_FAM14H_STATE_NUM) + return -1; + /* residency count in 80ns -> divide through 12.5 to get us residency */ + diff = current_count[id][cpu] - previous_count[id][cpu]; + + if (timediff == 0) + *percent = 0.0; + else + *percent = 100.0 * diff / timediff / 12.5; + + dprint("Timediff: %llu - res~: %lu us - percent: %.2f %%\n", + timediff, diff * 10 / 125, *percent); + + return 0; +} + +static int amd_fam14h_start(void) +{ + int num, cpu; + clock_gettime(CLOCK_REALTIME, &start_time); + for (num = 0; num < AMD_FAM14H_STATE_NUM; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) + amd_fam14h_init(&amd_fam14h_cstates[num], cpu); + } +#ifdef DEBUG + clock_gettime(CLOCK_REALTIME, &dbg_time); + dbg_timediff = timespec_diff_us(start_time, dbg_time); + dprint("Enabling counters took: %lu us\n", + dbg_timediff); +#endif + return 0; +} + +static int amd_fam14h_stop(void) +{ + int num, cpu; + struct timespec end_time; + + clock_gettime(CLOCK_REALTIME, &end_time); + + for (num = 0; num < AMD_FAM14H_STATE_NUM; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) + amd_fam14h_disable(&amd_fam14h_cstates[num], cpu); + } +#ifdef DEBUG + clock_gettime(CLOCK_REALTIME, &dbg_time); + dbg_timediff = timespec_diff_us(end_time, dbg_time); + dprint("Disabling counters took: %lu ns\n", dbg_timediff); +#endif + timediff = timespec_diff_us(start_time, end_time); + if (timediff / 1000 > OVERFLOW_MS) + print_overflow_err((unsigned int)timediff / 1000000, + OVERFLOW_MS / 1000); + + return 0; +} + +static int is_nbp1_capable(void) +{ + uint32_t val; + val = pci_read_long(amd_fam14h_pci_dev, PCI_NBP1_CAP_OFFSET); + return val & (1 << 31); +} + +struct cpuidle_monitor *amd_fam14h_register(void) +{ + int num; + + if (cpupower_cpu_info.vendor != X86_VENDOR_AMD) + return NULL; + + if (cpupower_cpu_info.family == 0x14) + strncpy(amd_fam14h_monitor.name, "Fam_14h", + MONITOR_NAME_LEN - 1); + else if (cpupower_cpu_info.family == 0x12) + strncpy(amd_fam14h_monitor.name, "Fam_12h", + MONITOR_NAME_LEN - 1); + else + return NULL; + + /* We do not alloc for nbp1 machine wide counter */ + for (num = 0; num < AMD_FAM14H_STATE_NUM - 1; num++) { + previous_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + current_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + } + + /* We need PCI device: Slot 18, Func 6, compare with BKDG + for fam 12h/14h */ + amd_fam14h_pci_dev = pci_slot_func_init(&pci_acc, 0x18, 6); + if (amd_fam14h_pci_dev == NULL || pci_acc == NULL) + return NULL; + + if (!is_nbp1_capable()) + amd_fam14h_monitor.hw_states_num = AMD_FAM14H_STATE_NUM - 1; + + amd_fam14h_monitor.name_len = strlen(amd_fam14h_monitor.name); + return &amd_fam14h_monitor; +} + +static void amd_fam14h_unregister(void) +{ + int num; + for (num = 0; num < AMD_FAM14H_STATE_NUM - 1; num++) { + free(previous_count[num]); + free(current_count[num]); + } + pci_cleanup(pci_acc); +} + +struct cpuidle_monitor amd_fam14h_monitor = { + .name = "", + .hw_states = amd_fam14h_cstates, + .hw_states_num = AMD_FAM14H_STATE_NUM, + .start = amd_fam14h_start, + .stop = amd_fam14h_stop, + .do_register = amd_fam14h_register, + .unregister = amd_fam14h_unregister, + .flags.needs_root = 1, + .overflow_s = OVERFLOW_MS / 1000, +}; +#endif /* #if defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c new file mode 100644 index 000000000..8b42c2f0a --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc + */ + +#include +#include +#include +#include +#include +#include + +#include "helpers/helpers.h" +#include "idle_monitor/cpupower-monitor.h" + +#define CPUIDLE_STATES_MAX 10 +static cstate_t cpuidle_cstates[CPUIDLE_STATES_MAX]; +struct cpuidle_monitor cpuidle_sysfs_monitor; + +static unsigned long long **previous_count; +static unsigned long long **current_count; +static struct timespec start_time; +static unsigned long long timediff; + +static int cpuidle_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + unsigned long long statediff = current_count[cpu][id] + - previous_count[cpu][id]; + dprint("%s: - diff: %llu - percent: %f (%u)\n", + cpuidle_cstates[id].name, timediff, *percent, cpu); + + if (timediff == 0) + *percent = 0.0; + else + *percent = ((100.0 * statediff) / timediff); + + dprint("%s: - timediff: %llu - statediff: %llu - percent: %f (%u)\n", + cpuidle_cstates[id].name, timediff, statediff, *percent, cpu); + + return 0; +} + +static int cpuidle_start(void) +{ + int cpu, state; + clock_gettime(CLOCK_REALTIME, &start_time); + for (cpu = 0; cpu < cpu_count; cpu++) { + for (state = 0; state < cpuidle_sysfs_monitor.hw_states_num; + state++) { + previous_count[cpu][state] = + cpuidle_state_time(cpu, state); + dprint("CPU %d - State: %d - Val: %llu\n", + cpu, state, previous_count[cpu][state]); + } + } + return 0; +} + +static int cpuidle_stop(void) +{ + int cpu, state; + struct timespec end_time; + clock_gettime(CLOCK_REALTIME, &end_time); + timediff = timespec_diff_us(start_time, end_time); + + for (cpu = 0; cpu < cpu_count; cpu++) { + for (state = 0; state < cpuidle_sysfs_monitor.hw_states_num; + state++) { + current_count[cpu][state] = + cpuidle_state_time(cpu, state); + dprint("CPU %d - State: %d - Val: %llu\n", + cpu, state, previous_count[cpu][state]); + } + } + return 0; +} + +void fix_up_intel_idle_driver_name(char *tmp, int num) +{ + /* fix up cpuidle name for intel idle driver */ + if (!strncmp(tmp, "NHM-", 4)) { + switch (num) { + case 1: + strcpy(tmp, "C1"); + break; + case 2: + strcpy(tmp, "C3"); + break; + case 3: + strcpy(tmp, "C6"); + break; + } + } else if (!strncmp(tmp, "SNB-", 4)) { + switch (num) { + case 1: + strcpy(tmp, "C1"); + break; + case 2: + strcpy(tmp, "C3"); + break; + case 3: + strcpy(tmp, "C6"); + break; + case 4: + strcpy(tmp, "C7"); + break; + } + } else if (!strncmp(tmp, "ATM-", 4)) { + switch (num) { + case 1: + strcpy(tmp, "C1"); + break; + case 2: + strcpy(tmp, "C2"); + break; + case 3: + strcpy(tmp, "C4"); + break; + case 4: + strcpy(tmp, "C6"); + break; + } + } +} + +#ifdef __powerpc__ +void map_power_idle_state_name(char *tmp) +{ + if (!strncmp(tmp, "stop0_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop0L"); + else if (!strncmp(tmp, "stop1_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop1L"); + else if (!strncmp(tmp, "stop2_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop2L"); +} +#else +void map_power_idle_state_name(char *tmp) { } +#endif + +static struct cpuidle_monitor *cpuidle_register(void) +{ + int num; + char *tmp; + int this_cpu; + + this_cpu = sched_getcpu(); + + /* Assume idle state count is the same for all CPUs */ + cpuidle_sysfs_monitor.hw_states_num = cpuidle_state_count(this_cpu); + + if (cpuidle_sysfs_monitor.hw_states_num <= 0) + return NULL; + + for (num = 0; num < cpuidle_sysfs_monitor.hw_states_num; num++) { + tmp = cpuidle_state_name(this_cpu, num); + if (tmp == NULL) + continue; + + map_power_idle_state_name(tmp); + fix_up_intel_idle_driver_name(tmp, num); + strncpy(cpuidle_cstates[num].name, tmp, CSTATE_NAME_LEN - 1); + free(tmp); + + tmp = cpuidle_state_desc(this_cpu, num); + if (tmp == NULL) + continue; + strncpy(cpuidle_cstates[num].desc, tmp, CSTATE_DESC_LEN - 1); + free(tmp); + + cpuidle_cstates[num].range = RANGE_THREAD; + cpuidle_cstates[num].id = num; + cpuidle_cstates[num].get_count_percent = + cpuidle_get_count_percent; + } + + /* Free this at program termination */ + previous_count = malloc(sizeof(long long *) * cpu_count); + current_count = malloc(sizeof(long long *) * cpu_count); + for (num = 0; num < cpu_count; num++) { + previous_count[num] = malloc(sizeof(long long) * + cpuidle_sysfs_monitor.hw_states_num); + current_count[num] = malloc(sizeof(long long) * + cpuidle_sysfs_monitor.hw_states_num); + } + + cpuidle_sysfs_monitor.name_len = strlen(cpuidle_sysfs_monitor.name); + return &cpuidle_sysfs_monitor; +} + +void cpuidle_unregister(void) +{ + int num; + + for (num = 0; num < cpu_count; num++) { + free(previous_count[num]); + free(current_count[num]); + } + free(previous_count); + free(current_count); +} + +struct cpuidle_monitor cpuidle_sysfs_monitor = { + .name = "Idle_Stats", + .hw_states = cpuidle_cstates, + .start = cpuidle_start, + .stop = cpuidle_stop, + .do_register = cpuidle_register, + .unregister = cpuidle_unregister, + .flags.needs_root = 0, + .overflow_s = UINT_MAX, +}; diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c new file mode 100644 index 000000000..7c77045fe --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * Output format inspired by Len Brown's turbostat tool. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "idle_monitor/cpupower-monitor.h" +#include "idle_monitor/idle_monitors.h" +#include "helpers/helpers.h" + +/* Define pointers to all monitors. */ +#define DEF(x) & x ## _monitor , +struct cpuidle_monitor *all_monitors[] = { +#include "idle_monitors.def" +0 +}; + +int cpu_count; + +static struct cpuidle_monitor *monitors[MONITORS_MAX]; +static unsigned int avail_monitors; + +static char *progname; + +enum operation_mode_e { list = 1, show, show_all }; +static int mode; +static int interval = 1; +static char *show_monitors_param; +static struct cpupower_topology cpu_top; +static unsigned int wake_cpus; + +/* ToDo: Document this in the manpage */ +static char range_abbr[RANGE_MAX] = { 'T', 'C', 'P', 'M', }; + +static void print_wrong_arg_exit(void) +{ + printf(_("invalid or unknown argument\n")); + exit(EXIT_FAILURE); +} + +long long timespec_diff_us(struct timespec start, struct timespec end) +{ + struct timespec temp; + if ((end.tv_nsec - start.tv_nsec) < 0) { + temp.tv_sec = end.tv_sec - start.tv_sec - 1; + temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec; + } else { + temp.tv_sec = end.tv_sec - start.tv_sec; + temp.tv_nsec = end.tv_nsec - start.tv_nsec; + } + return (temp.tv_sec * 1000000) + (temp.tv_nsec / 1000); +} + +void print_n_spaces(int n) +{ + int x; + for (x = 0; x < n; x++) + printf(" "); +} + +/*s is filled with left and right spaces + *to make its length atleast n+1 + */ +int fill_string_with_spaces(char *s, int n) +{ + char *temp; + int len = strlen(s); + + if (len >= n) + return -1; + + temp = malloc(sizeof(char) * (n+1)); + for (; len < n; len++) + s[len] = ' '; + s[len] = '\0'; + snprintf(temp, n+1, " %s", s); + strcpy(s, temp); + free(temp); + return 0; +} + +#define MAX_COL_WIDTH 6 +void print_header(int topology_depth) +{ + int unsigned mon; + int state, need_len; + cstate_t s; + char buf[128] = ""; + + fill_string_with_spaces(buf, topology_depth * 5 - 1); + printf("%s|", buf); + + for (mon = 0; mon < avail_monitors; mon++) { + need_len = monitors[mon]->hw_states_num * (MAX_COL_WIDTH + 1) + - 1; + if (mon != 0) + printf("||"); + sprintf(buf, "%s", monitors[mon]->name); + fill_string_with_spaces(buf, need_len); + printf("%s", buf); + } + printf("\n"); + + if (topology_depth > 2) + printf(" PKG|"); + if (topology_depth > 1) + printf("CORE|"); + if (topology_depth > 0) + printf(" CPU|"); + + for (mon = 0; mon < avail_monitors; mon++) { + if (mon != 0) + printf("||"); + for (state = 0; state < monitors[mon]->hw_states_num; state++) { + if (state != 0) + printf("|"); + s = monitors[mon]->hw_states[state]; + sprintf(buf, "%s", s.name); + fill_string_with_spaces(buf, MAX_COL_WIDTH); + printf("%s", buf); + } + printf(" "); + } + printf("\n"); +} + + +void print_results(int topology_depth, int cpu) +{ + unsigned int mon; + int state, ret; + double percent; + unsigned long long result; + cstate_t s; + + /* Be careful CPUs may got resorted for pkg value do not just use cpu */ + if (!bitmask_isbitset(cpus_chosen, cpu_top.core_info[cpu].cpu)) + return; + if (!cpu_top.core_info[cpu].is_online && + cpu_top.core_info[cpu].pkg == -1) + return; + + if (topology_depth > 2) + printf("%4d|", cpu_top.core_info[cpu].pkg); + if (topology_depth > 1) + printf("%4d|", cpu_top.core_info[cpu].core); + if (topology_depth > 0) + printf("%4d|", cpu_top.core_info[cpu].cpu); + + for (mon = 0; mon < avail_monitors; mon++) { + if (mon != 0) + printf("||"); + + for (state = 0; state < monitors[mon]->hw_states_num; state++) { + if (state != 0) + printf("|"); + + s = monitors[mon]->hw_states[state]; + + if (s.get_count_percent) { + ret = s.get_count_percent(s.id, &percent, + cpu_top.core_info[cpu].cpu); + if (ret) + printf("******"); + else if (percent >= 100.0) + printf("%6.1f", percent); + else + printf("%6.2f", percent); + } else if (s.get_count) { + ret = s.get_count(s.id, &result, + cpu_top.core_info[cpu].cpu); + if (ret) + printf("******"); + else + printf("%6llu", result); + } else { + printf(_("Monitor %s, Counter %s has no count " + "function. Implementation error\n"), + monitors[mon]->name, s.name); + exit(EXIT_FAILURE); + } + } + } + /* + * The monitor could still provide useful data, for example + * AMD HW counters partly sit in PCI config space. + * It's up to the monitor plug-in to check .is_online, this one + * is just for additional info. + */ + if (!cpu_top.core_info[cpu].is_online && + cpu_top.core_info[cpu].pkg != -1) { + printf(_(" *is offline\n")); + return; + } else + printf("\n"); +} + + +/* param: string passed by -m param (The list of monitors to show) + * + * Monitors must have been registered already, matching monitors + * are picked out and available monitors array is overridden + * with matching ones + * + * Monitors get sorted in the same order the user passes them +*/ + +static void parse_monitor_param(char *param) +{ + unsigned int num; + int mon, hits = 0; + char *tmp = param, *token; + struct cpuidle_monitor *tmp_mons[MONITORS_MAX]; + + + for (mon = 0; mon < MONITORS_MAX; mon++, tmp = NULL) { + token = strtok(tmp, ","); + if (token == NULL) + break; + if (strlen(token) >= MONITOR_NAME_LEN) { + printf(_("%s: max monitor name length" + " (%d) exceeded\n"), token, MONITOR_NAME_LEN); + continue; + } + + for (num = 0; num < avail_monitors; num++) { + if (!strcmp(monitors[num]->name, token)) { + dprint("Found requested monitor: %s\n", token); + tmp_mons[hits] = monitors[num]; + hits++; + } + } + } + if (hits == 0) { + printf(_("No matching monitor found in %s, " + "try -l option\n"), param); + exit(EXIT_FAILURE); + } + /* Override detected/registerd monitors array with requested one */ + memcpy(monitors, tmp_mons, + sizeof(struct cpuidle_monitor *) * MONITORS_MAX); + avail_monitors = hits; +} + +void list_monitors(void) +{ + unsigned int mon; + int state; + cstate_t s; + + for (mon = 0; mon < avail_monitors; mon++) { + printf(_("Monitor \"%s\" (%d states) - Might overflow after %u " + "s\n"), + monitors[mon]->name, monitors[mon]->hw_states_num, + monitors[mon]->overflow_s); + + for (state = 0; state < monitors[mon]->hw_states_num; state++) { + s = monitors[mon]->hw_states[state]; + /* + * ToDo show more state capabilities: + * percent, time (granlarity) + */ + printf("%s\t[%c] -> %s\n", s.name, range_abbr[s.range], + gettext(s.desc)); + } + } +} + +int fork_it(char **argv) +{ + int status; + unsigned int num; + unsigned long long timediff; + pid_t child_pid; + struct timespec start, end; + + child_pid = fork(); + clock_gettime(CLOCK_REALTIME, &start); + + for (num = 0; num < avail_monitors; num++) + monitors[num]->start(); + + if (!child_pid) { + /* child */ + execvp(argv[0], argv); + } else { + /* parent */ + if (child_pid == -1) { + perror("fork"); + exit(1); + } + + signal(SIGINT, SIG_IGN); + signal(SIGQUIT, SIG_IGN); + if (waitpid(child_pid, &status, 0) == -1) { + perror("wait"); + exit(1); + } + } + clock_gettime(CLOCK_REALTIME, &end); + for (num = 0; num < avail_monitors; num++) + monitors[num]->stop(); + + timediff = timespec_diff_us(start, end); + if (WIFEXITED(status)) + printf(_("%s took %.5f seconds and exited with status %d\n"), + argv[0], timediff / (1000.0 * 1000), + WEXITSTATUS(status)); + return 0; +} + +int do_interval_measure(int i) +{ + unsigned int num; + int cpu; + + if (wake_cpus) + for (cpu = 0; cpu < cpu_count; cpu++) + bind_cpu(cpu); + + for (num = 0; num < avail_monitors; num++) { + dprint("HW C-state residency monitor: %s - States: %d\n", + monitors[num]->name, monitors[num]->hw_states_num); + monitors[num]->start(); + } + + sleep(i); + + if (wake_cpus) + for (cpu = 0; cpu < cpu_count; cpu++) + bind_cpu(cpu); + + for (num = 0; num < avail_monitors; num++) + monitors[num]->stop(); + + + return 0; +} + +static void cmdline(int argc, char *argv[]) +{ + int opt; + progname = basename(argv[0]); + + while ((opt = getopt(argc, argv, "+lci:m:")) != -1) { + switch (opt) { + case 'l': + if (mode) + print_wrong_arg_exit(); + mode = list; + break; + case 'i': + /* only allow -i with -m or no option */ + if (mode && mode != show) + print_wrong_arg_exit(); + interval = atoi(optarg); + break; + case 'm': + if (mode) + print_wrong_arg_exit(); + mode = show; + show_monitors_param = optarg; + break; + case 'c': + wake_cpus = 1; + break; + default: + print_wrong_arg_exit(); + } + } + if (!mode) + mode = show_all; +} + +int cmd_monitor(int argc, char **argv) +{ + unsigned int num; + struct cpuidle_monitor *test_mon; + int cpu; + + cmdline(argc, argv); + cpu_count = get_cpu_topology(&cpu_top); + if (cpu_count < 0) { + printf(_("Cannot read number of available processors\n")); + return EXIT_FAILURE; + } + + if (!cpu_top.core_info[0].is_online) + printf("WARNING: at least one cpu is offline\n"); + + /* Default is: monitor all CPUs */ + if (bitmask_isallclear(cpus_chosen)) + bitmask_setall(cpus_chosen); + + dprint("System has up to %d CPU cores\n", cpu_count); + + for (num = 0; all_monitors[num]; num++) { + dprint("Try to register: %s\n", all_monitors[num]->name); + test_mon = all_monitors[num]->do_register(); + if (test_mon) { + if (test_mon->flags.needs_root && !run_as_root) { + fprintf(stderr, _("Available monitor %s needs " + "root access\n"), test_mon->name); + continue; + } + monitors[avail_monitors] = test_mon; + dprint("%s registered\n", all_monitors[num]->name); + avail_monitors++; + } + } + + if (avail_monitors == 0) { + printf(_("No HW Cstate monitors found\n")); + return 1; + } + + if (mode == list) { + list_monitors(); + exit(EXIT_SUCCESS); + } + + if (mode == show) + parse_monitor_param(show_monitors_param); + + dprint("Packages: %d - Cores: %d - CPUs: %d\n", + cpu_top.pkgs, cpu_top.cores, cpu_count); + + /* + * if any params left, it must be a command to fork + */ + if (argc - optind) + fork_it(argv + optind); + else + do_interval_measure(interval); + + /* ToDo: Topology parsing needs fixing first to do + this more generically */ + if (cpu_top.pkgs > 1) + print_header(3); + else + print_header(1); + + for (cpu = 0; cpu < cpu_count; cpu++) { + if (cpu_top.pkgs > 1) + print_results(3, cpu); + else + print_results(1, cpu); + } + + for (num = 0; num < avail_monitors; num++) + monitors[num]->unregister(); + + cpu_topology_release(cpu_top); + return 0; +} diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h new file mode 100644 index 000000000..c559d3115 --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + */ + +#ifndef __CPUIDLE_INFO_HW__ +#define __CPUIDLE_INFO_HW__ + +#include +#include + +#include "idle_monitor/idle_monitors.h" + +#define MONITORS_MAX 20 +#define MONITOR_NAME_LEN 20 + +/* CSTATE_NAME_LEN is limited by header field width defined + * in cpupower-monitor.c. Header field width is defined to be + * sum of percent width and two spaces for padding. + */ +#ifdef __powerpc__ +#define CSTATE_NAME_LEN 7 +#else +#define CSTATE_NAME_LEN 5 +#endif +#define CSTATE_DESC_LEN 60 + +extern int cpu_count; + +/* Hard to define the right names ...: */ +enum power_range_e { + RANGE_THREAD, /* Lowest in topology hierarcy, AMD: core, Intel: thread + kernel sysfs: cpu */ + RANGE_CORE, /* AMD: unit, Intel: core, kernel_sysfs: core_id */ + RANGE_PACKAGE, /* Package, processor socket */ + RANGE_MACHINE, /* Machine, platform wide */ + RANGE_MAX }; + +typedef struct cstate { + int id; + enum power_range_e range; + char name[CSTATE_NAME_LEN]; + char desc[CSTATE_DESC_LEN]; + + /* either provide a percentage or a general count */ + int (*get_count_percent)(unsigned int self_id, double *percent, + unsigned int cpu); + int (*get_count)(unsigned int self_id, unsigned long long *count, + unsigned int cpu); +} cstate_t; + +struct cpuidle_monitor { + /* Name must not contain whitespaces */ + char name[MONITOR_NAME_LEN]; + int name_len; + int hw_states_num; + cstate_t *hw_states; + int (*start) (void); + int (*stop) (void); + struct cpuidle_monitor* (*do_register) (void); + void (*unregister)(void); + unsigned int overflow_s; + struct { + unsigned int needs_root:1; + unsigned int per_cpu_schedule:1; + } flags; +}; + +extern long long timespec_diff_us(struct timespec start, struct timespec end); + +#define print_overflow_err(mes, ov) \ +{ \ + fprintf(stderr, gettext("Measure took %u seconds, but registers could " \ + "overflow at %u seconds, results " \ + "could be inaccurate\n"), mes, ov); \ +} + + +/* Taken over from x86info project sources -> return 0 on success */ +#include +#include +#include +static inline int bind_cpu(int cpu) +{ + cpu_set_t set; + + if (sched_getaffinity(getpid(), sizeof(set), &set) == 0) { + CPU_ZERO(&set); + CPU_SET(cpu, &set); + return sched_setaffinity(getpid(), sizeof(set), &set); + } + return 1; +} + +#endif /* __CPUIDLE_INFO_HW__ */ diff --git a/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c new file mode 100644 index 000000000..55e55b6b4 --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/hsw_ext_idle.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * Based on SandyBridge monitor. Implements the new package C-states + * (PC8, PC9, PC10) coming with a specific Haswell (family 0x45) CPU. + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include +#include + +#include "helpers/helpers.h" +#include "idle_monitor/cpupower-monitor.h" + +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +#define MSR_TSC 0x10 + +enum intel_hsw_ext_id { PC8 = 0, PC9, PC10, HSW_EXT_CSTATE_COUNT, + TSC = 0xFFFF }; + +static int hsw_ext_get_count_percent(unsigned int self_id, double *percent, + unsigned int cpu); + +static cstate_t hsw_ext_cstates[HSW_EXT_CSTATE_COUNT] = { + { + .name = "PC8", + .desc = N_("Processor Package C8"), + .id = PC8, + .range = RANGE_PACKAGE, + .get_count_percent = hsw_ext_get_count_percent, + }, + { + .name = "PC9", + .desc = N_("Processor Package C9"), + .id = PC9, + .range = RANGE_PACKAGE, + .get_count_percent = hsw_ext_get_count_percent, + }, + { + .name = "PC10", + .desc = N_("Processor Package C10"), + .id = PC10, + .range = RANGE_PACKAGE, + .get_count_percent = hsw_ext_get_count_percent, + }, +}; + +static unsigned long long tsc_at_measure_start; +static unsigned long long tsc_at_measure_end; +static unsigned long long *previous_count[HSW_EXT_CSTATE_COUNT]; +static unsigned long long *current_count[HSW_EXT_CSTATE_COUNT]; +/* valid flag for all CPUs. If a MSR read failed it will be zero */ +static int *is_valid; + +static int hsw_ext_get_count(enum intel_hsw_ext_id id, unsigned long long *val, + unsigned int cpu) +{ + int msr; + + switch (id) { + case PC8: + msr = MSR_PKG_C8_RESIDENCY; + break; + case PC9: + msr = MSR_PKG_C9_RESIDENCY; + break; + case PC10: + msr = MSR_PKG_C10_RESIDENCY; + break; + case TSC: + msr = MSR_TSC; + break; + default: + return -1; + } + if (read_msr(cpu, msr, val)) + return -1; + return 0; +} + +static int hsw_ext_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + *percent = 0.0; + + if (!is_valid[cpu]) + return -1; + + *percent = (100.0 * + (current_count[id][cpu] - previous_count[id][cpu])) / + (tsc_at_measure_end - tsc_at_measure_start); + + dprint("%s: previous: %llu - current: %llu - (%u)\n", + hsw_ext_cstates[id].name, previous_count[id][cpu], + current_count[id][cpu], cpu); + + dprint("%s: tsc_diff: %llu - count_diff: %llu - percent: %2.f (%u)\n", + hsw_ext_cstates[id].name, + (unsigned long long) tsc_at_measure_end - tsc_at_measure_start, + current_count[id][cpu] - previous_count[id][cpu], + *percent, cpu); + + return 0; +} + +static int hsw_ext_start(void) +{ + int num, cpu; + unsigned long long val; + + for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + hsw_ext_get_count(num, &val, cpu); + previous_count[num][cpu] = val; + } + } + hsw_ext_get_count(TSC, &tsc_at_measure_start, base_cpu); + return 0; +} + +static int hsw_ext_stop(void) +{ + unsigned long long val; + int num, cpu; + + hsw_ext_get_count(TSC, &tsc_at_measure_end, base_cpu); + + for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + is_valid[cpu] = !hsw_ext_get_count(num, &val, cpu); + current_count[num][cpu] = val; + } + } + return 0; +} + +struct cpuidle_monitor intel_hsw_ext_monitor; + +static struct cpuidle_monitor *hsw_ext_register(void) +{ + int num; + + if (cpupower_cpu_info.vendor != X86_VENDOR_INTEL + || cpupower_cpu_info.family != 6) + return NULL; + + switch (cpupower_cpu_info.model) { + case 0x45: /* HSW */ + break; + default: + return NULL; + } + + is_valid = calloc(cpu_count, sizeof(int)); + for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { + previous_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + current_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + } + intel_hsw_ext_monitor.name_len = strlen(intel_hsw_ext_monitor.name); + return &intel_hsw_ext_monitor; +} + +void hsw_ext_unregister(void) +{ + int num; + free(is_valid); + for (num = 0; num < HSW_EXT_CSTATE_COUNT; num++) { + free(previous_count[num]); + free(current_count[num]); + } +} + +struct cpuidle_monitor intel_hsw_ext_monitor = { + .name = "HaswellExtended", + .hw_states = hsw_ext_cstates, + .hw_states_num = HSW_EXT_CSTATE_COUNT, + .start = hsw_ext_start, + .stop = hsw_ext_stop, + .do_register = hsw_ext_register, + .unregister = hsw_ext_unregister, + .flags.needs_root = 1, + .overflow_s = 922000000 /* 922337203 seconds TSC overflow + at 20GHz */ +}; +#endif /* defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/idle_monitor/idle_monitors.def b/tools/power/cpupower/utils/idle_monitor/idle_monitors.def new file mode 100644 index 000000000..0d6ba4dbb --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/idle_monitors.def @@ -0,0 +1,8 @@ +#if defined(__i386__) || defined(__x86_64__) +DEF(amd_fam14h) +DEF(intel_nhm) +DEF(intel_snb) +DEF(intel_hsw_ext) +DEF(mperf) +#endif +DEF(cpuidle_sysfs) diff --git a/tools/power/cpupower/utils/idle_monitor/idle_monitors.h b/tools/power/cpupower/utils/idle_monitor/idle_monitors.h new file mode 100644 index 000000000..e9e567ec8 --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/idle_monitors.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * Based on the idea from Michael Matz + */ + +#ifndef _CPUIDLE_IDLE_MONITORS_H_ +#define _CPUIDLE_IDLE_MONITORS_H_ + +#define DEF(x) extern struct cpuidle_monitor x ##_monitor; +#include "idle_monitors.def" +#undef DEF +extern struct cpuidle_monitor *all_monitors[]; + +#endif /* _CPUIDLE_IDLE_MONITORS_H_ */ diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c new file mode 100644 index 000000000..ae6af354a --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include +#include +#include + +#include + +#include "helpers/helpers.h" +#include "idle_monitor/cpupower-monitor.h" + +#define MSR_APERF 0xE8 +#define MSR_MPERF 0xE7 + +#define RDPRU ".byte 0x0f, 0x01, 0xfd" +#define RDPRU_ECX_MPERF 0 +#define RDPRU_ECX_APERF 1 + +#define MSR_TSC 0x10 + +#define MSR_AMD_HWCR 0xc0010015 + +enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; + +static int mperf_get_count_percent(unsigned int self_id, double *percent, + unsigned int cpu); +static int mperf_get_count_freq(unsigned int id, unsigned long long *count, + unsigned int cpu); +static struct timespec time_start, time_end; + +static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { + { + .name = "C0", + .desc = N_("Processor Core not idle"), + .id = C0, + .range = RANGE_THREAD, + .get_count_percent = mperf_get_count_percent, + }, + { + .name = "Cx", + .desc = N_("Processor Core in an idle state"), + .id = Cx, + .range = RANGE_THREAD, + .get_count_percent = mperf_get_count_percent, + }, + + { + .name = "Freq", + .desc = N_("Average Frequency (including boost) in MHz"), + .id = AVG_FREQ, + .range = RANGE_THREAD, + .get_count = mperf_get_count_freq, + }, +}; + +enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; +static int max_freq_mode; +/* + * The max frequency mperf is ticking at (in C0), either retrieved via: + * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency + * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time + * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) + */ +static unsigned long max_frequency; + +static unsigned long long *tsc_at_measure_start; +static unsigned long long *tsc_at_measure_end; +static unsigned long long *mperf_previous_count; +static unsigned long long *aperf_previous_count; +static unsigned long long *mperf_current_count; +static unsigned long long *aperf_current_count; + +/* valid flag for all CPUs. If a MSR read failed it will be zero */ +static int *is_valid; + +static int mperf_get_tsc(unsigned long long *tsc) +{ + int ret; + + ret = read_msr(base_cpu, MSR_TSC, tsc); + if (ret) + dprint("Reading TSC MSR failed, returning %llu\n", *tsc); + return ret; +} + +static int get_aperf_mperf(int cpu, unsigned long long *aval, + unsigned long long *mval) +{ + unsigned long low_a, high_a; + unsigned long low_m, high_m; + int ret; + + /* + * Running on the cpu from which we read the registers will + * prevent APERF/MPERF from going out of sync because of IPI + * latency introduced by read_msr()s. + */ + if (mperf_monitor.flags.per_cpu_schedule) { + if (bind_cpu(cpu)) + return 1; + } + + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_RDPRU) { + asm volatile(RDPRU + : "=a" (low_a), "=d" (high_a) + : "c" (RDPRU_ECX_APERF)); + asm volatile(RDPRU + : "=a" (low_m), "=d" (high_m) + : "c" (RDPRU_ECX_MPERF)); + + *aval = ((low_a) | (high_a) << 32); + *mval = ((low_m) | (high_m) << 32); + + return 0; + } + + ret = read_msr(cpu, MSR_APERF, aval); + ret |= read_msr(cpu, MSR_MPERF, mval); + + return ret; +} + +static int mperf_init_stats(unsigned int cpu) +{ + unsigned long long aval, mval; + int ret; + + ret = get_aperf_mperf(cpu, &aval, &mval); + aperf_previous_count[cpu] = aval; + mperf_previous_count[cpu] = mval; + is_valid[cpu] = !ret; + + return 0; +} + +static int mperf_measure_stats(unsigned int cpu) +{ + unsigned long long aval, mval; + int ret; + + ret = get_aperf_mperf(cpu, &aval, &mval); + aperf_current_count[cpu] = aval; + mperf_current_count[cpu] = mval; + is_valid[cpu] = !ret; + + return 0; +} + +static int mperf_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + unsigned long long aperf_diff, mperf_diff, tsc_diff; + unsigned long long timediff; + + if (!is_valid[cpu]) + return -1; + + if (id != C0 && id != Cx) + return -1; + + mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; + aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; + + if (max_freq_mode == MAX_FREQ_TSC_REF) { + tsc_diff = tsc_at_measure_end[cpu] - tsc_at_measure_start[cpu]; + *percent = 100.0 * mperf_diff / tsc_diff; + dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", + mperf_cstates[id].name, mperf_diff, tsc_diff); + } else if (max_freq_mode == MAX_FREQ_SYSFS) { + timediff = max_frequency * timespec_diff_us(time_start, time_end); + *percent = 100.0 * mperf_diff / timediff; + dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", + mperf_cstates[id].name, mperf_diff, timediff); + } else + return -1; + + if (id == Cx) + *percent = 100.0 - *percent; + + dprint("%s: previous: %llu - current: %llu - (%u)\n", + mperf_cstates[id].name, mperf_diff, aperf_diff, cpu); + dprint("%s: %f\n", mperf_cstates[id].name, *percent); + return 0; +} + +static int mperf_get_count_freq(unsigned int id, unsigned long long *count, + unsigned int cpu) +{ + unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; + + if (id != AVG_FREQ) + return 1; + + if (!is_valid[cpu]) + return -1; + + mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; + aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; + + if (max_freq_mode == MAX_FREQ_TSC_REF) { + /* Calculate max_freq from TSC count */ + tsc_diff = tsc_at_measure_end[cpu] - tsc_at_measure_start[cpu]; + time_diff = timespec_diff_us(time_start, time_end); + max_frequency = tsc_diff / time_diff; + } + + *count = max_frequency * ((double)aperf_diff / mperf_diff); + dprint("%s: Average freq based on %s maximum frequency:\n", + mperf_cstates[id].name, + (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); + dprint("max_frequency: %lu\n", max_frequency); + dprint("aperf_diff: %llu\n", aperf_diff); + dprint("mperf_diff: %llu\n", mperf_diff); + dprint("avg freq: %llu\n", *count); + return 0; +} + +static int mperf_start(void) +{ + int cpu; + + clock_gettime(CLOCK_REALTIME, &time_start); + + for (cpu = 0; cpu < cpu_count; cpu++) { + mperf_get_tsc(&tsc_at_measure_start[cpu]); + mperf_init_stats(cpu); + } + + return 0; +} + +static int mperf_stop(void) +{ + int cpu; + + for (cpu = 0; cpu < cpu_count; cpu++) { + mperf_measure_stats(cpu); + mperf_get_tsc(&tsc_at_measure_end[cpu]); + } + + clock_gettime(CLOCK_REALTIME, &time_end); + return 0; +} + +/* + * Mperf register is defined to tick at P0 (maximum) frequency + * + * Instead of reading out P0 which can be tricky to read out from HW, + * we use TSC counter if it reliably ticks at P0/mperf frequency. + * + * Still try to fall back to: + * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq + * on older Intel HW without invariant TSC feature. + * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but + * it's still double checked (MSR_AMD_HWCR)). + * + * On these machines the user would still get useful mperf + * stats when acpi-cpufreq driver is loaded. + */ +static int init_maxfreq_mode(void) +{ + int ret; + unsigned long long hwcr; + unsigned long min; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) + goto use_sysfs; + + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD || + cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { + /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf + * freq. + * A test whether hwcr is accessable/available would be: + * (cpupower_cpu_info.family > 0x10 || + * cpupower_cpu_info.family == 0x10 && + * cpupower_cpu_info.model >= 0x2)) + * This should be the case for all aperf/mperf + * capable AMD machines and is therefore safe to test here. + * Compare with Linus kernel git commit: acf01734b1747b1ec4 + */ + ret = read_msr(0, MSR_AMD_HWCR, &hwcr); + /* + * If the MSR read failed, assume a Xen system that did + * not explicitly provide access to it and assume TSC works + */ + if (ret != 0) { + dprint("TSC read 0x%x failed - assume TSC working\n", + MSR_AMD_HWCR); + return 0; + } else if (1 & (hwcr >> 24)) { + max_freq_mode = MAX_FREQ_TSC_REF; + return 0; + } else { /* Use sysfs max frequency if available */ } + } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { + /* + * On Intel we assume mperf (in C0) is ticking at same + * rate than TSC + */ + max_freq_mode = MAX_FREQ_TSC_REF; + return 0; + } +use_sysfs: + if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { + dprint("Cannot retrieve max freq from cpufreq kernel " + "subsystem\n"); + return -1; + } + max_freq_mode = MAX_FREQ_SYSFS; + max_frequency /= 1000; /* Default automatically to MHz value */ + return 0; +} + +/* + * This monitor provides: + * + * 1) Average frequency a CPU resided in + * This always works if the CPU has aperf/mperf capabilities + * + * 2) C0 and Cx (any sleep state) time a CPU resided in + * Works if mperf timer stops ticking in sleep states which + * seem to be the case on all current HW. + * Both is directly retrieved from HW registers and is independent + * from kernel statistics. + */ +struct cpuidle_monitor mperf_monitor; +struct cpuidle_monitor *mperf_register(void) +{ + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) + return NULL; + + if (init_maxfreq_mode()) + return NULL; + + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) + mperf_monitor.flags.per_cpu_schedule = 1; + + /* Free this at program termination */ + is_valid = calloc(cpu_count, sizeof(int)); + mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); + aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long)); + mperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); + aperf_current_count = calloc(cpu_count, sizeof(unsigned long long)); + tsc_at_measure_start = calloc(cpu_count, sizeof(unsigned long long)); + tsc_at_measure_end = calloc(cpu_count, sizeof(unsigned long long)); + mperf_monitor.name_len = strlen(mperf_monitor.name); + return &mperf_monitor; +} + +void mperf_unregister(void) +{ + free(mperf_previous_count); + free(aperf_previous_count); + free(mperf_current_count); + free(aperf_current_count); + free(tsc_at_measure_start); + free(tsc_at_measure_end); + free(is_valid); +} + +struct cpuidle_monitor mperf_monitor = { + .name = "Mperf", + .hw_states_num = MPERF_CSTATE_COUNT, + .hw_states = mperf_cstates, + .start = mperf_start, + .stop = mperf_stop, + .do_register = mperf_register, + .unregister = mperf_unregister, + .flags.needs_root = 1, + .overflow_s = 922000000 /* 922337203 seconds TSC overflow + at 20GHz */ +}; +#endif /* #if defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/idle_monitor/nhm_idle.c b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c new file mode 100644 index 000000000..16eaf006f --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/nhm_idle.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * Based on Len Brown's turbostat tool. + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include +#include + +#include "helpers/helpers.h" +#include "idle_monitor/cpupower-monitor.h" + +#define MSR_PKG_C3_RESIDENCY 0x3F8 +#define MSR_PKG_C6_RESIDENCY 0x3F9 +#define MSR_CORE_C3_RESIDENCY 0x3FC +#define MSR_CORE_C6_RESIDENCY 0x3FD + +#define MSR_TSC 0x10 + +#define NHM_CSTATE_COUNT 4 + +enum intel_nhm_id { C3 = 0, C6, PC3, PC6, TSC = 0xFFFF }; + +static int nhm_get_count_percent(unsigned int self_id, double *percent, + unsigned int cpu); + +static cstate_t nhm_cstates[NHM_CSTATE_COUNT] = { + { + .name = "C3", + .desc = N_("Processor Core C3"), + .id = C3, + .range = RANGE_CORE, + .get_count_percent = nhm_get_count_percent, + }, + { + .name = "C6", + .desc = N_("Processor Core C6"), + .id = C6, + .range = RANGE_CORE, + .get_count_percent = nhm_get_count_percent, + }, + + { + .name = "PC3", + .desc = N_("Processor Package C3"), + .id = PC3, + .range = RANGE_PACKAGE, + .get_count_percent = nhm_get_count_percent, + }, + { + .name = "PC6", + .desc = N_("Processor Package C6"), + .id = PC6, + .range = RANGE_PACKAGE, + .get_count_percent = nhm_get_count_percent, + }, +}; + +static unsigned long long tsc_at_measure_start; +static unsigned long long tsc_at_measure_end; +static unsigned long long *previous_count[NHM_CSTATE_COUNT]; +static unsigned long long *current_count[NHM_CSTATE_COUNT]; +/* valid flag for all CPUs. If a MSR read failed it will be zero */ +static int *is_valid; + +static int nhm_get_count(enum intel_nhm_id id, unsigned long long *val, + unsigned int cpu) +{ + int msr; + + switch (id) { + case C3: + msr = MSR_CORE_C3_RESIDENCY; + break; + case C6: + msr = MSR_CORE_C6_RESIDENCY; + break; + case PC3: + msr = MSR_PKG_C3_RESIDENCY; + break; + case PC6: + msr = MSR_PKG_C6_RESIDENCY; + break; + case TSC: + msr = MSR_TSC; + break; + default: + return -1; + } + if (read_msr(cpu, msr, val)) + return -1; + + return 0; +} + +static int nhm_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + *percent = 0.0; + + if (!is_valid[cpu]) + return -1; + + *percent = (100.0 * + (current_count[id][cpu] - previous_count[id][cpu])) / + (tsc_at_measure_end - tsc_at_measure_start); + + dprint("%s: previous: %llu - current: %llu - (%u)\n", + nhm_cstates[id].name, previous_count[id][cpu], + current_count[id][cpu], cpu); + + dprint("%s: tsc_diff: %llu - count_diff: %llu - percent: %2.f (%u)\n", + nhm_cstates[id].name, + (unsigned long long) tsc_at_measure_end - tsc_at_measure_start, + current_count[id][cpu] - previous_count[id][cpu], + *percent, cpu); + + return 0; +} + +static int nhm_start(void) +{ + int num, cpu; + unsigned long long dbg, val; + + nhm_get_count(TSC, &tsc_at_measure_start, base_cpu); + + for (num = 0; num < NHM_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + is_valid[cpu] = !nhm_get_count(num, &val, cpu); + previous_count[num][cpu] = val; + } + } + nhm_get_count(TSC, &dbg, base_cpu); + dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start); + return 0; +} + +static int nhm_stop(void) +{ + unsigned long long val; + unsigned long long dbg; + int num, cpu; + + nhm_get_count(TSC, &tsc_at_measure_end, base_cpu); + + for (num = 0; num < NHM_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + is_valid[cpu] = !nhm_get_count(num, &val, cpu); + current_count[num][cpu] = val; + } + } + nhm_get_count(TSC, &dbg, base_cpu); + dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); + + return 0; +} + +struct cpuidle_monitor intel_nhm_monitor; + +struct cpuidle_monitor *intel_nhm_register(void) +{ + int num; + + if (cpupower_cpu_info.vendor != X86_VENDOR_INTEL) + return NULL; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC)) + return NULL; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) + return NULL; + + /* Free this at program termination */ + is_valid = calloc(cpu_count, sizeof(int)); + for (num = 0; num < NHM_CSTATE_COUNT; num++) { + previous_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + current_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + } + + intel_nhm_monitor.name_len = strlen(intel_nhm_monitor.name); + return &intel_nhm_monitor; +} + +void intel_nhm_unregister(void) +{ + int num; + + for (num = 0; num < NHM_CSTATE_COUNT; num++) { + free(previous_count[num]); + free(current_count[num]); + } + free(is_valid); +} + +struct cpuidle_monitor intel_nhm_monitor = { + .name = "Nehalem", + .hw_states_num = NHM_CSTATE_COUNT, + .hw_states = nhm_cstates, + .start = nhm_start, + .stop = nhm_stop, + .do_register = intel_nhm_register, + .unregister = intel_nhm_unregister, + .flags.needs_root = 1, + .overflow_s = 922000000 /* 922337203 seconds TSC overflow + at 20GHz */ +}; +#endif diff --git a/tools/power/cpupower/utils/idle_monitor/snb_idle.c b/tools/power/cpupower/utils/idle_monitor/snb_idle.c new file mode 100644 index 000000000..811d63ab1 --- /dev/null +++ b/tools/power/cpupower/utils/idle_monitor/snb_idle.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 2010,2011 Thomas Renninger , Novell Inc. + * + * Based on Len Brown's turbostat tool. + */ + +#if defined(__i386__) || defined(__x86_64__) + +#include +#include +#include +#include + +#include "helpers/helpers.h" +#include "idle_monitor/cpupower-monitor.h" + +#define MSR_PKG_C2_RESIDENCY 0x60D +#define MSR_PKG_C7_RESIDENCY 0x3FA +#define MSR_CORE_C7_RESIDENCY 0x3FE + +#define MSR_TSC 0x10 + +enum intel_snb_id { C7 = 0, PC2, PC7, SNB_CSTATE_COUNT, TSC = 0xFFFF }; + +static int snb_get_count_percent(unsigned int self_id, double *percent, + unsigned int cpu); + +static cstate_t snb_cstates[SNB_CSTATE_COUNT] = { + { + .name = "C7", + .desc = N_("Processor Core C7"), + .id = C7, + .range = RANGE_CORE, + .get_count_percent = snb_get_count_percent, + }, + { + .name = "PC2", + .desc = N_("Processor Package C2"), + .id = PC2, + .range = RANGE_PACKAGE, + .get_count_percent = snb_get_count_percent, + }, + { + .name = "PC7", + .desc = N_("Processor Package C7"), + .id = PC7, + .range = RANGE_PACKAGE, + .get_count_percent = snb_get_count_percent, + }, +}; + +static unsigned long long tsc_at_measure_start; +static unsigned long long tsc_at_measure_end; +static unsigned long long *previous_count[SNB_CSTATE_COUNT]; +static unsigned long long *current_count[SNB_CSTATE_COUNT]; +/* valid flag for all CPUs. If a MSR read failed it will be zero */ +static int *is_valid; + +static int snb_get_count(enum intel_snb_id id, unsigned long long *val, + unsigned int cpu) +{ + int msr; + + switch (id) { + case C7: + msr = MSR_CORE_C7_RESIDENCY; + break; + case PC2: + msr = MSR_PKG_C2_RESIDENCY; + break; + case PC7: + msr = MSR_PKG_C7_RESIDENCY; + break; + case TSC: + msr = MSR_TSC; + break; + default: + return -1; + } + if (read_msr(cpu, msr, val)) + return -1; + return 0; +} + +static int snb_get_count_percent(unsigned int id, double *percent, + unsigned int cpu) +{ + *percent = 0.0; + + if (!is_valid[cpu]) + return -1; + + *percent = (100.0 * + (current_count[id][cpu] - previous_count[id][cpu])) / + (tsc_at_measure_end - tsc_at_measure_start); + + dprint("%s: previous: %llu - current: %llu - (%u)\n", + snb_cstates[id].name, previous_count[id][cpu], + current_count[id][cpu], cpu); + + dprint("%s: tsc_diff: %llu - count_diff: %llu - percent: %2.f (%u)\n", + snb_cstates[id].name, + (unsigned long long) tsc_at_measure_end - tsc_at_measure_start, + current_count[id][cpu] - previous_count[id][cpu], + *percent, cpu); + + return 0; +} + +static int snb_start(void) +{ + int num, cpu; + unsigned long long val; + + for (num = 0; num < SNB_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + snb_get_count(num, &val, cpu); + previous_count[num][cpu] = val; + } + } + snb_get_count(TSC, &tsc_at_measure_start, base_cpu); + return 0; +} + +static int snb_stop(void) +{ + unsigned long long val; + int num, cpu; + + snb_get_count(TSC, &tsc_at_measure_end, base_cpu); + + for (num = 0; num < SNB_CSTATE_COUNT; num++) { + for (cpu = 0; cpu < cpu_count; cpu++) { + is_valid[cpu] = !snb_get_count(num, &val, cpu); + current_count[num][cpu] = val; + } + } + return 0; +} + +struct cpuidle_monitor intel_snb_monitor; + +static struct cpuidle_monitor *snb_register(void) +{ + int num; + + if (cpupower_cpu_info.vendor != X86_VENDOR_INTEL + || cpupower_cpu_info.family != 6) + return NULL; + + switch (cpupower_cpu_info.model) { + case 0x2A: /* SNB */ + case 0x2D: /* SNB Xeon */ + case 0x3A: /* IVB */ + case 0x3E: /* IVB Xeon */ + case 0x3C: /* HSW */ + case 0x3F: /* HSW */ + case 0x45: /* HSW */ + case 0x46: /* HSW */ + break; + default: + return NULL; + } + + is_valid = calloc(cpu_count, sizeof(int)); + for (num = 0; num < SNB_CSTATE_COUNT; num++) { + previous_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + current_count[num] = calloc(cpu_count, + sizeof(unsigned long long)); + } + intel_snb_monitor.name_len = strlen(intel_snb_monitor.name); + return &intel_snb_monitor; +} + +void snb_unregister(void) +{ + int num; + free(is_valid); + for (num = 0; num < SNB_CSTATE_COUNT; num++) { + free(previous_count[num]); + free(current_count[num]); + } +} + +struct cpuidle_monitor intel_snb_monitor = { + .name = "SandyBridge", + .hw_states = snb_cstates, + .hw_states_num = SNB_CSTATE_COUNT, + .start = snb_start, + .stop = snb_stop, + .do_register = snb_register, + .unregister = snb_unregister, + .flags.needs_root = 1, + .overflow_s = 922000000 /* 922337203 seconds TSC overflow + at 20GHz */ +}; +#endif /* defined(__i386__) || defined(__x86_64__) */ -- cgit v1.2.3