diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /arch/x86/events/amd | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/events/amd')
-rw-r--r-- | arch/x86/events/amd/Makefile | 10 | ||||
-rw-r--r-- | arch/x86/events/amd/brs.c | 434 | ||||
-rw-r--r-- | arch/x86/events/amd/core.c | 1521 | ||||
-rw-r--r-- | arch/x86/events/amd/ibs.c | 1541 | ||||
-rw-r--r-- | arch/x86/events/amd/iommu.c | 489 | ||||
-rw-r--r-- | arch/x86/events/amd/iommu.h | 24 | ||||
-rw-r--r-- | arch/x86/events/amd/lbr.c | 439 | ||||
-rw-r--r-- | arch/x86/events/amd/power.c | 305 | ||||
-rw-r--r-- | arch/x86/events/amd/uncore.c | 789 |
9 files changed, 5552 insertions, 0 deletions
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile new file mode 100644 index 000000000..527d947eb --- /dev/null +++ b/arch/x86/events/amd/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o +obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o +amd-uncore-objs := uncore.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += iommu.o +endif diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000..f1bff153d --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian <eranian@google.com> + */ +#include <linux/kernel.h> +#include <linux/jump_label.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +static int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + return 0; +} + +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return; + + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c new file mode 100644 index 000000000..04f4b96de --- /dev/null +++ b/arch/x86/events/amd/core.c @@ -0,0 +1,1521 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/perf_event.h> +#include <linux/jump_label.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/jiffies.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/nmi.h> + +#include "../perf_event.h" + +static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); +static unsigned long perf_nmi_window; + +/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */ +#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) +#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) + +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + +static __initconst const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst const u64 amd_hw_cache_event_ids_f17h + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */ + [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */ + [C(RESULT_MISS)] = 0, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */ + [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */ + [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */ + [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */ + [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +/* + * AMD Performance Monitor K7 and later, up to and including Family 16h: + */ +static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + + return amd_perfmon_event_map[hw_event]; +} + +/* + * Previously calculated offsets + */ +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; + +/* + * Legacy CPUs: + * 4 counters starting at 0xc0010000 each offset by 1 + * + * CPUs with core performance counter extensions: + * 6 counters starting at 0xc0010200 each offset by 2 + */ +static inline int amd_pmu_addr_offset(int index, bool eventsel) +{ + int offset; + + if (!index) + return index; + + if (eventsel) + offset = event_offsets[index]; + else + offset = count_offsets[index]; + + if (offset) + return offset; + + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + offset = index; + else + offset = index << 1; + + if (eventsel) + event_offsets[index] = offset; + else + count_offsets[index] = offset; + + return offset; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + +static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) +{ + if (!(x86_pmu.flags & PMU_FL_PAIR)) + return false; + + switch (amd_get_event_code(hwc)) { + case 0x003: return true; /* Retired SSE/AVX FLOPs */ + default: return false; + } +} + +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); + +static int amd_core_hw_config(struct perf_event *event) +{ + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD64_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD64_EVENTSEL_HOSTONLY; + + if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) + event->hw.flags |= PERF_X86_EVENT_PAIR; + + if (has_branch_stack(event)) + return static_call(amd_pmu_branch_hw_config)(event); + + return 0; +} + +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + +static int amd_pmu_hw_config(struct perf_event *event) +{ + int ret; + + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return forward_event_to_ibs(event); + + if (has_branch_stack(event) && !x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + return amd_core_hw_config(event); +} + +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + if (cmpxchg(nb->owners + i, event, NULL) == event) + break; + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling __amd_put_nb_event_constraints() + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + struct event_constraint *c) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old; + int idx, new = -1; + + if (!c) + c = &unconstrained; + + if (cpuc->is_fake) + return c; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + if (new == -1 || hwc->idx == idx) + /* assign free slot, prefer hwc->idx */ + old = cmpxchg(nb->owners + idx, NULL, event); + else if (nb->owners[idx] == event) + /* event already present */ + old = event; + else + continue; + + if (old && old != event) + continue; + + /* reassign to this slot */ + if (new != -1) + cmpxchg(nb->owners + new, event, NULL); + new = idx; + + /* already present, reuse */ + if (old == event) + break; + } + + if (new == -1) + return &emptyconstraint; + + return &nb->event_constraints[new]; +} + +static struct amd_nb *amd_alloc_nb(int cpu) +{ + struct amd_nb *nb; + int i; + + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); + if (!nb) + return NULL; + + nb->nb_id = -1; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + __set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); +} + +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + + WARN_ON_ONCE(cpuc->amd_nb); + + if (!x86_pmu.amd_nb_constraints) + return 0; + + cpuc->amd_nb = amd_alloc_nb(cpu); + if (cpuc->amd_nb) + return 0; + + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; + struct amd_nb *nb; + int i, nb_id; + + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); + + if (!x86_pmu.amd_nb_constraints) + return; + + nb_id = topology_die_id(cpu); + WARN_ON_ONCE(nb_id == BAD_APICID); + + for_each_online_cpu(i) { + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) + continue; + + if (nb->nb_id == nb_id) { + *onln = cpuc->amd_nb; + cpuc->amd_nb = nb; + break; + } + } + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; +} + +static void amd_pmu_cpu_dead(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; + amd_pmu_cpu_reset(cpu); + + if (!x86_pmu.amd_nb_constraints) + return; + + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); + + cpuhw->amd_nb = NULL; + } +} + +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + +/* + * When a PMC counter overflows, an NMI is used to process the event and + * reset the counter. NMI latency can result in the counter being updated + * before the NMI can run, which can result in what appear to be spurious + * NMIs. This function is intended to wait for the NMI to run and reset + * the counter to avoid possible unhandled NMI messages. + */ +#define OVERFLOW_WAIT_COUNT 50 + +static void amd_pmu_wait_on_overflow(int idx) +{ + unsigned int i; + + /* + * Wait for the counter to be reset if it has overflowed. This loop + * should exit very, very quickly, but just in case, don't wait + * forever... + */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + if (!static_call(amd_pmu_test_overflow)(idx)) + break; + + /* Might be in IRQ context, so can't sleep */ + udelay(1); + } +} + +static void amd_pmu_check_overflow(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + /* + * This shouldn't be called from NMI context, but add a safeguard here + * to return, since if we're in NMI context we can't wait for an NMI + * to reset an overflowed counter value. + */ + if (in_nmi()) + return; + + /* + * Check each counter for overflow and wait for it to be reset by the + * NMI if it has overflowed. This relies on the fact that all active + * counters are always enabled when this function is called and + * ARCH_PERFMON_EVENTSEL_INT is always set. + */ + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_wait_on_overflow(idx); + } +} + +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static __always_inline void amd_pmu_core_enable_all(void) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + +static void amd_pmu_disable_event(struct perf_event *event) +{ + x86_pmu_disable_event(event); + + /* + * This can be called from NMI context (via x86_pmu_stop). The counter + * may have overflowed, but either way, we'll never see it get reset + * by the NMI if we're already in the NMI. And the NMI latency support + * below will take care of any pending NMI that might have been + * generated by the overflow. + */ + if (in_nmi()) + return; + + amd_pmu_wait_on_overflow(event->hw.idx); +} + +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static __always_inline void amd_pmu_core_disable_all(void) +{ + amd_pmu_set_global_ctl(0); +} + +static void amd_pmu_v2_disable_all(void) +{ + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); + amd_pmu_check_overflow(); +} + +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + static_call(amd_pmu_branch_add)(event); +} + +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + static_call(amd_pmu_branch_del)(event); +} + +/* + * Because of NMI latency, if multiple PMC counters are active or other sources + * of NMIs are received, the perf NMI handler can handle one or more overflowed + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel + * back-to-back NMI support won't be active. This PMC handler needs to take into + * account that this can occur, otherwise this could result in unknown NMI + * messages being issued. Examples of this is PMC overflow while in the NMI + * handler when multiple PMCs are active or PMC overflow while handling some + * other source of an NMI. + * + * Attempt to mitigate this by creating an NMI window in which un-handled NMIs + * received during this window will be claimed. This prevents extending the + * window past when it is possible that latent NMIs should be received. The + * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has + * handled a counter. When an un-handled NMI is received, it will be claimed + * only if arriving within that window. + */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + amd_brs_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_brs_enable_all(); + + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 reserved, status, mask; + bool pmu_enabled; + + /* + * Save the PMU state as it needs to be restored when leaving the + * handler + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + + reserved = status & ~amd_pmu_global_cntr_mask; + if (reserved) + pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n", + reserved); + + /* Clear any reserved bits set by buggy microcode */ + status &= amd_pmu_global_cntr_mask; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + status &= ~mask; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (has_branch_stack(event)) { + data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); + + /* Clear overflow and freeze bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_core_enable_all(); + + return amd_pmu_adjust_nmi_window(handled); +} + +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) + return &unconstrained; + + return __amd_get_nb_event_constraints(cpuc, event, NULL); +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) + __amd_put_nb_event_constraints(cpuc, event); +} + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* moved to uncore.c */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static struct event_constraint pair_constraint; + +static struct event_constraint * +amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (amd_is_pair_event_code(hwc)) + return &pair_constraint; + + return &unconstrained; +} + +static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_counter_pair(hwc)) + --cpuc->n_pair; +} + +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) +{ + /* + * Decrease period by the depth of the BRS feature to get the last N + * taken branches and approximate the desired period + */ + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; +} + +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .addr_offset = amd_pmu_addr_offset, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, + + .amd_nb_constraints = 1, +}; + +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_branches_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_branches = { + .name = "caps", + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, +}; + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + &group_events_amd_brs, +#endif + NULL, +}; + +static int __init amd_core_pmu_init(void) +{ + union cpuid_0x80000022_ebx ebx; + u64 even_ctr_mask = 0ULL; + int i; + + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + return 0; + + /* Avoid calculating the value each time in the NMI handler */ + perf_nmi_window = msecs_to_jiffies(100); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * amd_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); + } + + /* + * AMD Core perfctr has separate MSRs for the NB events, see + * the amd/uncore.c driver. + */ + x86_pmu.amd_nb_constraints = 0; + + if (boot_cpu_data.x86 == 0x15) { + pr_cont("Fam15h "); + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + } + if (boot_cpu_data.x86 >= 0x17) { + pr_cont("Fam17h+ "); + /* + * Family 17h and compatibles have constraints for Large + * Increment per Cycle events: they may only be assigned an + * even numbered counter that has a consecutive adjacent odd + * numbered counter following it. + */ + for (i = 0; i < x86_pmu.num_counters - 1; i += 2) + even_ctr_mask |= BIT_ULL(i); + + pair_constraint = (struct event_constraint) + __EVENT_CONSTRAINT(0, even_ctr_mask, 0, + x86_pmu.num_counters / 2, 0, + PERF_X86_EVENT_PAIR); + + x86_pmu.get_event_constraints = amd_get_event_constraints_f17h; + x86_pmu.put_event_constraints = amd_put_event_constraints_f17h; + x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE; + x86_pmu.flags |= PMU_FL_PAIR; + } + + /* LBR and BRS are mutually exclusive features */ + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_brs_sched_task; + x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + + /* + * put_event_constraints callback same as Fam17h, set above + */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); + } + + x86_pmu.attr_update = amd_attr_update; + + pr_cont("core perfctr, "); + return 0; +} + +__init int amd_pmu_init(void) +{ + int ret; + + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + ret = amd_core_pmu_init(); + if (ret) + return ret; + + if (num_possible_cpus() == 1) { + /* + * No point in allocating data structures to serialize + * against other CPUs, when there is only the one CPU. + */ + x86_pmu.amd_nb_constraints = 0; + } + + if (boot_cpu_data.x86 >= 0x17) + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids)); + else + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + + return 0; +} + +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + +void amd_pmu_enable_virt(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + amd_pmu_reload_virt(); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + + /* Reload all events */ + amd_pmu_reload_virt(); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c new file mode 100644 index 000000000..37cbbc5c6 --- /dev/null +++ b/arch/x86/events/amd/ibs.c @@ -0,0 +1,1541 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/pci.h> +#include <linux/ptrace.h> +#include <linux/syscore_ops.h> +#include <linux/sched/clock.h> + +#include <asm/apic.h> + +#include "../perf_event.h" + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> +#include <asm/amd-ibs.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + + +/* + * IBS states: + * + * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken + * and any further add()s must fail. + * + * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are + * complicated by the fact that the IBS hardware can send late NMIs (ie. after + * we've cleared the EN bit). + * + * In order to consume these late NMIs we have the STOPPED state, any NMI that + * happens after we've cleared the EN state will clear this bit and report the + * NMI handled (this is fundamentally racy in the face or multiple NMI sources, + * someone else can consume our BIT and our NMI will go unhandled). + * + * And since we cannot set/clear this separate bit together with the EN bit, + * there are races; if we cleared STARTED early, an NMI could land in + * between clearing STARTED and clearing the EN bit (in fact multiple NMIs + * could happen if the period is small enough), and consume our STOPPED bit + * and trigger streams of unhandled NMIs. + * + * If, however, we clear STARTED late, an NMI can hit between clearing the + * EN bit and clearing STARTED, still see STARTED set and process the event. + * If this event will have the VALID bit clear, we bail properly, but this + * is not a given. With VALID set we can end up calling pmu::stop() again + * (the throttle logic) and trigger the WARNs in there. + * + * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() + * nesting, and clear STARTED late, so that we have a well defined state over + * the clearing of the EN bit. + * + * XXX: we could probably be using !atomic bitops for all this. + */ + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + IBS_STOPPED = 3, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + unsigned int fetch_count_reset_broken : 1; + unsigned int fetch_ignore_if_zero_rip : 1; + struct cpu_perf_ibs __percpu *pcpu; + + u64 (*get_count)(u64 config); +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * core pmu config -> IBS config + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + */ +static int core_pmu_ibs_config(struct perf_event *event, u64 *config) +{ + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} + +/* + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + */ +int forward_event_to_ibs(struct perf_event *event) +{ + u64 config = 0; + + if (!event->attr.precise_ip || event->attr.precise_ip > 2) + return -EOPNOTSUPP; + + if (!core_pmu_ibs_config(event, &config)) { + event->attr.type = perf_ibs_op.pmu.type; + event->attr.config = config; + } + return -ENOENT; +} + +static int perf_ibs_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (!perf_ibs) + return -ENOENT; + + config = event->attr.config; + + if (event->pmu != &perf_ibs->pmu) + return -ENOENT; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + + return 0; +} + +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; + + return fetch_ctl.fetch_cnt << 4; +} + +static u64 get_ibs_op_count(u64 config) +{ + union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; + u64 count = 0; + + /* + * If the internal 27-bit counter rolled over, the count is MaxCnt + * and the lower 7 bits of CurCnt are randomized. + * Otherwise CurCnt has the full 27-bit current counter value. + */ + if (op_ctl.op_val) { + count = op_ctl.opmaxcnt << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += op_ctl.opmaxcnt_ext << 20; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { + count = op_ctl.opcurcnt; + } + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + u64 tmp = hwc->config | config; + + if (perf_ibs->fetch_count_reset_broken) + wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); + + wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period, config = 0; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + + /* + * Set STARTED before enabling the hardware, such that a subsequent NMI + * must observe it. + */ + set_bit(IBS_STARTED, pcpu->state); + clear_bit(IBS_STOPPING, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, config); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + if (test_and_set_bit(IBS_STOPPING, pcpu->state)) + return; + + stopping = test_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + /* + * Set STOPPED before disabling the hardware, such that it + * must be visible to NMIs the moment we clear the EN bit, + * at which point we can generate an !VALID sample which + * we need to consume. + */ + set_bit(IBS_STOPPED, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + /* + * Clear STARTED after disabling the hardware; if it were + * cleared before an NMI hitting after the clear but before + * clearing the EN bit might think it a spurious NMI and not + * handle it. + * + * Clearing it after, however, creates the problem of the NMI + * handler seeing STARTED but not having a valid sample. + */ + clear_bit(IBS_STARTED, pcpu->state); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); +} + +static void perf_ibs_read(struct perf_event *event) { } + +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + &empty_caps_group, + NULL, +}; + +PMU_FORMAT_ATTR(rand_en, "config:57"); +PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} + +static struct attribute *rand_en_attrs[] = { + &format_attr_rand_en.attr, + NULL, +}; + +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + &empty_caps_group, + NULL, +}; + +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + &group_zen4_ibs_extensions, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, + &group_op_l3missonly, + &group_zen4_ibs_extensions, + NULL, +}; + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, +}; + +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, +}; + +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + union ibs_op_data op_data; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period, new_config = 0; + + if (!test_bit(IBS_STARTED, pcpu->state)) { +fail: + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incoming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) + return 1; + + return 0; + } + + if (WARN_ON_ONCE(!event)) + goto fail; + + hwc = &event->hw; + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + goto fail; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + /* + * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } + } + if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { + rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); + size++; + } + } + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + /* Workaround for erratum #1197 */ + if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) + goto out; + + set_linear_ip(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw = (struct perf_raw_record){ + .frag = { + .size = sizeof(u32) + ibs_data.size, + .data = ibs_data.data, + }, + }; + data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; + } + + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) { + perf_ibs_stop(event, 0); + } else { + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; + + perf_ibs_enable_event(perf_ibs, hwc, new_config); + } + + perf_event_update_userpage(event); + + return 1; +} + +static int +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + u64 stamp = sched_clock(); + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + perf_sample_event_took(sched_clock() - stamp); + + return handled; +} +NOKPROBE_SYMBOL(perf_ibs_nmi_handler); + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + +static __init int perf_ibs_fetch_init(void) +{ + /* + * Some chips fail to reset the fetch count when it is written; instead + * they need a 0-1 transition of IbsFetchEn. + */ + if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) + perf_ibs_fetch.fetch_count_reset_broken = 1; + + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) + perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; + + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} + +static __init int perf_ibs_op_init(void) +{ + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + } + + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; + + pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) +{ + return 0; +} + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n", + value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + pr_debug("No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static void force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + pr_debug("No EILVT entry available\n"); + return; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) + goto out; + + pr_info("LVT offset %d assigned\n", offset); + + return; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return; +} + +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) +{ + setup_APIC_ibs(); + return 0; +} + +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + +static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu) +{ + clear_APIC_ibs(); + return 0; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + return -EINVAL; + + perf_ibs_pm_init(); + + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + /* + * x86_pmu_amd_ibs_starting_cpu will be called from core on + * all online cpus. + */ + cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, + "perf/x86/amd/ibs:starting", + x86_pmu_amd_ibs_starting_cpu, + x86_pmu_amd_ibs_dying_cpu); + + return perf_event_ibs_init(); +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c new file mode 100644 index 000000000..b15f7b950 --- /dev/null +++ b/arch/x86/events/amd/iommu.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation + */ + +#define pr_fmt(fmt) "perf/amd_iommu: " fmt + +#include <linux/perf_event.h> +#include <linux/init.h> +#include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/amd-iommu.h> + +#include "../perf_event.h" +#include "iommu.h" + +/* iommu pmu conf masks */ +#define GET_CSOURCE(x) ((x)->conf & 0xFFULL) +#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) +#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL) +#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL) + +/* iommu pmu conf1 masks */ +#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL) +#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) +#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) + +#define IOMMU_NAME_SIZE 16 + +struct perf_amd_iommu { + struct list_head list; + struct pmu pmu; + struct amd_iommu *iommu; + char name[IOMMU_NAME_SIZE]; + u8 max_banks; + u8 max_counters; + u64 cntr_assign_mask; + raw_spinlock_t lock; +}; + +static LIST_HEAD(perf_amd_iommu_list); + +/*--------------------------------------------- + * sysfs format attributes + *---------------------------------------------*/ +PMU_FORMAT_ATTR(csource, "config:0-7"); +PMU_FORMAT_ATTR(devid, "config:8-23"); +PMU_FORMAT_ATTR(domid, "config:24-39"); +PMU_FORMAT_ATTR(pasid, "config:40-59"); +PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); +PMU_FORMAT_ATTR(domid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(pasid_mask, "config1:32-51"); + +static struct attribute *iommu_format_attrs[] = { + &format_attr_csource.attr, + &format_attr_devid.attr, + &format_attr_pasid.attr, + &format_attr_domid.attr, + &format_attr_devid_mask.attr, + &format_attr_pasid_mask.attr, + &format_attr_domid_mask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_format_group = { + .name = "format", + .attrs = iommu_format_attrs, +}; + +/*--------------------------------------------- + * sysfs events attributes + *---------------------------------------------*/ +static struct attribute_group amd_iommu_events_group = { + .name = "events", +}; + +struct amd_iommu_event_desc { + struct device_attribute attr; + const char *event; +}; + +static ssize_t _iommu_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amd_iommu_event_desc *event = + container_of(attr, struct amd_iommu_event_desc, attr); + return sprintf(buf, "%s\n", event->event); +} + +#define AMD_IOMMU_EVENT_DESC(_name, _event) \ +{ \ + .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \ + .event = _event, \ +} + +static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { + AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"), + AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"), + AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"), + AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"), + AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"), + AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"), + AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"), + AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"), + AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"), + AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), + AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), + AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"), + AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"), + AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"), + AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"), + AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"), + { /* end: all zeroes */ }, +}; + +/*--------------------------------------------- + * sysfs cpumask attributes + *---------------------------------------------*/ +static cpumask_t iommu_cpumask; + +static ssize_t _iommu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); +} +static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); + +static struct attribute *iommu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_cpumask_group = { + .attrs = iommu_cpumask_attrs, +}; + +/*---------------------------------------------*/ + +static int get_next_avail_iommu_bnk_cntr(struct perf_event *event) +{ + struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu); + int max_cntrs = piommu->max_counters; + int max_banks = piommu->max_banks; + u32 shift, bank, cntr; + unsigned long flags; + int retval; + + raw_spin_lock_irqsave(&piommu->lock, flags); + + for (bank = 0; bank < max_banks; bank++) { + for (cntr = 0; cntr < max_cntrs; cntr++) { + shift = bank + (bank*3) + cntr; + if (piommu->cntr_assign_mask & BIT_ULL(shift)) { + continue; + } else { + piommu->cntr_assign_mask |= BIT_ULL(shift); + event->hw.iommu_bank = bank; + event->hw.iommu_cntr = cntr; + retval = 0; + goto out; + } + } + } + retval = -ENOSPC; +out: + raw_spin_unlock_irqrestore(&piommu->lock, flags); + return retval; +} + +static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, + u8 bank, u8 cntr) +{ + unsigned long flags; + int max_banks, max_cntrs; + int shift = 0; + + max_banks = perf_iommu->max_banks; + max_cntrs = perf_iommu->max_counters; + + if ((bank > max_banks) || (cntr > max_cntrs)) + return -EINVAL; + + shift = bank + cntr + (bank*3); + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + perf_iommu->cntr_assign_mask &= ~(1ULL<<shift); + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + + return 0; +} + +static int perf_iommu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* test the event attr type check for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * IOMMU counters are shared across all cores. + * Therefore, it does not support per-process mode. + * Also, it does not support event sampling mode. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + /* update the hw_perf_event struct with the iommu config data */ + hwc->conf = event->attr.config; + hwc->conf1 = event->attr.config1; + + return 0; +} + +static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev) +{ + return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu; +} + +static void perf_iommu_enable_event(struct perf_event *ev) +{ + struct amd_iommu *iommu = perf_event_2_iommu(ev); + struct hw_perf_event *hwc = &ev->hw; + u8 bank = hwc->iommu_bank; + u8 cntr = hwc->iommu_cntr; + u64 reg = 0ULL; + + reg = GET_CSOURCE(hwc); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, ®); + + reg = GET_DEVID_MASK(hwc); + reg = GET_DEVID(hwc) | (reg << 32); + if (reg) + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, ®); + + reg = GET_PASID_MASK(hwc); + reg = GET_PASID(hwc) | (reg << 32); + if (reg) + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, ®); + + reg = GET_DOMID_MASK(hwc); + reg = GET_DOMID(hwc) | (reg << 32); + if (reg) + reg |= BIT(31); + amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, ®); +} + +static void perf_iommu_disable_event(struct perf_event *event) +{ + struct amd_iommu *iommu = perf_event_2_iommu(event); + struct hw_perf_event *hwc = &event->hw; + u64 reg = 0ULL; + + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_SRC_REG, ®); +} + +static void perf_iommu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + /* + * To account for power-gating, which prevents write to + * the counter, we need to enable the counter + * before setting up counter register. + */ + perf_iommu_enable_event(event); + + if (flags & PERF_EF_RELOAD) { + u64 count = 0; + struct amd_iommu *iommu = perf_event_2_iommu(event); + + /* + * Since the IOMMU PMU only support counting mode, + * the counter always start with value zero. + */ + amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &count); + } + + perf_event_update_userpage(event); +} + +static void perf_iommu_read(struct perf_event *event) +{ + u64 count; + struct hw_perf_event *hwc = &event->hw; + struct amd_iommu *iommu = perf_event_2_iommu(event); + + if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, + IOMMU_PC_COUNTER_REG, &count)) + return; + + /* IOMMU pc counter register is only 48 bits */ + count &= GENMASK_ULL(47, 0); + + /* + * Since the counter always start with value zero, + * simply just accumulate the count for the event. + */ + local64_add(count, &event->count); +} + +static void perf_iommu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * To account for power-gating, in which reading the counter would + * return zero, we need to read the register before disabling. + */ + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; + + perf_iommu_disable_event(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; +} + +static int perf_iommu_add(struct perf_event *event, int flags) +{ + int retval; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* request an iommu bank/counter */ + retval = get_next_avail_iommu_bnk_cntr(event); + if (retval) + return retval; + + if (flags & PERF_EF_START) + perf_iommu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_iommu_del(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + perf_iommu_stop(event, PERF_EF_UPDATE); + + /* clear the assigned iommu bank/counter */ + clear_avail_iommu_bnk_cntr(perf_iommu, + hwc->iommu_bank, hwc->iommu_cntr); + + perf_event_update_userpage(event); +} + +static __init int _init_events_attrs(void) +{ + int i = 0, j; + struct attribute **attrs; + + while (amd_iommu_v2_event_descs[i].attr.attr.name) + i++; + + attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + for (j = 0; j < i; j++) + attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; + + amd_iommu_events_group.attrs = attrs; + return 0; +} + +static const struct attribute_group *amd_iommu_attr_groups[] = { + &amd_iommu_format_group, + &amd_iommu_cpumask_group, + &amd_iommu_events_group, + NULL, +}; + +static const struct pmu iommu_pmu __initconst = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_iommu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, +}; + +static __init int init_one_iommu(unsigned int idx) +{ + struct perf_amd_iommu *perf_iommu; + int ret; + + perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL); + if (!perf_iommu) + return -ENOMEM; + + raw_spin_lock_init(&perf_iommu->lock); + + perf_iommu->pmu = iommu_pmu; + perf_iommu->iommu = get_amd_iommu(idx); + perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx); + perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx); + + if (!perf_iommu->iommu || + !perf_iommu->max_banks || + !perf_iommu->max_counters) { + kfree(perf_iommu); + return -EINVAL; + } + + snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx); + + ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1); + if (!ret) { + pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n", + idx, perf_iommu->max_banks, perf_iommu->max_counters); + list_add_tail(&perf_iommu->list, &perf_amd_iommu_list); + } else { + pr_warn("Error initializing IOMMU %d.\n", idx); + kfree(perf_iommu); + } + return ret; +} + +static __init int amd_iommu_pc_init(void) +{ + unsigned int i, cnt = 0; + int ret; + + /* Make sure the IOMMU PC resource is available */ + if (!amd_iommu_pc_supported()) + return -ENODEV; + + ret = _init_events_attrs(); + if (ret) + return ret; + + /* + * An IOMMU PMU is specific to an IOMMU, and can function independently. + * So we go through all IOMMUs and ignore the one that fails init + * unless all IOMMU are failing. + */ + for (i = 0; i < amd_iommu_get_num_iommus(); i++) { + ret = init_one_iommu(i); + if (!ret) + cnt++; + } + + if (!cnt) { + kfree(amd_iommu_events_group.attrs); + return -ENODEV; + } + + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); + return 0; +} + +device_initcall(amd_iommu_pc_init); diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h new file mode 100644 index 000000000..e6310c635 --- /dev/null +++ b/arch/x86/events/amd/iommu.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + */ + +#ifndef _PERF_EVENT_AMD_IOMMU_H_ +#define _PERF_EVENT_AMD_IOMMU_H_ + +/* iommu pc mmio region register indexes */ +#define IOMMU_PC_COUNTER_REG 0x00 +#define IOMMU_PC_COUNTER_SRC_REG 0x08 +#define IOMMU_PC_PASID_MATCH_REG 0x10 +#define IOMMU_PC_DOMID_MATCH_REG 0x18 +#define IOMMU_PC_DEVID_MATCH_REG 0x20 +#define IOMMU_PC_COUNTER_REPORT_REG 0x28 + +/* maximum specified bank/counters */ +#define PC_MAX_SPEC_BNKS 64 +#define PC_MAX_SPEC_CNTRS 16 + +#endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000..38a75216c --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/perf_event.h> +#include <asm/perf_event.h> + +#include "../perf_event.h" + +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, offset, type, i, j; + bool compress = false; + bool fused_only = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + fused_only = true; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) { + cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, idx, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); +} + +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, +}; + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; + } + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 lbr_select, dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c new file mode 100644 index 000000000..37d5b3805 --- /dev/null +++ b/arch/x86/events/amd/power.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Performance events - AMD Processor Power Reporting Mechanism + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Huang Rui <ray.huang@amd.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "../perf_event.h" + +/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ +#define AMD_POWER_EVENT_MASK 0xFFULL + +/* + * Accumulated power status counters. + */ +#define AMD_POWER_EVENTSEL_PKG 1 + +/* + * The ratio of compute unit power accumulator sample period to the + * PTSC period. + */ +static unsigned int cpu_pwr_sample_ratio; + +/* Maximum accumulated power of a compute unit. */ +static u64 max_cu_acc_power; + +static struct pmu pmu_class; + +/* + * Accumulated power represents the sum of each compute unit's (CU) power + * consumption. On any core of each CU we read the total accumulated power from + * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores + * which are picked to measure the power for the CUs they belong to. + */ +static cpumask_t cpu_mask; + +static void event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc; + u64 delta, tdelta; + + prev_pwr_acc = hwc->pwr_acc; + prev_ptsc = hwc->ptsc; + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc); + rdmsrl(MSR_F15H_PTSC, new_ptsc); + + /* + * Calculate the CU power consumption over a time period, the unit of + * final value (delta) is micro-Watts. Then add it to the event count. + */ + if (new_pwr_acc < prev_pwr_acc) { + delta = max_cu_acc_power + new_pwr_acc; + delta -= prev_pwr_acc; + } else + delta = new_pwr_acc - prev_pwr_acc; + + delta *= cpu_pwr_sample_ratio * 1000; + tdelta = new_ptsc - prev_ptsc; + + do_div(delta, tdelta); + local64_add(delta, &event->count); +} + +static void __pmu_event_start(struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + rdmsrl(MSR_F15H_PTSC, event->hw.ptsc); + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc); +} + +static void pmu_event_start(struct perf_event *event, int mode) +{ + __pmu_event_start(event); +} + +static void pmu_event_stop(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + /* Mark event as deactivated and stopped. */ + if (!(hwc->state & PERF_HES_STOPPED)) + hwc->state |= PERF_HES_STOPPED; + + /* Check if software counter update is necessary. */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of an event + * that we are disabling: + */ + event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int pmu_event_add(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __pmu_event_start(event); + + return 0; +} + +static void pmu_event_del(struct perf_event *event, int flags) +{ + pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK; + + /* Only look at AMD power events. */ + if (event->attr.type != pmu_class.type) + return -ENOENT; + + /* Unsupported modes and filters. */ + if (event->attr.sample_period) + return -EINVAL; + + if (cfg != AMD_POWER_EVENTSEL_PKG) + return -EINVAL; + + return 0; +} + +static void pmu_event_read(struct perf_event *event) +{ + event_update(event); +} + +static ssize_t +get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &cpu_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL); + +static struct attribute *pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group pmu_attr_group = { + .attrs = pmu_attrs, +}; + +/* + * Currently it only supports to report the power of each + * processor/package. + */ +EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01"); + +EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts"); + +/* Convert the count from micro-Watts to milli-Watts. */ +EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3"); + +static struct attribute *events_attr[] = { + EVENT_PTR(power_pkg), + EVENT_PTR(power_pkg_unit), + EVENT_PTR(power_pkg_scale), + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = formats_attr, +}; + +static const struct attribute_group *attr_groups[] = { + &pmu_attr_group, + &pmu_format_group, + &pmu_events_group, + NULL, +}; + +static struct pmu pmu_class = { + .attr_groups = attr_groups, + /* system-wide only */ + .task_ctx_nr = perf_invalid_context, + .event_init = pmu_event_init, + .add = pmu_event_add, + .del = pmu_event_del, + .start = pmu_event_start, + .stop = pmu_event_stop, + .read = pmu_event_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, +}; + +static int power_cpu_exit(unsigned int cpu) +{ + int target; + + if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask)) + return 0; + + /* + * Find a new CPU on the same compute unit, if was set in cpumask + * and still some CPUs on compute unit. Then migrate event and + * context to new CPU. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target < nr_cpumask_bits) { + cpumask_set_cpu(target, &cpu_mask); + perf_pmu_migrate_context(&pmu_class, cpu, target); + } + return 0; +} + +static int power_cpu_init(unsigned int cpu) +{ + int target; + + /* + * 1) If any CPU is set at cpu_mask in the same compute unit, do + * nothing. + * 2) If no CPU is set at cpu_mask in the same compute unit, + * set current ONLINE CPU. + * + * Note: if there is a CPU aside of the new one already in the + * sibling mask, then it is also in cpu_mask. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target >= nr_cpumask_bits) + cpumask_set_cpu(cpu, &cpu_mask); + return 0; +} + +static const struct x86_cpu_id cpu_match[] = { + X86_MATCH_VENDOR_FAM(AMD, 0x15, NULL), + {}, +}; + +static int __init amd_power_pmu_init(void) +{ + int ret; + + if (!x86_match_cpu(cpu_match)) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) + return -ENODEV; + + cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); + + if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) { + pr_err("Failed to read max compute unit power accumulator MSR\n"); + return -ENODEV; + } + + + cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, + "perf/x86/amd/power:online", + power_cpu_init, power_cpu_exit); + + ret = perf_pmu_register(&pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_warn("AMD Power PMU registration failed\n"); + return ret; + } + + pr_info("AMD Power PMU detected\n"); + return ret; +} +module_init(amd_power_pmu_init); + +static void __exit amd_power_pmu_exit(void) +{ + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE); + perf_pmu_unregister(&pmu_class); +} +module_exit(amd_power_pmu_exit); + +MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>"); +MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c new file mode 100644 index 000000000..83f15fe41 --- /dev/null +++ b/arch/x86/events/amd/uncore.c @@ -0,0 +1,789 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + */ + +#include <linux/perf_event.h> +#include <linux/percpu.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpufeature.h> +#include <linux/smp.h> + +#include <asm/perf_event.h> +#include <asm/msr.h> + +#define NUM_COUNTERS_NB 4 +#define NUM_COUNTERS_L2 4 +#define NUM_COUNTERS_L3 6 + +#define RDPMC_BASE_NB 6 +#define RDPMC_BASE_LLC 10 + +#define COUNTER_SHIFT 16 + +#undef pr_fmt +#define pr_fmt(fmt) "amd_uncore: " fmt + +static int pmu_version; +static int num_counters_llc; +static int num_counters_nb; +static bool l3_mask; + +static HLIST_HEAD(uncore_unused_list); + +struct amd_uncore { + int id; + int refcnt; + int cpu; + int num_counters; + int rdpmc_base; + u32 msr_base; + cpumask_t *active_mask; + struct pmu *pmu; + struct perf_event **events; + struct hlist_node node; +}; + +static struct amd_uncore * __percpu *amd_uncore_nb; +static struct amd_uncore * __percpu *amd_uncore_llc; + +static struct pmu amd_nb_pmu; +static struct pmu amd_llc_pmu; + +static cpumask_t amd_nb_active_mask; +static cpumask_t amd_llc_active_mask; + +static bool is_nb_event(struct perf_event *event) +{ + return event->pmu->type == amd_nb_pmu.type; +} + +static bool is_llc_event(struct perf_event *event) +{ + return event->pmu->type == amd_llc_pmu.type; +} + +static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) +{ + if (is_nb_event(event) && amd_uncore_nb) + return *per_cpu_ptr(amd_uncore_nb, event->cpu); + else if (is_llc_event(event) && amd_uncore_llc) + return *per_cpu_ptr(amd_uncore_llc, event->cpu); + + return NULL; +} + +static void amd_uncore_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new; + s64 delta; + + /* + * since we do not enable counter overflow interrupts, + * we do not have to worry about prev_count changing on us + */ + + prev = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new); + local64_set(&hwc->prev_count, new); + delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); +} + +static void amd_uncore_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + + hwc->state = 0; + wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); + perf_event_update_userpage(event); +} + +static void amd_uncore_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); + hwc->state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + amd_uncore_read(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int amd_uncore_add(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + /* are we already assigned? */ + if (hwc->idx != -1 && uncore->events[hwc->idx] == event) + goto out; + + for (i = 0; i < uncore->num_counters; i++) { + if (uncore->events[i] == event) { + hwc->idx = i; + goto out; + } + } + + /* if not, take the first available counter */ + hwc->idx = -1; + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { + hwc->idx = i; + break; + } + } + +out: + if (hwc->idx == -1) + return -EBUSY; + + hwc->config_base = uncore->msr_base + (2 * hwc->idx); + hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); + hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* + * The first four DF counters are accessible via RDPMC index 6 to 9 + * followed by the L3 counters from index 10 to 15. For processors + * with more than four DF counters, the DF RDPMC assignments become + * discontiguous as the additional counters are accessible starting + * from index 16. + */ + if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) + hwc->event_base_rdpmc += NUM_COUNTERS_L3; + + if (flags & PERF_EF_START) + amd_uncore_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void amd_uncore_del(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + amd_uncore_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], event, NULL) == event) + break; + } + + hwc->idx = -1; +} + +/* + * Return a full thread and slice mask unless user + * has provided them + */ +static u64 l3_thread_slice_mask(u64 config) +{ + if (boot_cpu_data.x86 <= 0x18) + return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | + ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); + + /* + * If the user doesn't specify a threadmask, they're not trying to + * count core 0, so we enable all cores & threads. + * We'll also assume that they want to count slice 0 if they specify + * a threadmask and leave sliceid and enallslices unpopulated. + */ + if (!(config & AMD64_L3_F19H_THREAD_MASK)) + return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_EN_ALL_CORES; + + return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | + AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_COREID_MASK); +} + +static int amd_uncore_event_init(struct perf_event *event) +{ + struct amd_uncore *uncore; + struct hw_perf_event *hwc = &event->hw; + u64 event_mask = AMD64_RAW_EVENT_MASK_NB; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (pmu_version >= 2 && is_nb_event(event)) + event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; + + /* + * NB and Last level cache counters (MSRs) are shared across all cores + * that share the same NB / Last level cache. On family 16h and below, + * Interrupts can be directed to a single target core, however, event + * counts generated by processes running on other cores cannot be masked + * out. So we do not support sampling and per-thread events via + * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: + */ + hwc->config = event->attr.config & event_mask; + hwc->idx = -1; + + if (event->cpu < 0) + return -EINVAL; + + /* + * SliceMask and ThreadMask need to be set for certain L3 events. + * For other events, the two fields do not affect the count. + */ + if (l3_mask && is_llc_event(event)) + hwc->config |= l3_thread_slice_mask(event->attr.config); + + uncore = event_to_amd_uncore(event); + if (!uncore) + return -ENODEV; + + /* + * since request can come in to any of the shared cores, we will remap + * to a single common cpu. + */ + event->cpu = uncore->cpu; + + return 0; +} + +static umode_t +amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ? + attr->mode : 0; +} + +static umode_t +amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0; +} + +static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + cpumask_t *active_mask; + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu->type == amd_nb_pmu.type) + active_mask = &amd_nb_active_mask; + else if (pmu->type == amd_llc_pmu.type) + active_mask = &amd_llc_active_mask; + else + return 0; + + return cpumap_print_to_pagebuf(true, buf, active_mask); +} +static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); + +static struct attribute *amd_uncore_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_attr_group = { + .attrs = amd_uncore_attrs, +}; + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct device_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + +DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); +DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ +DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ +DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ +DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ + +/* Common DF and NB attributes */ +static struct attribute *amd_uncore_df_format_attr[] = { + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ + NULL, +}; + +/* Common L2 and L3 attributes */ +static struct attribute *amd_uncore_l3_format_attr[] = { + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ + NULL, /* threadmask */ + NULL, +}; + +/* F17h unique L3 attributes */ +static struct attribute *amd_f17h_uncore_l3_format_attr[] = { + &format_attr_slicemask.attr, /* slicemask */ + NULL, +}; + +/* F19h unique L3 attributes */ +static struct attribute *amd_f19h_uncore_l3_format_attr[] = { + &format_attr_coreid.attr, /* coreid */ + &format_attr_enallslices.attr, /* enallslices */ + &format_attr_enallcores.attr, /* enallcores */ + &format_attr_sliceid.attr, /* sliceid */ + NULL, +}; + +static struct attribute_group amd_uncore_df_format_group = { + .name = "format", + .attrs = amd_uncore_df_format_attr, +}; + +static struct attribute_group amd_uncore_l3_format_group = { + .name = "format", + .attrs = amd_uncore_l3_format_attr, +}; + +static struct attribute_group amd_f17h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f17h_uncore_l3_format_attr, + .is_visible = amd_f17h_uncore_is_visible, +}; + +static struct attribute_group amd_f19h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f19h_uncore_l3_format_attr, + .is_visible = amd_f19h_uncore_is_visible, +}; + +static const struct attribute_group *amd_uncore_df_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_df_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_l3_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_update[] = { + &amd_f17h_uncore_l3_format_group, + &amd_f19h_uncore_l3_format_group, + NULL, +}; + +static struct pmu amd_nb_pmu = { + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_df_attr_groups, + .name = "amd_nb", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, +}; + +static struct pmu amd_llc_pmu = { + .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_l3_attr_groups, + .attr_update = amd_uncore_l3_attr_update, + .name = "amd_l2", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, +}; + +static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) +{ + return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, + cpu_to_node(cpu)); +} + +static inline struct perf_event ** +amd_uncore_events_alloc(unsigned int num, unsigned int cpu) +{ + return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, + cpu_to_node(cpu)); +} + +static int amd_uncore_cpu_up_prepare(unsigned int cpu) +{ + struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; + + if (amd_uncore_nb) { + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; + uncore_nb = amd_uncore_alloc(cpu); + if (!uncore_nb) + goto fail; + uncore_nb->cpu = cpu; + uncore_nb->num_counters = num_counters_nb; + uncore_nb->rdpmc_base = RDPMC_BASE_NB; + uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; + uncore_nb->active_mask = &amd_nb_active_mask; + uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); + if (!uncore_nb->events) + goto fail; + uncore_nb->id = -1; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; + } + + if (amd_uncore_llc) { + *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; + uncore_llc = amd_uncore_alloc(cpu); + if (!uncore_llc) + goto fail; + uncore_llc->cpu = cpu; + uncore_llc->num_counters = num_counters_llc; + uncore_llc->rdpmc_base = RDPMC_BASE_LLC; + uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_llc->active_mask = &amd_llc_active_mask; + uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); + if (!uncore_llc->events) + goto fail; + uncore_llc->id = -1; + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; + } + + return 0; + +fail: + if (uncore_nb) { + kfree(uncore_nb->events); + kfree(uncore_nb); + } + + if (uncore_llc) { + kfree(uncore_llc->events); + kfree(uncore_llc); + } + + return -ENOMEM; +} + +static struct amd_uncore * +amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) +{ + unsigned int cpu; + struct amd_uncore *that; + + for_each_online_cpu(cpu) { + that = *per_cpu_ptr(uncores, cpu); + + if (!that) + continue; + + if (this == that) + continue; + + if (this->id == that->id) { + hlist_add_head(&this->node, &uncore_unused_list); + this = that; + break; + } + } + + this->refcnt++; + return this; +} + +static int amd_uncore_cpu_starting(unsigned int cpu) +{ + unsigned int eax, ebx, ecx, edx; + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = *per_cpu_ptr(amd_uncore_nb, cpu); + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + uncore->id = ecx & 0xff; + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_llc) { + uncore = *per_cpu_ptr(amd_uncore_llc, cpu); + uncore->id = get_llc_id(cpu); + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); + *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; + } + + return 0; +} + +static void uncore_clean_online(void) +{ + struct amd_uncore *uncore; + struct hlist_node *n; + + hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { + hlist_del(&uncore->node); + kfree(uncore->events); + kfree(uncore); + } +} + +static void uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + uncore_clean_online(); + + if (cpu == uncore->cpu) + cpumask_set_cpu(cpu, uncore->active_mask); +} + +static int amd_uncore_cpu_online(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_online(cpu, amd_uncore_nb); + + if (amd_uncore_llc) + uncore_online(cpu, amd_uncore_llc); + + return 0; +} + +static void uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + unsigned int i; + struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); + + if (this->cpu != cpu) + return; + + /* this cpu is going down, migrate to a shared sibling if possible */ + for_each_online_cpu(i) { + struct amd_uncore *that = *per_cpu_ptr(uncores, i); + + if (cpu == i) + continue; + + if (this == that) { + perf_pmu_migrate_context(this->pmu, cpu, i); + cpumask_clear_cpu(cpu, that->active_mask); + cpumask_set_cpu(i, that->active_mask); + that->cpu = i; + break; + } + } +} + +static int amd_uncore_cpu_down_prepare(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_down_prepare(cpu, amd_uncore_nb); + + if (amd_uncore_llc) + uncore_down_prepare(cpu, amd_uncore_llc); + + return 0; +} + +static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + if (cpu == uncore->cpu) + cpumask_clear_cpu(cpu, uncore->active_mask); + + if (!--uncore->refcnt) { + kfree(uncore->events); + kfree(uncore); + } + + *per_cpu_ptr(uncores, cpu) = NULL; +} + +static int amd_uncore_cpu_dead(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_dead(cpu, amd_uncore_nb); + + if (amd_uncore_llc) + uncore_dead(cpu, amd_uncore_llc); + + return 0; +} + +static int __init amd_uncore_init(void) +{ + struct attribute **df_attr = amd_uncore_df_format_attr; + struct attribute **l3_attr = amd_uncore_l3_format_attr; + union cpuid_0x80000022_ebx ebx; + int ret = -ENODEV; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) + return -ENODEV; + + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) + pmu_version = 2; + + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + if (boot_cpu_data.x86 >= 0x17) { + /* + * For F17h and above, the Northbridge counters are + * repurposed as Data Fabric counters. Also, L3 + * counters are supported too. The PMUs are exported + * based on family as either L2 or L3 and NB or DF. + */ + num_counters_llc = NUM_COUNTERS_L3; + amd_nb_pmu.name = "amd_df"; + amd_llc_pmu.name = "amd_l3"; + l3_mask = true; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + if (pmu_version >= 2) { + *df_attr++ = &format_attr_event14v2.attr; + *df_attr++ = &format_attr_umask12.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *df_attr = &format_attr_event14.attr; + } + + amd_uncore_nb = alloc_percpu(struct amd_uncore *); + if (!amd_uncore_nb) { + ret = -ENOMEM; + goto fail_nb; + } + ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + if (ret) + goto fail_nb; + + if (pmu_version >= 2) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + num_counters_nb = ebx.split.num_df_pmc; + } + + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + + ret = 0; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + if (boot_cpu_data.x86 >= 0x19) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask8.attr; + *l3_attr++ = &format_attr_threadmask2.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask8.attr; + *l3_attr++ = &format_attr_threadmask8.attr; + } + + amd_uncore_llc = alloc_percpu(struct amd_uncore *); + if (!amd_uncore_llc) { + ret = -ENOMEM; + goto fail_llc; + } + ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1); + if (ret) + goto fail_llc; + + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); + ret = 0; + } + + /* + * Install callbacks. Core will call them for each online cpu. + */ + if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, + "perf/x86/amd/uncore:prepare", + amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) + goto fail_llc; + + if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, + "perf/x86/amd/uncore:starting", + amd_uncore_cpu_starting, NULL)) + goto fail_prep; + if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + "perf/x86/amd/uncore:online", + amd_uncore_cpu_online, + amd_uncore_cpu_down_prepare)) + goto fail_start; + return 0; + +fail_start: + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); +fail_prep: + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); +fail_llc: + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) + perf_pmu_unregister(&amd_nb_pmu); + free_percpu(amd_uncore_llc); +fail_nb: + free_percpu(amd_uncore_nb); + + return ret; +} + +static void __exit amd_uncore_exit(void) +{ + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE); + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); + + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + perf_pmu_unregister(&amd_llc_pmu); + free_percpu(amd_uncore_llc); + amd_uncore_llc = NULL; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + perf_pmu_unregister(&amd_nb_pmu); + free_percpu(amd_uncore_nb); + amd_uncore_nb = NULL; + } +} + +module_init(amd_uncore_init); +module_exit(amd_uncore_exit); + +MODULE_DESCRIPTION("AMD Uncore Driver"); +MODULE_LICENSE("GPL v2"); |