diff options
Diffstat (limited to 'arch/riscv/kvm/vcpu_pmu.c')
-rw-r--r-- | arch/riscv/kvm/vcpu_pmu.c | 633 |
1 files changed, 633 insertions, 0 deletions
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c new file mode 100644 index 0000000000..86391a5061 --- /dev/null +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -0,0 +1,633 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Rivos Inc + * + * Authors: + * Atish Patra <atishp@rivosinc.com> + */ + +#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/perf/riscv_pmu.h> +#include <asm/csr.h> +#include <asm/kvm_vcpu_sbi.h> +#include <asm/kvm_vcpu_pmu.h> +#include <linux/bitops.h> + +#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs) +#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16) +#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK) + +static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = { + [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES, + [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS, + [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES, + [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES, + [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES, + [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES, + [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES, +}; + +static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc) +{ + u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0); + u64 sample_period; + + if (!pmc->counter_val) + sample_period = counter_val_mask + 1; + else + sample_period = (-pmc->counter_val) & counter_val_mask; + + return sample_period; +} + +static u32 kvm_pmu_get_perf_event_type(unsigned long eidx) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 type = PERF_TYPE_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + type = PERF_TYPE_HARDWARE; + break; + case SBI_PMU_EVENT_TYPE_CACHE: + type = PERF_TYPE_HW_CACHE; + break; + case SBI_PMU_EVENT_TYPE_RAW: + case SBI_PMU_EVENT_TYPE_FW: + type = PERF_TYPE_RAW; + break; + default: + break; + } + + return type; +} + +static bool kvm_pmu_is_fw_event(unsigned long eidx) +{ + return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW; +} + +static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_disable(pmc->perf_event); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code) +{ + return hw_event_perf_map[sbi_event_code]; +} + +static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code) +{ + u64 config = U64_MAX; + unsigned int cache_type, cache_op, cache_result; + + /* All the cache event masks lie within 0xFF. No separate masking is necessary */ + cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_ID_SHIFT; + cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> + SBI_PMU_EVENT_CACHE_OP_SHIFT; + cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK; + + if (cache_type >= PERF_COUNT_HW_CACHE_MAX || + cache_op >= PERF_COUNT_HW_CACHE_OP_MAX || + cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return config; + + config = cache_type | (cache_op << 8) | (cache_result << 16); + + return config; +} + +static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data) +{ + enum sbi_pmu_event_type etype = get_event_type(eidx); + u32 ecode = get_event_code(eidx); + u64 config = U64_MAX; + + switch (etype) { + case SBI_PMU_EVENT_TYPE_HW: + if (ecode < SBI_PMU_HW_GENERAL_MAX) + config = kvm_pmu_get_perf_event_hw_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_CACHE: + config = kvm_pmu_get_perf_event_cache_config(ecode); + break; + case SBI_PMU_EVENT_TYPE_RAW: + config = evt_data & RISCV_PMU_RAW_EVENT_MASK; + break; + case SBI_PMU_EVENT_TYPE_FW: + if (ecode < SBI_PMU_FW_MAX) + config = (1ULL << 63) | ecode; + break; + default: + break; + } + + return config; +} + +static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx) +{ + u32 etype = kvm_pmu_get_perf_event_type(eidx); + u32 ecode = get_event_code(eidx); + + if (etype != SBI_PMU_EVENT_TYPE_HW) + return -EINVAL; + + if (ecode == SBI_PMU_HW_CPU_CYCLES) + return 0; + else if (ecode == SBI_PMU_HW_INSTRUCTIONS) + return 2; + else + return -EINVAL; +} + +static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ctr_idx = -1; + int i, pmc_idx; + int min, max; + + if (kvm_pmu_is_fw_event(eidx)) { + /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */ + min = kvpmu->num_hw_ctrs; + max = min + kvpmu->num_fw_ctrs; + } else { + /* First 3 counters are reserved for fixed counters */ + min = 3; + max = kvpmu->num_hw_ctrs; + } + + for_each_set_bit(i, &cmask, BITS_PER_LONG) { + pmc_idx = i + cbase; + if ((pmc_idx >= min && pmc_idx < max) && + !test_bit(pmc_idx, kvpmu->pmc_in_use)) { + ctr_idx = pmc_idx; + break; + } + } + + return ctr_idx; +} + +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx, + unsigned long cbase, unsigned long cmask) +{ + int ret; + + /* Fixed counters need to be have fixed mapping as they have different width */ + ret = kvm_pmu_get_fixed_pmc_index(eidx); + if (ret >= 0) + return ret; + + return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask); +} + +static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + unsigned long *out_val) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u64 enabled, running; + int fevent_code; + + pmc = &kvpmu->pmc[cidx]; + + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + pmc->counter_val = kvpmu->fw_event[fevent_code].value; + } else if (pmc->perf_event) { + pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running); + } else { + return -EINVAL; + } + *out_val = pmc->counter_val; + + return 0; +} + +static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base, + unsigned long ctr_mask) +{ + /* Make sure the we have a valid counter mask requested from the caller */ + if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu))) + return -EINVAL; + + return 0; +} + +static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr, + unsigned long flags, unsigned long eidx, unsigned long evtdata) +{ + struct perf_event *event; + + kvm_pmu_release_perf_event(pmc); + attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata); + if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) { + //TODO: Do we really want to clear the value in hardware counter + pmc->counter_val = 0; + } + + /* + * Set the default sample_period for now. The guest specified value + * will be updated in the start call. + */ + attr->sample_period = kvm_pmu_get_sample_period(pmc); + + event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc); + if (IS_ERR(event)) { + pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); + return PTR_ERR(event); + } + + pmc->perf_event = event; + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + perf_event_enable(pmc->perf_event); + + return 0; +} + +int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_fw_event *fevent; + + if (!kvpmu || fid >= SBI_PMU_FW_MAX) + return -EINVAL; + + fevent = &kvpmu->fw_event[fid]; + if (fevent->started) + fevent->value++; + + return 0; +} + +int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC; + + if (!kvpmu || !kvpmu->init_done) { + /* + * In absence of sscofpmf in the platform, the guest OS may use + * the legacy PMU driver to read cycle/instret. In that case, + * just return 0 to avoid any illegal trap. However, any other + * hpmcounter access should result in illegal trap as they must + * be access through SBI PMU only. + */ + if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) { + *val = 0; + return ret; + } else { + return KVM_INSN_ILLEGAL_TRAP; + } + } + + /* The counter CSR are read only. Thus, any write should result in illegal traps */ + if (wr_mask) + return KVM_INSN_ILLEGAL_TRAP; + + cidx = csr_num - CSR_CYCLE; + + if (pmu_ctr_read(vcpu, cidx, val) < 0) + return KVM_INSN_ILLEGAL_TRAP; + + return ret; +} + +int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + retdata->out_val = kvm_pmu_num_counters(kvpmu); + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + + if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) { + retdata->err_val = SBI_ERR_INVALID_PARAM; + return 0; + } + + retdata->out_val = kvpmu->pmc[cidx].cinfo.value; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, u64 ival, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + struct kvm_pmc *pmc; + int fevent_code; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Start the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + pmc = &kvpmu->pmc[pmc_index]; + if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) + pmc->counter_val = ival; + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Check if the counter was already started for some reason */ + if (kvpmu->fw_event[fevent_code].started) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + + kvpmu->fw_event[fevent_code].started = true; + kvpmu->fw_event[fevent_code].value = pmc->counter_val; + } else if (pmc->perf_event) { + if (unlikely(pmc->started)) { + sbiret = SBI_ERR_ALREADY_STARTED; + continue; + } + perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc)); + perf_event_enable(pmc->perf_event); + pmc->started = true; + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + } + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + struct kvm_vcpu_sbi_return *retdata) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + int i, pmc_index, sbiret = 0; + u64 enabled, running; + struct kvm_pmc *pmc; + int fevent_code; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + /* Stop the counters that have been configured and requested by the guest */ + for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) { + pmc_index = i + ctr_base; + if (!test_bit(pmc_index, kvpmu->pmc_in_use)) + continue; + pmc = &kvpmu->pmc[pmc_index]; + if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + fevent_code = get_event_code(pmc->event_idx); + if (fevent_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + if (!kvpmu->fw_event[fevent_code].started) + sbiret = SBI_ERR_ALREADY_STOPPED; + + kvpmu->fw_event[fevent_code].started = false; + } else if (pmc->perf_event) { + if (pmc->started) { + /* Stop counting the counter */ + perf_event_disable(pmc->perf_event); + pmc->started = false; + } else { + sbiret = SBI_ERR_ALREADY_STOPPED; + } + + if (flags & SBI_PMU_STOP_FLAG_RESET) { + /* Relase the counter if this is a reset request */ + pmc->counter_val += perf_event_read_value(pmc->perf_event, + &enabled, &running); + kvm_pmu_release_perf_event(pmc); + } + } else { + sbiret = SBI_ERR_INVALID_PARAM; + } + if (flags & SBI_PMU_STOP_FLAG_RESET) { + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + clear_bit(pmc_index, kvpmu->pmc_in_use); + } + } + +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base, + unsigned long ctr_mask, unsigned long flags, + unsigned long eidx, u64 evtdata, + struct kvm_vcpu_sbi_return *retdata) +{ + int ctr_idx, ret, sbiret = 0; + bool is_fevent; + unsigned long event_code; + u32 etype = kvm_pmu_get_perf_event_type(eidx); + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + struct perf_event_attr attr = { + .type = etype, + .size = sizeof(struct perf_event_attr), + .pinned = true, + /* + * It should never reach here if the platform doesn't support the sscofpmf + * extension as mode filtering won't work without it. + */ + .exclude_host = true, + .exclude_hv = true, + .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH), + .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH), + .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS, + }; + + if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) { + sbiret = SBI_ERR_INVALID_PARAM; + goto out; + } + + event_code = get_event_code(eidx); + is_fevent = kvm_pmu_is_fw_event(eidx); + if (is_fevent && event_code >= SBI_PMU_FW_MAX) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + + /* + * SKIP_MATCH flag indicates the caller is aware of the assigned counter + * for this event. Just do a sanity check if it already marked used. + */ + if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) { + if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) { + sbiret = SBI_ERR_FAILURE; + goto out; + } + ctr_idx = ctr_base + __ffs(ctr_mask); + } else { + ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask); + if (ctr_idx < 0) { + sbiret = SBI_ERR_NOT_SUPPORTED; + goto out; + } + } + + pmc = &kvpmu->pmc[ctr_idx]; + pmc->idx = ctr_idx; + + if (is_fevent) { + if (flags & SBI_PMU_CFG_FLAG_AUTO_START) + kvpmu->fw_event[event_code].started = true; + } else { + ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata); + if (ret) + return ret; + } + + set_bit(ctr_idx, kvpmu->pmc_in_use); + pmc->event_idx = eidx; + retdata->out_val = ctr_idx; +out: + retdata->err_val = sbiret; + + return 0; +} + +int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, + struct kvm_vcpu_sbi_return *retdata) +{ + int ret; + + ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val); + if (ret == -EINVAL) + retdata->err_val = SBI_ERR_INVALID_PARAM; + + return 0; +} + +void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) +{ + int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0; + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + /* + * PMU functionality should be only available to guests if privilege mode + * filtering is available in the host. Otherwise, guest will always count + * events while the execution is in hypervisor mode. + */ + if (!riscv_isa_extension_available(NULL, SSCOFPMF)) + return; + + ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs); + if (ret < 0 || !hpm_width || !num_hw_ctrs) + return; + + /* + * Increase the number of hardware counters to offset the time counter. + */ + kvpmu->num_hw_ctrs = num_hw_ctrs + 1; + kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX; + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); + + if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) { + pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA"); + kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS; + } + + /* + * There is no correlation between the logical hardware counter and virtual counters. + * However, we need to encode a hpmcounter CSR in the counter info field so that + * KVM can trap n emulate the read. This works well in the migration use case as + * KVM doesn't care if the actual hpmcounter is available in the hardware or not. + */ + for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) { + /* TIME CSR shouldn't be read from perf interface */ + if (i == 1) + continue; + pmc = &kvpmu->pmc[i]; + pmc->idx = i; + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + if (i < kvpmu->num_hw_ctrs) { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW; + if (i < 3) + /* CY, IR counters */ + pmc->cinfo.width = 63; + else + pmc->cinfo.width = hpm_width; + /* + * The CSR number doesn't have any relation with the logical + * hardware counters. The CSR numbers are encoded sequentially + * to avoid maintaining a map between the virtual counter + * and CSR number. + */ + pmc->cinfo.csr = CSR_CYCLE + i; + } else { + pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW; + pmc->cinfo.width = BITS_PER_LONG - 1; + } + } + + kvpmu->init_done = true; +} + +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + if (!kvpmu) + return; + + for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) { + pmc = &kvpmu->pmc[i]; + pmc->counter_val = 0; + kvm_pmu_release_perf_event(pmc); + pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID; + } + bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS); + memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event)); +} + +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) +{ + kvm_riscv_vcpu_pmu_deinit(vcpu); +} |