diff options
Diffstat (limited to 'src/spdk/dpdk/lib/librte_power')
17 files changed, 4071 insertions, 0 deletions
diff --git a/src/spdk/dpdk/lib/librte_power/Makefile b/src/spdk/dpdk/lib/librte_power/Makefile new file mode 100644 index 000000000..087d643ee --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_power.a + +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing +LDLIBS += -lrte_eal -lrte_timer + +EXPORT_MAP := rte_power_version.map + +# all source are stored in SRCS-y +SRCS-$(CONFIG_RTE_LIBRTE_POWER) := rte_power.c power_acpi_cpufreq.c +SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_kvm_vm.c guest_channel.c +SRCS-$(CONFIG_RTE_LIBRTE_POWER) += rte_power_empty_poll.c +SRCS-$(CONFIG_RTE_LIBRTE_POWER) += power_pstate_cpufreq.c + +# install this header file +SYMLINK-$(CONFIG_RTE_LIBRTE_POWER)-include := rte_power.h rte_power_empty_poll.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/spdk/dpdk/lib/librte_power/channel_commands.h b/src/spdk/dpdk/lib/librte_power/channel_commands.h new file mode 100644 index 000000000..adc8e5ca2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/channel_commands.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef CHANNEL_COMMANDS_H_ +#define CHANNEL_COMMANDS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <stdbool.h> + +/* --- Incoming messages --- */ + +/* Valid Commands */ +#define CPU_POWER 1 +#define CPU_POWER_CONNECT 2 +#define PKT_POLICY 3 +#define PKT_POLICY_REMOVE 4 + +/* CPU Power Command Scaling */ +#define CPU_POWER_SCALE_UP 1 +#define CPU_POWER_SCALE_DOWN 2 +#define CPU_POWER_SCALE_MAX 3 +#define CPU_POWER_SCALE_MIN 4 +#define CPU_POWER_ENABLE_TURBO 5 +#define CPU_POWER_DISABLE_TURBO 6 + +/* CPU Power Queries */ +#define CPU_POWER_QUERY_FREQ_LIST 7 +#define CPU_POWER_QUERY_FREQ 8 +#define CPU_POWER_QUERY_CAPS_LIST 9 +#define CPU_POWER_QUERY_CAPS 10 + +/* --- Outgoing messages --- */ + +/* Generic Power Command Response */ +#define CPU_POWER_CMD_ACK 1 +#define CPU_POWER_CMD_NACK 2 + +/* CPU Power Query Responses */ +#define CPU_POWER_FREQ_LIST 3 +#define CPU_POWER_CAPS_LIST 4 + +#define HOURS 24 + +#define MAX_VFS 10 +#define VM_MAX_NAME_SZ 32 + +#define MAX_VCPU_PER_VM 8 + +struct t_boost_status { + bool tbEnabled; +}; + +struct timer_profile { + int busy_hours[HOURS]; + int quiet_hours[HOURS]; + int hours_to_use_traffic_profile[HOURS]; +}; + +enum workload {HIGH, MEDIUM, LOW}; +enum policy_to_use { + TRAFFIC, + TIME, + WORKLOAD, + BRANCH_RATIO +}; + +struct traffic { + uint32_t min_packet_thresh; + uint32_t avg_max_packet_thresh; + uint32_t max_max_packet_thresh; +}; + +#define CORE_TYPE_VIRTUAL 0 +#define CORE_TYPE_PHYSICAL 1 + +struct channel_packet { + uint64_t resource_id; /**< core_num, device */ + uint32_t unit; /**< scale down/up/min/max */ + uint32_t command; /**< Power, IO, etc */ + char vm_name[VM_MAX_NAME_SZ]; + + uint64_t vfid[MAX_VFS]; + int nb_mac_to_monitor; + struct traffic traffic_policy; + uint8_t vcpu_to_control[MAX_VCPU_PER_VM]; + uint8_t num_vcpu; + struct timer_profile timer_policy; + bool core_type; + enum workload workload; + enum policy_to_use policy_to_use; + struct t_boost_status t_boost_status; +}; + +struct channel_packet_freq_list { + uint64_t resource_id; /**< core_num, device */ + uint32_t unit; /**< scale down/up/min/max */ + uint32_t command; /**< Power, IO, etc */ + char vm_name[VM_MAX_NAME_SZ]; + + uint32_t freq_list[MAX_VCPU_PER_VM]; + uint8_t num_vcpu; +}; + +struct channel_packet_caps_list { + uint64_t resource_id; /**< core_num, device */ + uint32_t unit; /**< scale down/up/min/max */ + uint32_t command; /**< Power, IO, etc */ + char vm_name[VM_MAX_NAME_SZ]; + + uint64_t turbo[MAX_VCPU_PER_VM]; + uint64_t priority[MAX_VCPU_PER_VM]; + uint8_t num_vcpu; +}; + + +#ifdef __cplusplus +} +#endif + +#endif /* CHANNEL_COMMANDS_H_ */ diff --git a/src/spdk/dpdk/lib/librte_power/guest_channel.c b/src/spdk/dpdk/lib/librte_power/guest_channel.c new file mode 100644 index 000000000..b984d55bc --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/guest_channel.c @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <limits.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <poll.h> + + +#include <rte_log.h> + +#include "guest_channel.h" +#include "channel_commands.h" + +#define RTE_LOGTYPE_GUEST_CHANNEL RTE_LOGTYPE_USER1 + +/* Timeout for incoming message in milliseconds. */ +#define TIMEOUT 10 + +static int global_fds[RTE_MAX_LCORE] = { [0 ... RTE_MAX_LCORE-1] = -1 }; + +int +guest_channel_host_connect(const char *path, unsigned int lcore_id) +{ + int flags, ret; + struct channel_packet pkt; + char fd_path[PATH_MAX]; + int fd = -1; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + /* check if path is already open */ + if (global_fds[lcore_id] != -1) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is already open with fd %d\n", + lcore_id, global_fds[lcore_id]); + return -1; + } + + snprintf(fd_path, PATH_MAX, "%s.%u", path, lcore_id); + RTE_LOG(INFO, GUEST_CHANNEL, "Opening channel '%s' for lcore %u\n", + fd_path, lcore_id); + fd = open(fd_path, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Unable to to connect to '%s' with error " + "%s\n", fd_path, strerror(errno)); + return -1; + } + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Failed on fcntl get flags for file %s\n", + fd_path); + goto error; + } + + flags |= O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags) < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Failed on setting non-blocking mode for " + "file %s", fd_path); + goto error; + } + /* QEMU needs a delay after connection */ + sleep(1); + + /* Send a test packet, this command is ignored by the host, but a successful + * send indicates that the host endpoint is monitoring. + */ + pkt.command = CPU_POWER_CONNECT; + global_fds[lcore_id] = fd; + ret = guest_channel_send_msg(&pkt, lcore_id); + if (ret != 0) { + RTE_LOG(ERR, GUEST_CHANNEL, + "Error on channel '%s' communications test: %s\n", + fd_path, ret > 0 ? strerror(ret) : + "channel not connected"); + goto error; + } + RTE_LOG(INFO, GUEST_CHANNEL, "Channel '%s' is now connected\n", fd_path); + return 0; +error: + close(fd); + global_fds[lcore_id] = -1; + return -1; +} + +int +guest_channel_send_msg(struct channel_packet *pkt, unsigned int lcore_id) +{ + int ret, buffer_len = sizeof(*pkt); + void *buffer = pkt; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + + if (global_fds[lcore_id] < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel is not connected\n"); + return -1; + } + while (buffer_len > 0) { + ret = write(global_fds[lcore_id], buffer, buffer_len); + if (ret == buffer_len) + return 0; + if (ret == -1) { + if (errno == EINTR) + continue; + return errno; + } + buffer = (char *)buffer + ret; + buffer_len -= ret; + } + return 0; +} + +int rte_power_guest_channel_send_msg(struct channel_packet *pkt, + unsigned int lcore_id) +{ + return guest_channel_send_msg(pkt, lcore_id); +} + +int power_guest_channel_read_msg(void *pkt, + size_t pkt_len, + unsigned int lcore_id) +{ + int ret; + struct pollfd fds; + + if (pkt_len == 0 || pkt == NULL) + return -1; + + fds.fd = global_fds[lcore_id]; + fds.events = POLLIN; + + ret = poll(&fds, 1, TIMEOUT); + if (ret == 0) { + RTE_LOG(DEBUG, GUEST_CHANNEL, "Timeout occurred during poll function.\n"); + return -1; + } else if (ret < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Error occurred during poll function: %s\n", + strerror(errno)); + return -1; + } + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + + if (global_fds[lcore_id] < 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel is not connected\n"); + return -1; + } + + while (pkt_len > 0) { + ret = read(global_fds[lcore_id], + pkt, pkt_len); + + if (ret < 0) { + if (errno == EINTR) + continue; + return -1; + } + + if (ret == 0) { + RTE_LOG(ERR, GUEST_CHANNEL, "Expected more data, but connection has been closed.\n"); + return -1; + } + pkt = (char *)pkt + ret; + pkt_len -= ret; + } + + return 0; +} + +int rte_power_guest_channel_receive_msg(void *pkt, + size_t pkt_len, + unsigned int lcore_id) +{ + return power_guest_channel_read_msg(pkt, pkt_len, lcore_id); +} + +void +guest_channel_host_disconnect(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, GUEST_CHANNEL, "Channel(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return; + } + if (global_fds[lcore_id] < 0) + return; + close(global_fds[lcore_id]); + global_fds[lcore_id] = -1; +} diff --git a/src/spdk/dpdk/lib/librte_power/guest_channel.h b/src/spdk/dpdk/lib/librte_power/guest_channel.h new file mode 100644 index 000000000..025961606 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/guest_channel.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#ifndef _GUEST_CHANNEL_H +#define _GUEST_CHANNEL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <channel_commands.h> + +/** + * Connect to the Virtio-Serial VM end-point located in path. It is + * thread safe for unique lcore_ids. This function must be only called once from + * each lcore. + * + * @param path + * The path to the serial device on the filesystem + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int guest_channel_host_connect(const char *path, unsigned int lcore_id); + +/** + * Disconnect from an already connected Virtio-Serial Endpoint. + * + * + * @param lcore_id + * lcore_id. + * + */ +void guest_channel_host_disconnect(unsigned int lcore_id); + +/** + * Send a message contained in pkt over the Virtio-Serial to the host endpoint. + * + * @param pkt + * Pointer to a populated struct guest_agent_pkt + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on channel not connected. + * - errno on write to channel error. + */ +int guest_channel_send_msg(struct channel_packet *pkt, unsigned int lcore_id); + +/** + * Send a message contained in pkt over the Virtio-Serial to the host endpoint. + * + * @param pkt + * Pointer to a populated struct channel_packet + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_guest_channel_send_msg(struct channel_packet *pkt, + unsigned int lcore_id); + +/** + * Read a message contained in pkt over the Virtio-Serial + * from the host endpoint. + * + * @param pkt + * Pointer to channel_packet or + * channel_packet_freq_list struct. + * + * @param pkt_len + * Size of expected data packet. + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_guest_channel_read_msg(void *pkt, + size_t pkt_len, + unsigned int lcore_id); + +/** + * Receive a message contained in pkt over the Virtio-Serial + * from the host endpoint. + * + * @param pkt + * Pointer to channel_packet or + * channel_packet_freq_list struct. + * + * @param pkt_len + * Size of expected data packet. + * + * @param lcore_id + * lcore_id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +__rte_experimental +int +rte_power_guest_channel_receive_msg(void *pkt, + size_t pkt_len, + unsigned int lcore_id); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/meson.build b/src/spdk/dpdk/lib/librte_power/meson.build new file mode 100644 index 000000000..1cdba7a05 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/meson.build @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +if not is_linux + build = false + reason = 'only supported on linux' +endif +sources = files('rte_power.c', 'power_acpi_cpufreq.c', + 'power_kvm_vm.c', 'guest_channel.c', + 'rte_power_empty_poll.c', + 'power_pstate_cpufreq.c') +headers = files('rte_power.h','rte_power_empty_poll.h') +deps += ['timer'] +build = false +reason = 'not needed by SPDK' diff --git a/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c new file mode 100644 index 000000000..f443fce69 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.c @@ -0,0 +1,660 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <limits.h> + +#include <rte_atomic.h> +#include <rte_memcpy.h> +#include <rte_memory.h> +#include <rte_string_fns.h> + +#include "power_acpi_cpufreq.h" +#include "power_common.h" + +#ifdef RTE_LIBRTE_POWER_DEBUG +#define POWER_DEBUG_TRACE(fmt, args...) do { \ + RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \ +} while (0) +#else +#define POWER_DEBUG_TRACE(fmt, args...) +#endif + +#define FOPEN_OR_ERR_RET(f, retval) do { \ + if ((f) == NULL) { \ + RTE_LOG(ERR, POWER, "File not opened\n"); \ + return retval; \ + } \ +} while (0) + +#define FOPS_OR_NULL_GOTO(ret, label) do { \ + if ((ret) == NULL) { \ + RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ + goto label; \ + } \ +} while (0) + +#define FOPS_OR_ERR_GOTO(ret, label) do { \ + if ((ret) < 0) { \ + RTE_LOG(ERR, POWER, "File operations failed\n"); \ + goto label; \ + } \ +} while (0) + +#define STR_SIZE 1024 +#define POWER_CONVERT_TO_DECIMAL 10 + +#define POWER_GOVERNOR_USERSPACE "userspace" +#define POWER_SYSFILE_GOVERNOR \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor" +#define POWER_SYSFILE_AVAIL_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_available_frequencies" +#define POWER_SYSFILE_SETSPEED \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_setspeed" + +/* + * MSR related + */ +#define PLATFORM_INFO 0x0CE +#define TURBO_RATIO_LIMIT 0x1AD +#define IA32_PERF_CTL 0x199 +#define CORE_TURBO_DISABLE_BIT ((uint64_t)1<<32) + +enum power_state { + POWER_IDLE = 0, + POWER_ONGOING, + POWER_USED, + POWER_UNKNOWN +}; + +/** + * Power info per lcore. + */ +struct rte_power_info { + unsigned int lcore_id; /**< Logical core id */ + uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */ + uint32_t nb_freqs; /**< number of available freqs */ + FILE *f; /**< FD of scaling_setspeed */ + char governor_ori[32]; /**< Original governor name */ + uint32_t curr_idx; /**< Freq index in freqs array */ + volatile uint32_t state; /**< Power in use state */ + uint16_t turbo_available; /**< Turbo Boost available */ + uint16_t turbo_enable; /**< Turbo Boost enable/disable */ +} __rte_cache_aligned; + +static struct rte_power_info lcore_power_info[RTE_MAX_LCORE]; + +/** + * It is to set specific freq for specific logical core, according to the index + * of supported frequencies. + */ +static int +set_freq_internal(struct rte_power_info *pi, uint32_t idx) +{ + if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Invalid frequency index %u, which " + "should be less than %u\n", idx, pi->nb_freqs); + return -1; + } + + /* Check if it is the same as current */ + if (idx == pi->curr_idx) + return 0; + + POWER_DEBUG_TRACE("Frequency[%u] %u to be set for lcore %u\n", + idx, pi->freqs[idx], pi->lcore_id); + if (fseek(pi->f, 0, SEEK_SET) < 0) { + RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 " + "for setting frequency for lcore %u\n", pi->lcore_id); + return -1; + } + if (fprintf(pi->f, "%u", pi->freqs[idx]) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + fflush(pi->f); + pi->curr_idx = idx; + + return 1; +} + +/** + * It is to check the current scaling governor by reading sys file, and then + * set it into 'userspace' if it is not by writing the sys file. The original + * governor will be saved for rolling back. + */ +static int +power_set_governor_userspace(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + /* Strip off terminating '\n' */ + strtok(buf, "\n"); + + /* Check if current governor is userspace */ + if (strncmp(buf, POWER_GOVERNOR_USERSPACE, + sizeof(POWER_GOVERNOR_USERSPACE)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u is " + "already userspace\n", pi->lcore_id); + goto out; + } + /* Save the original governor */ + strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori)); + + /* Write 'userspace' to the governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(POWER_GOVERNOR_USERSPACE, f); + FOPS_OR_ERR_GOTO(val, out); + + /* We need to flush to see if the fputs succeeds */ + val = fflush(f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been " + "set to user space successfully\n", pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to get the available frequencies of the specific lcore by reading the + * sys file. + */ +static int +power_get_available_freqs(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1, i, count; + char *p; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *freqs[RTE_MAX_LCORE_FREQS]; + char *s; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_AVAIL_FREQ, + pi->lcore_id); + f = fopen(fullpath, "r"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Strip the line break if there is */ + p = strchr(buf, '\n'); + if (p != NULL) + *p = 0; + + /* Split string into at most RTE_MAX_LCORE_FREQS frequencies */ + count = rte_strsplit(buf, sizeof(buf), freqs, + RTE_MAX_LCORE_FREQS, ' '); + if (count <= 0) { + RTE_LOG(ERR, POWER, "No available frequency in " + ""POWER_SYSFILE_AVAIL_FREQ"\n", pi->lcore_id); + goto out; + } + if (count >= RTE_MAX_LCORE_FREQS) { + RTE_LOG(ERR, POWER, "Too many available frequencies : %d\n", + count); + goto out; + } + + /* Store the available frequncies into power context */ + for (i = 0, pi->nb_freqs = 0; i < count; i++) { + POWER_DEBUG_TRACE("Lcore %u frequency[%d]: %s\n", pi->lcore_id, + i, freqs[i]); + pi->freqs[pi->nb_freqs++] = strtoul(freqs[i], &p, + POWER_CONVERT_TO_DECIMAL); + } + + if ((pi->freqs[0]-1000) == pi->freqs[1]) { + pi->turbo_available = 1; + pi->turbo_enable = 1; + POWER_DEBUG_TRACE("Lcore %u Can do Turbo Boost\n", + pi->lcore_id); + } else { + pi->turbo_available = 0; + pi->turbo_enable = 0; + POWER_DEBUG_TRACE("Turbo Boost not available on Lcore %u\n", + pi->lcore_id); + } + + ret = 0; + POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n", + count, pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to fopen the sys file for the future setting the lcore frequency. + */ +static int +power_init_for_setting_freq(struct rte_power_info *pi) +{ + FILE *f; + char fullpath[PATH_MAX]; + char buf[BUFSIZ]; + uint32_t i, freq; + char *s; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_SETSPEED, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, -1); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + freq = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL); + for (i = 0; i < pi->nb_freqs; i++) { + if (freq == pi->freqs[i]) { + pi->curr_idx = i; + pi->f = f; + return 0; + } + } + +out: + fclose(f); + + return -1; +} + +int +power_acpi_cpufreq_init(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "in use\n", lcore_id); + return -1; + } + + pi->lcore_id = lcore_id; + /* Check and set the governor */ + if (power_set_governor_userspace(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to " + "userspace\n", lcore_id); + goto fail; + } + + /* Get the available frequencies */ + if (power_get_available_freqs(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot get available frequencies of " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Init for setting lcore frequency */ + if (power_init_for_setting_freq(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot init for setting frequency for " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Set freq to max by default */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u " + "to max\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u " + "power management\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +/** + * It is to check the governor and then set the original governor back if + * needed by writing the sys file. + */ +static int +power_set_governor_original(struct rte_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if the governor to be set is the same as current */ + if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u " + "has already been set to %s\n", + pi->lcore_id, pi->governor_ori); + goto out; + } + + /* Write back the original governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(pi->governor_ori, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u " + "has been set back to %s successfully\n", + pi->lcore_id, pi->governor_ori); +out: + fclose(f); + + return ret; +} + +int +power_acpi_cpufreq_exit(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "not used\n", lcore_id); + return -1; + } + + /* Close FD of setting freq */ + fclose(pi->f); + pi->f = NULL; + + /* Set the governor back to the original */ + if (power_set_governor_original(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set the governor of %u back " + "to the original\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from " + "'userspace' mode and been set back to the " + "original\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +uint32_t +power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return 0; + } + + if (freqs == NULL) { + RTE_LOG(ERR, POWER, "NULL buffer supplied\n"); + return 0; + } + + pi = &lcore_power_info[lcore_id]; + if (num < pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Buffer size is not enough\n"); + return 0; + } + rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t)); + + return pi->nb_freqs; +} + +uint32_t +power_acpi_cpufreq_get_freq(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return RTE_POWER_INVALID_FREQ_INDEX; + } + + return lcore_power_info[lcore_id].curr_idx; +} + +int +power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t index) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + return set_freq_internal(&(lcore_power_info[lcore_id]), index); +} + +int +power_acpi_cpufreq_freq_down(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx + 1 == pi->nb_freqs) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx + 1); +} + +int +power_acpi_cpufreq_freq_up(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx == 0 || + (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable)) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx - 1); +} + +int +power_acpi_cpufreq_freq_max(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + /* Frequencies in the array are from high to low. */ + if (lcore_power_info[lcore_id].turbo_available) { + if (lcore_power_info[lcore_id].turbo_enable) + /* Set to Turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 0); + else + /* Set to max non-turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 1); + } else + return set_freq_internal(&lcore_power_info[lcore_id], 0); +} + +int +power_acpi_cpufreq_freq_min(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->nb_freqs - 1); +} + + +int +power_acpi_turbo_status(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + return pi->turbo_enable; +} + + +int +power_acpi_enable_turbo(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + if (pi->turbo_available) + pi->turbo_enable = 1; + else { + pi->turbo_enable = 0; + RTE_LOG(ERR, POWER, + "Failed to enable turbo on lcore %u\n", + lcore_id); + return -1; + } + + /* Max may have changed, so call to max function */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, + "Failed to set frequency of lcore %u to max\n", + lcore_id); + return -1; + } + + return 0; +} + +int +power_acpi_disable_turbo(unsigned int lcore_id) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + pi->turbo_enable = 0; + + if ((pi->turbo_available) && (pi->curr_idx <= 1)) { + /* Try to set freq to max by default coming out of turbo */ + if (power_acpi_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, + "Failed to set frequency of lcore %u to max\n", + lcore_id); + return -1; + } + } + + return 0; +} + +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps) +{ + struct rte_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + if (caps == NULL) { + RTE_LOG(ERR, POWER, "Invalid argument\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + caps->capabilities = 0; + caps->turbo = !!(pi->turbo_available); + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h new file mode 100644 index 000000000..1af741607 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_acpi_cpufreq.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_ACPI_CPUFREQ_H +#define _POWER_ACPI_CPUFREQ_H + +/** + * @file + * RTE Power Management via userspace ACPI cpufreq + */ + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include "rte_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize power management for a specific lcore. It will check and set the + * governor to userspace for the lcore, get the available frequencies, and + * prepare to set new lcore frequency. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_cpufreq_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. It will set the governor to which + * is before initialized. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_cpufreq_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. The return value will be + * the minimal one of the total number of available frequencies and the number + * of buffer. The index of available frequencies used in other interfaces + * should be in the range of 0 to this return value. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +uint32_t power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. It + * will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)' if error. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + */ +uint32_t power_acpi_cpufreq_get_freq(unsigned int lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_up(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_down(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_max(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_acpi_cpufreq_freq_min(unsigned int lcore_id); + +/** + * Get the turbo status of a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 Turbo Boost is enabled on this lcore. + * - 0 Turbo Boost is disabled on this lcore. + * - Negative on error. + */ +int power_acpi_turbo_status(unsigned int lcore_id); + +/** + * Enable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost is enabled successfully on this lcore. + * - Negative on error. + */ +int power_acpi_enable_turbo(unsigned int lcore_id); + +/** + * Disable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost disabled successfully on this lcore. + * - Negative on error. + */ +int power_acpi_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_acpi_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/power_common.h b/src/spdk/dpdk/lib/librte_power/power_common.h new file mode 100644 index 000000000..feeb5777b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_common.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_COMMON_H_ +#define _POWER_COMMON_H_ + +#define RTE_POWER_INVALID_FREQ_INDEX (~0) + +#endif /* _POWER_COMMON_H_ */ diff --git a/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c new file mode 100644 index 000000000..2bb17beb1 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.c @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <errno.h> +#include <string.h> + +#include <rte_log.h> + +#include "guest_channel.h" +#include "channel_commands.h" +#include "power_kvm_vm.h" +#include "power_common.h" + +#define FD_PATH "/dev/virtio-ports/virtio.serial.port.poweragent" + +static struct channel_packet pkt[RTE_MAX_LCORE]; + + +int +power_kvm_vm_init(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Core(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + pkt[lcore_id].command = CPU_POWER; + pkt[lcore_id].resource_id = lcore_id; + return guest_channel_host_connect(FD_PATH, lcore_id); +} + +int +power_kvm_vm_exit(unsigned int lcore_id) +{ + guest_channel_host_disconnect(lcore_id); + return 0; +} + +uint32_t +power_kvm_vm_freqs(__rte_unused unsigned int lcore_id, + __rte_unused uint32_t *freqs, + __rte_unused uint32_t num) +{ + RTE_LOG(ERR, POWER, "rte_power_freqs is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +uint32_t +power_kvm_vm_get_freq(__rte_unused unsigned int lcore_id) +{ + RTE_LOG(ERR, POWER, "rte_power_get_freq is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +int +power_kvm_vm_set_freq(__rte_unused unsigned int lcore_id, + __rte_unused uint32_t index) +{ + RTE_LOG(ERR, POWER, "rte_power_set_freq is not implemented " + "for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +static inline int +send_msg(unsigned int lcore_id, uint32_t scale_direction) +{ + int ret; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Core(%u) is out of range 0...%d\n", + lcore_id, RTE_MAX_LCORE-1); + return -1; + } + pkt[lcore_id].unit = scale_direction; + ret = guest_channel_send_msg(&pkt[lcore_id], lcore_id); + if (ret == 0) + return 1; + RTE_LOG(DEBUG, POWER, "Error sending message: %s\n", + ret > 0 ? strerror(ret) : "channel not connected"); + return -1; +} + +int +power_kvm_vm_freq_up(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_UP); +} + +int +power_kvm_vm_freq_down(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_DOWN); +} + +int +power_kvm_vm_freq_max(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_MAX); +} + +int +power_kvm_vm_freq_min(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_SCALE_MIN); +} + +int +power_kvm_vm_turbo_status(__rte_unused unsigned int lcore_id) +{ + RTE_LOG(ERR, POWER, "rte_power_turbo_status is not implemented for Virtual Machine Power Management\n"); + return -ENOTSUP; +} + +int +power_kvm_vm_enable_turbo(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_ENABLE_TURBO); +} + +int +power_kvm_vm_disable_turbo(unsigned int lcore_id) +{ + return send_msg(lcore_id, CPU_POWER_DISABLE_TURBO); +} + +struct rte_power_core_capabilities; +int power_kvm_vm_get_capabilities(__rte_unused unsigned int lcore_id, + __rte_unused struct rte_power_core_capabilities *caps) +{ + RTE_LOG(ERR, POWER, "rte_power_get_capabilities is not implemented for Virtual Machine Power Management\n"); + return -ENOTSUP; +} diff --git a/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h new file mode 100644 index 000000000..94d4aa121 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_kvm_vm.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _POWER_KVM_VM_H +#define _POWER_KVM_VM_H + +/** + * @file + * RTE Power Management KVM VM + */ + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include "rte_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize power management for a specific lcore. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * -ENOTSUP + */ +uint32_t power_kvm_vm_freqs(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * + * @return + * -ENOTSUP + */ +uint32_t power_kvm_vm_get_freq(unsigned int lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It is not currently supported for VM Power Management. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * -ENOTSUP + */ +int power_kvm_vm_set_freq(unsigned int lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore. This request is forwarded to the + * host monitor. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_up(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_down(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_max(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_freq_min(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * -ENOTSUP + */ +int power_kvm_vm_turbo_status(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_enable_turbo(unsigned int lcore_id); + +/** + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success. + * - Negative on error. + */ +int power_kvm_vm_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_kvm_vm_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.c b/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.c new file mode 100644 index 000000000..2d8a9499d --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.c @@ -0,0 +1,854 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <limits.h> +#include <errno.h> +#include <inttypes.h> + +#include <rte_atomic.h> +#include <rte_memcpy.h> +#include <rte_memory.h> +#include <rte_string_fns.h> + +#include "power_pstate_cpufreq.h" +#include "power_common.h" + + +#ifdef RTE_LIBRTE_POWER_DEBUG +#define POWER_DEBUG_TRACE(fmt, args...) do { \ + RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \ +} while (0) +#else +#define POWER_DEBUG_TRACE(fmt, args...) +#endif + +#define FOPEN_OR_ERR_RET(f, retval) do { \ + if ((f) == NULL) { \ + RTE_LOG(ERR, POWER, "File not opened\n"); \ + return retval; \ + } \ +} while (0) + +#define FOPS_OR_NULL_GOTO(ret, label) do { \ + if ((ret) == NULL) { \ + RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ + goto label; \ + } \ +} while (0) + +#define FOPS_OR_ERR_GOTO(ret, label) do { \ + if ((ret) < 0) { \ + RTE_LOG(ERR, POWER, "File operations failed\n"); \ + goto label; \ + } \ +} while (0) + + +#define POWER_CONVERT_TO_DECIMAL 10 +#define BUS_FREQ 100000 + +#define POWER_GOVERNOR_PERF "performance" +#define POWER_SYSFILE_GOVERNOR \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor" +#define POWER_SYSFILE_MAX_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq" +#define POWER_SYSFILE_MIN_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq" +#define POWER_SYSFILE_CUR_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq" +#define POWER_SYSFILE_BASE_MAX_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq" +#define POWER_SYSFILE_BASE_MIN_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq" +#define POWER_SYSFILE_BASE_FREQ \ + "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency" +#define POWER_MSR_PATH "/dev/cpu/%u/msr" + +/* + * MSR related + */ +#define PLATFORM_INFO 0x0CE +#define NON_TURBO_MASK 0xFF00 +#define NON_TURBO_OFFSET 0x8 + + +enum power_state { + POWER_IDLE = 0, + POWER_ONGOING, + POWER_USED, + POWER_UNKNOWN +}; + +struct pstate_power_info { + unsigned int lcore_id; /**< Logical core id */ + uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */ + uint32_t nb_freqs; /**< number of available freqs */ + FILE *f_cur_min; /**< FD of scaling_min */ + FILE *f_cur_max; /**< FD of scaling_max */ + char governor_ori[32]; /**< Original governor name */ + uint32_t curr_idx; /**< Freq index in freqs array */ + uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */ + uint32_t sys_max_freq; /**< system wide max freq */ + uint32_t core_base_freq; /**< core base freq */ + volatile uint32_t state; /**< Power in use state */ + uint16_t turbo_available; /**< Turbo Boost available */ + uint16_t turbo_enable; /**< Turbo Boost enable/disable */ + uint16_t priority_core; /**< High Performance core */ +} __rte_cache_aligned; + + +static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE]; + +/** + * It is to read the specific MSR. + */ + +static int32_t +power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id) +{ + int fd, ret; + char fullpath[PATH_MAX]; + + snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id); + + fd = open(fullpath, O_RDONLY); + + if (fd < 0) { + RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath, + strerror(errno)); + return fd; + } + + ret = pread(fd, val, sizeof(uint64_t), msr); + + if (ret < 0) { + RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath, + strerror(errno)); + goto out; + } + + POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n", + fullpath, msr, lcore_id); + + POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val); + +out: close(fd); + return ret; +} + +/** + * It is to fopen the sys file for the future setting the lcore frequency. + */ +static int +power_init_for_setting_freq(struct pstate_power_info *pi) +{ + FILE *f_min, *f_max, *f_base; + char fullpath_min[PATH_MAX]; + char fullpath_max[PATH_MAX]; + char fullpath_base[PATH_MAX]; + char buf_base[BUFSIZ]; + char *s_base; + uint32_t base_ratio = 0; + uint64_t max_non_turbo = 0; + int ret_val = 0; + + snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ, + pi->lcore_id); + + f_min = fopen(fullpath_min, "rw+"); + FOPEN_OR_ERR_RET(f_min, -1); + + snprintf(fullpath_max, sizeof(fullpath_max), POWER_SYSFILE_MAX_FREQ, + pi->lcore_id); + + f_max = fopen(fullpath_max, "rw+"); + if (f_max == NULL) + fclose(f_min); + + FOPEN_OR_ERR_RET(f_max, -1); + + pi->f_cur_min = f_min; + pi->f_cur_max = f_max; + + snprintf(fullpath_base, sizeof(fullpath_base), POWER_SYSFILE_BASE_FREQ, + pi->lcore_id); + + f_base = fopen(fullpath_base, "r"); + if (f_base == NULL) { + /* No sysfs base_frequency, that's OK, continue without */ + base_ratio = 0; + } else { + s_base = fgets(buf_base, sizeof(buf_base), f_base); + FOPS_OR_NULL_GOTO(s_base, out); + + buf_base[BUFSIZ-1] = '\0'; + if (strlen(buf_base)) + /* Strip off terminating '\n' */ + strtok(buf_base, "\n"); + + base_ratio = strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL) + / BUS_FREQ; + } + + /* Add MSR read to detect turbo status */ + + if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0) { + ret_val = -1; + goto out; + } + + max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET; + + POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo); + + pi->non_turbo_max_ratio = max_non_turbo; + + /* + * If base_frequency is reported as greater than the maximum + * non-turbo frequency, then mark it as a high priority core. + */ + if (base_ratio > max_non_turbo) + pi->priority_core = 1; + else + pi->priority_core = 0; + pi->core_base_freq = base_ratio * BUS_FREQ; + +out: + if (f_base != NULL) + fclose(f_base); + return ret_val; +} + +static int +set_freq_internal(struct pstate_power_info *pi, uint32_t idx) +{ + uint32_t target_freq = 0; + + if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Invalid frequency index %u, which " + "should be less than %u\n", idx, pi->nb_freqs); + return -1; + } + + /* Check if it is the same as current */ + if (idx == pi->curr_idx) + return 0; + + /* Because Intel Pstate Driver only allow user change min/max hint + * User need change the min/max as same value. + */ + if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) { + RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 " + "for setting frequency for lcore %u\n", + pi->lcore_id); + return -1; + } + + if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) { + RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 " + "for setting frequency for lcore %u\n", + pi->lcore_id); + return -1; + } + + /* Turbo is available and enabled, first freq bucket is sys max freq */ + if (pi->turbo_available && idx == 0) { + if (pi->turbo_enable) + target_freq = pi->sys_max_freq; + else { + RTE_LOG(ERR, POWER, "Turbo is off, frequency can't be scaled up more %u\n", + pi->lcore_id); + return -1; + } + } else + target_freq = pi->freqs[idx]; + + /* Decrease freq, the min freq should be updated first */ + if (idx > pi->curr_idx) { + + if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + + if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + + POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n", + target_freq, pi->lcore_id); + + fflush(pi->f_cur_min); + fflush(pi->f_cur_max); + + } + + /* Increase freq, the max freq should be updated first */ + if (idx < pi->curr_idx) { + + if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + + if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) { + RTE_LOG(ERR, POWER, "Fail to write new frequency for " + "lcore %u\n", pi->lcore_id); + return -1; + } + + POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n", + target_freq, pi->lcore_id); + + fflush(pi->f_cur_max); + fflush(pi->f_cur_min); + } + + pi->curr_idx = idx; + + return 1; +} + +/** + * It is to check the current scaling governor by reading sys file, and then + * set it into 'performance' if it is not by writing the sys file. The original + * governor will be saved for rolling back. + */ +static int +power_set_governor_performance(struct pstate_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + /* Strip off terminating '\n' */ + strtok(buf, "\n"); + + /* Check if current governor is performance */ + if (strncmp(buf, POWER_GOVERNOR_PERF, + sizeof(POWER_GOVERNOR_PERF)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u is " + "already performance\n", pi->lcore_id); + goto out; + } + /* Save the original governor */ + strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori)); + + /* Write 'performance' to the governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(POWER_GOVERNOR_PERF, f); + FOPS_OR_ERR_GOTO(val, out); + + /* We need to flush to see if the fputs succeeds */ + val = fflush(f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been " + "set to performance successfully\n", pi->lcore_id); +out: + fclose(f); + + return ret; +} + +/** + * It is to check the governor and then set the original governor back if + * needed by writing the sys file. + */ +static int +power_set_governor_original(struct pstate_power_info *pi) +{ + FILE *f; + int ret = -1; + char buf[BUFSIZ]; + char fullpath[PATH_MAX]; + char *s; + int val; + + snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, + pi->lcore_id); + f = fopen(fullpath, "rw+"); + FOPEN_OR_ERR_RET(f, ret); + + s = fgets(buf, sizeof(buf), f); + FOPS_OR_NULL_GOTO(s, out); + + /* Check if the governor to be set is the same as current */ + if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) { + ret = 0; + POWER_DEBUG_TRACE("Power management governor of lcore %u " + "has already been set to %s\n", + pi->lcore_id, pi->governor_ori); + goto out; + } + + /* Write back the original governor */ + val = fseek(f, 0, SEEK_SET); + FOPS_OR_ERR_GOTO(val, out); + + val = fputs(pi->governor_ori, f); + FOPS_OR_ERR_GOTO(val, out); + + ret = 0; + RTE_LOG(INFO, POWER, "Power management governor of lcore %u " + "has been set back to %s successfully\n", + pi->lcore_id, pi->governor_ori); +out: + fclose(f); + + return ret; +} + +/** + * It is to get the available frequencies of the specific lcore by reading the + * sys file. + */ +static int +power_get_available_freqs(struct pstate_power_info *pi) +{ + FILE *f_min, *f_max; + int ret = -1; + char *p_min, *p_max; + char buf_min[BUFSIZ]; + char buf_max[BUFSIZ]; + char fullpath_min[PATH_MAX]; + char fullpath_max[PATH_MAX]; + char *s_min, *s_max; + uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0; + uint32_t i, num_freqs = 0; + + snprintf(fullpath_max, sizeof(fullpath_max), + POWER_SYSFILE_BASE_MAX_FREQ, + pi->lcore_id); + snprintf(fullpath_min, sizeof(fullpath_min), + POWER_SYSFILE_BASE_MIN_FREQ, + pi->lcore_id); + + f_min = fopen(fullpath_min, "r"); + FOPEN_OR_ERR_RET(f_min, ret); + + f_max = fopen(fullpath_max, "r"); + if (f_max == NULL) + fclose(f_min); + + FOPEN_OR_ERR_RET(f_max, ret); + + s_min = fgets(buf_min, sizeof(buf_min), f_min); + FOPS_OR_NULL_GOTO(s_min, out); + + s_max = fgets(buf_max, sizeof(buf_max), f_max); + FOPS_OR_NULL_GOTO(s_max, out); + + + /* Strip the line break if there is */ + p_min = strchr(buf_min, '\n'); + if (p_min != NULL) + *p_min = 0; + + p_max = strchr(buf_max, '\n'); + if (p_max != NULL) + *p_max = 0; + + sys_min_freq = strtoul(buf_min, &p_min, POWER_CONVERT_TO_DECIMAL); + sys_max_freq = strtoul(buf_max, &p_max, POWER_CONVERT_TO_DECIMAL); + + if (sys_max_freq < sys_min_freq) + goto out; + + pi->sys_max_freq = sys_max_freq; + + if (pi->priority_core == 1) + base_max_freq = pi->core_base_freq; + else + base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ; + + POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n", + sys_min_freq, + sys_max_freq, + base_max_freq); + + if (base_max_freq < sys_max_freq) + pi->turbo_available = 1; + else + pi->turbo_available = 0; + + /* If turbo is available then there is one extra freq bucket + * to store the sys max freq which value is base_max +1 + */ + num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 + + pi->turbo_available; + + /* Generate the freq bucket array. + * If turbo is available the freq bucket[0] value is base_max +1 + * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ + * and so on. + * If turbo is not available bucket[0] is base_max and so on + */ + for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) { + if ((i == 0) && pi->turbo_available) + pi->freqs[pi->nb_freqs++] = base_max_freq + 1; + else + pi->freqs[pi->nb_freqs++] = + base_max_freq - (i - pi->turbo_available) * BUS_FREQ; + } + + ret = 0; + + POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n", + num_freqs, pi->lcore_id); + +out: + fclose(f_min); + fclose(f_max); + + return ret; +} + +int +power_pstate_cpufreq_init(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "in use\n", lcore_id); + return -1; + } + + pi->lcore_id = lcore_id; + /* Check and set the governor */ + if (power_set_governor_performance(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to " + "performance\n", lcore_id); + goto fail; + } + /* Init for setting lcore frequency */ + if (power_init_for_setting_freq(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot init for setting frequency for " + "lcore %u\n", lcore_id); + goto fail; + } + + /* Get the available frequencies */ + if (power_get_available_freqs(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot get available frequencies of " + "lcore %u\n", lcore_id); + goto fail; + } + + + /* Set freq to max by default */ + if (power_pstate_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u " + "to max\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u " + "power management\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + +int +power_pstate_cpufreq_exit(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n", + lcore_id, RTE_MAX_LCORE - 1U); + return -1; + } + pi = &lcore_power_info[lcore_id]; + + if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING) + == 0) { + RTE_LOG(INFO, POWER, "Power management of lcore %u is " + "not used\n", lcore_id); + return -1; + } + + /* Close FD of setting freq */ + fclose(pi->f_cur_min); + fclose(pi->f_cur_max); + pi->f_cur_min = NULL; + pi->f_cur_max = NULL; + + /* Set the governor back to the original */ + if (power_set_governor_original(pi) < 0) { + RTE_LOG(ERR, POWER, "Cannot set the governor of %u back " + "to the original\n", lcore_id); + goto fail; + } + + RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from " + "'performance' mode and been set back to the " + "original\n", lcore_id); + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE); + + return 0; + +fail: + rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN); + + return -1; +} + + +uint32_t +power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return 0; + } + + if (freqs == NULL) { + RTE_LOG(ERR, POWER, "NULL buffer supplied\n"); + return 0; + } + + pi = &lcore_power_info[lcore_id]; + if (num < pi->nb_freqs) { + RTE_LOG(ERR, POWER, "Buffer size is not enough\n"); + return 0; + } + rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t)); + + return pi->nb_freqs; +} + +uint32_t +power_pstate_cpufreq_get_freq(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return RTE_POWER_INVALID_FREQ_INDEX; + } + + return lcore_power_info[lcore_id].curr_idx; +} + + +int +power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + return set_freq_internal(&(lcore_power_info[lcore_id]), index); +} + +int +power_pstate_cpufreq_freq_up(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx == 0 || + (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable)) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx - 1); +} + +int +power_pstate_cpufreq_freq_down(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + if (pi->curr_idx + 1 == pi->nb_freqs) + return 0; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->curr_idx + 1); +} + +int +power_pstate_cpufreq_freq_max(unsigned int lcore_id) +{ + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + /* Frequencies in the array are from high to low. */ + if (lcore_power_info[lcore_id].turbo_available) { + if (lcore_power_info[lcore_id].turbo_enable) + /* Set to Turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 0); + else + /* Set to max non-turbo */ + return set_freq_internal( + &lcore_power_info[lcore_id], 1); + } else + return set_freq_internal(&lcore_power_info[lcore_id], 0); +} + + +int +power_pstate_cpufreq_freq_min(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + /* Frequencies in the array are from high to low. */ + return set_freq_internal(pi, pi->nb_freqs - 1); +} + + +int +power_pstate_turbo_status(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + return pi->turbo_enable; +} + +int +power_pstate_enable_turbo(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + if (pi->turbo_available) + pi->turbo_enable = 1; + else { + pi->turbo_enable = 0; + RTE_LOG(ERR, POWER, + "Failed to enable turbo on lcore %u\n", + lcore_id); + return -1; + } + + return 0; +} + + +int +power_pstate_disable_turbo(unsigned int lcore_id) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + + pi->turbo_enable = 0; + + if (pi->turbo_available && pi->curr_idx <= 1) { + /* Try to set freq to max by default coming out of turbo */ + if (power_pstate_cpufreq_freq_max(lcore_id) < 0) { + RTE_LOG(ERR, POWER, + "Failed to set frequency of lcore %u to max\n", + lcore_id); + return -1; + } + } + + return 0; +} + + +int power_pstate_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps) +{ + struct pstate_power_info *pi; + + if (lcore_id >= RTE_MAX_LCORE) { + RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); + return -1; + } + if (caps == NULL) { + RTE_LOG(ERR, POWER, "Invalid argument\n"); + return -1; + } + + pi = &lcore_power_info[lcore_id]; + caps->capabilities = 0; + caps->turbo = !!(pi->turbo_available); + caps->priority = pi->priority_core; + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.h b/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.h new file mode 100644 index 000000000..6fd801881 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/power_pstate_cpufreq.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _POWER_PSTATE_CPUFREQ_H +#define _POWER_PSTATE_CPUFREQ_H + +/** + * @file + * RTE Power Management via Intel Pstate driver + */ + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include "rte_power.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize power management for a specific lcore. It will check and set the + * governor to performance for the lcore, get the available frequencies, and + * prepare to set new lcore frequency. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_pstate_cpufreq_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. It will set the governor to which + * is before initialized. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_pstate_cpufreq_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. The return value will be + * the minimal one of the total number of available frequencies and the number + * of buffer. The index of available frequencies used in other interfaces + * should be in the range of 0 to this return value. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +uint32_t power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +/** + * Return the current index of available frequencies of a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + * If error, it will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)'. + */ +uint32_t power_pstate_cpufreq_get_freq(unsigned int lcore_id); + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_pstate_cpufreq_freq_up(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_pstate_cpufreq_freq_down(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_pstate_cpufreq_freq_max(unsigned int lcore_id); + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +int power_pstate_cpufreq_freq_min(unsigned int lcore_id); + +/** + * Get the turbo status of a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 Turbo Boost is enabled on this lcore. + * - 0 Turbo Boost is disabled on this lcore. + * - Negative on error. + */ +int power_pstate_turbo_status(unsigned int lcore_id); + +/** + * Enable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost is enabled successfully on this lcore. + * - Negative on error. + */ +int power_pstate_enable_turbo(unsigned int lcore_id); + +/** + * Disable Turbo Boost on a specific lcore. + * It should be protected outside of this function for threadsafe. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 Turbo Boost disabled successfully on this lcore. + * - Negative on error. + */ +int power_pstate_disable_turbo(unsigned int lcore_id); + +/** + * Returns power capabilities for a specific lcore. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int power_pstate_get_capabilities(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/rte_power.c b/src/spdk/dpdk/lib/librte_power/rte_power.c new file mode 100644 index 000000000..6b7722727 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power.c @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <rte_spinlock.h> + +#include "rte_power.h" +#include "power_acpi_cpufreq.h" +#include "power_kvm_vm.h" +#include "power_pstate_cpufreq.h" +#include "power_common.h" + +enum power_management_env global_default_env = PM_ENV_NOT_SET; + +static rte_spinlock_t global_env_cfg_lock = RTE_SPINLOCK_INITIALIZER; + +/* function pointers */ +rte_power_freqs_t rte_power_freqs = NULL; +rte_power_get_freq_t rte_power_get_freq = NULL; +rte_power_set_freq_t rte_power_set_freq = NULL; +rte_power_freq_change_t rte_power_freq_up = NULL; +rte_power_freq_change_t rte_power_freq_down = NULL; +rte_power_freq_change_t rte_power_freq_max = NULL; +rte_power_freq_change_t rte_power_freq_min = NULL; +rte_power_freq_change_t rte_power_turbo_status; +rte_power_freq_change_t rte_power_freq_enable_turbo; +rte_power_freq_change_t rte_power_freq_disable_turbo; +rte_power_get_capabilities_t rte_power_get_capabilities; + +static void +reset_power_function_ptrs(void) +{ + rte_power_freqs = NULL; + rte_power_get_freq = NULL; + rte_power_set_freq = NULL; + rte_power_freq_up = NULL; + rte_power_freq_down = NULL; + rte_power_freq_max = NULL; + rte_power_freq_min = NULL; + rte_power_turbo_status = NULL; + rte_power_freq_enable_turbo = NULL; + rte_power_freq_disable_turbo = NULL; + rte_power_get_capabilities = NULL; +} + +int +rte_power_set_env(enum power_management_env env) +{ + rte_spinlock_lock(&global_env_cfg_lock); + + if (global_default_env != PM_ENV_NOT_SET) { + RTE_LOG(ERR, POWER, "Power Management Environment already set.\n"); + rte_spinlock_unlock(&global_env_cfg_lock); + return -1; + } + + int ret = 0; + + if (env == PM_ENV_ACPI_CPUFREQ) { + rte_power_freqs = power_acpi_cpufreq_freqs; + rte_power_get_freq = power_acpi_cpufreq_get_freq; + rte_power_set_freq = power_acpi_cpufreq_set_freq; + rte_power_freq_up = power_acpi_cpufreq_freq_up; + rte_power_freq_down = power_acpi_cpufreq_freq_down; + rte_power_freq_min = power_acpi_cpufreq_freq_min; + rte_power_freq_max = power_acpi_cpufreq_freq_max; + rte_power_turbo_status = power_acpi_turbo_status; + rte_power_freq_enable_turbo = power_acpi_enable_turbo; + rte_power_freq_disable_turbo = power_acpi_disable_turbo; + rte_power_get_capabilities = power_acpi_get_capabilities; + } else if (env == PM_ENV_KVM_VM) { + rte_power_freqs = power_kvm_vm_freqs; + rte_power_get_freq = power_kvm_vm_get_freq; + rte_power_set_freq = power_kvm_vm_set_freq; + rte_power_freq_up = power_kvm_vm_freq_up; + rte_power_freq_down = power_kvm_vm_freq_down; + rte_power_freq_min = power_kvm_vm_freq_min; + rte_power_freq_max = power_kvm_vm_freq_max; + rte_power_turbo_status = power_kvm_vm_turbo_status; + rte_power_freq_enable_turbo = power_kvm_vm_enable_turbo; + rte_power_freq_disable_turbo = power_kvm_vm_disable_turbo; + rte_power_get_capabilities = power_kvm_vm_get_capabilities; + } else if (env == PM_ENV_PSTATE_CPUFREQ) { + rte_power_freqs = power_pstate_cpufreq_freqs; + rte_power_get_freq = power_pstate_cpufreq_get_freq; + rte_power_set_freq = power_pstate_cpufreq_set_freq; + rte_power_freq_up = power_pstate_cpufreq_freq_up; + rte_power_freq_down = power_pstate_cpufreq_freq_down; + rte_power_freq_min = power_pstate_cpufreq_freq_min; + rte_power_freq_max = power_pstate_cpufreq_freq_max; + rte_power_turbo_status = power_pstate_turbo_status; + rte_power_freq_enable_turbo = power_pstate_enable_turbo; + rte_power_freq_disable_turbo = power_pstate_disable_turbo; + rte_power_get_capabilities = power_pstate_get_capabilities; + + } else { + RTE_LOG(ERR, POWER, "Invalid Power Management Environment(%d) set\n", + env); + ret = -1; + } + + if (ret == 0) + global_default_env = env; + else { + global_default_env = PM_ENV_NOT_SET; + reset_power_function_ptrs(); + } + + rte_spinlock_unlock(&global_env_cfg_lock); + return ret; +} + +void +rte_power_unset_env(void) +{ + rte_spinlock_lock(&global_env_cfg_lock); + global_default_env = PM_ENV_NOT_SET; + reset_power_function_ptrs(); + rte_spinlock_unlock(&global_env_cfg_lock); +} + +enum power_management_env +rte_power_get_env(void) { + return global_default_env; +} + +int +rte_power_init(unsigned int lcore_id) +{ + int ret = -1; + + switch (global_default_env) { + case PM_ENV_ACPI_CPUFREQ: + return power_acpi_cpufreq_init(lcore_id); + case PM_ENV_KVM_VM: + return power_kvm_vm_init(lcore_id); + case PM_ENV_PSTATE_CPUFREQ: + return power_pstate_cpufreq_init(lcore_id); + default: + RTE_LOG(INFO, POWER, "Env isn't set yet!\n"); + } + + /* Auto detect Environment */ + RTE_LOG(INFO, POWER, "Attempting to initialise ACPI cpufreq power management...\n"); + ret = power_acpi_cpufreq_init(lcore_id); + if (ret == 0) { + rte_power_set_env(PM_ENV_ACPI_CPUFREQ); + goto out; + } + + RTE_LOG(INFO, POWER, "Attempting to initialise PSTAT power management...\n"); + ret = power_pstate_cpufreq_init(lcore_id); + if (ret == 0) { + rte_power_set_env(PM_ENV_PSTATE_CPUFREQ); + goto out; + } + + RTE_LOG(INFO, POWER, "Attempting to initialise VM power management...\n"); + ret = power_kvm_vm_init(lcore_id); + if (ret == 0) { + rte_power_set_env(PM_ENV_KVM_VM); + goto out; + } + RTE_LOG(ERR, POWER, "Unable to set Power Management Environment for lcore " + "%u\n", lcore_id); +out: + return ret; +} + +int +rte_power_exit(unsigned int lcore_id) +{ + switch (global_default_env) { + case PM_ENV_ACPI_CPUFREQ: + return power_acpi_cpufreq_exit(lcore_id); + case PM_ENV_KVM_VM: + return power_kvm_vm_exit(lcore_id); + case PM_ENV_PSTATE_CPUFREQ: + return power_pstate_cpufreq_exit(lcore_id); + default: + RTE_LOG(ERR, POWER, "Environment has not been set, unable to exit gracefully\n"); + + } + return -1; + +} diff --git a/src/spdk/dpdk/lib/librte_power/rte_power.h b/src/spdk/dpdk/lib/librte_power/rte_power.h new file mode 100644 index 000000000..427058b81 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power.h @@ -0,0 +1,290 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef _RTE_POWER_H +#define _RTE_POWER_H + +/** + * @file + * RTE Power Management + */ + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_string_fns.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Power Management Environment State */ +enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM, + PM_ENV_PSTATE_CPUFREQ}; + +/** + * Set the default power management implementation. If this is not called prior + * to rte_power_init(), then auto-detect of the environment will take place. + * It is thread safe. New env can be set only in uninitialized state + * (thus rte_power_unset_env must be called if different env was already set). + * + * @param env + * env. The environment in which to initialise Power Management for. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_set_env(enum power_management_env env); + +/** + * Unset the global environment configuration. + * This can only be called after all threads have completed. + */ +void rte_power_unset_env(void); + +/** + * Get the default power management implementation. + * + * @return + * power_management_env The configured environment. + */ +enum power_management_env rte_power_get_env(void); + +/** + * Initialize power management for a specific lcore. If rte_power_set_env() has + * not been called then an auto-detect of the environment will start and + * initialise the corresponding resources. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_init(unsigned int lcore_id); + +/** + * Exit power management on a specific lcore. This will call the environment + * dependent exit function. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +int rte_power_exit(unsigned int lcore_id); + +/** + * Get the available frequencies of a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param freqs + * The buffer array to save the frequencies. + * @param num + * The number of frequencies to get. + * + * @return + * The number of available frequencies. + */ +typedef uint32_t (*rte_power_freqs_t)(unsigned int lcore_id, uint32_t *freqs, + uint32_t num); + +extern rte_power_freqs_t rte_power_freqs; + +/** + * Return the current index of available frequencies of a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * The current index of available frequencies. + */ +typedef uint32_t (*rte_power_get_freq_t)(unsigned int lcore_id); + +extern rte_power_get_freq_t rte_power_get_freq; + +/** + * Set the new frequency for a specific lcore by indicating the index of + * available frequencies. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param index + * The index of available frequencies. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +typedef int (*rte_power_set_freq_t)(unsigned int lcore_id, uint32_t index); + +extern rte_power_set_freq_t rte_power_set_freq; + +/** + * Function pointer definition for generic frequency change functions. Review + * each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +typedef int (*rte_power_freq_change_t)(unsigned int lcore_id); + +/** + * Scale up the frequency of a specific lcore according to the available + * frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_up; + +/** + * Scale down the frequency of a specific lcore according to the available + * frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ + +extern rte_power_freq_change_t rte_power_freq_down; + +/** + * Scale up the frequency of a specific lcore to the highest according to the + * available frequencies. + * Review each environments specific documentation for usage. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_max; + +/** + * Scale down the frequency of a specific lcore to the lowest according to the + * available frequencies. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 on success with frequency changed. + * - 0 on success without frequency changed. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_min; + +/** + * Query the Turbo Boost status of a specific lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 1 Turbo Boost is enabled for this lcore. + * - 0 Turbo Boost is disabled for this lcore. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_turbo_status; + +/** + * Enable Turbo Boost for this lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_enable_turbo; + +/** + * Disable Turbo Boost for this lcore. + * Review each environments specific documentation for usage.. + * + * @param lcore_id + * lcore id. + * + * @return + * - 0 on success. + * - Negative on error. + */ +extern rte_power_freq_change_t rte_power_freq_disable_turbo; + +/** + * Power capabilities summary. + */ +struct rte_power_core_capabilities { + RTE_STD_C11 + union { + uint64_t capabilities; + RTE_STD_C11 + struct { + uint64_t turbo:1; /**< Turbo can be enabled. */ + uint64_t priority:1; /**< SST-BF high freq core */ + }; + }; +}; + +/** + * Returns power capabilities for a specific lcore. + * Function pointer definition. Review each environments + * specific documentation for usage. + * + * @param lcore_id + * lcore id. + * @param caps + * pointer to rte_power_core_capabilities object. + * + * @return + * - 0 on success. + * - Negative on error. + */ +typedef int (*rte_power_get_capabilities_t)(unsigned int lcore_id, + struct rte_power_core_capabilities *caps); + +extern rte_power_get_capabilities_t rte_power_get_capabilities; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.c b/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.c new file mode 100644 index 000000000..70c07b153 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.c @@ -0,0 +1,542 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include <string.h> + +#include <rte_lcore.h> +#include <rte_cycles.h> +#include <rte_atomic.h> +#include <rte_malloc.h> +#include <inttypes.h> + +#include "rte_power.h" +#include "rte_power_empty_poll.h" + +#define INTERVALS_PER_SECOND 100 /* (10ms) */ +#define SECONDS_TO_TRAIN_FOR 2 +#define DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD 70 +#define DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD 30 +#define DEFAULT_CYCLES_PER_PACKET 800 + +static struct ep_params *ep_params; +static uint32_t med_to_high_threshold = DEFAULT_MED_TO_HIGH_PERCENT_THRESHOLD; +static uint32_t high_to_med_threshold = DEFAULT_HIGH_TO_MED_PERCENT_THRESHOLD; + +static uint32_t avail_freqs[RTE_MAX_LCORE][NUM_FREQS]; + +static uint32_t total_avail_freqs[RTE_MAX_LCORE]; + +static uint32_t freq_index[NUM_FREQ]; + +static uint32_t +get_freq_index(enum freq_val index) +{ + return freq_index[index]; +} + + +static int +set_power_freq(int lcore_id, enum freq_val freq, bool specific_freq) +{ + int err = 0; + uint32_t power_freq_index; + if (!specific_freq) + power_freq_index = get_freq_index(freq); + else + power_freq_index = freq; + + err = rte_power_set_freq(lcore_id, power_freq_index); + + return err; +} + + +static __rte_always_inline void +exit_training_state(struct priority_worker *poll_stats) +{ + RTE_SET_USED(poll_stats); +} + +static __rte_always_inline void +enter_training_state(struct priority_worker *poll_stats) +{ + poll_stats->iter_counter = 0; + poll_stats->cur_freq = LOW; + poll_stats->queue_state = TRAINING; +} + +static __rte_always_inline void +enter_normal_state(struct priority_worker *poll_stats) +{ + /* Clear the averages arrays and strs */ + memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av)); + poll_stats->ec = 0; + memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av)); + poll_stats->pc = 0; + + poll_stats->cur_freq = MED; + poll_stats->iter_counter = 0; + poll_stats->threshold_ctr = 0; + poll_stats->queue_state = MED_NORMAL; + RTE_LOG(INFO, POWER, "Set the power freq to MED\n"); + set_power_freq(poll_stats->lcore_id, MED, false); + + poll_stats->thresh[MED].threshold_percent = med_to_high_threshold; + poll_stats->thresh[HGH].threshold_percent = high_to_med_threshold; +} + +static __rte_always_inline void +enter_busy_state(struct priority_worker *poll_stats) +{ + memset(poll_stats->edpi_av, 0, sizeof(poll_stats->edpi_av)); + poll_stats->ec = 0; + memset(poll_stats->ppi_av, 0, sizeof(poll_stats->ppi_av)); + poll_stats->pc = 0; + + poll_stats->cur_freq = HGH; + poll_stats->iter_counter = 0; + poll_stats->threshold_ctr = 0; + poll_stats->queue_state = HGH_BUSY; + set_power_freq(poll_stats->lcore_id, HGH, false); +} + +static __rte_always_inline void +enter_purge_state(struct priority_worker *poll_stats) +{ + poll_stats->iter_counter = 0; + poll_stats->queue_state = LOW_PURGE; +} + +static __rte_always_inline void +set_state(struct priority_worker *poll_stats, + enum queue_state new_state) +{ + enum queue_state old_state = poll_stats->queue_state; + if (old_state != new_state) { + + /* Call any old state exit functions */ + if (old_state == TRAINING) + exit_training_state(poll_stats); + + /* Call any new state entry functions */ + if (new_state == TRAINING) + enter_training_state(poll_stats); + if (new_state == MED_NORMAL) + enter_normal_state(poll_stats); + if (new_state == HGH_BUSY) + enter_busy_state(poll_stats); + if (new_state == LOW_PURGE) + enter_purge_state(poll_stats); + } +} + +static __rte_always_inline void +set_policy(struct priority_worker *poll_stats, + struct ep_policy *policy) +{ + set_state(poll_stats, policy->state); + + if (policy->state == TRAINING) + return; + + poll_stats->thresh[MED_NORMAL].base_edpi = policy->med_base_edpi; + poll_stats->thresh[HGH_BUSY].base_edpi = policy->hgh_base_edpi; + + poll_stats->thresh[MED_NORMAL].trained = true; + poll_stats->thresh[HGH_BUSY].trained = true; + +} + +static void +update_training_stats(struct priority_worker *poll_stats, + uint32_t freq, + bool specific_freq, + uint32_t max_train_iter) +{ + RTE_SET_USED(specific_freq); + + uint64_t p0_empty_deq; + + if (poll_stats->cur_freq == freq && + poll_stats->thresh[freq].trained == false) { + if (poll_stats->thresh[freq].cur_train_iter == 0) { + + set_power_freq(poll_stats->lcore_id, + freq, specific_freq); + + poll_stats->empty_dequeues_prev = + poll_stats->empty_dequeues; + + poll_stats->thresh[freq].cur_train_iter++; + + return; + } else if (poll_stats->thresh[freq].cur_train_iter + <= max_train_iter) { + + p0_empty_deq = poll_stats->empty_dequeues - + poll_stats->empty_dequeues_prev; + + poll_stats->empty_dequeues_prev = + poll_stats->empty_dequeues; + + poll_stats->thresh[freq].base_edpi += p0_empty_deq; + poll_stats->thresh[freq].cur_train_iter++; + + } else { + if (poll_stats->thresh[freq].trained == false) { + poll_stats->thresh[freq].base_edpi = + poll_stats->thresh[freq].base_edpi / + max_train_iter; + + /* Add on a factor of 0.05% + * this should remove any + * false negatives when the system is 0% busy + */ + poll_stats->thresh[freq].base_edpi += + poll_stats->thresh[freq].base_edpi / 2000; + + poll_stats->thresh[freq].trained = true; + poll_stats->cur_freq++; + + } + } + } +} + +static __rte_always_inline uint32_t +update_stats(struct priority_worker *poll_stats) +{ + uint64_t tot_edpi = 0, tot_ppi = 0; + uint32_t j, percent; + + struct priority_worker *s = poll_stats; + + uint64_t cur_edpi = s->empty_dequeues - s->empty_dequeues_prev; + + s->empty_dequeues_prev = s->empty_dequeues; + + uint64_t ppi = s->num_dequeue_pkts - s->num_dequeue_pkts_prev; + + s->num_dequeue_pkts_prev = s->num_dequeue_pkts; + + if (s->thresh[s->cur_freq].base_edpi < cur_edpi) { + + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "cur_edpi is too large " + "cur edpi %"PRId64" " + "base edpi %"PRId64"\n", + cur_edpi, + s->thresh[s->cur_freq].base_edpi); + /* Value to make us fail need debug log*/ + return 1000UL; + } + + s->edpi_av[s->ec++ % BINS_AV] = cur_edpi; + s->ppi_av[s->pc++ % BINS_AV] = ppi; + + for (j = 0; j < BINS_AV; j++) { + tot_edpi += s->edpi_av[j]; + tot_ppi += s->ppi_av[j]; + } + + tot_edpi = tot_edpi / BINS_AV; + + percent = 100 - (uint32_t)(((float)tot_edpi / + (float)s->thresh[s->cur_freq].base_edpi) * 100); + + return (uint32_t)percent; +} + + +static __rte_always_inline void +update_stats_normal(struct priority_worker *poll_stats) +{ + uint32_t percent; + + if (poll_stats->thresh[poll_stats->cur_freq].base_edpi == 0) { + + enum freq_val cur_freq = poll_stats->cur_freq; + + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "cure freq is %d, edpi is %"PRIu64"\n", + cur_freq, + poll_stats->thresh[cur_freq].base_edpi); + return; + } + + percent = update_stats(poll_stats); + + if (percent > 100) { + /* edpi mean empty poll counter difference per interval */ + RTE_LOG(DEBUG, POWER, "Edpi is bigger than threshold\n"); + return; + } + + if (poll_stats->cur_freq == LOW) + RTE_LOG(INFO, POWER, "Purge Mode is not currently supported\n"); + else if (poll_stats->cur_freq == MED) { + + if (percent > + poll_stats->thresh[MED].threshold_percent) { + + if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND) + poll_stats->threshold_ctr++; + else { + set_state(poll_stats, HGH_BUSY); + RTE_LOG(INFO, POWER, "MOVE to HGH\n"); + } + + } else { + /* reset */ + poll_stats->threshold_ctr = 0; + } + + } else if (poll_stats->cur_freq == HGH) { + + if (percent < + poll_stats->thresh[HGH].threshold_percent) { + + if (poll_stats->threshold_ctr < INTERVALS_PER_SECOND) + poll_stats->threshold_ctr++; + else { + set_state(poll_stats, MED_NORMAL); + RTE_LOG(INFO, POWER, "MOVE to MED\n"); + } + } else { + /* reset */ + poll_stats->threshold_ctr = 0; + } + + } +} + +static int +empty_poll_training(struct priority_worker *poll_stats, + uint32_t max_train_iter) +{ + + if (poll_stats->iter_counter < INTERVALS_PER_SECOND) { + poll_stats->iter_counter++; + return 0; + } + + + update_training_stats(poll_stats, + LOW, + false, + max_train_iter); + + update_training_stats(poll_stats, + MED, + false, + max_train_iter); + + update_training_stats(poll_stats, + HGH, + false, + max_train_iter); + + + if (poll_stats->thresh[LOW].trained == true + && poll_stats->thresh[MED].trained == true + && poll_stats->thresh[HGH].trained == true) { + + set_state(poll_stats, MED_NORMAL); + + RTE_LOG(INFO, POWER, "LOW threshold is %"PRIu64"\n", + poll_stats->thresh[LOW].base_edpi); + + RTE_LOG(INFO, POWER, "MED threshold is %"PRIu64"\n", + poll_stats->thresh[MED].base_edpi); + + + RTE_LOG(INFO, POWER, "HIGH threshold is %"PRIu64"\n", + poll_stats->thresh[HGH].base_edpi); + + RTE_LOG(INFO, POWER, "Training is Complete for %d\n", + poll_stats->lcore_id); + } + + return 0; +} + +void +rte_empty_poll_detection(struct rte_timer *tim, void *arg) +{ + + uint32_t i; + + struct priority_worker *poll_stats; + + RTE_SET_USED(tim); + + RTE_SET_USED(arg); + + for (i = 0; i < NUM_NODES; i++) { + + poll_stats = &(ep_params->wrk_data.wrk_stats[i]); + + if (rte_lcore_is_enabled(poll_stats->lcore_id) == 0) + continue; + + switch (poll_stats->queue_state) { + case(TRAINING): + empty_poll_training(poll_stats, + ep_params->max_train_iter); + break; + + case(HGH_BUSY): + case(MED_NORMAL): + update_stats_normal(poll_stats); + break; + + case(LOW_PURGE): + break; + default: + break; + + } + + } + +} + +int +rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb, + struct ep_policy *policy) +{ + uint32_t i; + /* Allocate the ep_params structure */ + ep_params = rte_zmalloc_socket(NULL, + sizeof(struct ep_params), + 0, + rte_socket_id()); + + if (!ep_params) + return -1; + + if (freq_tlb == NULL) { + freq_index[LOW] = 14; + freq_index[MED] = 9; + freq_index[HGH] = 1; + } else { + freq_index[LOW] = freq_tlb[LOW]; + freq_index[MED] = freq_tlb[MED]; + freq_index[HGH] = freq_tlb[HGH]; + } + + RTE_LOG(INFO, POWER, "Initialize the Empty Poll\n"); + + /* Train for pre-defined period */ + ep_params->max_train_iter = INTERVALS_PER_SECOND * SECONDS_TO_TRAIN_FOR; + + struct stats_data *w = &ep_params->wrk_data; + + *eptr = ep_params; + + /* initialize all wrk_stats state */ + for (i = 0; i < NUM_NODES; i++) { + + if (rte_lcore_is_enabled(i) == 0) + continue; + /*init the freqs table */ + total_avail_freqs[i] = rte_power_freqs(i, + avail_freqs[i], + NUM_FREQS); + + RTE_LOG(INFO, POWER, "total avail freq is %d , lcoreid %d\n", + total_avail_freqs[i], + i); + + if (get_freq_index(LOW) > total_avail_freqs[i]) + return -1; + + if (rte_get_master_lcore() != i) { + w->wrk_stats[i].lcore_id = i; + set_policy(&w->wrk_stats[i], policy); + } + } + + return 0; +} + +void +rte_power_empty_poll_stat_free(void) +{ + + RTE_LOG(INFO, POWER, "Close the Empty Poll\n"); + + if (ep_params != NULL) + rte_free(ep_params); +} + +int +rte_power_empty_poll_stat_update(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + poll_stats->empty_dequeues++; + + return 0; +} + +int +rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt) +{ + + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + poll_stats->num_dequeue_pkts += nb_pkt; + + return 0; +} + + +uint64_t +rte_power_empty_poll_stat_fetch(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + return poll_stats->empty_dequeues; +} + +uint64_t +rte_power_poll_stat_fetch(unsigned int lcore_id) +{ + struct priority_worker *poll_stats; + + if (lcore_id >= NUM_NODES) + return -1; + + poll_stats = &(ep_params->wrk_data.wrk_stats[lcore_id]); + + if (poll_stats->lcore_id == 0) + poll_stats->lcore_id = lcore_id; + + return poll_stats->num_dequeue_pkts; +} diff --git a/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.h b/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.h new file mode 100644 index 000000000..6ba0a3707 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power_empty_poll.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _RTE_EMPTY_POLL_H +#define _RTE_EMPTY_POLL_H + +/** + * @file + * RTE Power Management + */ +#include <stdint.h> +#include <stdbool.h> + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_power.h> +#include <rte_timer.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_FREQS RTE_MAX_LCORE_FREQS + +#define BINS_AV 4 /* Has to be ^2 */ + +#define DROP (NUM_DIRECTIONS * NUM_DEVICES) + +#define NUM_PRIORITIES 2 + +#define NUM_NODES 256 /* Max core number*/ + +/* Processor Power State */ +enum freq_val { + LOW, + MED, + HGH, + NUM_FREQ = NUM_FREQS +}; + + +/* Queue Polling State */ +enum queue_state { + TRAINING, /* NO TRAFFIC */ + MED_NORMAL, /* MED */ + HGH_BUSY, /* HIGH */ + LOW_PURGE, /* LOW */ +}; + +/* Queue Stats */ +struct freq_threshold { + + uint64_t base_edpi; + bool trained; + uint32_t threshold_percent; + uint32_t cur_train_iter; +}; + +/* Each Worker Thread Empty Poll Stats */ +struct priority_worker { + + /* Current dequeue and throughput counts */ + /* These 2 are written to by the worker threads */ + /* So keep them on their own cache line */ + uint64_t empty_dequeues; + uint64_t num_dequeue_pkts; + + enum queue_state queue_state; + + uint64_t empty_dequeues_prev; + uint64_t num_dequeue_pkts_prev; + + /* Used for training only */ + struct freq_threshold thresh[NUM_FREQ]; + enum freq_val cur_freq; + + /* bucket arrays to calculate the averages */ + /* edpi mean empty poll counter difference per interval */ + uint64_t edpi_av[BINS_AV]; + /* empty poll counter */ + uint32_t ec; + /* ppi mean valid poll counter per interval */ + uint64_t ppi_av[BINS_AV]; + /* valid poll counter */ + uint32_t pc; + + uint32_t lcore_id; + uint32_t iter_counter; + uint32_t threshold_ctr; + uint32_t display_ctr; + uint8_t dev_id; + +} __rte_cache_aligned; + + +struct stats_data { + + struct priority_worker wrk_stats[NUM_NODES]; + + /* flag to stop rx threads processing packets until training over */ + bool start_rx; + +}; + +/* Empty Poll Parameters */ +struct ep_params { + + /* Timer related stuff */ + uint64_t interval_ticks; + uint32_t max_train_iter; + + struct rte_timer timer0; + struct stats_data wrk_data; +}; + + +/* Sample App Init information */ +struct ep_policy { + + uint64_t med_base_edpi; + uint64_t hgh_base_edpi; + + enum queue_state state; +}; + + + +/** + * Initialize the power management system. + * + * @param eptr + * the structure of empty poll configuration + * @param freq_tlb + * the power state/frequency mapping table + * @param policy + * the initialization policy from sample app + * + * @return + * - 0 on success. + * - Negative on error. + */ +__rte_experimental +int +rte_power_empty_poll_stat_init(struct ep_params **eptr, uint8_t *freq_tlb, + struct ep_policy *policy); + +/** + * Free the resource hold by power management system. + */ +__rte_experimental +void +rte_power_empty_poll_stat_free(void); + +/** + * Update specific core empty poll counter + * It's not thread safe. + * + * @param lcore_id + * lcore id + * + * @return + * - 0 on success. + * - Negative on error. + */ +__rte_experimental +int +rte_power_empty_poll_stat_update(unsigned int lcore_id); + +/** + * Update specific core valid poll counter, not thread safe. + * + * @param lcore_id + * lcore id. + * @param nb_pkt + * The packet number of one valid poll. + * + * @return + * - 0 on success. + * - Negative on error. + */ +__rte_experimental +int +rte_power_poll_stat_update(unsigned int lcore_id, uint8_t nb_pkt); + +/** + * Fetch specific core empty poll counter. + * + * @param lcore_id + * lcore id + * + * @return + * Current lcore empty poll counter value. + */ +__rte_experimental +uint64_t +rte_power_empty_poll_stat_fetch(unsigned int lcore_id); + +/** + * Fetch specific core valid poll counter. + * + * @param lcore_id + * lcore id + * + * @return + * Current lcore valid poll counter value. + */ +__rte_experimental +uint64_t +rte_power_poll_stat_fetch(unsigned int lcore_id); + +/** + * Empty poll state change detection function + * + * @param tim + * The timer structure + * @param arg + * The customized parameter + */ +__rte_experimental +void +rte_empty_poll_detection(struct rte_timer *tim, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/spdk/dpdk/lib/librte_power/rte_power_version.map b/src/spdk/dpdk/lib/librte_power/rte_power_version.map new file mode 100644 index 000000000..55a168f56 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_power/rte_power_version.map @@ -0,0 +1,36 @@ +DPDK_20.0 { + global: + + rte_power_exit; + rte_power_freq_disable_turbo; + rte_power_freq_down; + rte_power_freq_enable_turbo; + rte_power_freq_max; + rte_power_freq_min; + rte_power_freq_up; + rte_power_freqs; + rte_power_get_capabilities; + rte_power_get_env; + rte_power_get_freq; + rte_power_guest_channel_send_msg; + rte_power_init; + rte_power_set_env; + rte_power_set_freq; + rte_power_turbo_status; + rte_power_unset_env; + + local: *; +}; + +EXPERIMENTAL { + global: + + rte_empty_poll_detection; + rte_power_empty_poll_stat_fetch; + rte_power_empty_poll_stat_free; + rte_power_empty_poll_stat_init; + rte_power_empty_poll_stat_update; + rte_power_guest_channel_receive_msg; + rte_power_poll_stat_fetch; + rte_power_poll_stat_update; +}; |