diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/seastar/dpdk/lib/librte_eal/linuxapp/eal | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/seastar/dpdk/lib/librte_eal/linuxapp/eal')
23 files changed, 9991 insertions, 0 deletions
diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/Makefile b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/Makefile new file mode 100644 index 00000000..640afd08 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/Makefile @@ -0,0 +1,134 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2016 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +EXPORT_MAP := rte_eal_version.map +VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) + +LIBABIVER := 4 + +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt + +# specific to linuxapp exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c +ifeq ($(CONFIG_RTE_LIBRTE_XEN_DOM0),y) +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_xen_memory.c +endif +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_uio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_pci_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_vdev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_pci_uio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +CFLAGS_eal.o := -D_GNU_SOURCE +CFLAGS_eal_interrupts.o := -D_GNU_SOURCE +CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE +CFLAGS_eal_timer.o := -D_GNU_SOURCE +CFLAGS_eal_lcore.o := -D_GNU_SOURCE +CFLAGS_eal_thread.o := -D_GNU_SOURCE +CFLAGS_eal_log.o := -D_GNU_SOURCE +CFLAGS_eal_common_log.o := -D_GNU_SOURCE +CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE +CFLAGS_eal_pci.o := -D_GNU_SOURCE +CFLAGS_eal_pci_uio.o := -D_GNU_SOURCE +CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE +CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE +CFLAGS_eal_common_options.o := -D_GNU_SOURCE +CFLAGS_eal_common_thread.o := -D_GNU_SOURCE +CFLAGS_eal_common_lcore.o := -D_GNU_SOURCE + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_interrupts.h rte_kni_common.h rte_dom0_common.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ + $(addprefix include/exec-env/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal.c new file mode 100644 index 00000000..7c78f2dc --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal.c @@ -0,0 +1,997 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2014 6WIND S.A. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdarg.h> +#include <unistd.h> +#include <pthread.h> +#include <syslog.h> +#include <getopt.h> +#include <sys/file.h> +#include <fcntl.h> +#include <stddef.h> +#include <errno.h> +#include <limits.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/stat.h> +#if defined(RTE_ARCH_X86) +#include <sys/io.h> +#endif + +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_errno.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_random.h> +#include <rte_cycles.h> +#include <rte_string_fns.h> +#include <rte_cpuflags.h> +#include <rte_interrupts.h> +#include <rte_bus.h> +#include <rte_pci.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_common.h> +#include <rte_version.h> +#include <rte_atomic.h> +#include <malloc_heap.h> + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" +#include "eal_vfio.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memseg), + .l_len = sizeof(early_mem_config.memseg), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static void +rte_eal_config_create(void) +{ + void *rte_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); + if (retval < 0){ + close(mem_cfg_fd); + rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + } + + rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + + if (rte_mem_cfg_addr == MAP_FAILED){ + rte_panic("Cannot mmap memory for rte_config\n"); + } + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + +} + +/* attach to an existing shared memory config */ +static void +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + + rte_config.mem_config = mem_config; +} + +/* reattach the shared config at exact memory location primary process has it */ +static void +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) + /* errno is stale, don't use */ + rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--base-virtaddr' option\n", + rte_mem_cfg_addr, mem_config); + else + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + } + close(mem_cfg_fd); + + rte_config.mem_config = mem_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if we can open the file but not get a write-lock we are a secondary + * process. NOTE: if we get a file handle back, we keep that open + * and don't close it to prevent a race condition between multiple opens */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static void +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + rte_eal_config_create(); + break; + case RTE_PROC_SECONDARY: + rte_eal_config_attach(); + rte_eal_mcfg_wait_complete(rte_config.mem_config); + rte_eal_config_reattach(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + rte_panic("Invalid process type\n"); + } +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_mem(char *socket_mem) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(socket_mem, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(socket_mem[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(socket_mem, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + internal_config.force_sockets = 1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + end = NULL; + internal_config.socket_mem[i] = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + internal_config.socket_mem[i] *= 1024ULL; + internal_config.socket_mem[i] *= 1024ULL; + total_mem += internal_config.socket_mem[i]; + } + + /* check if we have a positive amount of total memory */ + if (total_mem == 0) + return -1; + + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + /* long options */ + case OPT_XEN_DOM0_NUM: +#ifdef RTE_LIBRTE_XEN_DOM0 + internal_config.xen_dom0_support = 1; +#else + RTE_LOG(ERR, EAL, "Can't support DPDK app " + "running on Dom0, please configure" + " RTE_LIBRTE_XEN_DOM0=y\n"); + ret = -1; + goto out; +#endif + break; + + case OPT_HUGE_DIR_NUM: + internal_config.hugepage_dir = optarg; + break; + + case OPT_FILE_PREFIX_NUM: + internal_config.hugefile_prefix = optarg; + break; + + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_mem(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + /* --xen-dom0 doesn't make sense with --socket-mem */ + if (internal_config.xen_dom0_support && internal_config.force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options --"OPT_SOCKET_MEM" cannot be specified " + "together with --"OPT_XEN_DOM0"\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static void +eal_check_mem_on_local_socket(void) +{ + const struct rte_memseg *ms; + int i, socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + ms = rte_eal_get_physmem_layout(); + + for (i = 0; i < RTE_MAX_MEMSEG; i++) + if (ms[i].socket_id == socket_id && + ms[i].len > 0) + return; + + RTE_LOG(WARNING, EAL, "WARNING: Master core has no " + "memory on local socket!\n"); +} + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +inline static void +rte_eal_mcfg_complete(void) +{ + /* ALL shared mem_config related INIT DONE */ + if (rte_config.process_type == RTE_PROC_PRIMARY) + rte_config.mem_config->magic = RTE_MAGIC; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; +#endif + return 0; +} + +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + int vfio_enabled = 0; + + if (!internal_config.no_pci) { + pci_vfio_enable(); + vfio_enabled |= pci_vfio_is_enabled(); + } + + if (vfio_enabled) { + + /* if we are primary process, create a thread to communicate with + * secondary processes. the thread will use a socket to wait for + * requests from secondary process to send open file descriptors, + * because VFIO does not allow multiple open descriptors on a group or + * VFIO container. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_mp_sync_setup() < 0) + return -1; + } + + return 0; +} +#endif + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + const char *logid; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + logid = strrchr(argv[0], '/'); + logid = strdup(logid ? logid + 1: argv[0]); + + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (internal_config.no_hugetlbfs == 0 && + internal_config.process_type != RTE_PROC_SECONDARY && + internal_config.xen_dom0_support == 0 && + eal_hugepage_info_init() < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + rte_srand(rte_rdtsc()); + + rte_config_init(); + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { + rte_eal_init_alert("Cannot init logging."); + rte_errno = ENOMEM; + rte_atomic32_clear(&run_once); + return -1; + } + +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO\n"); + rte_errno = EAGAIN; + rte_atomic32_clear(&run_once); + return -1; + } +#endif + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory\n"); + rte_errno = ENOMEM; + return -1; + } + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone\n"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects\n"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers\n"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + if (eal_plugins_init() < 0) + rte_eal_init_alert("Cannot init plugins\n"); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + rte_config.master_lcore, (int)thread_id, cpuset, + ret == 0 ? "" : "..."); + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread\n"); + return -1; + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices\n"); + rte_errno = ENODEV; + return -1; + } + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices\n"); + rte_errno = ENOTSUP; + return -1; + } + + rte_eal_mcfg_complete(); + + return fctret; +} + +/* get core role */ +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned lcore_id) +{ + return rte_config.lcore_role[lcore_id]; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c new file mode 100644 index 00000000..fbae4613 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_alarm.c @@ -0,0 +1,272 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <stdint.h> +#include <signal.h> +#include <errno.h> +#include <string.h> +#include <sys/queue.h> +#include <sys/time.h> +#include <sys/timerfd.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_interrupts.h> +#include <rte_alarm.h> +#include <rte_common.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#include <rte_errno.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> +#include <eal_private.h> + +#ifndef TFD_NONBLOCK +#include <fcntl.h> +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#define US_PER_S (US_PER_MS * MS_PER_S) + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + rte_free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = rte_zmalloc(NULL, sizeof(*new_alarm), 0); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + ret |= rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL); + handler_registered = (ret == 0) ? 1 : 0; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + rte_free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + rte_free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + return count; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c new file mode 100644 index 00000000..e1c75548 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_debug.c @@ -0,0 +1,117 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef RTE_BACKTRACE +#include <execinfo.h> +#endif +#include <stdarg.h> +#include <signal.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> + +#include <rte_log.h> +#include <rte_debug.h> +#include <rte_common.h> + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c new file mode 100644 index 00000000..7a21e8f6 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -0,0 +1,368 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <sys/types.h> +#include <sys/file.h> +#include <dirent.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <fnmatch.h> +#include <inttypes.h> +#include <stdarg.h> +#include <unistd.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_common.h> +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + char path[PATH_MAX]; + long unsigned resv_pages, num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + + /* first, check how many reserved pages kernel reports */ + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_rsvd_file); + if (eal_parse_sysfs_value(path, &resv_pages) < 0) + return 0; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static const char * +get_hugepage_dir(uint64_t hugepage_sz) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + char *retval = NULL; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + retval = strdup(splitstr[MOUNTPT]); + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + retval = strdup(splitstr[MOUNTPT]); + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, unlock and remove the file */ + if (lck_result != -1) { + flock(fd, LOCK_UN); + unlinkat(dir_fd, dirent->d_name, 0); + } + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) { + RTE_LOG(ERR, EAL, + "Cannot open directory %s to read system hugepage info\n", + sys_dir_path); + return -1; + } + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz); + + /* first, check if we have a mountpoint */ + if (hpi->hugedir == NULL) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + /* for now, put all pages into socket 0, + * later they will be sorted */ + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) + if (internal_config.hugepage_info[i].hugedir != NULL && + internal_config.hugepage_info[i].num_pages[0] > 0) + return 0; + + /* no valid hugepage mounts available, return error */ + return -1; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c new file mode 100644 index 00000000..2e3bd12a --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_interrupts.c @@ -0,0 +1,1257 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <pthread.h> +#include <sys/queue.h> +#include <stdarg.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> +#include <sys/epoll.h> +#include <sys/signalfd.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include <assert.h> +#include <stdbool.h> + +#include <rte_common.h> +#include <rte_interrupts.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_debug.h> +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_spinlock.h> + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ + irq_set->count = intr_handle->max_intr ? + (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? + RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} +#endif + +static int +uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(const struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(const struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = rte_zmalloc("interrupt callback list", + sizeof(*callback), 0); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if TAILQ_EMPTY(&src->callbacks) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + if ((src = rte_zmalloc("interrupt source list", + sizeof(*src), 0)) == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + rte_free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + return -EPIPE; + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + rte_free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + rte_free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + return -1; + break; +#endif + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + return -1; + break; +#endif + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + bool call = false; + int n, bytes_read; + struct rte_intr_source *src; + struct rte_intr_callback *cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __attribute__((noreturn)) void * +eal_intr_thread_main(__rte_unused void *arg) +{ + struct epoll_event ev; + + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0, ret_1 = 0; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) { + rte_errno = errno; + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = pthread_create(&intr_thread, NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } else { + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, + "eal-intr-thread"); + ret_1 = rte_thread_setname(intr_thread, thread_name); + if (ret_1 != 0) + RTE_LOG(DEBUG, EAL, + "Failed to set thread name for interrupt handling\n"); + } + + return -ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 1; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + /* for vdev, fd points to: + * a. eventfd which does not need to read out; + * b. datapath fd which needs PMD to read out. + */ + return; + case RTE_INTR_HANDLE_EXT: + return; + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -errno; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { + /* do nothing, and let vdev driver to initialize this struct */ + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + if (intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 1; + + return 0; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c new file mode 100644 index 00000000..de5b4260 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_lcore.c @@ -0,0 +1,110 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <unistd.h> +#include <limits.h> +#include <string.h> +#include <dirent.h> + +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c new file mode 100644 index 00000000..e3a50aa3 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_log.c @@ -0,0 +1,103 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <sys/types.h> +#include <syslog.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_spinlock.h> +#include <rte_log.h> + +#include "eal_private.h" + +/* + * default log function + */ +static ssize_t +console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + char copybuf[BUFSIZ + 1]; + ssize_t ret; + uint32_t loglevel; + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* truncate message if too big (should not happen) */ + if (size > BUFSIZ) + size = BUFSIZ; + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + loglevel = rte_log_cur_msg_loglevel() - 1; + memcpy(copybuf, buf, size); + copybuf[size] = '\0'; + + /* write on syslog too */ + syslog(loglevel, "%s", copybuf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + eal_log_set_default(log_stream); + + return 0; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c new file mode 100644 index 00000000..ebe06833 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -0,0 +1,1479 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/* BSD LICENSE + * + * Copyright(c) 2013 6WIND. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define _FILE_OFFSET_BITS 64 +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <stdarg.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <signal.h> +#include <setjmp.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" + +#define PFN_MASK_SIZE 8 + +#ifdef RTE_LIBRTE_XEN_DOM0 +int rte_xen_dom0_supported(void) +{ + return internal_config.xen_dom0_support; +} +#endif + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static uint64_t baseaddr_offset; + +static bool phys_addrs_available = true; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +static void +test_phys_addrs_available(void) +{ + uint64_t tmp; + phys_addr_t physaddr; + + /* For dom0, phys addresses can always be available */ + if (rte_xen_dom0_supported()) + return; + + physaddr = rte_mem_virt2phy(&tmp); + if (physaddr == RTE_BAD_PHYS_ADDR) { + RTE_LOG(ERR, EAL, + "Cannot obtain physical addresses: %s. " + "Only vfio will function.\n", + strerror(errno)); + phys_addrs_available = false; + } +} + +/* Lock page in physical memory and prevent from swapping. */ +int +rte_mem_lock_page(const void *virt) +{ + unsigned long virtual = (unsigned long)virt; + int page_size = getpagesize(); + unsigned long aligned = (virtual & ~ (page_size - 1)); + return mlock((void*)aligned, page_size); +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd, retval; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + /* when using dom0, /proc/self/pagemap always returns 0, check in + * dpdk memory by browsing the memsegs */ + if (rte_xen_dom0_supported()) { + struct rte_mem_config *mcfg; + struct rte_memseg *memseg; + unsigned i; + + mcfg = rte_eal_get_configuration()->mem_config; + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + memseg = &mcfg->memseg[i]; + if (memseg->addr == NULL) + break; + if (virtaddr > memseg->addr && + virtaddr < RTE_PTR_ADD(memseg->addr, + memseg->len)) { + return memseg->phys_addr + + RTE_PTR_DIFF(virtaddr, memseg->addr); + } + } + + return RTE_BAD_PHYS_ADDR; + } + + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ + if (!phys_addrs_available) + return RTE_BAD_PHYS_ADDR; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_PHYS_ADDR; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_PHYS_ADDR; + } + + retval = read(fd, &page, PFN_MASK_SIZE); + close(fd); + if (retval < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_PHYS_ADDR; + } else if (retval != PFN_MASK_SIZE) { + RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap " + "but expected %d:\n", + __func__, retval, PFN_MASK_SIZE); + return RTE_BAD_PHYS_ADDR; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + if ((page & 0x7fffffffffffffULL) == 0) + return RTE_BAD_PHYS_ADDR; + + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + + return physaddr; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. + */ +static int +set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + static phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + hugepg_tbl[i].physaddr = addr; + addr += hugepg_tbl[i].size; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ +static void * +get_virtual_area(size_t *size, size_t hugepage_sz) +{ + void *addr; + int fd; + long aligned_addr; + + if (internal_config.base_virtaddr != 0) { + addr = (void*) (uintptr_t) (internal_config.base_virtaddr + + baseaddr_offset); + } + else addr = NULL; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0){ + RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); + return NULL; + } + do { + addr = mmap(addr, + (*size) + hugepage_sz, PROT_READ, +#ifdef RTE_ARCH_PPC_64 + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, +#else + MAP_PRIVATE, +#endif + fd, 0); + if (addr == MAP_FAILED) + *size -= hugepage_sz; + } while (addr == MAP_FAILED && *size > 0); + + if (addr == MAP_FAILED) { + close(fd); + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + return NULL; + } + + munmap(addr, (*size) + hugepage_sz); + close(fd); + + /* align addr to a huge page size boundary */ + aligned_addr = (long)addr; + aligned_addr += (hugepage_sz - 1); + aligned_addr &= (~(hugepage_sz - 1)); + addr = (void *)(aligned_addr); + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + addr, *size); + + /* increment offset */ + baseaddr_offset += *size; + + return addr; +} + +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map continguous physical blocks in contiguous virtual blocks. + */ +static unsigned +map_all_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, int orig) +{ + int fd; + unsigned i; + void *virtaddr; + void *vma_addr = NULL; + size_t vma_len = 0; + + for (i = 0; i < hpi->num_pages[0]; i++) { + uint64_t hugepage_sz = hpi->hugepage_sz; + + if (orig) { + hugepg_tbl[i].file_id = i; + hugepg_tbl[i].size = hugepage_sz; + eal_get_hugefile_path(hugepg_tbl[i].filepath, + sizeof(hugepg_tbl[i].filepath), hpi->hugedir, + hugepg_tbl[i].file_id); + hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0'; + } +#ifndef RTE_ARCH_64 + /* for 32-bit systems, don't remap 1G and 16G pages, just reuse + * original map address as final map address. + */ + else if ((hugepage_sz == RTE_PGSIZE_1G) + || (hugepage_sz == RTE_PGSIZE_16G)) { + hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; + hugepg_tbl[i].orig_va = NULL; + continue; + } +#endif + else if (vma_len == 0) { + unsigned j, num_pages; + + /* reserve a virtual area for next contiguous + * physical block: count the number of + * contiguous physical pages. */ + for (j = i+1; j < hpi->num_pages[0] ; j++) { +#ifdef RTE_ARCH_PPC_64 + /* The physical addresses are sorted in + * descending order on PPC64 */ + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr - hugepage_sz) + break; +#else + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr + hugepage_sz) + break; +#endif + } + num_pages = j - i; + vma_len = num_pages * hugepage_sz; + + /* get the biggest virtual memory area up to + * vma_len. If it fails, vma_addr is NULL, so + * let the kernel provide the address. */ + vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); + if (vma_addr == NULL) + vma_len = hugepage_sz; + } + + /* try to create hugepage file */ + fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return i; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros */ + virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + return i; + } + + if (orig) { + hugepg_tbl[i].orig_va = virtaddr; + } + else { + hugepg_tbl[i].final_va = virtaddr; + } + + if (orig) { + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); + return i; + } + *(int *)virtaddr = 0; + } + + + /* set shared flock on the file. */ + if (flock(fd, LOCK_SH | LOCK_NB) == -1) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + return i; + } + + close(fd); + + vma_addr = (char *)vma_addr + hugepage_sz; + vma_len -= hugepage_sz; + } + + return i; +} + +/* Unmap all hugepages from original mapping */ +static int +unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned i; + for (i = 0; i < hpi->num_pages[0]; i++) { + if (hugepg_tbl[i].orig_va) { + munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz); + hugepg_tbl[i].orig_va = NULL; + } + } + return 0; +} + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps," + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, internal_config.hugefile_prefix); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = a; + const struct hugepage_file *p2 = b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = b; + const struct hugepage_file *p2 = a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd = open(filename, O_CREAT | O_RDWR, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].final_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->final_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + + unmap_len = hp->size; + + /* get start addr and len of the remaining segment */ + munmap(hp->final_va, (size_t) unmap_len); + + hp->final_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } else { + /* lock the page and skip */ + pages_found++; + } + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for <memory> ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size, total_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + hp_used[i].hugedir = hp_info[i].hugedir; + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +int +rte_eal_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j, new_memseg; + int nr_hugefiles, nr_hugepages = 0; + void *addr; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr; + mcfg->memseg[0].addr = addr; + mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K; + mcfg->memseg[0].len = internal_config.memory; + mcfg->memseg[0].socket_id = 0; + return 0; + } + +/* check if app runs on Xen Dom0 */ + if (internal_config.xen_dom0_support) { +#ifdef RTE_LIBRTE_XEN_DOM0 + /* use dom0_mm kernel driver to init memory */ + if (rte_xen_dom0_memory_init() < 0) + return -1; + else + return 0; +#endif + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + huge_register_sigbus(); + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } + + if (phys_addrs_available) { + /* find physical addresses for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to find phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } else { + /* set physical addresses for each hugepage */ + if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to set phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + + /* remap all hugepages */ + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) != + hpi->num_pages[0]) { + RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + /* unmap original mappings */ + if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0) + goto fail; + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; + } + + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + nr_hugefiles = nr_hugepages; + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_info_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have final_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + /* first memseg index shall be 0 after incrementing it below */ + j = -1; + for (i = 0; i < nr_hugefiles; i++) { + new_memseg = 0; + + /* if this is a new section, create a new memseg */ + if (i == 0) + new_memseg = 1; + else if (hugepage[i].socket_id != hugepage[i-1].socket_id) + new_memseg = 1; + else if (hugepage[i].size != hugepage[i-1].size) + new_memseg = 1; + +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * virtual address to lower address. Here, both the physical + * address and virtual address are in descending order */ + else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) != + hugepage[i].size) + new_memseg = 1; + else if (((unsigned long)hugepage[i-1].final_va - + (unsigned long)hugepage[i].final_va) != hugepage[i].size) + new_memseg = 1; +#else + else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != + hugepage[i].size) + new_memseg = 1; + else if (((unsigned long)hugepage[i].final_va - + (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) + new_memseg = 1; +#endif + + if (new_memseg) { + j += 1; + if (j == RTE_MAX_MEMSEG) + break; + + mcfg->memseg[j].phys_addr = hugepage[i].physaddr; + mcfg->memseg[j].addr = hugepage[i].final_va; + mcfg->memseg[j].len = hugepage[i].size; + mcfg->memseg[j].socket_id = hugepage[i].socket_id; + mcfg->memseg[j].hugepage_sz = hugepage[i].size; + } + /* continuation of previous memseg */ + else { +#ifdef RTE_ARCH_PPC_64 + /* Use the phy and virt address of the last page as segment + * address for IBM Power architecture */ + mcfg->memseg[j].phys_addr = hugepage[i].physaddr; + mcfg->memseg[j].addr = hugepage[i].final_va; +#endif + mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz; + } + hugepage[i].memseg_id = j; + } + + if (i < nr_hugefiles) { + RTE_LOG(ERR, EAL, "Can only reserve %d pages " + "from %d requested\n" + "Current %s=%d is not enough\n" + "Please either increase it or request less amount " + "of memory.\n", + i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG), + RTE_MAX_MEMSEG); + goto fail; + } + + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return 0; + +fail: + huge_recover_sigbus(); + free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return -1; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +int +rte_eal_hugepage_attach(void) +{ + const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct hugepage_file *hp = NULL; + unsigned num_hp = 0; + unsigned i, s = 0; /* s used to track the segment number */ + unsigned max_seg = RTE_MAX_MEMSEG; + off_t size = 0; + int fd, fd_zero = -1, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + test_phys_addrs_available(); + + if (internal_config.xen_dom0_support) { +#ifdef RTE_LIBRTE_XEN_DOM0 + if (rte_xen_dom0_memory_attach() < 0) { + RTE_LOG(ERR, EAL, "Failed to attach memory segments of primary " + "process\n"); + return -1; + } + return 0; +#endif + } + + fd_zero = open("/dev/zero", O_RDONLY); + if (fd_zero < 0) { + RTE_LOG(ERR, EAL, "Could not open /dev/zero\n"); + goto error; + } + fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path()); + goto error; + } + + /* map all segments into memory to make sure we get the addrs */ + for (s = 0; s < RTE_MAX_MEMSEG; ++s) { + void *base_addr; + + /* + * the first memory segment with len==0 is the one that + * follows the last valid segment. + */ + if (mcfg->memseg[s].len == 0) + break; + + /* + * fdzero is mmapped to get a contiguous block of virtual + * addresses of the appropriate memseg size. + * use mmap to get identical addresses as the primary process. + */ + base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, + PROT_READ, +#ifdef RTE_ARCH_PPC_64 + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, +#else + MAP_PRIVATE, +#endif + fd_zero, 0); + if (base_addr == MAP_FAILED || + base_addr != mcfg->memseg[s].addr) { + max_seg = s; + if (base_addr != MAP_FAILED) { + /* errno is stale, don't use */ + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " + "in /dev/zero at [%p], got [%p] - " + "please use '--base-virtaddr' option\n", + (unsigned long long)mcfg->memseg[s].len, + mcfg->memseg[s].addr, base_addr); + munmap(base_addr, mcfg->memseg[s].len); + } else { + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " + "in /dev/zero at [%p]: '%s'\n", + (unsigned long long)mcfg->memseg[s].len, + mcfg->memseg[s].addr, strerror(errno)); + } + if (aslr_enabled() > 0) { + RTE_LOG(ERR, EAL, "It is recommended to " + "disable ASLR in the kernel " + "and retry running both primary " + "and secondary processes\n"); + } + goto error; + } + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + s = 0; + while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){ + void *addr, *base_addr; + uintptr_t offset = 0; + size_t mapping_size; + /* + * free previously mapped memory so we can map the + * hugepages into the space + */ + base_addr = mcfg->memseg[s].addr; + munmap(base_addr, mcfg->memseg[s].len); + + /* find the hugepages for this segment and map them + * we don't need to worry about order, as the server sorted the + * entries before it did the second mmap of them */ + for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){ + if (hp[i].memseg_id == (int)s){ + fd = open(hp[i].filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + hp[i].filepath); + goto error; + } + mapping_size = hp[i].size; + addr = mmap(RTE_PTR_ADD(base_addr, offset), + mapping_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); /* close file both on success and on failure */ + if (addr == MAP_FAILED || + addr != RTE_PTR_ADD(base_addr, offset)) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + hp[i].filepath); + goto error; + } + offset+=mapping_size; + } + } + RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s, + (unsigned long long)mcfg->memseg[s].len); + s++; + } + /* unmap the hugepage config file, since we are done using it */ + munmap(hp, size); + close(fd_zero); + close(fd_hugepage); + return 0; + +error: + for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++) + munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len); + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_zero >= 0) + close(fd_zero); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +bool +rte_eal_using_phys_addrs(void) +{ + return phys_addrs_available; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci.c new file mode 100644 index 00000000..595622b2 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -0,0 +1,723 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <dirent.h> + +#include <rte_log.h> +#include <rte_bus.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> +#include <rte_devargs.h> +#include <rte_memcpy.h> + +#include "eal_filesystem.h" +#include "eal_private.h" +#include "eal_pci_init.h" + +/** + * @file + * PCI probing under linux + * + * This code is used to simulate a PCI probe by parsing information in sysfs. + * When a registered device matches a driver, it is then initialized with + * IGB_UIO driver (or doesn't initialize, if the device wasn't bound to it). + */ + +extern struct rte_pci_bus rte_pci_bus; + +static int +pci_get_kernel_driver_by_path(const char *filename, char *dri_name) +{ + int count; + char path[PATH_MAX]; + char *name; + + if (!filename || !dri_name) + return -1; + + count = readlink(filename, path, PATH_MAX); + if (count >= PATH_MAX) + return -1; + + /* For device does not have a driver */ + if (count < 0) + return 1; + + path[count] = '\0'; + + name = strrchr(path, '/'); + if (name) { + strncpy(dri_name, name + 1, strlen(name + 1) + 1); + return 0; + } + + return -1; +} + +/* Map pci device */ +int +rte_pci_map_device(struct rte_pci_device *dev) +{ + int ret = -1; + + /* try mapping the NIC resources using VFIO if it exists */ + switch (dev->kdrv) { + case RTE_KDRV_VFIO: +#ifdef VFIO_PRESENT + if (pci_vfio_is_enabled()) + ret = pci_vfio_map_resource(dev); +#endif + break; + case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: + if (rte_eal_using_phys_addrs()) { + /* map resources for devices that use uio */ + ret = pci_uio_map_resource(dev); + } + break; + default: + RTE_LOG(DEBUG, EAL, + " Not managed by a supported kernel driver, skipped\n"); + ret = 1; + break; + } + + return ret; +} + +/* Unmap pci device */ +void +rte_pci_unmap_device(struct rte_pci_device *dev) +{ + /* try unmapping the NIC resources using VFIO if it exists */ + switch (dev->kdrv) { + case RTE_KDRV_VFIO: +#ifdef VFIO_PRESENT + if (pci_vfio_is_enabled()) + pci_vfio_unmap_resource(dev); +#endif + break; + case RTE_KDRV_IGB_UIO: + case RTE_KDRV_UIO_GENERIC: + /* unmap resources for devices that use uio */ + pci_uio_unmap_resource(dev); + break; + default: + RTE_LOG(DEBUG, EAL, + " Not managed by a supported kernel driver, skipped\n"); + break; + } +} + +void * +pci_find_max_end_va(void) +{ + const struct rte_memseg *seg = rte_eal_get_physmem_layout(); + const struct rte_memseg *last = seg; + unsigned i = 0; + + for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) { + if (seg->addr == NULL) + break; + + if (seg->addr > last->addr) + last = seg; + + } + return RTE_PTR_ADD(last->addr, last->len); +} + +/* parse one line of the "resource" sysfs file (note that the 'line' + * string is modified) + */ +int +pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr, + uint64_t *end_addr, uint64_t *flags) +{ + union pci_resource_info { + struct { + char *phys_addr; + char *end_addr; + char *flags; + }; + char *ptrs[PCI_RESOURCE_FMT_NVAL]; + } res_info; + + if (rte_strsplit(line, len, res_info.ptrs, 3, ' ') != 3) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + return -1; + } + errno = 0; + *phys_addr = strtoull(res_info.phys_addr, NULL, 16); + *end_addr = strtoull(res_info.end_addr, NULL, 16); + *flags = strtoull(res_info.flags, NULL, 16); + if (errno != 0) { + RTE_LOG(ERR, EAL, + "%s(): bad resource format\n", __func__); + return -1; + } + + return 0; +} + +/* parse the "resource" sysfs file */ +static int +pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev) +{ + FILE *f; + char buf[BUFSIZ]; + int i; + uint64_t phys_addr, end_addr, flags; + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "Cannot open sysfs resource\n"); + return -1; + } + + for (i = 0; i<PCI_MAX_RESOURCE; i++) { + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot read resource\n", __func__); + goto error; + } + if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr, + &end_addr, &flags) < 0) + goto error; + + if (flags & IORESOURCE_MEM) { + dev->mem_resource[i].phys_addr = phys_addr; + dev->mem_resource[i].len = end_addr - phys_addr + 1; + /* not mapped for now */ + dev->mem_resource[i].addr = NULL; + } + } + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +/* Scan one pci sysfs entry, and fill the devices list from it. */ +static int +pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) +{ + char filename[PATH_MAX]; + unsigned long tmp; + struct rte_pci_device *dev; + char driver[PATH_MAX]; + int ret; + + dev = malloc(sizeof(*dev)); + if (dev == NULL) + return -1; + + memset(dev, 0, sizeof(*dev)); + dev->addr = *addr; + + /* get vendor id */ + snprintf(filename, sizeof(filename), "%s/vendor", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.vendor_id = (uint16_t)tmp; + + /* get device id */ + snprintf(filename, sizeof(filename), "%s/device", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.device_id = (uint16_t)tmp; + + /* get subsystem_vendor id */ + snprintf(filename, sizeof(filename), "%s/subsystem_vendor", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.subsystem_vendor_id = (uint16_t)tmp; + + /* get subsystem_device id */ + snprintf(filename, sizeof(filename), "%s/subsystem_device", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->id.subsystem_device_id = (uint16_t)tmp; + + /* get class_id */ + snprintf(filename, sizeof(filename), "%s/class", + dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + /* the least 24 bits are valid: class, subclass, program interface */ + dev->id.class_id = (uint32_t)tmp & RTE_CLASS_ANY_ID; + + /* get max_vfs */ + dev->max_vfs = 0; + snprintf(filename, sizeof(filename), "%s/max_vfs", dirname); + if (!access(filename, F_OK) && + eal_parse_sysfs_value(filename, &tmp) == 0) + dev->max_vfs = (uint16_t)tmp; + else { + /* for non igb_uio driver, need kernel version >= 3.8 */ + snprintf(filename, sizeof(filename), + "%s/sriov_numvfs", dirname); + if (!access(filename, F_OK) && + eal_parse_sysfs_value(filename, &tmp) == 0) + dev->max_vfs = (uint16_t)tmp; + } + + /* get numa node */ + snprintf(filename, sizeof(filename), "%s/numa_node", + dirname); + if (access(filename, R_OK) != 0) { + /* if no NUMA support, set default to 0 */ + dev->device.numa_node = 0; + } else { + if (eal_parse_sysfs_value(filename, &tmp) < 0) { + free(dev); + return -1; + } + dev->device.numa_node = tmp; + } + + rte_pci_device_name(addr, dev->name, sizeof(dev->name)); + dev->device.name = dev->name; + + /* parse resources */ + snprintf(filename, sizeof(filename), "%s/resource", dirname); + if (pci_parse_sysfs_resource(filename, dev) < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__); + free(dev); + return -1; + } + + /* parse driver */ + snprintf(filename, sizeof(filename), "%s/driver", dirname); + ret = pci_get_kernel_driver_by_path(filename, driver); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to get kernel driver\n"); + free(dev); + return -1; + } + + if (!ret) { + if (!strcmp(driver, "vfio-pci")) + dev->kdrv = RTE_KDRV_VFIO; + else if (!strcmp(driver, "igb_uio")) + dev->kdrv = RTE_KDRV_IGB_UIO; + else if (!strcmp(driver, "uio_pci_generic")) + dev->kdrv = RTE_KDRV_UIO_GENERIC; + else + dev->kdrv = RTE_KDRV_UNKNOWN; + } else + dev->kdrv = RTE_KDRV_NONE; + + /* device is valid, add in list (sorted) */ + if (TAILQ_EMPTY(&rte_pci_bus.device_list)) { + rte_pci_add_device(dev); + } else { + struct rte_pci_device *dev2; + int ret; + + TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) { + ret = rte_eal_compare_pci_addr(&dev->addr, &dev2->addr); + if (ret > 0) + continue; + + if (ret < 0) { + rte_pci_insert_device(dev2, dev); + } else { /* already registered */ + dev2->kdrv = dev->kdrv; + dev2->max_vfs = dev->max_vfs; + memmove(dev2->mem_resource, dev->mem_resource, + sizeof(dev->mem_resource)); + free(dev); + } + return 0; + } + + rte_pci_add_device(dev); + } + + return 0; +} + +int +pci_update_device(const struct rte_pci_addr *addr) +{ + char filename[PATH_MAX]; + + snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT, + pci_get_sysfs_path(), addr->domain, addr->bus, addr->devid, + addr->function); + + return pci_scan_one(filename, addr); +} + +/* + * split up a pci address into its constituent parts. + */ +static int +parse_pci_addr_format(const char *buf, int bufsize, struct rte_pci_addr *addr) +{ + /* first split on ':' */ + union splitaddr { + struct { + char *domain; + char *bus; + char *devid; + char *function; + }; + char *str[PCI_FMT_NVAL]; /* last element-separator is "." not ":" */ + } splitaddr; + + char *buf_copy = strndup(buf, bufsize); + if (buf_copy == NULL) + return -1; + + if (rte_strsplit(buf_copy, bufsize, splitaddr.str, PCI_FMT_NVAL, ':') + != PCI_FMT_NVAL - 1) + goto error; + /* final split is on '.' between devid and function */ + splitaddr.function = strchr(splitaddr.devid,'.'); + if (splitaddr.function == NULL) + goto error; + *splitaddr.function++ = '\0'; + + /* now convert to int values */ + errno = 0; + addr->domain = (uint16_t)strtoul(splitaddr.domain, NULL, 16); + addr->bus = (uint8_t)strtoul(splitaddr.bus, NULL, 16); + addr->devid = (uint8_t)strtoul(splitaddr.devid, NULL, 16); + addr->function = (uint8_t)strtoul(splitaddr.function, NULL, 10); + if (errno != 0) + goto error; + + free(buf_copy); /* free the copy made with strdup */ + return 0; +error: + free(buf_copy); + return -1; +} + +/* + * Scan the content of the PCI bus, and the devices in the devices + * list + */ +int +rte_pci_scan(void) +{ + struct dirent *e; + DIR *dir; + char dirname[PATH_MAX]; + struct rte_pci_addr addr; + + /* for debug purposes, PCI can be disabled */ + if (internal_config.no_pci) + return 0; + + dir = opendir(pci_get_sysfs_path()); + if (dir == NULL) { + RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n", + __func__, strerror(errno)); + return -1; + } + + while ((e = readdir(dir)) != NULL) { + if (e->d_name[0] == '.') + continue; + + if (parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr) != 0) + continue; + + snprintf(dirname, sizeof(dirname), "%s/%s", + pci_get_sysfs_path(), e->d_name); + + if (pci_scan_one(dirname, &addr) < 0) + goto error; + } + closedir(dir); + return 0; + +error: + closedir(dir); + return -1; +} + +/* Read PCI config space. */ +int rte_pci_read_config(const struct rte_pci_device *device, + void *buf, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &device->intr_handle; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + return pci_uio_read_config(intr_handle, buf, len, offset); + +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + return pci_vfio_read_config(intr_handle, buf, len, offset); +#endif + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } +} + +/* Write PCI config space. */ +int rte_pci_write_config(const struct rte_pci_device *device, + const void *buf, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &device->intr_handle; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + return pci_uio_write_config(intr_handle, buf, len, offset); + +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + return pci_vfio_write_config(intr_handle, buf, len, offset); +#endif + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } +} + +#if defined(RTE_ARCH_X86) +static int +pci_ioport_map(struct rte_pci_device *dev, int bar __rte_unused, + struct rte_pci_ioport *p) +{ + uint16_t start, end; + FILE *fp; + char *line = NULL; + char pci_id[16]; + int found = 0; + size_t linesz; + + snprintf(pci_id, sizeof(pci_id), PCI_PRI_FMT, + dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function); + + fp = fopen("/proc/ioports", "r"); + if (fp == NULL) { + RTE_LOG(ERR, EAL, "%s(): can't open ioports\n", __func__); + return -1; + } + + while (getdelim(&line, &linesz, '\n', fp) > 0) { + char *ptr = line; + char *left; + int n; + + n = strcspn(ptr, ":"); + ptr[n] = 0; + left = &ptr[n + 1]; + + while (*left && isspace(*left)) + left++; + + if (!strncmp(left, pci_id, strlen(pci_id))) { + found = 1; + + while (*ptr && isspace(*ptr)) + ptr++; + + sscanf(ptr, "%04hx-%04hx", &start, &end); + + break; + } + } + + free(line); + fclose(fp); + + if (!found) + return -1; + + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + p->base = start; + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%x\n", start); + + return 0; +} +#endif + +int +rte_pci_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + int ret = -1; + + switch (dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + if (pci_vfio_is_enabled()) + ret = pci_vfio_ioport_map(dev, bar, p); + break; +#endif + case RTE_KDRV_IGB_UIO: + ret = pci_uio_ioport_map(dev, bar, p); + break; + case RTE_KDRV_UIO_GENERIC: +#if defined(RTE_ARCH_X86) + ret = pci_ioport_map(dev, bar, p); +#else + ret = pci_uio_ioport_map(dev, bar, p); +#endif + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + ret = pci_ioport_map(dev, bar, p); +#endif + break; + default: + break; + } + + if (!ret) + p->dev = dev; + + return ret; +} + +void +rte_pci_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + pci_vfio_ioport_read(p, data, len, offset); + break; +#endif + case RTE_KDRV_IGB_UIO: + pci_uio_ioport_read(p, data, len, offset); + break; + case RTE_KDRV_UIO_GENERIC: + pci_uio_ioport_read(p, data, len, offset); + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + pci_uio_ioport_read(p, data, len, offset); +#endif + break; + default: + break; + } +} + +void +rte_pci_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + pci_vfio_ioport_write(p, data, len, offset); + break; +#endif + case RTE_KDRV_IGB_UIO: + pci_uio_ioport_write(p, data, len, offset); + break; + case RTE_KDRV_UIO_GENERIC: + pci_uio_ioport_write(p, data, len, offset); + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + pci_uio_ioport_write(p, data, len, offset); +#endif + break; + default: + break; + } +} + +int +rte_pci_ioport_unmap(struct rte_pci_ioport *p) +{ + int ret = -1; + + switch (p->dev->kdrv) { +#ifdef VFIO_PRESENT + case RTE_KDRV_VFIO: + if (pci_vfio_is_enabled()) + ret = pci_vfio_ioport_unmap(p); + break; +#endif + case RTE_KDRV_IGB_UIO: + ret = pci_uio_ioport_unmap(p); + break; + case RTE_KDRV_UIO_GENERIC: +#if defined(RTE_ARCH_X86) + ret = 0; +#else + ret = pci_uio_ioport_unmap(p); +#endif + break; + case RTE_KDRV_NONE: +#if defined(RTE_ARCH_X86) + ret = 0; +#endif + break; + default: + break; + } + + return ret; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_init.h new file mode 100644 index 00000000..ae2980d6 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -0,0 +1,97 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EAL_PCI_INIT_H_ +#define EAL_PCI_INIT_H_ + +#include "eal_vfio.h" + +/** IO resource type: */ +#define IORESOURCE_IO 0x00000100 +#define IORESOURCE_MEM 0x00000200 + +/* + * Helper function to map PCI resources right after hugepages in virtual memory + */ +extern void *pci_map_addr; +void *pci_find_max_end_va(void); + +/* parse one line of the "resource" sysfs file (note that the 'line' + * string is modified) + */ +int pci_parse_one_sysfs_resource(char *line, size_t len, uint64_t *phys_addr, + uint64_t *end_addr, uint64_t *flags); + +int pci_uio_alloc_resource(struct rte_pci_device *dev, + struct mapped_pci_resource **uio_res); +void pci_uio_free_resource(struct rte_pci_device *dev, + struct mapped_pci_resource *uio_res); +int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, + struct mapped_pci_resource *uio_res, int map_idx); + +int pci_uio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs); +int pci_uio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs); + +int pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p); +void pci_uio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset); +void pci_uio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset); +int pci_uio_ioport_unmap(struct rte_pci_ioport *p); + +#ifdef VFIO_PRESENT + +/* access config space */ +int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs); +int pci_vfio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs); + +int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p); +void pci_vfio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset); +void pci_vfio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset); +int pci_vfio_ioport_unmap(struct rte_pci_ioport *p); + +/* map/unmap VFIO resource prototype */ +int pci_vfio_map_resource(struct rte_pci_device *dev); +int pci_vfio_unmap_resource(struct rte_pci_device *dev); + +#endif + +#endif /* EAL_PCI_INIT_H_ */ diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_uio.c new file mode 100644 index 00000000..fa10329f --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_uio.c @@ -0,0 +1,567 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <dirent.h> +#include <inttypes.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/sysmacros.h> +#include <linux/pci_regs.h> + +#if defined(RTE_ARCH_X86) +#include <sys/io.h> +#endif + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_common.h> +#include <rte_malloc.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" + +void *pci_map_addr = NULL; + +#define OFF_MAX ((uint64_t)(off_t)-1) + +int +pci_uio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offset) +{ + return pread(intr_handle->uio_cfg_fd, buf, len, offset); +} + +int +pci_uio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offset) +{ + return pwrite(intr_handle->uio_cfg_fd, buf, len, offset); +} + +static int +pci_uio_set_bus_master(int dev_fd) +{ + uint16_t reg; + int ret; + + ret = pread(dev_fd, ®, sizeof(reg), PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, + "Cannot read command from PCI config space!\n"); + return -1; + } + + /* return if bus mastering is already on */ + if (reg & PCI_COMMAND_MASTER) + return 0; + + reg |= PCI_COMMAND_MASTER; + + ret = pwrite(dev_fd, ®, sizeof(reg), PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, + "Cannot write command to PCI config space!\n"); + return -1; + } + + return 0; +} + +static int +pci_mknod_uio_dev(const char *sysfs_uio_path, unsigned uio_num) +{ + FILE *f; + char filename[PATH_MAX]; + int ret; + unsigned major, minor; + dev_t dev; + + /* get the name of the sysfs file that contains the major and minor + * of the uio device and read its content */ + snprintf(filename, sizeof(filename), "%s/dev", sysfs_uio_path); + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n", + __func__); + return -1; + } + + ret = fscanf(f, "%u:%u", &major, &minor); + if (ret != 2) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n", + __func__); + fclose(f); + return -1; + } + fclose(f); + + /* create the char device "mknod /dev/uioX c major minor" */ + snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num); + dev = makedev(major, minor); + ret = mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "%s(): mknod() failed %s\n", + __func__, strerror(errno)); + return -1; + } + + return ret; +} + +/* + * Return the uioX char device used for a pci device. On success, return + * the UIO number and fill dstbuf string with the path of the device in + * sysfs. On error, return a negative value. In this case dstbuf is + * invalid. + */ +static int +pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf, + unsigned int buflen, int create) +{ + struct rte_pci_addr *loc = &dev->addr; + unsigned int uio_num; + struct dirent *e; + DIR *dir; + char dirname[PATH_MAX]; + + /* depending on kernel version, uio can be located in uio/uioX + * or uio:uioX */ + + snprintf(dirname, sizeof(dirname), + "%s/" PCI_PRI_FMT "/uio", pci_get_sysfs_path(), + loc->domain, loc->bus, loc->devid, loc->function); + + dir = opendir(dirname); + if (dir == NULL) { + /* retry with the parent directory */ + snprintf(dirname, sizeof(dirname), + "%s/" PCI_PRI_FMT, pci_get_sysfs_path(), + loc->domain, loc->bus, loc->devid, loc->function); + dir = opendir(dirname); + + if (dir == NULL) { + RTE_LOG(ERR, EAL, "Cannot opendir %s\n", dirname); + return -1; + } + } + + /* take the first file starting with "uio" */ + while ((e = readdir(dir)) != NULL) { + /* format could be uio%d ...*/ + int shortprefix_len = sizeof("uio") - 1; + /* ... or uio:uio%d */ + int longprefix_len = sizeof("uio:uio") - 1; + char *endptr; + + if (strncmp(e->d_name, "uio", 3) != 0) + continue; + + /* first try uio%d */ + errno = 0; + uio_num = strtoull(e->d_name + shortprefix_len, &endptr, 10); + if (errno == 0 && endptr != (e->d_name + shortprefix_len)) { + snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num); + break; + } + + /* then try uio:uio%d */ + errno = 0; + uio_num = strtoull(e->d_name + longprefix_len, &endptr, 10); + if (errno == 0 && endptr != (e->d_name + longprefix_len)) { + snprintf(dstbuf, buflen, "%s/uio:uio%u", dirname, uio_num); + break; + } + } + closedir(dir); + + /* No uio resource found */ + if (e == NULL) + return -1; + + /* create uio device if we've been asked to */ + if (internal_config.create_uio_dev && create && + pci_mknod_uio_dev(dstbuf, uio_num) < 0) + RTE_LOG(WARNING, EAL, "Cannot create /dev/uio%u\n", uio_num); + + return uio_num; +} + +void +pci_uio_free_resource(struct rte_pci_device *dev, + struct mapped_pci_resource *uio_res) +{ + rte_free(uio_res); + + if (dev->intr_handle.uio_cfg_fd >= 0) { + close(dev->intr_handle.uio_cfg_fd); + dev->intr_handle.uio_cfg_fd = -1; + } + if (dev->intr_handle.fd >= 0) { + close(dev->intr_handle.fd); + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + } +} + +int +pci_uio_alloc_resource(struct rte_pci_device *dev, + struct mapped_pci_resource **uio_res) +{ + char dirname[PATH_MAX]; + char cfgname[PATH_MAX]; + char devname[PATH_MAX]; /* contains the /dev/uioX */ + int uio_num; + struct rte_pci_addr *loc; + + loc = &dev->addr; + + /* find uio resource */ + uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1); + if (uio_num < 0) { + RTE_LOG(WARNING, EAL, " "PCI_PRI_FMT" not managed by UIO driver, " + "skipping\n", loc->domain, loc->bus, loc->devid, loc->function); + return 1; + } + snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num); + + /* save fd if in primary process */ + dev->intr_handle.fd = open(devname, O_RDWR); + if (dev->intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + devname, strerror(errno)); + goto error; + } + + snprintf(cfgname, sizeof(cfgname), + "/sys/class/uio/uio%u/device/config", uio_num); + dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR); + if (dev->intr_handle.uio_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + cfgname, strerror(errno)); + goto error; + } + + if (dev->kdrv == RTE_KDRV_IGB_UIO) + dev->intr_handle.type = RTE_INTR_HANDLE_UIO; + else { + dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX; + + /* set bus master that is not done by uio_pci_generic */ + if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) { + RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); + goto error; + } + } + + /* allocate the mapping details for secondary processes*/ + *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0); + if (*uio_res == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot store uio mmap details\n", __func__); + goto error; + } + + snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname); + memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr)); + + return 0; + +error: + pci_uio_free_resource(dev, *uio_res); + return -1; +} + +int +pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, + struct mapped_pci_resource *uio_res, int map_idx) +{ + int fd; + char devname[PATH_MAX]; + void *mapaddr; + struct rte_pci_addr *loc; + struct pci_map *maps; + + loc = &dev->addr; + maps = uio_res->maps; + + /* update devname for mmap */ + snprintf(devname, sizeof(devname), + "%s/" PCI_PRI_FMT "/resource%d", + pci_get_sysfs_path(), + loc->domain, loc->bus, loc->devid, + loc->function, res_idx); + + /* allocate memory to keep path */ + maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0); + if (maps[map_idx].path == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n", + strerror(errno)); + return -1; + } + + /* + * open resource file, to mmap it + */ + fd = open(devname, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + devname, strerror(errno)); + goto error; + } + + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); + + mapaddr = pci_map_resource(pci_map_addr, fd, 0, + (size_t)dev->mem_resource[res_idx].len, 0); + close(fd); + if (mapaddr == MAP_FAILED) + goto error; + + pci_map_addr = RTE_PTR_ADD(mapaddr, + (size_t)dev->mem_resource[res_idx].len); + + maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr; + maps[map_idx].size = dev->mem_resource[res_idx].len; + maps[map_idx].addr = mapaddr; + maps[map_idx].offset = 0; + strcpy(maps[map_idx].path, devname); + dev->mem_resource[res_idx].addr = mapaddr; + + return 0; + +error: + rte_free(maps[map_idx].path); + return -1; +} + +#if defined(RTE_ARCH_X86) +int +pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + char dirname[PATH_MAX]; + char filename[PATH_MAX]; + int uio_num; + unsigned long start; + + uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 0); + if (uio_num < 0) + return -1; + + /* get portio start */ + snprintf(filename, sizeof(filename), + "%s/portio/port%d/start", dirname, bar); + if (eal_parse_sysfs_value(filename, &start) < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n", + __func__); + return -1; + } + /* ensure we don't get anything funny here, read/write will cast to + * uin16_t */ + if (start > UINT16_MAX) + return -1; + + /* FIXME only for primary process ? */ + if (dev->intr_handle.type == RTE_INTR_HANDLE_UNKNOWN) { + + snprintf(filename, sizeof(filename), "/dev/uio%u", uio_num); + dev->intr_handle.fd = open(filename, O_RDWR); + if (dev->intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", + filename, strerror(errno)); + return -1; + } + dev->intr_handle.type = RTE_INTR_HANDLE_UIO; + } + + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx\n", start); + + p->base = start; + p->len = 0; + return 0; +} +#else +int +pci_uio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + FILE *f; + char buf[BUFSIZ]; + char filename[PATH_MAX]; + uint64_t phys_addr, end_addr, flags; + int fd, i; + void *addr; + + /* open and read addresses of the corresponding resource in sysfs */ + snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource", + pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function); + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "Cannot open sysfs resource: %s\n", + strerror(errno)); + return -1; + } + for (i = 0; i < bar + 1; i++) { + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "Cannot read sysfs resource\n"); + goto error; + } + } + if (pci_parse_one_sysfs_resource(buf, sizeof(buf), &phys_addr, + &end_addr, &flags) < 0) + goto error; + if ((flags & IORESOURCE_IO) == 0) { + RTE_LOG(ERR, EAL, "BAR %d is not an IO resource\n", bar); + goto error; + } + snprintf(filename, sizeof(filename), "%s/" PCI_PRI_FMT "/resource%d", + pci_get_sysfs_path(), dev->addr.domain, dev->addr.bus, + dev->addr.devid, dev->addr.function, bar); + + /* mmap the pci resource */ + fd = open(filename, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + goto error; + } + addr = mmap(NULL, end_addr + 1, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot mmap IO port resource: %s\n", + strerror(errno)); + goto error; + } + + /* strangely, the base address is mmap addr + phys_addr */ + p->base = (uintptr_t)addr + phys_addr; + p->len = end_addr + 1; + RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%"PRIx64"\n", p->base); + fclose(f); + + return 0; + +error: + fclose(f); + return -1; +} +#endif + +void +pci_uio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + uint8_t *d; + int size; + uintptr_t reg = p->base + offset; + + for (d = data; len > 0; d += size, reg += size, len -= size) { + if (len >= 4) { + size = 4; +#if defined(RTE_ARCH_X86) + *(uint32_t *)d = inl(reg); +#else + *(uint32_t *)d = *(volatile uint32_t *)reg; +#endif + } else if (len >= 2) { + size = 2; +#if defined(RTE_ARCH_X86) + *(uint16_t *)d = inw(reg); +#else + *(uint16_t *)d = *(volatile uint16_t *)reg; +#endif + } else { + size = 1; +#if defined(RTE_ARCH_X86) + *d = inb(reg); +#else + *d = *(volatile uint8_t *)reg; +#endif + } + } +} + +void +pci_uio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + const uint8_t *s; + int size; + uintptr_t reg = p->base + offset; + + for (s = data; len > 0; s += size, reg += size, len -= size) { + if (len >= 4) { + size = 4; +#if defined(RTE_ARCH_X86) + outl_p(*(const uint32_t *)s, reg); +#else + *(volatile uint32_t *)reg = *(const uint32_t *)s; +#endif + } else if (len >= 2) { + size = 2; +#if defined(RTE_ARCH_X86) + outw_p(*(const uint16_t *)s, reg); +#else + *(volatile uint16_t *)reg = *(const uint16_t *)s; +#endif + } else { + size = 1; +#if defined(RTE_ARCH_X86) + outb_p(*s, reg); +#else + *(volatile uint8_t *)reg = *s; +#endif + } + } +} + +int +pci_uio_ioport_unmap(struct rte_pci_ioport *p) +{ +#if defined(RTE_ARCH_X86) + RTE_SET_USED(p); + /* FIXME close intr fd ? */ + return 0; +#else + return munmap((void *)(uintptr_t)p->base, p->len); +#endif +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c new file mode 100644 index 00000000..2be13195 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -0,0 +1,674 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <fcntl.h> +#include <linux/pci_regs.h> +#include <sys/eventfd.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <stdbool.h> + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" +#include "eal_vfio.h" +#include "eal_private.h" + +/** + * @file + * PCI probing under linux (VFIO version) + * + * This code tries to determine if the PCI device is bound to VFIO driver, + * and initialize it (map BARs, set up interrupts) if that's the case. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +#define PAGE_SIZE (sysconf(_SC_PAGESIZE)) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +static struct rte_tailq_elem rte_vfio_tailq = { + .name = "VFIO_RESOURCE_LIST", +}; +EAL_REGISTER_TAILQ(rte_vfio_tailq) + +int +pci_vfio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs) +{ + return pread64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + +int +pci_vfio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs) +{ + return pwrite64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + +/* get PCI BAR number where MSI-X interrupts are */ +static int +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, + uint32_t *msix_table_size) +{ + int ret; + uint32_t reg; + uint16_t flags; + uint8_t cap_id, cap_offset; + + /* read PCI capability pointer from config space */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_CAPABILITY_LIST); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " + "config space!\n"); + return -1; + } + + /* we need first byte */ + cap_offset = reg & 0xFF; + + while (cap_offset) { + + /* read PCI capability ID */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " + "config space!\n"); + return -1; + } + + /* we need first byte */ + cap_id = reg & 0xFF; + + /* if we haven't reached MSI-X, check next capability */ + if (cap_id != PCI_CAP_ID_MSIX) { + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " + "config space!\n"); + return -1; + } + + /* we need second byte */ + cap_offset = (reg & 0xFF00) >> 8; + + continue; + } + /* else, read table offset */ + else { + /* table offset resides in the next 4 bytes */ + ret = pread64(fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 4); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " + "space!\n"); + return -1; + } + + ret = pread64(fd, &flags, sizeof(flags), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 2); + if (ret != sizeof(flags)) { + RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " + "space!\n"); + return -1; + } + + *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; + *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); + + return 0; + } + } + return 0; +} + +/* set PCI bus mastering */ +static int +pci_vfio_set_bus_master(int dev_fd, bool op) +{ + uint16_t reg; + int ret; + + ret = pread64(dev_fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_COMMAND); + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); + return -1; + } + + if (op) + /* set the master bit */ + reg |= PCI_COMMAND_MASTER; + else + reg &= ~(PCI_COMMAND_MASTER); + + ret = pwrite64(dev_fd, ®, sizeof(reg), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_COMMAND); + + if (ret != sizeof(reg)) { + RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); + return -1; + } + + return 0; +} + +/* set up interrupt support (but not enable interrupts) */ +static int +pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) +{ + int i, ret, intr_idx; + + /* default to invalid index */ + intr_idx = VFIO_PCI_NUM_IRQS; + + /* get interrupt type from internal config (MSI-X by default, can be + * overriden from the command line + */ + switch (internal_config.vfio_intr_mode) { + case RTE_INTR_MODE_MSIX: + intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; + break; + case RTE_INTR_MODE_MSI: + intr_idx = VFIO_PCI_MSI_IRQ_INDEX; + break; + case RTE_INTR_MODE_LEGACY: + intr_idx = VFIO_PCI_INTX_IRQ_INDEX; + break; + /* don't do anything if we want to automatically determine interrupt type */ + case RTE_INTR_MODE_NONE: + break; + default: + RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); + return -1; + } + + /* start from MSI-X interrupt type */ + for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + int fd = -1; + + /* skip interrupt modes we don't want */ + if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE && + i != intr_idx) + continue; + + irq.index = i; + + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); + if (ret < 0) { + RTE_LOG(ERR, EAL, " cannot get IRQ info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* if this vector cannot be used with eventfd, fail if we explicitly + * specified interrupt type, otherwise continue */ + if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { + if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) { + RTE_LOG(ERR, EAL, + " interrupt vector does not support eventfd!\n"); + return -1; + } else + continue; + } + + /* set up an eventfd for interrupts */ + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, " cannot set up eventfd, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + dev->intr_handle.fd = fd; + dev->intr_handle.vfio_dev_fd = vfio_dev_fd; + + switch (i) { + case VFIO_PCI_MSIX_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; + break; + case VFIO_PCI_MSI_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; + break; + case VFIO_PCI_INTX_IRQ_INDEX: + internal_config.vfio_intr_mode = RTE_INTR_MODE_LEGACY; + dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; + break; + default: + RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); + return -1; + } + + return 0; + } + + /* if we're here, we haven't found a suitable interrupt vector */ + return -1; +} + +/* + * map the PCI resources of a PCI device in virtual memory (VFIO version). + * primary and secondary processes follow almost exactly the same path + */ +int +pci_vfio_map_resource(struct rte_pci_device *dev) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char pci_addr[PATH_MAX] = {0}; + int vfio_dev_fd; + struct rte_pci_addr *loc = &dev->addr; + int i, ret, msix_bar; + struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + + struct pci_map *maps; + uint32_t msix_table_offset = 0; + uint32_t msix_table_size = 0; + uint32_t ioport_bar; + + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + + /* store PCI address string */ + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); + + if ((ret = vfio_setup_device(pci_get_sysfs_path(), pci_addr, + &vfio_dev_fd, &device_info))) + return ret; + + /* get MSI-X BAR, if any (we have to know where it is because we can't + * easily mmap it when using VFIO) */ + msix_bar = -1; + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, + &msix_table_offset, &msix_table_size); + if (ret < 0) { + RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); + close(vfio_dev_fd); + return -1; + } + + /* if we're in a primary process, allocate vfio_res and get region info */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot store uio mmap details\n", __func__); + close(vfio_dev_fd); + return -1; + } + memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); + + /* get number of registers (up to BAR5) */ + vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, + VFIO_PCI_BAR5_REGION_INDEX + 1); + } else { + /* if we're in a secondary process, just find our tailq entry */ + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { + if (rte_eal_compare_pci_addr(&vfio_res->pci_addr, + &dev->addr)) + continue; + break; + } + /* if we haven't found our tailq entry, something's wrong */ + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", + pci_addr); + close(vfio_dev_fd); + return -1; + } + } + + /* map BARs */ + maps = vfio_res->maps; + + for (i = 0; i < (int) vfio_res->nb_maps; i++) { + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + void *bar_addr; + struct memreg { + unsigned long offset, size; + } memreg[2] = {}; + + reg.index = i; + + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); + + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device region info " + "error %i (%s)\n", pci_addr, errno, strerror(errno)); + close(vfio_dev_fd); + if (internal_config.process_type == RTE_PROC_PRIMARY) + rte_free(vfio_res); + return -1; + } + + /* chk for io port region */ + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_BASE_ADDRESS_0 + i*4); + + if (ret != sizeof(ioport_bar)) { + RTE_LOG(ERR, EAL, + "Cannot read command (%x) from config space!\n", + PCI_BASE_ADDRESS_0 + i*4); + return -1; + } + + if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) { + RTE_LOG(INFO, EAL, + "Ignore mapping IO port bar(%d) addr: %x\n", + i, ioport_bar); + continue; + } + + /* skip non-mmapable BARs */ + if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) + continue; + + if (i == msix_bar) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table_offset; + uint32_t table_end = table_start + msix_table_size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= reg.size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); + continue; + } else { + memreg[0].offset = reg.offset; + memreg[0].size = table_start; + memreg[1].offset = reg.offset + table_end; + memreg[1].size = reg.size - table_end; + + RTE_LOG(DEBUG, EAL, + "Trying to map BAR %d that contains the MSI-X " + "table. Trying offsets: " + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } + } else { + memreg[0].offset = reg.offset; + memreg[0].size = reg.size; + } + + /* try to figure out an address */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); + + bar_addr = pci_map_addr; + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); + } else { + bar_addr = maps[i].addr; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, + memreg[1].offset - + (uintptr_t)reg.offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, reg.size); + bar_addr = MAP_FAILED; + } + } + + if (bar_addr == MAP_FAILED || + (internal_config.process_type == RTE_PROC_SECONDARY && + bar_addr != maps[i].addr)) { + RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, + strerror(errno)); + close(vfio_dev_fd); + if (internal_config.process_type == RTE_PROC_PRIMARY) + rte_free(vfio_res); + return -1; + } + + maps[i].addr = bar_addr; + maps[i].offset = reg.offset; + maps[i].size = reg.size; + maps[i].path = NULL; /* vfio doesn't have per-resource paths */ + dev->mem_resource[i].addr = bar_addr; + } + + /* if secondary process, do not set up interrupts */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { + RTE_LOG(ERR, EAL, " %s error setting up interrupts!\n", pci_addr); + close(vfio_dev_fd); + rte_free(vfio_res); + return -1; + } + + /* set bus mastering for the device */ + if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { + RTE_LOG(ERR, EAL, " %s cannot set up bus mastering!\n", pci_addr); + close(vfio_dev_fd); + rte_free(vfio_res); + return -1; + } + + /* Reset the device */ + ioctl(vfio_dev_fd, VFIO_DEVICE_RESET); + } + + if (internal_config.process_type == RTE_PROC_PRIMARY) + TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); + + return 0; +} + +int +pci_vfio_unmap_resource(struct rte_pci_device *dev) +{ + char pci_addr[PATH_MAX] = {0}; + struct rte_pci_addr *loc = &dev->addr; + int i, ret; + struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list; + + struct pci_map *maps; + + /* store PCI address string */ + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); + + + if (close(dev->intr_handle.fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", + pci_addr); + return -1; + } + + if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { + RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", + pci_addr); + return -1; + } + + ret = vfio_release_device(pci_get_sysfs_path(), pci_addr, + dev->intr_handle.vfio_dev_fd); + if (ret < 0) { + RTE_LOG(ERR, EAL, + "%s(): cannot release device\n", __func__); + return ret; + } + + vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + /* Get vfio_res */ + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { + if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr))) + continue; + break; + } + /* if we haven't found our tailq entry, something's wrong */ + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", + pci_addr); + return -1; + } + + /* unmap BARs */ + maps = vfio_res->maps; + + RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", + pci_addr); + for (i = 0; i < (int) vfio_res->nb_maps; i++) { + + /* + * We do not need to be aware of MSI-X table BAR mappings as + * when mapping. Just using current maps array is enough + */ + if (maps[i].addr) { + RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", + pci_addr, maps[i].addr); + pci_unmap_resource(maps[i].addr, maps[i].size); + } + } + + TAILQ_REMOVE(vfio_res_list, vfio_res, next); + + return 0; +} + +int +pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + if (bar < VFIO_PCI_BAR0_REGION_INDEX || + bar > VFIO_PCI_BAR5_REGION_INDEX) { + RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); + return -1; + } + + p->dev = dev; + p->base = VFIO_GET_REGION_ADDR(bar); + return 0; +} + +void +pci_vfio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; + + if (pread64(intr_handle->vfio_dev_fd, data, + len, p->base + offset) <= 0) + RTE_LOG(ERR, EAL, + "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", + VFIO_GET_REGION_IDX(p->base), (int)offset); +} + +void +pci_vfio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; + + if (pwrite64(intr_handle->vfio_dev_fd, data, + len, p->base + offset) <= 0) + RTE_LOG(ERR, EAL, + "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", + VFIO_GET_REGION_IDX(p->base), (int)offset); +} + +int +pci_vfio_ioport_unmap(struct rte_pci_ioport *p) +{ + RTE_SET_USED(p); + return -1; +} + +int +pci_vfio_enable(void) +{ + return vfio_enable("vfio_pci"); +} + +int +pci_vfio_is_enabled(void) +{ + return vfio_is_enabled("vfio_pci"); +} +#endif diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c new file mode 100644 index 00000000..9f88530e --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_thread.c @@ -0,0 +1,212 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <pthread.h> +#include <sched.h> +#include <sys/queue.h> +#include <sys/syscall.h> + +#include <rte_debug.h> +#include <rte_atomic.h> +#include <rte_launch.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_per_lcore.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%x;cpuset=[%s%s])\n", + lcore_id, (int)thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = -1; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return ret; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c new file mode 100644 index 00000000..afa32f5c --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_timer.c @@ -0,0 +1,305 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * Copyright(c) 2012-2013 6WIND S.A. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <fcntl.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <pthread.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunatelly, we need + * this because hpet is 32 bits by default under linux). + */ +static void +hpet_msb_inc(__attribute__((unused)) void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = pthread_create(&msb_inc_thread_id, NULL, + (void *(*)(void *))hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + /* + * Set thread_name for aid in debugging. + */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "hpet-msb-inc"); + ret = rte_thread_setname(msb_inc_thread_id, thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set HPET timer thread name!\n"); + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +static void +check_tsc_flags(void) +{ + char line[512]; + FILE *stream; + + stream = fopen("/proc/cpuinfo", "r"); + if (!stream) { + RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n"); + return; + } + + while (fgets(line, sizeof line, stream)) { + char *constant_tsc; + char *nonstop_tsc; + + if (strncmp(line, "flags", 5) != 0) + continue; + + constant_tsc = strstr(line, "constant_tsc"); + nonstop_tsc = strstr(line, "nonstop_tsc"); + if (!constant_tsc || !nonstop_tsc) + RTE_LOG(WARNING, EAL, + "WARNING: cpu flags " + "constant_tsc=%s " + "nonstop_tsc=%s " + "-> using unreliable clock cycles !\n", + constant_tsc ? "yes":"no", + nonstop_tsc ? "yes":"no"); + break; + } + + fclose(stream); +} + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + return tsc_hz; + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + check_tsc_flags(); + return 0; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c new file mode 100644 index 00000000..53ac725d --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -0,0 +1,819 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/ioctl.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_eal_memconfig.h> + +#include "eal_filesystem.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +/* per-process VFIO config */ +static struct vfio_config vfio_cfg; + +static int vfio_type1_dma_map(int); +static int vfio_spapr_dma_map(int); +static int vfio_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + /* ppc64 IOMMU, otherwise known as spapr */ + { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map}, + /* IOMMU-less mode */ + { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, +}; + +int +vfio_get_group_fd(int iommu_group_no) +{ + int i; + int vfio_group_fd; + int group_idx = -1; + char filename[PATH_MAX]; + + /* check if we already have the group descriptor open */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) + return vfio_cfg.vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg.vfio_groups[i].group_no == -1) { + group_idx = i; + break; + } + + /* This should not happen */ + if (group_idx == -1) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + vfio_cfg.vfio_groups[group_idx].group_no = iommu_group_no; + vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd; + vfio_cfg.vfio_active_groups++; + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via our socket + */ + else { + int socket_fd, ret; + + socket_fd = vfio_mp_sync_connect_to_primary(); + + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { + RTE_LOG(ERR, EAL, " cannot send group number!\n"); + close(socket_fd); + return -1; + } + ret = vfio_mp_sync_receive_request(socket_fd); + switch (ret) { + case SOCKET_NO_FD: + close(socket_fd); + return 0; + case SOCKET_OK: + vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); + /* if we got the fd, return it */ + if (vfio_group_fd > 0) { + close(socket_fd); + return vfio_group_fd; + } + /* fall-through on error */ + default: + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + } + return -1; +} + + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + int i; + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) + return i; + return -1; +} + +static void +vfio_group_device_get(int vfio_group_fd) +{ + int i; + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > VFIO_MAX_GROUPS) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg.vfio_groups[i].devices++; +} + +static void +vfio_group_device_put(int vfio_group_fd) +{ + int i; + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > VFIO_MAX_GROUPS) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg.vfio_groups[i].devices--; +} + +static int +vfio_group_device_count(int vfio_group_fd) +{ + int i; + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + return -1; + } + + return vfio_cfg.vfio_groups[i].devices; +} + +int +clear_group(int vfio_group_fd) +{ + int i; + int socket_fd, ret; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg.vfio_groups[i].group_no = -1; + vfio_cfg.vfio_groups[i].fd = -1; + vfio_cfg.vfio_groups[i].devices = 0; + vfio_cfg.vfio_active_groups--; + return 0; + } + + /* This is just for SECONDARY processes */ + socket_fd = vfio_mp_sync_connect_to_primary(); + + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + + if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + + if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) { + RTE_LOG(ERR, EAL, " cannot send group fd!\n"); + close(socket_fd); + return -1; + } + + ret = vfio_mp_sync_receive_request(socket_fd); + switch (ret) { + case SOCKET_NO_FD: + RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n"); + close(socket_fd); + break; + case SOCKET_OK: + close(socket_fd); + return 0; + case SOCKET_ERR: + RTE_LOG(ERR, EAL, " Socket error\n"); + close(socket_fd); + break; + default: + RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret); + close(socket_fd); + } + return -1; +} + +int +vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + int vfio_group_fd; + int iommu_group_no; + int ret; + + /* get group number */ + ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = vfio_get_group_fd(iommu_group_no); + if (vfio_group_fd < 0) + return -1; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when first group is + * assigned to a container and only in primary process. + * Note this can happen several times with the hotplug + * functionality. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg.vfio_active_groups == 1) { + /* select an IOMMU type which we will be using */ + const struct vfio_iommu_type *t = + vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, + " %s failed to select IOMMU type\n", + dev_addr); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + ret = t->dma_map_func(vfio_cfg.vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, + " %s DMA remapping failed, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + } + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + + RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", + dev_addr); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, + strerror(errno)); + close(*vfio_dev_fd); + close(vfio_group_fd); + clear_group(vfio_group_fd); + return -1; + } + vfio_group_device_get(vfio_group_fd); + + return 0; +} + +int +vfio_release_device(const char *sysfs_base, const char *dev_addr, + int vfio_dev_fd) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + int vfio_group_fd; + int iommu_group_no; + int ret; + + /* get group number */ + ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); + if (ret <= 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", + dev_addr); + /* This is an error at this point. */ + return -1; + } + + /* get the actual group fd */ + vfio_group_fd = vfio_get_group_fd(iommu_group_no); + if (vfio_group_fd <= 0) { + RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n", + dev_addr); + return -1; + } + + /* At this point we got an active group. Closing it will make the + * container detachment. If this is the last active group, VFIO kernel + * code will unset the container and the IOMMU mappings. + */ + + /* Closing a device */ + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", + dev_addr); + return -1; + } + + /* An VFIO group can have several devices attached. Just when there is + * no devices remaining should the group be closed. + */ + vfio_group_device_put(vfio_group_fd); + if (!vfio_group_device_count(vfio_group_fd)) { + + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", + dev_addr); + return -1; + } + + if (clear_group(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", + dev_addr); + return -1; + } + } + + return 0; +} + +int +vfio_enable(const char *modname) +{ + /* initialize group list */ + int i; + int vfio_available; + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + vfio_cfg.vfio_groups[i].fd = -1; + vfio_cfg.vfio_groups[i].group_no = -1; + vfio_cfg.vfio_groups[i].devices = 0; + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio-pci module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + vfio_cfg.vfio_container_fd = vfio_get_container_fd(); + + /* check if we have VFIO driver enabled */ + if (vfio_cfg.vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + vfio_cfg.vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname); + return vfio_cfg.vfio_enabled && mod_available; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) +{ + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) +{ + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } else { + /* + * if we're in a secondary process, request container fd from the + * primary process via our socket + */ + int socket_fd; + + socket_fd = vfio_mp_sync_connect_to_primary(); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); + return -1; + } + if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { + RTE_LOG(ERR, EAL, " cannot request container fd!\n"); + close(socket_fd); + return -1; + } + vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot get container fd!\n"); + close(socket_fd); + return -1; + } + close(socket_fd); + return vfio_container_fd; + } + + return -1; +} + +int +vfio_get_group_no(const char *sysfs_base, + const char *dev_addr, int *iommu_group_no) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_no = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, + strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + struct vfio_iommu_spapr_tce_info info = { + .argsz = sizeof(info), + }; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + }; + + /* query spapr iommu info */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + RTE_LOG(ERR, EAL, " cannot get iommu info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* remove default DMA of 32 bit window */ + remove.start_addr = info.dma32_window_start; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + RTE_LOG(ERR, EAL, " cannot remove default DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* calculate window size based on number of hugepages configured */ + create.window_size = rte_eal_get_physmem_size(); + create.page_shift = __builtin_ctzll(ms->hugepage_sz); + create.levels = 2; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + if (ret) { + RTE_LOG(ERR, EAL, " cannot create new DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + reg.vaddr = (uintptr_t) ms[i].addr; + reg.size = ms[i].len; + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + } + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +#endif diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h new file mode 100644 index 00000000..5ff63e5d --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -0,0 +1,224 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +/* + * determine if VFIO is present on the system + */ +#ifdef RTE_EAL_VFIO +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#include <linux/vfio.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) +#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8 +#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff +#else +#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET +#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE +#endif + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#ifndef VFIO_SPAPR_TCE_v2_IOMMU +#define RTE_VFIO_SPAPR 7 +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +struct vfio_iommu_spapr_register_memory { + uint32_t argsz; + uint32_t flags; + uint64_t vaddr; + uint64_t size; +}; + +struct vfio_iommu_spapr_tce_create { + uint32_t argsz; + uint32_t flags; + /* in */ + uint32_t page_shift; + uint32_t __resv1; + uint64_t window_size; + uint32_t levels; + uint32_t __resv2; + /* out */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_remove { + uint32_t argsz; + uint32_t flags; + /* in */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_ddw_info { + uint64_t pgsizes; + uint32_t max_dynamic_windows_supported; + uint32_t levels; +}; + +/* SPAPR_v2 is not present, but SPAPR might be */ +#ifndef VFIO_SPAPR_TCE_IOMMU +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct vfio_iommu_spapr_tce_info { + uint32_t argsz; + uint32_t flags; + uint32_t dma32_window_start; + uint32_t dma32_window_size; + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; +#endif /* VFIO_SPAPR_TCE_IOMMU */ + +#else /* VFIO_SPAPR_TCE_v2_IOMMU */ +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0) +#define RTE_VFIO_NOIOMMU 8 +#else +#define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU +#endif + +#define VFIO_MAX_GROUPS 64 + +/* + * Function prototypes for VFIO multiprocess sync functions + */ +int vfio_mp_sync_send_request(int socket, int req); +int vfio_mp_sync_receive_request(int socket); +int vfio_mp_sync_send_fd(int socket, int fd); +int vfio_mp_sync_receive_fd(int socket); +int vfio_mp_sync_connect_to_primary(void); + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_no; + int fd; + int devices; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; +}; + +#define VFIO_DIR "/dev/vfio" +#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" +#define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" +#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) +#define VFIO_GET_REGION_IDX(x) (x >> 40) + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +/* open container fd or get an existing one */ +int +vfio_get_container_fd(void); + +/* parse IOMMU group number for a device + * returns 1 on success, -1 for errors, 0 for non-existent group + */ +int +vfio_get_group_no(const char *sysfs_base, + const char *dev_addr, int *iommu_group_no); + +/* open group fd or get an existing one */ +int +vfio_get_group_fd(int iommu_group_no); + +/* remove group fd from internal VFIO group fd array */ +int +clear_group(int vfio_group_fd); + +/** + * Setup vfio_cfg for the device identified by its address. It discovers + * the configured I/O MMU groups or sets a new one for the device. If a new + * groups is assigned, the DMA mapping is performed. + * Returns 0 on success, a negative value on failure and a positive value in + * case the given device cannot be managed this way. + */ +int vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info); + +int vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd); + +int vfio_enable(const char *modname); +int vfio_is_enabled(const char *modname); + +int pci_vfio_enable(void); +int pci_vfio_is_enabled(void); + +int vfio_mp_sync_setup(void); + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_CLR_GROUP 0x300 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +#define VFIO_PRESENT +#endif /* kernel version */ +#endif /* RTE_EAL_VFIO */ + +#endif /* EAL_VFIO_H_ */ diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c new file mode 100644 index 00000000..7e8095cb --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c @@ -0,0 +1,424 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <pthread.h> + +/* sys/un.h with __USE_MISC uses strlen, which is unsafe */ +#ifdef __USE_MISC +#define REMOVED_USE_MISC +#undef __USE_MISC +#endif +#include <sys/un.h> +/* make sure we redefine __USE_MISC only if it was previously undefined */ +#ifdef REMOVED_USE_MISC +#define __USE_MISC +#undef REMOVED_USE_MISC +#endif + +#include <rte_log.h> +#include <rte_pci.h> +#include <rte_eal_memconfig.h> +#include <rte_malloc.h> + +#include "eal_filesystem.h" +#include "eal_pci_init.h" +#include "eal_thread.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +#define SOCKET_PATH_FMT "%s/.%s_mp_socket" +#define CMSGLEN (CMSG_LEN(sizeof(int))) +#define FD_TO_CMSGHDR(fd, chdr) \ + do {\ + (chdr).cmsg_len = CMSGLEN;\ + (chdr).cmsg_level = SOL_SOCKET;\ + (chdr).cmsg_type = SCM_RIGHTS;\ + memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\ + } while (0) +#define CMSGHDR_TO_FD(chdr, fd) \ + memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd)) + +static pthread_t socket_thread; +static int mp_socket_fd; + + +/* get socket path (/var/run if root, $HOME otherwise) */ +static void +get_socket_path(char *buffer, int bufsz) +{ + const char *dir = "/var/run"; + const char *home_dir = getenv("HOME"); + + if (getuid() != 0 && home_dir != NULL) + dir = home_dir; + + /* use current prefix as file path */ + snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir, + internal_config.hugefile_prefix); +} + + + +/* + * data flow for socket comm protocol: + * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP + * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number + * 2. server receives message + * 2a. in case of invalid group, SOCKET_ERR is sent back to client + * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client + * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd + * + * in case of any error, socket is closed. + */ + +/* send a request, return -1 on error */ +int +vfio_mp_sync_send_request(int socket, int req) +{ + struct msghdr hdr; + struct iovec iov; + int buf; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + + buf = req; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive a request and return it */ +int +vfio_mp_sync_receive_request(int socket) +{ + int buf; + struct msghdr hdr; + struct iovec iov; + int ret, req; + + memset(&hdr, 0, sizeof(hdr)); + + buf = SOCKET_ERR; + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + return req; +} + +/* send OK in message, fd in control message */ +int +vfio_mp_sync_send_fd(int socket, int fd) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + buf = SOCKET_OK; + FD_TO_CMSGHDR(fd, *chdr); + + ret = sendmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + return 0; +} + +/* receive OK in message, fd in control message */ +int +vfio_mp_sync_receive_fd(int socket) +{ + int buf; + struct msghdr hdr; + struct cmsghdr *chdr; + char chdr_buf[CMSGLEN]; + struct iovec iov; + int ret, req, fd; + + buf = SOCKET_ERR; + + chdr = (struct cmsghdr *) chdr_buf; + memset(chdr, 0, sizeof(chdr_buf)); + memset(&hdr, 0, sizeof(hdr)); + + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + iov.iov_base = (char *) &buf; + iov.iov_len = sizeof(buf); + hdr.msg_control = chdr; + hdr.msg_controllen = CMSGLEN; + + ret = recvmsg(socket, &hdr, 0); + if (ret < 0) + return -1; + + req = buf; + + if (req != SOCKET_OK) + return -1; + + CMSGHDR_TO_FD(*chdr, fd); + + return fd; +} + +/* connect socket_fd in secondary process to the primary process's socket */ +int +vfio_mp_sync_connect_to_primary(void) +{ + struct sockaddr_un addr; + socklen_t sockaddr_len; + int socket_fd; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0) + return socket_fd; + + /* if connect failed */ + close(socket_fd); + return -1; +} + + + +/* + * socket listening thread for primary process + */ +static __attribute__((noreturn)) void * +vfio_mp_sync_thread(void __rte_unused * arg) +{ + int ret, fd, vfio_data; + + /* wait for requests on the socket */ + for (;;) { + int conn_sock; + struct sockaddr_un addr; + socklen_t sockaddr_len = sizeof(addr); + + /* this is a blocking call */ + conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr, + &sockaddr_len); + + /* just restart on error */ + if (conn_sock == -1) + continue; + + /* set socket to linger after close */ + struct linger l; + l.l_onoff = 1; + l.l_linger = 60; + + if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0) + RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option " + "on listen socket (%s)\n", strerror(errno)); + + ret = vfio_mp_sync_receive_request(conn_sock); + + switch (ret) { + case SOCKET_REQ_CONTAINER: + fd = vfio_get_container_fd(); + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + else + vfio_mp_sync_send_fd(conn_sock, fd); + close(fd); + break; + case SOCKET_REQ_GROUP: + /* wait for group number */ + vfio_data = vfio_mp_sync_receive_request(conn_sock); + if (vfio_data < 0) { + close(conn_sock); + continue; + } + + fd = vfio_get_group_fd(vfio_data); + + if (fd < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + /* if VFIO group exists but isn't bound to VFIO driver */ + else if (fd == 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); + /* if group exists and is bound to VFIO driver */ + else { + vfio_mp_sync_send_request(conn_sock, SOCKET_OK); + vfio_mp_sync_send_fd(conn_sock, fd); + } + break; + case SOCKET_CLR_GROUP: + /* wait for group fd */ + vfio_data = vfio_mp_sync_receive_request(conn_sock); + if (vfio_data < 0) { + close(conn_sock); + continue; + } + + ret = clear_group(vfio_data); + + if (ret < 0) + vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD); + else + vfio_mp_sync_send_request(conn_sock, SOCKET_OK); + break; + default: + vfio_mp_sync_send_request(conn_sock, SOCKET_ERR); + break; + } + close(conn_sock); + } +} + +static int +vfio_mp_sync_socket_setup(void) +{ + int ret, socket_fd; + struct sockaddr_un addr; + socklen_t sockaddr_len; + + /* set up a socket */ + socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); + if (socket_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket!\n"); + return -1; + } + + get_socket_path(addr.sun_path, sizeof(addr.sun_path)); + addr.sun_family = AF_UNIX; + + sockaddr_len = sizeof(struct sockaddr_un); + + unlink(addr.sun_path); + + ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + ret = listen(socket_fd, 50); + if (ret) { + RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno)); + close(socket_fd); + return -1; + } + + /* save the socket in local configuration */ + mp_socket_fd = socket_fd; + + return 0; +} + +/* + * set up a local socket and tell it to listen for incoming connections + */ +int +vfio_mp_sync_setup(void) +{ + int ret; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + if (vfio_mp_sync_socket_setup() < 0) { + RTE_LOG(ERR, EAL, "Failed to set up local socket!\n"); + return -1; + } + + ret = pthread_create(&socket_thread, NULL, + vfio_mp_sync_thread, NULL); + if (ret) { + RTE_LOG(ERR, EAL, + "Failed to create thread for communication with secondary processes!\n"); + close(mp_socket_fd); + return -1; + } + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync"); + ret = rte_thread_setname(socket_thread, thread_name); + if (ret) + RTE_LOG(DEBUG, EAL, + "Failed to set thread name for secondary processes!\n"); + + return 0; +} + +#endif diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_xen_memory.c b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_xen_memory.c new file mode 100644 index 00000000..bddbdb07 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/eal_xen_memory.c @@ -0,0 +1,383 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <stdarg.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <sys/file.h> +#include <unistd.h> +#include <limits.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/time.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include <exec-env/rte_dom0_common.h> + +#define PAGE_SIZE RTE_PGSIZE_4K +#define DEFAUL_DOM0_NAME "dom0-mem" + +static int xen_fd = -1; +static const char sys_dir_path[] = "/sys/kernel/mm/dom0-mm/memsize-mB"; + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by mem_size until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of mem_size size. + */ +static void * +xen_get_virtual_area(size_t *size, size_t mem_size) +{ + void *addr; + int fd; + long aligned_addr; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zu bytes\n", *size); + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0){ + RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n"); + return NULL; + } + do { + addr = mmap(NULL, (*size) + mem_size, PROT_READ, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + *size -= mem_size; + } while (addr == MAP_FAILED && *size > 0); + + if (addr == MAP_FAILED) { + close(fd); + RTE_LOG(ERR, EAL, "Cannot get a virtual area\n"); + return NULL; + } + + munmap(addr, (*size) + mem_size); + close(fd); + + /* align addr to a mem_size boundary */ + aligned_addr = (uintptr_t)addr; + aligned_addr = RTE_ALIGN_CEIL(aligned_addr, mem_size); + addr = (void *)(aligned_addr); + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + addr, *size); + + return addr; +} + +/** + * Get memory size configuration from /sys/devices/virtual/misc/dom0_mm + * /memsize-mB/memsize file, and the size unit is mB. + */ +static int +get_xen_memory_size(void) +{ + char path[PATH_MAX]; + unsigned long mem_size = 0; + static const char *file_name; + + file_name = "memsize"; + snprintf(path, sizeof(path), "%s/%s", + sys_dir_path, file_name); + + if (eal_parse_sysfs_value(path, &mem_size) < 0) + return -1; + + if (mem_size == 0) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s was not" + " configured.\n",sys_dir_path, file_name); + if (mem_size % 2) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s must be" + " even number.\n",sys_dir_path, file_name); + + if (mem_size > DOM0_CONFIG_MEMSIZE) + rte_exit(EXIT_FAILURE,"XEN-DOM0:the %s/%s should not be larger" + " than %d mB\n",sys_dir_path, file_name, DOM0_CONFIG_MEMSIZE); + + return mem_size; +} + +/** + * Based on physical address to caculate MFN in Xen Dom0. + */ +phys_addr_t +rte_xen_mem_phy2mch(int32_t memseg_id, const phys_addr_t phy_addr) +{ + int mfn_id, i; + uint64_t mfn, mfn_offset; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg *memseg = mcfg->memseg; + + /* find the memory segment owning the physical address */ + if (memseg_id == -1) { + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + if ((phy_addr >= memseg[i].phys_addr) && + (phy_addr < memseg[i].phys_addr + + memseg[i].len)) { + memseg_id = i; + break; + } + } + if (memseg_id == -1) + return RTE_BAD_PHYS_ADDR; + } + + mfn_id = (phy_addr - memseg[memseg_id].phys_addr) / RTE_PGSIZE_2M; + + /*the MFN is contiguous in 2M */ + mfn_offset = (phy_addr - memseg[memseg_id].phys_addr) % + RTE_PGSIZE_2M / PAGE_SIZE; + mfn = mfn_offset + memseg[memseg_id].mfn[mfn_id]; + + /** return mechine address */ + return mfn * PAGE_SIZE + phy_addr % PAGE_SIZE; +} + +int +rte_xen_dom0_memory_init(void) +{ + void *vir_addr, *vma_addr = NULL; + int err, ret = 0; + uint32_t i, requested, mem_size, memseg_idx, num_memseg = 0; + size_t vma_len = 0; + struct memory_info meminfo; + struct memseg_info seginfo[RTE_MAX_MEMSEG]; + int flags, page_size = getpagesize(); + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg *memseg = mcfg->memseg; + uint64_t total_mem = internal_config.memory; + + memset(seginfo, 0, sizeof(seginfo)); + memset(&meminfo, 0, sizeof(struct memory_info)); + + mem_size = get_xen_memory_size(); + requested = (unsigned) (total_mem / 0x100000); + if (requested > mem_size) + /* if we didn't satisfy total memory requirements */ + rte_exit(EXIT_FAILURE,"Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, mem_size); + else if (total_mem != 0) + mem_size = requested; + + /* Check FD and open once */ + if (xen_fd < 0) { + xen_fd = open(DOM0_MM_DEV, O_RDWR); + if (xen_fd < 0) { + RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); + return -1; + } + } + + meminfo.size = mem_size; + + /* construct memory mangement name for Dom0 */ + snprintf(meminfo.name, DOM0_NAME_MAX, "%s-%s", + internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); + + /* Notify kernel driver to allocate memory */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_PREPARE_MEMSEG, &meminfo); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memory\n"); + err = -EIO; + goto fail; + } + + /* Get number of memory segment from driver */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_NUM_MEMSEG, &num_memseg); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg count.\n"); + err = -EIO; + goto fail; + } + + if(num_memseg > RTE_MAX_MEMSEG){ + RTE_LOG(ERR, EAL, "XEN DOM0: the memseg count %d is greater" + " than max memseg %d.\n",num_memseg, RTE_MAX_MEMSEG); + err = -EIO; + goto fail; + } + + /* get all memory segements information */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_GET_MEMSEG_INFO, seginfo); + if (ret < 0) { + RTE_LOG(ERR, EAL, "XEN DOM0:failed to get memseg info.\n"); + err = -EIO; + goto fail; + } + + /* map all memory segments to contiguous user space */ + for (memseg_idx = 0; memseg_idx < num_memseg; memseg_idx++) + { + vma_len = seginfo[memseg_idx].size; + + /** + * get the biggest virtual memory area up to vma_len. If it fails, + * vma_addr is NULL, so let the kernel provide the address. + */ + vma_addr = xen_get_virtual_area(&vma_len, RTE_PGSIZE_2M); + if (vma_addr == NULL) { + flags = MAP_SHARED; + vma_len = RTE_PGSIZE_2M; + } else + flags = MAP_SHARED | MAP_FIXED; + + seginfo[memseg_idx].size = vma_len; + vir_addr = mmap(vma_addr, seginfo[memseg_idx].size, + PROT_READ|PROT_WRITE, flags, xen_fd, + memseg_idx * page_size); + if (vir_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "XEN DOM0:Could not mmap %s\n", + DOM0_MM_DEV); + err = -EIO; + goto fail; + } + + memseg[memseg_idx].addr = vir_addr; + memseg[memseg_idx].phys_addr = page_size * + seginfo[memseg_idx].pfn ; + memseg[memseg_idx].len = seginfo[memseg_idx].size; + for ( i = 0; i < seginfo[memseg_idx].size / RTE_PGSIZE_2M; i++) + memseg[memseg_idx].mfn[i] = seginfo[memseg_idx].mfn[i]; + + /* MFNs are continuous in 2M, so assume that page size is 2M */ + memseg[memseg_idx].hugepage_sz = RTE_PGSIZE_2M; + + memseg[memseg_idx].nchannel = mcfg->nchannel; + memseg[memseg_idx].nrank = mcfg->nrank; + + /* NUMA is not suppoted in Xen Dom0, so only set socket 0*/ + memseg[memseg_idx].socket_id = 0; + } + + return 0; +fail: + if (xen_fd > 0) { + close(xen_fd); + xen_fd = -1; + } + return err; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration, mapping them in order to form a contiguous block in the + * virtual memory space + */ +int +rte_xen_dom0_memory_attach(void) +{ + const struct rte_mem_config *mcfg; + unsigned s = 0; /* s used to track the segment number */ + int xen_fd = -1; + int ret = -1; + void *vir_addr; + char name[DOM0_NAME_MAX] = {0}; + int page_size = getpagesize(); + + mcfg = rte_eal_get_configuration()->mem_config; + + /* Check FD and open once */ + if (xen_fd < 0) { + xen_fd = open(DOM0_MM_DEV, O_RDWR); + if (xen_fd < 0) { + RTE_LOG(ERR, EAL, "Can not open %s\n",DOM0_MM_DEV); + goto error; + } + } + + /* construct memory mangement name for Dom0 */ + snprintf(name, DOM0_NAME_MAX, "%s-%s", + internal_config.hugefile_prefix, DEFAUL_DOM0_NAME); + /* attach to memory segments of primary process */ + ret = ioctl(xen_fd, RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG, name); + if (ret) { + RTE_LOG(ERR, EAL,"attach memory segments fail.\n"); + goto error; + } + + /* map all segments into memory to make sure we get the addrs */ + for (s = 0; s < RTE_MAX_MEMSEG; ++s) { + + /* + * the first memory segment with len==0 is the one that + * follows the last valid segment. + */ + if (mcfg->memseg[s].len == 0) + break; + + vir_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, + PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, xen_fd, + s * page_size); + if (vir_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes " + "in %s to requested address [%p]\n", + (unsigned long long)mcfg->memseg[s].len, DOM0_MM_DEV, + mcfg->memseg[s].addr); + goto error; + } + } + return 0; + +error: + if (xen_fd >= 0) { + close(xen_fd); + xen_fd = -1; + } + return -1; +} diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h new file mode 100644 index 00000000..d9707780 --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dom0_common.h @@ -0,0 +1,108 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2007-2014 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Contact Information: + * Intel Corporation + * + * + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _RTE_DOM0_COMMON_H_ +#define _RTE_DOM0_COMMON_H_ + +#ifdef __KERNEL__ +#include <linux/if.h> +#endif + +#define DOM0_NAME_MAX 256 +#define DOM0_MM_DEV "/dev/dom0_mm" + +#define DOM0_CONTIG_NUM_ORDER 9 /**< order of 2M */ +#define DOM0_NUM_MEMSEG 512 /**< Maximum nb. of memory segment. */ +#define DOM0_MEMBLOCK_SIZE 0x200000 /**< size of memory block(2M). */ +#define DOM0_CONFIG_MEMSIZE 4096 /**< Maximum config memory size(4G). */ +#define DOM0_NUM_MEMBLOCK (DOM0_CONFIG_MEMSIZE / 2) /**< Maximum nb. of 2M memory block. */ + +#define RTE_DOM0_IOCTL_PREPARE_MEMSEG _IOWR(0, 1 , struct memory_info) +#define RTE_DOM0_IOCTL_ATTACH_TO_MEMSEG _IOWR(0, 2 , char *) +#define RTE_DOM0_IOCTL_GET_NUM_MEMSEG _IOWR(0, 3, int) +#define RTE_DOM0_IOCTL_GET_MEMSEG_INFO _IOWR(0, 4, void *) + +/** + * A structure used to store memory information. + */ +struct memory_info { + char name[DOM0_NAME_MAX]; + uint64_t size; +}; + +/** + * A structure used to store memory segment information. + */ +struct memseg_info { + uint32_t idx; + uint64_t pfn; + uint64_t size; + uint64_t mfn[DOM0_NUM_MEMBLOCK]; +}; + +/** + * A structure used to store memory block information. + */ +struct memblock_info { + uint8_t exchange_flag; + uint8_t used; + uint64_t vir_addr; + uint64_t pfn; + uint64_t mfn; +}; +#endif /* _RTE_DOM0_COMMON_H_ */ diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h new file mode 100644 index 00000000..6daffebf --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h @@ -0,0 +1,239 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_INTERRUPTS_H_ +#error "don't include this file directly, please include generic <rte_interrupts.h>" +#endif + +#ifndef _RTE_LINUXAPP_INTERRUPTS_H_ +#define _RTE_LINUXAPP_INTERRUPTS_H_ + +#define RTE_MAX_RXTX_INTR_VEC_ID 32 +#define RTE_INTR_VEC_ZERO_OFFSET 0 +#define RTE_INTR_VEC_RXTX_OFFSET 1 + +enum rte_intr_handle_type { + RTE_INTR_HANDLE_UNKNOWN = 0, + RTE_INTR_HANDLE_UIO, /**< uio device handle */ + RTE_INTR_HANDLE_UIO_INTX, /**< uio generic handle */ + RTE_INTR_HANDLE_VFIO_LEGACY, /**< vfio device handle (legacy) */ + RTE_INTR_HANDLE_VFIO_MSI, /**< vfio device handle (MSI) */ + RTE_INTR_HANDLE_VFIO_MSIX, /**< vfio device handle (MSIX) */ + RTE_INTR_HANDLE_ALARM, /**< alarm handle */ + RTE_INTR_HANDLE_EXT, /**< external handler */ + RTE_INTR_HANDLE_VDEV, /**< virtual device */ + RTE_INTR_HANDLE_MAX +}; + +#define RTE_INTR_EVENT_ADD 1UL +#define RTE_INTR_EVENT_DEL 2UL + +typedef void (*rte_intr_event_cb_t)(int fd, void *arg); + +struct rte_epoll_data { + uint32_t event; /**< event type */ + void *data; /**< User data */ + rte_intr_event_cb_t cb_fun; /**< IN: callback fun */ + void *cb_arg; /**< IN: callback arg */ +}; + +enum { + RTE_EPOLL_INVALID = 0, + RTE_EPOLL_VALID, + RTE_EPOLL_EXEC, +}; + +/** interrupt epoll event obj, taken by epoll_event.ptr */ +struct rte_epoll_event { + volatile uint32_t status; /**< OUT: event status */ + int fd; /**< OUT: event fd */ + int epfd; /**< OUT: epoll instance the ev associated with */ + struct rte_epoll_data epdata; +}; + +/** Handle for interrupts. */ +struct rte_intr_handle { + RTE_STD_C11 + union { + int vfio_dev_fd; /**< VFIO device file descriptor */ + int uio_cfg_fd; /**< UIO config file descriptor + for uio_pci_generic */ + }; + int fd; /**< interrupt event file descriptor */ + enum rte_intr_handle_type type; /**< handle type */ + uint32_t max_intr; /**< max interrupt requested */ + uint32_t nb_efd; /**< number of available efd(event fd) */ + int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /**< intr vectors/efds mapping */ + struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID]; + /**< intr vector epoll event */ + int *intr_vec; /**< intr vector number array */ +}; + +#define RTE_EPOLL_PER_THREAD -1 /**< to hint using per thread epfd */ + +/** + * It waits for events on the epoll instance. + * + * @param epfd + * Epoll instance fd on which the caller wait for events. + * @param events + * Memory area contains the events that will be available for the caller. + * @param maxevents + * Up to maxevents are returned, must greater than zero. + * @param timeout + * Specifying a timeout of -1 causes a block indefinitely. + * Specifying a timeout equal to zero cause to return immediately. + * @return + * - On success, returns the number of available event. + * - On failure, a negative value. + */ +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout); + +/** + * It performs control operations on epoll instance referred by the epfd. + * It requests that the operation op be performed for the target fd. + * + * @param epfd + * Epoll instance fd on which the caller perform control operations. + * @param op + * The operation be performed for the target fd. + * @param fd + * The target fd on which the control ops perform. + * @param event + * Describes the object linked to the fd. + * Note: The caller must take care the object deletion after CTL_DEL. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event); + +/** + * The function returns the per thread epoll instance. + * + * @return + * epfd the epoll instance referred to. + */ +int +rte_intr_tls_epfd(void); + +/** + * @param intr_handle + * Pointer to the interrupt handle. + * @param epfd + * Epoll instance fd which the intr vector associated to. + * @param op + * The operation be performed for the vector. + * Operation type of {ADD, DEL}. + * @param vec + * RX intr vector number added to the epoll instance wait list. + * @param data + * User raw data. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data); + +/** + * It deletes registered eventfds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle); + +/** + * It enables the packet I/O interrupt event if it's necessary. + * It creates event fd for each interrupt vector when MSIX is used, + * otherwise it multiplexes a single event fd. + * + * @param intr_handle + * Pointer to the interrupt handle. + * @param nb_efd + * Number of interrupt vector trying to enable. + * The value 0 is not allowed. + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd); + +/** + * It disables the packet I/O interrupt event. + * It deletes registered eventfds and closes the open fds. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle); + +/** + * The packet I/O interrupt on datapath is enabled or not. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle); + +/** + * The interrupt handle instance allows other causes or not. + * Other causes stand for any none packet I/O interrupts. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle); + +/** + * The multiple interrupt vector capability of interrupt handle instance. + * It returns zero if no multiple interrupt vector support. + * + * @param intr_handle + * Pointer to the interrupt handle. + */ +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle); + +#endif /* _RTE_LINUXAPP_INTERRUPTS_H_ */ diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h new file mode 100644 index 00000000..2ac879fd --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h @@ -0,0 +1,179 @@ +/*- + * This file is provided under a dual BSD/LGPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GNU LESSER GENERAL PUBLIC LICENSE + * + * Copyright(c) 2007-2014 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Contact Information: + * Intel Corporation + * + * + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include <linux/if.h> +#define RTE_STD_C11 +#else +#include <rte_common.h> +#endif + +/** + * KNI name is part of memzone name. + */ +#define RTE_KNI_NAMESIZE 32 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + RTE_STD_C11 + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void *volatile buffer[]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + uint64_t buf_physaddr; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[2]; + uint16_t nb_segs; /**< Number of segments. */ + char pad4[2]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + /* PCI info */ + uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ + uint8_t bus; /**< Device bus */ + uint8_t devid; /**< Device ID */ + uint8_t function; /**< Device function. */ + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + __extension__ + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/rte_eal_version.map new file mode 100644 index 00000000..670bab3a --- /dev/null +++ b/src/seastar/dpdk/lib/librte_eal/linuxapp/eal/rte_eal_version.map @@ -0,0 +1,200 @@ +DPDK_2.0 { + global: + + __rte_panic; + devargs_list; + eal_parse_sysfs_value; + eal_timer_source; + lcore_config; + per_lcore__lcore_id; + per_lcore__rte_errno; + rte_calloc; + rte_calloc_socket; + rte_cpu_check_supported; + rte_cpu_get_flag_enabled; + rte_cycles_vmware_tsc_map; + rte_delay_us; + rte_dump_physmem_layout; + rte_dump_registers; + rte_dump_stack; + rte_dump_tailq; + rte_eal_alarm_cancel; + rte_eal_alarm_set; + rte_eal_devargs_add; + rte_eal_devargs_dump; + rte_eal_devargs_type_count; + rte_eal_get_configuration; + rte_eal_get_lcore_state; + rte_eal_get_physmem_layout; + rte_eal_get_physmem_size; + rte_eal_has_hugepages; + rte_eal_hpet_init; + rte_eal_init; + rte_eal_iopl_init; + rte_eal_lcore_role; + rte_eal_mp_remote_launch; + rte_eal_mp_wait_lcore; + rte_eal_parse_devargs_str; + rte_eal_process_type; + rte_eal_remote_launch; + rte_eal_tailq_lookup; + rte_eal_tailq_register; + rte_eal_wait_lcore; + rte_exit; + rte_free; + rte_get_hpet_cycles; + rte_get_hpet_hz; + rte_get_log_level; + rte_get_log_type; + rte_get_tsc_hz; + rte_hexdump; + rte_intr_callback_register; + rte_intr_callback_unregister; + rte_intr_disable; + rte_intr_enable; + rte_log; + rte_log_cur_msg_loglevel; + rte_log_cur_msg_logtype; + rte_logs; + rte_malloc; + rte_malloc_dump_stats; + rte_malloc_get_socket_stats; + rte_malloc_set_limit; + rte_malloc_socket; + rte_malloc_validate; + rte_malloc_virt2phy; + rte_mem_lock_page; + rte_mem_phy2mch; + rte_mem_virt2phy; + rte_memdump; + rte_memory_get_nchannel; + rte_memory_get_nrank; + rte_memzone_dump; + rte_memzone_lookup; + rte_memzone_reserve; + rte_memzone_reserve_aligned; + rte_memzone_reserve_bounded; + rte_memzone_walk; + rte_openlog_stream; + rte_realloc; + rte_set_application_usage_hook; + rte_set_log_level; + rte_set_log_type; + rte_socket_id; + rte_strerror; + rte_strsplit; + rte_sys_gettid; + rte_thread_get_affinity; + rte_thread_set_affinity; + rte_vlog; + rte_xen_dom0_memory_attach; + rte_xen_dom0_memory_init; + rte_zmalloc; + rte_zmalloc_socket; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_epoll_ctl; + rte_epoll_wait; + rte_intr_allow_others; + rte_intr_dp_is_en; + rte_intr_efd_disable; + rte_intr_efd_enable; + rte_intr_rx_ctl; + rte_intr_tls_epfd; + rte_memzone_free; + +} DPDK_2.0; + +DPDK_2.2 { + global: + + rte_intr_cap_multiple; + rte_keepalive_create; + rte_keepalive_dispatch_pings; + rte_keepalive_mark_alive; + rte_keepalive_register_core; + rte_xen_dom0_supported; + rte_xen_mem_phy2mch; + +} DPDK_2.1; + +DPDK_16.04 { + global: + + rte_cpu_get_flag_name; + rte_eal_primary_proc_alive; + +} DPDK_2.2; + +DPDK_16.07 { + global: + + pci_get_sysfs_path; + rte_keepalive_mark_sleep; + rte_keepalive_register_relay_callback; + rte_rtm_supported; + rte_thread_setname; + +} DPDK_16.04; + +DPDK_16.11 { + global: + + rte_delay_us_block; + rte_delay_us_callback_register; + rte_eal_dev_attach; + rte_eal_dev_detach; + +} DPDK_16.07; + +DPDK_17.02 { + global: + + rte_bus_dump; + rte_bus_probe; + rte_bus_register; + rte_bus_scan; + rte_bus_unregister; + +} DPDK_16.11; + +DPDK_17.05 { + global: + + rte_cpu_is_supported; + rte_intr_free_epoll_fd; + rte_log_dump; + rte_log_get_global_level; + rte_log_register; + rte_log_set_global_level; + rte_log_set_level; + rte_log_set_level_regexp; + rte_pci_detach; + rte_pci_dump; + rte_pci_ioport_map; + rte_pci_ioport_read; + rte_pci_ioport_unmap; + rte_pci_ioport_write; + rte_pci_map_device; + rte_pci_probe; + rte_pci_probe_one; + rte_pci_read_config; + rte_pci_register; + rte_pci_scan; + rte_pci_unmap_device; + rte_pci_unregister; + rte_pci_write_config; + rte_vdev_init; + rte_vdev_register; + rte_vdev_uninit; + rte_vdev_unregister; + vfio_get_container_fd; + vfio_get_group_fd; + vfio_get_group_no; + +} DPDK_17.02; |