diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/lib/env_dpdk/init.c | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/env_dpdk/init.c')
-rw-r--r-- | src/spdk/lib/env_dpdk/init.c | 604 |
1 files changed, 604 insertions, 0 deletions
diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c new file mode 100644 index 000000000..0376dbe7b --- /dev/null +++ b/src/spdk/lib/env_dpdk/init.c @@ -0,0 +1,604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include "spdk/version.h" +#include "spdk/env_dpdk.h" + +#include <rte_config.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_vfio.h> + +#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" +#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" +#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 + +static char **g_eal_cmdline; +static int g_eal_cmdline_argcount; +static bool g_external_init = true; + +static char * +_sprintf_alloc(const char *format, ...) +{ + va_list args; + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + va_start(args, format); + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + va_end(args); + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + va_end(args); + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + va_end(args); + return NULL; +} + +void +spdk_env_opts_init(struct spdk_env_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; + opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; + opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; + opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; + opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; +} + +static void +free_args(char **args, int argcount) +{ + int i; + + if (args == NULL) { + return; + } + + for (i = 0; i < argcount; i++) { + free(args[i]); + } + + if (argcount) { + free(args); + } +} + +static char ** +push_arg(char *args[], int *argcount, char *arg) +{ + char **tmp; + + if (arg == NULL) { + fprintf(stderr, "%s: NULL arg supplied\n", __func__); + free_args(args, *argcount); + return NULL; + } + + tmp = realloc(args, sizeof(char *) * (*argcount + 1)); + if (tmp == NULL) { + free(arg); + free_args(args, *argcount); + return NULL; + } + + tmp[*argcount] = arg; + (*argcount)++; + + return tmp; +} + +#if defined(__linux__) && defined(__x86_64__) + +/* TODO: Can likely get this value from rlimits in the future */ +#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 +#define VTD_CAP_MGAW_SHIFT 16 +#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) + +static int +get_iommu_width(void) +{ + DIR *dir; + FILE *file; + struct dirent *entry; + char mgaw_path[64]; + char buf[64]; + char *end; + long long int val; + int width, tmp; + + dir = opendir("/sys/devices/virtual/iommu/"); + if (dir == NULL) { + return -EINVAL; + } + + width = 0; + + while ((entry = readdir(dir)) != NULL) { + /* Find directories named "dmar0", "dmar1", etc */ + if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { + continue; + } + + tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", + entry->d_name); + if ((unsigned)tmp >= sizeof(mgaw_path)) { + continue; + } + + file = fopen(mgaw_path, "r"); + if (file == NULL) { + continue; + } + + if (fgets(buf, sizeof(buf), file) == NULL) { + fclose(file); + continue; + } + + val = strtoll(buf, &end, 16); + if (val == LLONG_MIN || val == LLONG_MAX) { + fclose(file); + continue; + } + + tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; + if (width == 0 || tmp < width) { + width = tmp; + } + + fclose(file); + } + + closedir(dir); + + return width; +} + +#endif + +static int +build_eal_cmdline(const struct spdk_env_opts *opts) +{ + int argcount = 0; + char **args; + + args = NULL; + + /* set the program name */ + args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); + if (args == NULL) { + return -1; + } + + /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); + if (args == NULL) { + return -1; + } + } + + /* set the coremask */ + /* NOTE: If coremask starts with '[' and ends with ']' it is a core list + */ + if (opts->core_mask[0] == '[') { + char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); + + if (l_arg != NULL) { + int len = strlen(l_arg); + + if (l_arg[len - 1] == ']') { + l_arg[len - 1] = '\0'; + } + } + args = push_arg(args, &argcount, l_arg); + } else { + args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); + } + + if (args == NULL) { + return -1; + } + + /* set the memory channel number */ + if (opts->mem_channel > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); + if (args == NULL) { + return -1; + } + } + + /* set the memory size */ + if (opts->mem_size >= 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); + if (args == NULL) { + return -1; + } + } + + /* set the master core */ + if (opts->master_core > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", + opts->master_core)); + if (args == NULL) { + return -1; + } + } + + /* set no pci if enabled */ + if (opts->no_pci) { + args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); + if (args == NULL) { + return -1; + } + } + + /* create just one hugetlbfs file */ + if (opts->hugepage_single_segments) { + args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); + if (args == NULL) { + return -1; + } + } + + /* unlink hugepages after initialization */ + if (opts->unlink_hugepage) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); + if (args == NULL) { + return -1; + } + } + + /* use a specific hugetlbfs mount */ + if (opts->hugedir) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); + if (args == NULL) { + return -1; + } + } + + if (opts->num_pci_addr) { + size_t i; + char bdf[32]; + struct spdk_pci_addr *pci_addr = + opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; + + for (i = 0; i < opts->num_pci_addr; i++) { + spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); + args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", + (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), + bdf)); + if (args == NULL) { + return -1; + } + } + } + + /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); + if (args == NULL) { + return -1; + } + + /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); + if (args == NULL) { + return -1; + } + + /* `user1` log type is used by rte_vhost, which prints an INFO log for each received + * vhost user message. We don't want that. The same log type is also used by a couple + * of other DPDK libs, but none of which we make use right now. If necessary, this can + * be overridden via opts->env_context. + */ + args = push_arg(args, &argcount, strdup("--log-level=user1:6")); + if (args == NULL) { + return -1; + } + + if (opts->env_context) { + args = push_arg(args, &argcount, strdup(opts->env_context)); + if (args == NULL) { + return -1; + } + } + +#ifdef __linux__ + + if (opts->iova_mode) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); + if (args == NULL) { + return -1; + } + } else { + /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, + * but DPDK guesses it should be iova-mode=va. Add a check and force + * iova-mode=pa here. */ + if (rte_vfio_noiommu_is_enabled()) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } + +#if defined(__x86_64__) + /* DPDK by default guesses that it should be using iova-mode=va so that it can + * support running as an unprivileged user. However, some systems (especially + * virtual machines) don't have an IOMMU capable of handling the full virtual + * address space and DPDK doesn't currently catch that. Add a check in SPDK + * and force iova-mode=pa here. */ + if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } +#elif defined(__PPC64__) + /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly + * auto-detect at the moment, so we'll just force it here. */ + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } +#endif + } + + + /* Set the base virtual address - it must be an address that is not in the + * ASAN shadow region, otherwise ASAN-enabled builds will ignore the + * mmap hint. + * + * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm + */ + args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); + if (args == NULL) { + return -1; + } + + /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. + * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two + * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split + * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. + */ +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); + if (args == NULL) { + return -1; + } + } +#endif + + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", + getpid())); + if (args == NULL) { + return -1; + } + } else { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", + opts->shm_id)); + if (args == NULL) { + return -1; + } + + /* set the process type */ + args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); + if (args == NULL) { + return -1; + } + } +#endif + + g_eal_cmdline = args; + g_eal_cmdline_argcount = argcount; + return argcount; +} + +int +spdk_env_dpdk_post_init(bool legacy_mem) +{ + int rc; + + pci_env_init(); + + rc = mem_map_init(legacy_mem); + if (rc < 0) { + fprintf(stderr, "Failed to allocate mem_map\n"); + return rc; + } + + rc = vtophys_init(); + if (rc < 0) { + fprintf(stderr, "Failed to initialize vtophys\n"); + return rc; + } + + return 0; +} + +void +spdk_env_dpdk_post_fini(void) +{ + pci_env_fini(); + + free_args(g_eal_cmdline, g_eal_cmdline_argcount); + g_eal_cmdline = NULL; + g_eal_cmdline_argcount = 0; +} + +int +spdk_env_init(const struct spdk_env_opts *opts) +{ + char **dpdk_args = NULL; + int i, rc; + int orig_optind; + bool legacy_mem; + + /* If SPDK env has been initialized before, then only pci env requires + * reinitialization. + */ + if (g_external_init == false) { + if (opts != NULL) { + fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); + return -EINVAL; + } + + printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); + pci_env_reinit(); + + return 0; + } + + if (opts == NULL) { + fprintf(stderr, "NULL arguments to initialize DPDK\n"); + return -EINVAL; + } + + rc = build_eal_cmdline(opts); + if (rc < 0) { + fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + return -EINVAL; + } + + printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + printf("[ DPDK EAL parameters: "); + for (i = 0; i < g_eal_cmdline_argcount; i++) { + printf("%s ", g_eal_cmdline[i]); + } + printf("]\n"); + + /* DPDK rearranges the array we pass to it, so make a copy + * before passing so we can still free the individual strings + * correctly. + */ + dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); + if (dpdk_args == NULL) { + fprintf(stderr, "Failed to allocate dpdk_args\n"); + return -ENOMEM; + } + memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); + + fflush(stdout); + orig_optind = optind; + optind = 1; + rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); + optind = orig_optind; + + free(dpdk_args); + + if (rc < 0) { + if (rte_errno == EALREADY) { + fprintf(stderr, "DPDK already initialized\n"); + } else { + fprintf(stderr, "Failed to initialize DPDK\n"); + } + return -rte_errno; + } + + legacy_mem = false; + if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { + legacy_mem = true; + } + + rc = spdk_env_dpdk_post_init(legacy_mem); + if (rc == 0) { + g_external_init = false; + } + + return rc; +} + +void +spdk_env_fini(void) +{ + spdk_env_dpdk_post_fini(); +} + +bool +spdk_env_dpdk_external_init(void) +{ + return g_external_init; +} |