diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/dpdk/lib/librte_eal/common | |
parent | Initial commit. (diff) | |
download | ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/dpdk/lib/librte_eal/common')
50 files changed, 18271 insertions, 0 deletions
diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c new file mode 100644 index 000000000..baa5b532a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_bus.c @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2016 NXP + */ + +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_bus.h> +#include <rte_debug.h> +#include <rte_string_fns.h> +#include <rte_errno.h> + +#include "eal_private.h" + +static struct rte_bus_list rte_bus_list = + TAILQ_HEAD_INITIALIZER(rte_bus_list); + +void +rte_bus_register(struct rte_bus *bus) +{ + RTE_VERIFY(bus); + RTE_VERIFY(bus->name && strlen(bus->name)); + /* A bus should mandatorily have the scan implemented */ + RTE_VERIFY(bus->scan); + RTE_VERIFY(bus->probe); + RTE_VERIFY(bus->find_device); + /* Buses supporting driver plug also require unplug. */ + RTE_VERIFY(!bus->plug || bus->unplug); + + TAILQ_INSERT_TAIL(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name); +} + +void +rte_bus_unregister(struct rte_bus *bus) +{ + TAILQ_REMOVE(&rte_bus_list, bus, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] bus.\n", bus->name); +} + +/* Scan all the buses for registered devices */ +int +rte_bus_scan(void) +{ + int ret; + struct rte_bus *bus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus->scan(); + if (ret) + RTE_LOG(ERR, EAL, "Scan for (%s) bus failed.\n", + bus->name); + } + + return 0; +} + +/* Probe all devices of all buses */ +int +rte_bus_probe(void) +{ + int ret; + struct rte_bus *bus, *vbus = NULL; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + if (!strcmp(bus->name, "vdev")) { + vbus = bus; + continue; + } + + ret = bus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + bus->name); + } + + if (vbus) { + ret = vbus->probe(); + if (ret) + RTE_LOG(ERR, EAL, "Bus (%s) probe failed.\n", + vbus->name); + } + + return 0; +} + +/* Dump information of a single bus */ +static int +bus_dump_one(FILE *f, struct rte_bus *bus) +{ + int ret; + + /* For now, dump only the bus name */ + ret = fprintf(f, " %s\n", bus->name); + + /* Error in case of inability in writing to stream */ + if (ret < 0) + return ret; + + return 0; +} + +void +rte_bus_dump(FILE *f) +{ + int ret; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + ret = bus_dump_one(f, bus); + if (ret) { + RTE_LOG(ERR, EAL, "Unable to write to stream (%d)\n", + ret); + break; + } + } +} + +struct rte_bus * +rte_bus_find(const struct rte_bus *start, rte_bus_cmp_t cmp, + const void *data) +{ + struct rte_bus *bus; + + if (start != NULL) + bus = TAILQ_NEXT(start, next); + else + bus = TAILQ_FIRST(&rte_bus_list); + while (bus != NULL) { + if (cmp(bus, data) == 0) + break; + bus = TAILQ_NEXT(bus, next); + } + return bus; +} + +static int +cmp_rte_device(const struct rte_device *dev1, const void *_dev2) +{ + const struct rte_device *dev2 = _dev2; + + return dev1 != dev2; +} + +static int +bus_find_device(const struct rte_bus *bus, const void *_dev) +{ + struct rte_device *dev; + + dev = bus->find_device(NULL, cmp_rte_device, _dev); + return dev == NULL; +} + +struct rte_bus * +rte_bus_find_by_device(const struct rte_device *dev) +{ + return rte_bus_find(NULL, bus_find_device, (const void *)dev); +} + +static int +cmp_bus_name(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return strcmp(bus->name, name); +} + +struct rte_bus * +rte_bus_find_by_name(const char *busname) +{ + return rte_bus_find(NULL, cmp_bus_name, (const void *)busname); +} + +static int +bus_can_parse(const struct rte_bus *bus, const void *_name) +{ + const char *name = _name; + + return !(bus->parse && bus->parse(name, NULL) == 0); +} + +struct rte_bus * +rte_bus_find_by_device_name(const char *str) +{ + char name[RTE_DEV_NAME_MAX_LEN]; + char *c; + + strlcpy(name, str, sizeof(name)); + c = strchr(name, ','); + if (c != NULL) + c[0] = '\0'; + return rte_bus_find(NULL, bus_can_parse, name); +} + + +/* + * Get iommu class of devices on the bus. + */ +enum rte_iova_mode +rte_bus_get_iommu_class(void) +{ + enum rte_iova_mode mode = RTE_IOVA_DC; + bool buses_want_va = false; + bool buses_want_pa = false; + struct rte_bus *bus; + + TAILQ_FOREACH(bus, &rte_bus_list, next) { + enum rte_iova_mode bus_iova_mode; + + if (bus->get_iommu_class == NULL) + continue; + + bus_iova_mode = bus->get_iommu_class(); + RTE_LOG(DEBUG, EAL, "Bus %s wants IOVA as '%s'\n", + bus->name, + bus_iova_mode == RTE_IOVA_DC ? "DC" : + (bus_iova_mode == RTE_IOVA_PA ? "PA" : "VA")); + if (bus_iova_mode == RTE_IOVA_PA) + buses_want_pa = true; + else if (bus_iova_mode == RTE_IOVA_VA) + buses_want_va = true; + } + if (buses_want_va && !buses_want_pa) { + mode = RTE_IOVA_VA; + } else if (buses_want_pa && !buses_want_va) { + mode = RTE_IOVA_PA; + } else { + mode = RTE_IOVA_DC; + if (buses_want_va) { + RTE_LOG(WARNING, EAL, "Some buses want 'VA' but forcing 'DC' because other buses want 'PA'.\n"); + RTE_LOG(WARNING, EAL, "Depending on the final decision by the EAL, not all buses may be able to initialize.\n"); + } + } + + return mode; +} + +static int +bus_handle_sigbus(const struct rte_bus *bus, + const void *failure_addr) +{ + int ret; + + if (!bus->sigbus_handler) + return -1; + + ret = bus->sigbus_handler(failure_addr); + + /* find bus but handle failed, keep the errno be set. */ + if (ret < 0 && rte_errno == 0) + rte_errno = ENOTSUP; + + return ret > 0; +} + +int +rte_bus_sigbus_handler(const void *failure_addr) +{ + struct rte_bus *bus; + + int ret = 0; + int old_errno = rte_errno; + + rte_errno = 0; + + bus = rte_bus_find(NULL, bus_handle_sigbus, failure_addr); + /* can not find bus. */ + if (!bus) + return 1; + /* find bus but handle failed, pass on the new errno. */ + else if (rte_errno != 0) + return -1; + + /* restore the old errno. */ + rte_errno = old_errno; + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c new file mode 100644 index 000000000..0187076af --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_class.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Gaëtan Rivet + */ + +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_class.h> +#include <rte_debug.h> + +static struct rte_class_list rte_class_list = + TAILQ_HEAD_INITIALIZER(rte_class_list); + +void +rte_class_register(struct rte_class *class) +{ + RTE_VERIFY(class); + RTE_VERIFY(class->name && strlen(class->name)); + + TAILQ_INSERT_TAIL(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Registered [%s] device class.\n", class->name); +} + +void +rte_class_unregister(struct rte_class *class) +{ + TAILQ_REMOVE(&rte_class_list, class, next); + RTE_LOG(DEBUG, EAL, "Unregistered [%s] device class.\n", class->name); +} + +struct rte_class * +rte_class_find(const struct rte_class *start, rte_class_cmp_t cmp, + const void *data) +{ + struct rte_class *cls; + + if (start != NULL) + cls = TAILQ_NEXT(start, next); + else + cls = TAILQ_FIRST(&rte_class_list); + while (cls != NULL) { + if (cmp(cls, data) == 0) + break; + cls = TAILQ_NEXT(cls, next); + } + return cls; +} + +static int +cmp_class_name(const struct rte_class *class, const void *_name) +{ + const char *name = _name; + + return strcmp(class->name, name); +} + +struct rte_class * +rte_class_find_by_name(const char *name) +{ + return rte_class_find(NULL, cmp_class_name, (const void *)name); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c new file mode 100644 index 000000000..dc5f75d05 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_cpuflags.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> + +#include <rte_common.h> +#include <rte_cpuflags.h> + +int +rte_cpu_is_supported(void) +{ + /* This is generated at compile-time by the build system */ + static const enum rte_cpu_flag_t compile_time_flags[] = { + RTE_COMPILE_TIME_CPUFLAGS + }; + unsigned count = RTE_DIM(compile_time_flags), i; + int ret; + + for (i = 0; i < count; i++) { + ret = rte_cpu_get_flag_enabled(compile_time_flags[i]); + + if (ret < 0) { + fprintf(stderr, + "ERROR: CPU feature flag lookup failed with error %d\n", + ret); + return 0; + } + if (!ret) { + fprintf(stderr, + "ERROR: This system does not support \"%s\".\n" + "Please check that RTE_MACHINE is set correctly.\n", + rte_cpu_get_flag_name(compile_time_flags[i])); + return 0; + } + } + + return 1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c new file mode 100644 index 000000000..9e4f09d83 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_dev.c @@ -0,0 +1,793 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <sys/queue.h> + +#include <rte_compat.h> +#include <rte_bus.h> +#include <rte_class.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_debug.h> +#include <rte_errno.h> +#include <rte_kvargs.h> +#include <rte_log.h> +#include <rte_spinlock.h> +#include <rte_malloc.h> +#include <rte_string_fns.h> + +#include "eal_private.h" +#include "hotplug_mp.h" + +/** + * The device event callback description. + * + * It contains callback address to be registered by user application, + * the pointer to the parameters for callback, and the device name. + */ +struct dev_event_callback { + TAILQ_ENTRY(dev_event_callback) next; /**< Callbacks list */ + rte_dev_event_cb_fn cb_fn; /**< Callback address */ + void *cb_arg; /**< Callback parameter */ + char *dev_name; /**< Callback device name, NULL is for all device */ + uint32_t active; /**< Callback is executing */ +}; + +/** @internal Structure to keep track of registered callbacks */ +TAILQ_HEAD(dev_event_cb_list, dev_event_callback); + +/* The device event callback list for all registered callbacks. */ +static struct dev_event_cb_list dev_event_cbs; + +/* spinlock for device callbacks */ +static rte_spinlock_t dev_event_lock = RTE_SPINLOCK_INITIALIZER; + +struct dev_next_ctx { + struct rte_dev_iterator *it; + const char *bus_str; + const char *cls_str; +}; + +#define CTX(it, bus_str, cls_str) \ + (&(const struct dev_next_ctx){ \ + .it = it, \ + .bus_str = bus_str, \ + .cls_str = cls_str, \ + }) + +#define ITCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->it) + +#define BUSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->bus_str) + +#define CLSCTX(ptr) \ + (((struct dev_next_ctx *)(intptr_t)ptr)->cls_str) + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +int +rte_dev_is_probed(const struct rte_device *dev) +{ + /* The field driver should be set only when the probe is successful. */ + return dev->driver != NULL; +} + +/* helper function to build devargs, caller should free the memory */ +static int +build_devargs(const char *busname, const char *devname, + const char *drvargs, char **devargs) +{ + int length; + + length = snprintf(NULL, 0, "%s:%s,%s", busname, devname, drvargs); + if (length < 0) + return -EINVAL; + + *devargs = malloc(length + 1); + if (*devargs == NULL) + return -ENOMEM; + + length = snprintf(*devargs, length + 1, "%s:%s,%s", + busname, devname, drvargs); + if (length < 0) { + free(*devargs); + return -EINVAL; + } + + return 0; +} + +int +rte_eal_hotplug_add(const char *busname, const char *devname, + const char *drvargs) +{ + + char *devargs; + int ret; + + ret = build_devargs(busname, devname, drvargs, &devargs); + if (ret != 0) + return ret; + + ret = rte_dev_probe(devargs); + free(devargs); + + return ret; +} + +/* probe device at local process. */ +int +local_dev_probe(const char *devargs, struct rte_device **new_dev) +{ + struct rte_device *dev; + struct rte_devargs *da; + int ret; + + *new_dev = NULL; + da = calloc(1, sizeof(*da)); + if (da == NULL) + return -ENOMEM; + + ret = rte_devargs_parse(da, devargs); + if (ret) + goto err_devarg; + + if (da->bus->plug == NULL) { + RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n", + da->bus->name); + ret = -ENOTSUP; + goto err_devarg; + } + + ret = rte_devargs_insert(&da); + if (ret) + goto err_devarg; + + /* the rte_devargs will be referenced in the matching rte_device */ + ret = da->bus->scan(); + if (ret) + goto err_devarg; + + dev = da->bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s)\n", + da->name); + ret = -ENODEV; + goto err_devarg; + } + /* Since there is a matching device, it is now its responsibility + * to manage the devargs we've just inserted. From this point + * those devargs shouldn't be removed manually anymore. + */ + + ret = dev->bus->plug(dev); + if (ret > 0) + ret = -ENOTSUP; + + if (ret && !rte_dev_is_probed(dev)) { /* if hasn't ever succeeded */ + RTE_LOG(ERR, EAL, "Driver cannot attach the device (%s)\n", + dev->name); + return ret; + } + + *new_dev = dev; + return ret; + +err_devarg: + if (rte_devargs_remove(da) != 0) { + free(da->args); + free(da); + } + return ret; +} + +int +rte_dev_probe(const char *devargs) +{ + struct eal_dev_mp_req req; + struct rte_device *dev; + int ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_ATTACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug add device\n"); + return req.result; + } + + /* attach a shared device from primary start from here: */ + + /* primary attach the new device itself. */ + ret = local_dev_probe(devargs, &dev); + + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on primary process\n"); + + /** + * it is possible that secondary process failed to attached a + * device that primary process have during initialization, + * so for -EEXIST case, we still need to sync with secondary + * process. + */ + if (ret != -EEXIST) + return ret; + } + + /* primary send attach sync request to secondary. */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /* if any communication error, we need to rollback. */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug add request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to attach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to attach device on secondary process\n"); + ret = req.result; + + /* for -EEXIST, we don't need to rollback. */ + if (ret == -EEXIST) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on secondary." + "Devices in secondary may not sync with primary\n"); + + /* primary rollback itself. */ + if (local_dev_remove(dev) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device attach on primary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_eal_hotplug_remove(const char *busname, const char *devname) +{ + struct rte_device *dev; + struct rte_bus *bus; + + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", busname); + return -ENOENT; + } + + dev = bus->find_device(NULL, cmp_dev_name, devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", devname); + return -EINVAL; + } + + return rte_dev_remove(dev); +} + +/* remove device at local process. */ +int +local_dev_remove(struct rte_device *dev) +{ + int ret; + + if (dev->bus->unplug == NULL) { + RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n", + dev->bus->name); + return -ENOTSUP; + } + + ret = dev->bus->unplug(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Driver cannot detach the device (%s)\n", + dev->name); + return (ret < 0) ? ret : -ENOENT; + } + + return 0; +} + +int +rte_dev_remove(struct rte_device *dev) +{ + struct eal_dev_mp_req req; + char *devargs; + int ret; + + if (!rte_dev_is_probed(dev)) { + RTE_LOG(ERR, EAL, "Device is not probed\n"); + return -ENOENT; + } + + ret = build_devargs(dev->bus->name, dev->name, "", &devargs); + if (ret != 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.t = EAL_DEV_REQ_TYPE_DETACH; + strlcpy(req.devargs, devargs, EAL_DEV_MP_DEV_ARGS_MAX_LEN); + free(devargs); + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) { + /** + * If in secondary process, just send IPC request to + * primary process. + */ + ret = eal_dev_hotplug_request_to_primary(&req); + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send hotplug request to primary\n"); + return -ENOMSG; + } + if (req.result != 0) + RTE_LOG(ERR, EAL, + "Failed to hotplug remove device\n"); + return req.result; + } + + /* detach a device from primary start from here: */ + + /* primary send detach sync request to secondary */ + ret = eal_dev_hotplug_request_to_secondary(&req); + + /** + * if communication error, we need to rollback, because it is possible + * part of the secondary processes still detached it successfully. + */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to send device detach request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + /** + * if any secondary failed to detach, we need to consider if rollback + * is necessary. + */ + if (req.result != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on secondary process\n"); + ret = req.result; + /** + * if -ENOENT, we don't need to rollback, since devices is + * already detached on secondary process. + */ + if (ret != -ENOENT) + goto rollback; + } + + /* primary detach the device itself. */ + ret = local_dev_remove(dev); + + /* if primary failed, still need to consider if rollback is necessary */ + if (ret != 0) { + RTE_LOG(ERR, EAL, + "Failed to detach device on primary process\n"); + /* if -ENOENT, we don't need to rollback */ + if (ret == -ENOENT) + return ret; + goto rollback; + } + + return 0; + +rollback: + req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + + /* primary send rollback request to secondary. */ + if (eal_dev_hotplug_request_to_secondary(&req) != 0) + RTE_LOG(WARNING, EAL, + "Failed to rollback device detach on secondary." + "Devices in secondary may not sync with primary\n"); + + return ret; +} + +int +rte_dev_event_callback_register(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + struct dev_event_callback *event_cb; + int ret; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + + if (TAILQ_EMPTY(&dev_event_cbs)) + TAILQ_INIT(&dev_event_cbs); + + TAILQ_FOREACH(event_cb, &dev_event_cbs, next) { + if (event_cb->cb_fn == cb_fn && event_cb->cb_arg == cb_arg) { + if (device_name == NULL && event_cb->dev_name == NULL) + break; + if (device_name == NULL || event_cb->dev_name == NULL) + continue; + if (!strcmp(event_cb->dev_name, device_name)) + break; + } + } + + /* create a new callback. */ + if (event_cb == NULL) { + event_cb = malloc(sizeof(struct dev_event_callback)); + if (event_cb != NULL) { + event_cb->cb_fn = cb_fn; + event_cb->cb_arg = cb_arg; + event_cb->active = 0; + if (!device_name) { + event_cb->dev_name = NULL; + } else { + event_cb->dev_name = strdup(device_name); + if (event_cb->dev_name == NULL) { + ret = -ENOMEM; + goto error; + } + } + TAILQ_INSERT_TAIL(&dev_event_cbs, event_cb, next); + } else { + RTE_LOG(ERR, EAL, + "Failed to allocate memory for device " + "event callback."); + ret = -ENOMEM; + goto error; + } + } else { + RTE_LOG(ERR, EAL, + "The callback is already exist, no need " + "to register again.\n"); + ret = -EEXIST; + } + + rte_spinlock_unlock(&dev_event_lock); + return 0; +error: + free(event_cb); + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +int +rte_dev_event_callback_unregister(const char *device_name, + rte_dev_event_cb_fn cb_fn, + void *cb_arg) +{ + int ret = 0; + struct dev_event_callback *event_cb, *next; + + if (!cb_fn) + return -EINVAL; + + rte_spinlock_lock(&dev_event_lock); + /*walk through the callbacks and remove all that match. */ + for (event_cb = TAILQ_FIRST(&dev_event_cbs); event_cb != NULL; + event_cb = next) { + + next = TAILQ_NEXT(event_cb, next); + + if (device_name != NULL && event_cb->dev_name != NULL) { + if (!strcmp(event_cb->dev_name, device_name)) { + if (event_cb->cb_fn != cb_fn || + (cb_arg != (void *)-1 && + event_cb->cb_arg != cb_arg)) + continue; + } + } else if (device_name != NULL) { + continue; + } + + /* + * if this callback is not executing right now, + * then remove it. + */ + if (event_cb->active == 0) { + TAILQ_REMOVE(&dev_event_cbs, event_cb, next); + free(event_cb); + ret++; + } else { + continue; + } + } + rte_spinlock_unlock(&dev_event_lock); + return ret; +} + +void +rte_dev_event_callback_process(const char *device_name, + enum rte_dev_event_type event) +{ + struct dev_event_callback *cb_lst; + + if (device_name == NULL) + return; + + rte_spinlock_lock(&dev_event_lock); + + TAILQ_FOREACH(cb_lst, &dev_event_cbs, next) { + if (cb_lst->dev_name) { + if (strcmp(cb_lst->dev_name, device_name)) + continue; + } + cb_lst->active = 1; + rte_spinlock_unlock(&dev_event_lock); + cb_lst->cb_fn(device_name, event, + cb_lst->cb_arg); + rte_spinlock_lock(&dev_event_lock); + cb_lst->active = 0; + } + rte_spinlock_unlock(&dev_event_lock); +} + +int +rte_dev_iterator_init(struct rte_dev_iterator *it, + const char *dev_str) +{ + struct rte_devargs devargs; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + + /* Having both bus_str and cls_str NULL is illegal, + * marking this iterator as invalid unless + * everything goes well. + */ + it->bus_str = NULL; + it->cls_str = NULL; + + devargs.data = dev_str; + if (rte_devargs_layers_parse(&devargs, dev_str)) + goto get_out; + + bus = devargs.bus; + cls = devargs.cls; + /* The string should have at least + * one layer specified. + */ + if (bus == NULL && cls == NULL) { + RTE_LOG(ERR, EAL, + "Either bus or class must be specified.\n"); + rte_errno = EINVAL; + goto get_out; + } + if (bus != NULL && bus->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Bus %s not supported\n", bus->name); + rte_errno = ENOTSUP; + goto get_out; + } + if (cls != NULL && cls->dev_iterate == NULL) { + RTE_LOG(ERR, EAL, "Class %s not supported\n", cls->name); + rte_errno = ENOTSUP; + goto get_out; + } + it->bus_str = devargs.bus_str; + it->cls_str = devargs.cls_str; + it->dev_str = dev_str; + it->bus = bus; + it->cls = cls; + it->device = NULL; + it->class_device = NULL; +get_out: + return -rte_errno; +} + +static char * +dev_str_sane_copy(const char *str) +{ + size_t end; + char *copy; + + end = strcspn(str, ",/"); + if (str[end] == ',') { + copy = strdup(&str[end + 1]); + } else { + /* '/' or '\0' */ + copy = strdup(""); + } + if (copy == NULL) { + rte_errno = ENOMEM; + } else { + char *slash; + + slash = strchr(copy, '/'); + if (slash != NULL) + slash[0] = '\0'; + } + return copy; +} + +static int +class_next_dev_cmp(const struct rte_class *cls, + const void *ctx) +{ + struct rte_dev_iterator *it; + const char *cls_str = NULL; + void *dev; + + if (cls->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + cls_str = CLSCTX(ctx); + dev = it->class_device; + /* it->cls_str != NULL means a class + * was specified in the devstr. + */ + if (it->cls_str != NULL && cls != it->cls) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + dev = cls->dev_iterate(dev, cls_str, it); + it->class_device = dev; + return dev == NULL; +} + +static int +bus_next_dev_cmp(const struct rte_bus *bus, + const void *ctx) +{ + struct rte_device *dev = NULL; + struct rte_class *cls = NULL; + struct rte_dev_iterator *it; + const char *bus_str = NULL; + + if (bus->dev_iterate == NULL) + return 1; + it = ITCTX(ctx); + bus_str = BUSCTX(ctx); + dev = it->device; + /* it->bus_str != NULL means a bus + * was specified in the devstr. + */ + if (it->bus_str != NULL && bus != it->bus) + return 1; + /* If an error occurred previously, + * no need to test further. + */ + if (rte_errno != 0) + return -1; + if (it->cls_str == NULL) { + dev = bus->dev_iterate(dev, bus_str, it); + goto end; + } + /* cls_str != NULL */ + if (dev == NULL) { +next_dev_on_bus: + dev = bus->dev_iterate(dev, bus_str, it); + it->device = dev; + } + if (dev == NULL) + return 1; + if (it->cls != NULL) + cls = TAILQ_PREV(it->cls, rte_class_list, next); + cls = rte_class_find(cls, class_next_dev_cmp, ctx); + if (cls != NULL) { + it->cls = cls; + goto end; + } + goto next_dev_on_bus; +end: + it->device = dev; + return dev == NULL; +} +struct rte_device * +rte_dev_iterator_next(struct rte_dev_iterator *it) +{ + struct rte_bus *bus = NULL; + int old_errno = rte_errno; + char *bus_str = NULL; + char *cls_str = NULL; + + rte_errno = 0; + if (it->bus_str == NULL && it->cls_str == NULL) { + /* Invalid iterator. */ + rte_errno = EINVAL; + return NULL; + } + if (it->bus != NULL) + bus = TAILQ_PREV(it->bus, rte_bus_list, next); + if (it->bus_str != NULL) { + bus_str = dev_str_sane_copy(it->bus_str); + if (bus_str == NULL) + goto out; + } + if (it->cls_str != NULL) { + cls_str = dev_str_sane_copy(it->cls_str); + if (cls_str == NULL) + goto out; + } + while ((bus = rte_bus_find(bus, bus_next_dev_cmp, + CTX(it, bus_str, cls_str)))) { + if (it->device != NULL) { + it->bus = bus; + goto out; + } + if (it->bus_str != NULL || + rte_errno != 0) + break; + } + if (rte_errno == 0) + rte_errno = old_errno; +out: + free(bus_str); + free(cls_str); + return it->device; +} + +int +rte_dev_dma_map(struct rte_device *dev, void *addr, uint64_t iova, + size_t len) +{ + if (dev->bus->dma_map == NULL || len == 0) { + rte_errno = ENOTSUP; + return -1; + } + /* Memory must be registered through rte_extmem_* APIs */ + if (rte_mem_virt2memseg_list(addr) == NULL) { + rte_errno = EINVAL; + return -1; + } + + return dev->bus->dma_map(dev, addr, iova, len); +} + +int +rte_dev_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, + size_t len) +{ + if (dev->bus->dma_unmap == NULL || len == 0) { + rte_errno = ENOTSUP; + return -1; + } + /* Memory must be registered through rte_extmem_* APIs */ + if (rte_mem_virt2memseg_list(addr) == NULL) { + rte_errno = EINVAL; + return -1; + } + + return dev->bus->dma_unmap(dev, addr, iova, len); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c new file mode 100644 index 000000000..2123773ef --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_devargs.c @@ -0,0 +1,403 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2014 6WIND S.A. + */ + +/* This file manages the list of devices and their arguments, as given + * by the user at startup + */ + +#include <stdio.h> +#include <string.h> +#include <stdarg.h> + +#include <rte_bus.h> +#include <rte_class.h> +#include <rte_compat.h> +#include <rte_dev.h> +#include <rte_devargs.h> +#include <rte_errno.h> +#include <rte_kvargs.h> +#include <rte_log.h> +#include <rte_tailq.h> +#include "eal_private.h" + +/** user device double-linked queue type definition */ +TAILQ_HEAD(rte_devargs_list, rte_devargs); + +/** Global list of user devices */ +static struct rte_devargs_list devargs_list = + TAILQ_HEAD_INITIALIZER(devargs_list); + +static size_t +devargs_layer_count(const char *s) +{ + size_t i = s ? 1 : 0; + + while (s != NULL && s[0] != '\0') { + i += s[0] == '/'; + s++; + } + return i; +} + +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr) +{ + struct { + const char *key; + const char *str; + struct rte_kvargs *kvlist; + } layers[] = { + { "bus=", NULL, NULL, }, + { "class=", NULL, NULL, }, + { "driver=", NULL, NULL, }, + }; + struct rte_kvargs_pair *kv = NULL; + struct rte_class *cls = NULL; + struct rte_bus *bus = NULL; + const char *s = devstr; + size_t nblayer; + size_t i = 0; + int ret = 0; + + /* Split each sub-lists. */ + nblayer = devargs_layer_count(devstr); + if (nblayer > RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Invalid format: too many layers (%zu)\n", + nblayer); + ret = -E2BIG; + goto get_out; + } + + /* If the devargs points the devstr + * as source data, then it should not allocate + * anything and keep referring only to it. + */ + if (devargs->data != devstr) { + devargs->data = strdup(devstr); + if (devargs->data == NULL) { + RTE_LOG(ERR, EAL, "OOM\n"); + ret = -ENOMEM; + goto get_out; + } + s = devargs->data; + } + + while (s != NULL) { + if (i >= RTE_DIM(layers)) { + RTE_LOG(ERR, EAL, "Unrecognized layer %s\n", s); + ret = -EINVAL; + goto get_out; + } + /* + * The last layer is free-form. + * The "driver" key is not required (but accepted). + */ + if (strncmp(layers[i].key, s, strlen(layers[i].key)) && + i != RTE_DIM(layers) - 1) + goto next_layer; + layers[i].str = s; + layers[i].kvlist = rte_kvargs_parse_delim(s, NULL, "/"); + if (layers[i].kvlist == NULL) { + RTE_LOG(ERR, EAL, "Could not parse %s\n", s); + ret = -EINVAL; + goto get_out; + } + s = strchr(s, '/'); + if (s != NULL) + s++; +next_layer: + i++; + } + + /* Parse each sub-list. */ + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist == NULL) + continue; + kv = &layers[i].kvlist->pairs[0]; + if (strcmp(kv->key, "bus") == 0) { + bus = rte_bus_find_by_name(kv->value); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Could not find bus \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "class") == 0) { + cls = rte_class_find_by_name(kv->value); + if (cls == NULL) { + RTE_LOG(ERR, EAL, "Could not find class \"%s\"\n", + kv->value); + ret = -EFAULT; + goto get_out; + } + } else if (strcmp(kv->key, "driver") == 0) { + /* Ignore */ + continue; + } + } + + /* Fill devargs fields. */ + devargs->bus_str = layers[0].str; + devargs->cls_str = layers[1].str; + devargs->drv_str = layers[2].str; + devargs->bus = bus; + devargs->cls = cls; + + /* If we own the data, clean up a bit + * the several layers string, to ease + * their parsing afterward. + */ + if (devargs->data != devstr) { + char *s = (void *)(intptr_t)(devargs->data); + + while ((s = strchr(s, '/'))) { + *s = '\0'; + s++; + } + } + +get_out: + for (i = 0; i < RTE_DIM(layers); i++) { + if (layers[i].kvlist) + rte_kvargs_free(layers[i].kvlist); + } + if (ret != 0) + rte_errno = -ret; + return ret; +} + +static int +bus_name_cmp(const struct rte_bus *bus, const void *name) +{ + return strncmp(bus->name, name, strlen(bus->name)); +} + +int +rte_devargs_parse(struct rte_devargs *da, const char *dev) +{ + struct rte_bus *bus = NULL; + const char *devname; + const size_t maxlen = sizeof(da->name); + size_t i; + + if (da == NULL) + return -EINVAL; + + /* Retrieve eventual bus info */ + do { + devname = dev; + bus = rte_bus_find(bus, bus_name_cmp, dev); + if (bus == NULL) + break; + devname = dev + strlen(bus->name) + 1; + if (rte_bus_find_by_device_name(devname) == bus) + break; + } while (1); + /* Store device name */ + i = 0; + while (devname[i] != '\0' && devname[i] != ',') { + da->name[i] = devname[i]; + i++; + if (i == maxlen) { + RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n", + dev, maxlen); + da->name[i - 1] = '\0'; + return -EINVAL; + } + } + da->name[i] = '\0'; + if (bus == NULL) { + bus = rte_bus_find_by_device_name(da->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n", + da->name); + return -EFAULT; + } + } + da->bus = bus; + /* Parse eventual device arguments */ + if (devname[i] == ',') + da->args = strdup(&devname[i + 1]); + else + da->args = strdup(""); + if (da->args == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n"); + return -ENOMEM; + } + return 0; +} + +int +rte_devargs_parsef(struct rte_devargs *da, const char *format, ...) +{ + va_list ap; + size_t len; + char *dev; + int ret; + + if (da == NULL) + return -EINVAL; + + va_start(ap, format); + len = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + dev = calloc(1, len + 1); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "not enough memory to parse device\n"); + return -ENOMEM; + } + + va_start(ap, format); + vsnprintf(dev, len + 1, format, ap); + va_end(ap); + + ret = rte_devargs_parse(da, dev); + + free(dev); + return ret; +} + +int +rte_devargs_insert(struct rte_devargs **da) +{ + struct rte_devargs *listed_da; + void *tmp; + + if (*da == NULL || (*da)->bus == NULL) + return -1; + + TAILQ_FOREACH_SAFE(listed_da, &devargs_list, next, tmp) { + if (listed_da == *da) + /* devargs already in the list */ + return 0; + if (strcmp(listed_da->bus->name, (*da)->bus->name) == 0 && + strcmp(listed_da->name, (*da)->name) == 0) { + /* device already in devargs list, must be updated */ + listed_da->type = (*da)->type; + listed_da->policy = (*da)->policy; + free(listed_da->args); + listed_da->args = (*da)->args; + listed_da->bus = (*da)->bus; + listed_da->cls = (*da)->cls; + listed_da->bus_str = (*da)->bus_str; + listed_da->cls_str = (*da)->cls_str; + listed_da->data = (*da)->data; + /* replace provided devargs with found one */ + free(*da); + *da = listed_da; + return 0; + } + } + /* new device in the list */ + TAILQ_INSERT_TAIL(&devargs_list, *da, next); + return 0; +} + +/* store a whitelist parameter for later parsing */ +int +rte_devargs_add(enum rte_devtype devtype, const char *devargs_str) +{ + struct rte_devargs *devargs = NULL; + struct rte_bus *bus = NULL; + const char *dev = devargs_str; + + /* use calloc instead of rte_zmalloc as it's called early at init */ + devargs = calloc(1, sizeof(*devargs)); + if (devargs == NULL) + goto fail; + + if (rte_devargs_parse(devargs, dev)) + goto fail; + devargs->type = devtype; + bus = devargs->bus; + if (devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI) + devargs->policy = RTE_DEV_BLACKLISTED; + if (bus->conf.scan_mode == RTE_BUS_SCAN_UNDEFINED) { + if (devargs->policy == RTE_DEV_WHITELISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_WHITELIST; + else if (devargs->policy == RTE_DEV_BLACKLISTED) + bus->conf.scan_mode = RTE_BUS_SCAN_BLACKLIST; + } + TAILQ_INSERT_TAIL(&devargs_list, devargs, next); + return 0; + +fail: + if (devargs) { + free(devargs->args); + free(devargs); + } + + return -1; +} + +int +rte_devargs_remove(struct rte_devargs *devargs) +{ + struct rte_devargs *d; + void *tmp; + + if (devargs == NULL || devargs->bus == NULL) + return -1; + + TAILQ_FOREACH_SAFE(d, &devargs_list, next, tmp) { + if (strcmp(d->bus->name, devargs->bus->name) == 0 && + strcmp(d->name, devargs->name) == 0) { + TAILQ_REMOVE(&devargs_list, d, next); + free(d->args); + free(d); + return 0; + } + } + return 1; +} + +/* count the number of devices of a specified type */ +unsigned int +rte_devargs_type_count(enum rte_devtype devtype) +{ + struct rte_devargs *devargs; + unsigned int count = 0; + + TAILQ_FOREACH(devargs, &devargs_list, next) { + if (devargs->type != devtype) + continue; + count++; + } + return count; +} + +/* dump the user devices on the console */ +void +rte_devargs_dump(FILE *f) +{ + struct rte_devargs *devargs; + + fprintf(f, "User device list:\n"); + TAILQ_FOREACH(devargs, &devargs_list, next) { + fprintf(f, " [%s]: %s %s\n", + (devargs->bus ? devargs->bus->name : "??"), + devargs->name, devargs->args); + } +} + +/* bus-aware rte_devargs iterator. */ +struct rte_devargs * +rte_devargs_next(const char *busname, const struct rte_devargs *start) +{ + struct rte_devargs *da; + + if (start != NULL) + da = TAILQ_NEXT(start, next); + else + da = TAILQ_FIRST(&devargs_list); + while (da != NULL) { + if (busname == NULL || + (strcmp(busname, da->bus->name) == 0)) + return da; + da = TAILQ_NEXT(da, next); + } + return NULL; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c new file mode 100644 index 000000000..2a10fb823 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_errno.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/* Use XSI-compliant portable version of strerror_r() */ +#undef _GNU_SOURCE + +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <stdarg.h> +#include <errno.h> + +#include <rte_per_lcore.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +RTE_DEFINE_PER_LCORE(int, _rte_errno); + +const char * +rte_strerror(int errnum) +{ + /* BSD puts a colon in the "unknown error" messages, Linux doesn't */ +#ifdef RTE_EXEC_ENV_FREEBSD + static const char *sep = ":"; +#else + static const char *sep = ""; +#endif +#define RETVAL_SZ 256 + static RTE_DEFINE_PER_LCORE(char[RETVAL_SZ], retval); + char *ret = RTE_PER_LCORE(retval); + + /* since some implementations of strerror_r throw an error + * themselves if errnum is too big, we handle that case here */ + if (errnum >= RTE_MAX_ERRNO) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", sep, errnum); + else + switch (errnum){ + case E_RTE_SECONDARY: + return "Invalid call in secondary process"; + case E_RTE_NO_CONFIG: + return "Missing rte_config structure"; + default: + if (strerror_r(errnum, ret, RETVAL_SZ) != 0) + snprintf(ret, RETVAL_SZ, "Unknown error%s %d", + sep, errnum); + } + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c new file mode 100644 index 000000000..4f8f1af73 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_fbarray.c @@ -0,0 +1,1510 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <sys/mman.h> +#include <stdint.h> +#include <errno.h> +#include <sys/file.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_errno.h> +#include <rte_spinlock.h> +#include <rte_tailq.h> + +#include "eal_filesystem.h" +#include "eal_private.h" + +#include "rte_fbarray.h" + +#define MASK_SHIFT 6ULL +#define MASK_ALIGN (1ULL << MASK_SHIFT) +#define MASK_LEN_TO_IDX(x) ((x) >> MASK_SHIFT) +#define MASK_LEN_TO_MOD(x) ((x) - RTE_ALIGN_FLOOR(x, MASK_ALIGN)) +#define MASK_GET_IDX(idx, mod) ((idx << MASK_SHIFT) + mod) + +/* + * We use this to keep track of created/attached memory areas to prevent user + * errors in API usage. + */ +struct mem_area { + TAILQ_ENTRY(mem_area) next; + void *addr; + size_t len; + int fd; +}; +TAILQ_HEAD(mem_area_head, mem_area); +/* local per-process tailq */ +static struct mem_area_head mem_area_tailq = + TAILQ_HEAD_INITIALIZER(mem_area_tailq); +static rte_spinlock_t mem_area_lock = RTE_SPINLOCK_INITIALIZER; + +/* + * This is a mask that is always stored at the end of array, to provide fast + * way of finding free/used spots without looping through each element. + */ + +struct used_mask { + unsigned int n_masks; + uint64_t data[]; +}; + +static size_t +calc_mask_size(unsigned int len) +{ + /* mask must be multiple of MASK_ALIGN, even though length of array + * itself may not be aligned on that boundary. + */ + len = RTE_ALIGN_CEIL(len, MASK_ALIGN); + return sizeof(struct used_mask) + + sizeof(uint64_t) * MASK_LEN_TO_IDX(len); +} + +static size_t +calc_data_size(size_t page_sz, unsigned int elt_sz, unsigned int len) +{ + size_t data_sz = elt_sz * len; + size_t msk_sz = calc_mask_size(len); + return RTE_ALIGN_CEIL(data_sz + msk_sz, page_sz); +} + +static struct used_mask * +get_used_mask(void *data, unsigned int elt_sz, unsigned int len) +{ + return (struct used_mask *) RTE_PTR_ADD(data, elt_sz * len); +} + +static int +resize_and_map(int fd, void *addr, size_t len) +{ + char path[PATH_MAX]; + void *map_addr; + + if (ftruncate(fd, len)) { + RTE_LOG(ERR, EAL, "Cannot truncate %s\n", path); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + + map_addr = mmap(addr, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr != addr) { + RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno)); + /* pass errno up the chain */ + rte_errno = errno; + return -1; + } + return 0; +} + +static int +overlap(const struct mem_area *ma, const void *start, size_t len) +{ + const void *end = RTE_PTR_ADD(start, len); + const void *ma_start = ma->addr; + const void *ma_end = RTE_PTR_ADD(ma->addr, ma->len); + + /* start overlap? */ + if (start >= ma_start && start < ma_end) + return 1; + /* end overlap? */ + if (end >= ma_start && end < ma_end) + return 1; + return 0; +} + +static int +find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookahead_idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-1ULL << last_mod); + + for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) { + uint64_t cur_msk, lookahead_msk; + unsigned int run_start, clz, left; + bool found = false; + /* + * The process of getting n consecutive bits for arbitrary n is + * a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * rshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count leading zeroes (that is, count + * how many consecutive set bits we had starting from the + * end of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits right at the start, so we will do + * (n-k-1) rshift-ands and check if first bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* combine current ignore mask with last index ignore mask */ + if (msk_idx == last) + ignore_msk |= last_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk >> 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + run_start = __builtin_ctzll(tmp_msk); + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count leading zeroes on inverted mask */ + if (~cur_msk == 0) + clz = sizeof(cur_msk) * 8; + else + clz = __builtin_clzll(~cur_msk); + + /* if there aren't any runs at the end either, just continue */ + if (clz == 0) + continue; + + /* we have a partial run at the end, so try looking ahead */ + run_start = MASK_ALIGN - clz; + left -= clz; + + for (lookahead_idx = msk_idx + 1; lookahead_idx < msk->n_masks; + lookahead_idx++) { + unsigned int s_idx, need; + lookahead_msk = msk->data[lookahead_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookahead_msk = ~lookahead_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookahead_msk &= lookahead_msk >> 1ULL; + + /* if first bit is not set, we've lost the run */ + if ((lookahead_msk & 1) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookahead-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = ~((1ULL << need) - 1); + msk_idx = lookahead_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } + + /* we didn't find anything, so continue */ + if (!found) + continue; + + return MASK_GET_IDX(msk_idx, run_start); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_next(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk, ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + ignore_msk = ~((1ULL << first_mod) - 1ULL); + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + for (idx = first; idx < msk->n_masks; idx++) { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find first set bit - that will correspond to whatever it is + * that we're looking for. + */ + found = __builtin_ctzll(cur); + return MASK_GET_IDX(idx, found); + } + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int last, last_mod; + uint64_t last_msk; + unsigned int need_len, result = 0; + + /* array length may not be aligned, so calculate ignore mask for last + * mask index. + */ + last = MASK_LEN_TO_IDX(arr->len); + last_mod = MASK_LEN_TO_MOD(arr->len); + last_msk = ~(-(1ULL) << last_mod); + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + for (idx = first; idx < msk->n_masks; idx++, result += need_len) { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* if this is last mask, ignore everything after last bit */ + if (idx == last) + cur &= last_msk; + + /* ignore everything before start on first iteration */ + if (idx == first) { + cur >>= first_mod; + /* at the start, we don't need the full mask len */ + need_len -= first_mod; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + continue; + + /* + * see if current run ends before mask end. + */ + run_len = __builtin_ctzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } + } + return result; +} + +static int +find_prev_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int msk_idx, lookbehind_idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing ctz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + msk_idx = first; + do { + uint64_t cur_msk, lookbehind_msk; + unsigned int run_start, run_end, ctz, left; + bool found = false; + /* + * The process of getting n consecutive bits from the top for + * arbitrary n is a bit involved, but here it is in a nutshell: + * + * 1. let n be the number of consecutive bits we're looking for + * 2. check if n can fit in one mask, and if so, do n-1 + * lshift-ands to see if there is an appropriate run inside + * our current mask + * 2a. if we found a run, bail out early + * 2b. if we didn't find a run, proceed + * 3. invert the mask and count trailing zeroes (that is, count + * how many consecutive set bits we had starting from the + * start of current mask) as k + * 3a. if k is 0, continue to next mask + * 3b. if k is not 0, we have a potential run + * 4. to satisfy our requirements, next mask must have n-k + * consecutive set bits at the end, so we will do (n-k-1) + * lshift-ands and check if last bit is set. + * + * Step 4 will need to be repeated if (n-k) > MASK_ALIGN until + * we either run out of masks, lose the run, or find what we + * were looking for. + */ + cur_msk = msk->data[msk_idx]; + left = n; + + /* if we're looking for free spaces, invert the mask */ + if (!used) + cur_msk = ~cur_msk; + + /* if we have an ignore mask, ignore once */ + if (ignore_msk) { + cur_msk &= ignore_msk; + ignore_msk = 0; + } + + /* if n can fit in within a single mask, do a search */ + if (n <= MASK_ALIGN) { + uint64_t tmp_msk = cur_msk; + unsigned int s_idx; + for (s_idx = 0; s_idx < n - 1; s_idx++) + tmp_msk &= tmp_msk << 1ULL; + /* we found what we were looking for */ + if (tmp_msk != 0) { + /* clz will give us offset from end of mask, and + * we only get the end of our run, not start, + * so adjust result to point to where start + * would have been. + */ + run_start = MASK_ALIGN - + __builtin_clzll(tmp_msk) - n; + return MASK_GET_IDX(msk_idx, run_start); + } + } + + /* + * we didn't find our run within the mask, or n > MASK_ALIGN, + * so we're going for plan B. + */ + + /* count trailing zeroes on inverted mask */ + if (~cur_msk == 0) + ctz = sizeof(cur_msk) * 8; + else + ctz = __builtin_ctzll(~cur_msk); + + /* if there aren't any runs at the start either, just + * continue + */ + if (ctz == 0) + continue; + + /* we have a partial run at the start, so try looking behind */ + run_end = MASK_GET_IDX(msk_idx, ctz); + left -= ctz; + + /* go backwards, include zero */ + lookbehind_idx = msk_idx - 1; + + /* we can't lookbehind as we've run out of masks, so stop */ + if (msk_idx == 0) + break; + + do { + const uint64_t last_bit = 1ULL << (MASK_ALIGN - 1); + unsigned int s_idx, need; + + lookbehind_msk = msk->data[lookbehind_idx]; + + /* if we're looking for free space, invert the mask */ + if (!used) + lookbehind_msk = ~lookbehind_msk; + + /* figure out how many consecutive bits we need here */ + need = RTE_MIN(left, MASK_ALIGN); + + for (s_idx = 0; s_idx < need - 1; s_idx++) + lookbehind_msk &= lookbehind_msk << 1ULL; + + /* if last bit is not set, we've lost the run */ + if ((lookbehind_msk & last_bit) == 0) { + /* + * we've scanned this far, so we know there are + * no runs in the space we've lookbehind-scanned + * as well, so skip that on next iteration. + */ + ignore_msk = -1ULL << need; + msk_idx = lookbehind_idx; + break; + } + + left -= need; + + /* check if we've found what we were looking for */ + if (left == 0) { + found = true; + break; + } + } while ((lookbehind_idx--) != 0); /* decrement after check to + * include zero + */ + + /* we didn't find anything, so continue */ + if (!found) + continue; + + /* we've found what we were looking for, but we only know where + * the run ended, so calculate start position. + */ + return run_end - n; + } while (msk_idx-- != 0); /* decrement after check to include zero */ + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_prev(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + uint64_t ignore_msk; + + /* + * mask only has granularity of MASK_ALIGN, but start may not be aligned + * on that boundary, so construct a special mask to exclude anything we + * don't want to see to avoid confusing clz. + */ + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + /* we're going backwards, so mask must start from the top */ + ignore_msk = first_mod == MASK_ALIGN - 1 ? + -1ULL : /* prevent overflow */ + ~(-1ULL << (first_mod + 1)); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + int found; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything before start on first iteration */ + if (idx == first) + cur &= ignore_msk; + + /* check if we have any entries */ + if (cur == 0) + continue; + + /* + * find last set bit - that will correspond to whatever it is + * that we're looking for. we're counting trailing zeroes, thus + * the value we get is counted from end of mask, so calculate + * position from start of mask. + */ + found = MASK_ALIGN - __builtin_clzll(cur) - 1; + + return MASK_GET_IDX(idx, found); + } while (idx-- != 0); /* decrement after check to include zero*/ + + /* we didn't find anything */ + rte_errno = used ? ENOENT : ENOSPC; + return -1; +} + +static int +find_rev_contig(const struct rte_fbarray *arr, unsigned int start, bool used) +{ + const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, + arr->len); + unsigned int idx, first, first_mod; + unsigned int need_len, result = 0; + + first = MASK_LEN_TO_IDX(start); + first_mod = MASK_LEN_TO_MOD(start); + + /* go backwards, include zero */ + idx = first; + do { + uint64_t cur = msk->data[idx]; + unsigned int run_len; + + need_len = MASK_ALIGN; + + /* if we're looking for free entries, invert mask */ + if (!used) + cur = ~cur; + + /* ignore everything after start on first iteration */ + if (idx == first) { + unsigned int end_len = MASK_ALIGN - first_mod - 1; + cur <<= end_len; + /* at the start, we don't need the full mask len */ + need_len -= end_len; + } + + /* we will be looking for zeroes, so invert the mask */ + cur = ~cur; + + /* if mask is zero, we have a complete run */ + if (cur == 0) + goto endloop; + + /* + * see where run ends, starting from the end. + */ + run_len = __builtin_clzll(cur); + + /* add however many zeroes we've had in the last run and quit */ + if (run_len < need_len) { + result += run_len; + break; + } +endloop: + result += need_len; + } while (idx-- != 0); /* decrement after check to include zero */ + return result; +} + +static int +set_used(struct rte_fbarray *arr, unsigned int idx, bool used) +{ + struct used_mask *msk; + uint64_t msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + unsigned int msk_idx = MASK_LEN_TO_IDX(idx); + bool already_used; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + ret = 0; + + /* prevent array from changing under us */ + rte_rwlock_write_lock(&arr->rwlock); + + already_used = (msk->data[msk_idx] & msk_bit) != 0; + + /* nothing to be done */ + if (used == already_used) + goto out; + + if (used) { + msk->data[msk_idx] |= msk_bit; + arr->count++; + } else { + msk->data[msk_idx] &= ~msk_bit; + arr->count--; + } +out: + rte_rwlock_write_unlock(&arr->rwlock); + + return ret; +} + +static int +fully_validate(const char *name, unsigned int elt_sz, unsigned int len) +{ + if (name == NULL || elt_sz == 0 || len == 0 || len > INT_MAX) { + rte_errno = EINVAL; + return -1; + } + + if (strnlen(name, RTE_FBARRAY_NAME_LEN) == RTE_FBARRAY_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +int +rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len, + unsigned int elt_sz) +{ + size_t page_sz, mmap_len; + char path[PATH_MAX]; + struct used_mask *msk; + struct mem_area *ma = NULL; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + if (fully_validate(name, elt_sz, len)) + return -1; + + /* allocate mem area before doing anything */ + ma = malloc(sizeof(*ma)); + if (ma == NULL) { + rte_errno = ENOMEM; + return -1; + } + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) { + free(ma); + return -1; + } + + /* calculate our memory limits */ + mmap_len = calc_data_size(page_sz, elt_sz, len); + + data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0); + if (data == NULL) { + free(ma); + return -1; + } + + rte_spinlock_lock(&mem_area_lock); + + fd = -1; + + if (internal_config.no_shconf) { + /* remap virtual area as writable */ + void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0); + if (new_data == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n", + __func__, strerror(errno)); + goto fail; + } + } else { + eal_get_fbarray_path(path, sizeof(path), name); + + /* + * Each fbarray is unique to process namespace, i.e. the + * filename depends on process prefix. Try to take out a lock + * and see if we succeed. If we don't, someone else is using it + * already. + */ + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = errno; + goto fail; + } else if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n", + __func__, path, strerror(errno)); + rte_errno = EBUSY; + goto fail; + } + + /* take out a non-exclusive lock, so that other processes could + * still attach to it, but no other process could reinitialize + * it. + */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + } + ma->addr = data; + ma->len = mmap_len; + ma->fd = fd; + + /* do not close fd - keep it until detach/destroy */ + TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next); + + /* initialize the data */ + memset(data, 0, mmap_len); + + /* populate data structure */ + strlcpy(arr->name, name, sizeof(arr->name)); + arr->data = data; + arr->len = len; + arr->elt_sz = elt_sz; + arr->count = 0; + + msk = get_used_mask(data, elt_sz, len); + msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN)); + + rte_rwlock_init(&arr->rwlock); + + rte_spinlock_unlock(&mem_area_lock); + + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + free(ma); + + rte_spinlock_unlock(&mem_area_lock); + return -1; +} + +int +rte_fbarray_attach(struct rte_fbarray *arr) +{ + struct mem_area *ma = NULL, *tmp = NULL; + size_t page_sz, mmap_len; + char path[PATH_MAX]; + void *data = NULL; + int fd = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize attach as two values we need (element + * size and array length) are constant for the duration of life of + * the array, so the parts we care about will not race. + */ + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) + return -1; + + ma = malloc(sizeof(*ma)); + if (ma == NULL) { + rte_errno = ENOMEM; + return -1; + } + + page_sz = sysconf(_SC_PAGESIZE); + if (page_sz == (size_t)-1) { + free(ma); + return -1; + } + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* check the tailq - maybe user has already mapped this address space */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (overlap(tmp, arr->data, mmap_len)) { + rte_errno = EEXIST; + goto fail; + } + } + + /* we know this memory area is unique, so proceed */ + + data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0); + if (data == NULL) + goto fail; + + eal_get_fbarray_path(path, sizeof(path), arr->name); + + fd = open(path, O_RDWR); + if (fd < 0) { + rte_errno = errno; + goto fail; + } + + /* lock the file, to let others know we're using it */ + if (flock(fd, LOCK_SH | LOCK_NB)) { + rte_errno = errno; + goto fail; + } + + if (resize_and_map(fd, data, mmap_len)) + goto fail; + + /* store our new memory area */ + ma->addr = data; + ma->fd = fd; /* keep fd until detach/destroy */ + ma->len = mmap_len; + + TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next); + + /* we're done */ + + rte_spinlock_unlock(&mem_area_lock); + return 0; +fail: + if (data) + munmap(data, mmap_len); + if (fd >= 0) + close(fd); + free(ma); + rte_spinlock_unlock(&mem_area_lock); + return -1; +} + +int +rte_fbarray_detach(struct rte_fbarray *arr) +{ + struct mem_area *tmp = NULL; + size_t mmap_len; + int ret = -1; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* does this area exist? */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (tmp->addr == arr->data && tmp->len == mmap_len) + break; + } + if (tmp == NULL) { + rte_errno = ENOENT; + ret = -1; + goto out; + } + + munmap(arr->data, mmap_len); + + /* area is unmapped, close fd and remove the tailq entry */ + if (tmp->fd >= 0) + close(tmp->fd); + TAILQ_REMOVE(&mem_area_tailq, tmp, next); + free(tmp); + + ret = 0; +out: + rte_spinlock_unlock(&mem_area_lock); + return ret; +} + +int +rte_fbarray_destroy(struct rte_fbarray *arr) +{ + struct mem_area *tmp = NULL; + size_t mmap_len; + int fd, ret; + char path[PATH_MAX]; + + if (arr == NULL) { + rte_errno = EINVAL; + return -1; + } + + /* + * we don't need to synchronize detach as two values we need (element + * size and total capacity) are constant for the duration of life of + * the array, so the parts we care about will not race. if the user is + * detaching while doing something else in the same process, we can't + * really do anything about it, things will blow up either way. + */ + + size_t page_sz = sysconf(_SC_PAGESIZE); + + if (page_sz == (size_t)-1) + return -1; + + mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len); + + /* does this area exist? */ + rte_spinlock_lock(&mem_area_lock); + + TAILQ_FOREACH(tmp, &mem_area_tailq, next) { + if (tmp->addr == arr->data && tmp->len == mmap_len) + break; + } + if (tmp == NULL) { + rte_errno = ENOENT; + ret = -1; + goto out; + } + /* with no shconf, there were never any files to begin with */ + if (!internal_config.no_shconf) { + /* + * attempt to get an exclusive lock on the file, to ensure it + * has been detached by all other processes + */ + fd = tmp->fd; + if (flock(fd, LOCK_EX | LOCK_NB)) { + RTE_LOG(DEBUG, EAL, "Cannot destroy fbarray - another process is using it\n"); + rte_errno = EBUSY; + ret = -1; + goto out; + } + + /* we're OK to destroy the file */ + eal_get_fbarray_path(path, sizeof(path), arr->name); + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "Cannot unlink fbarray: %s\n", + strerror(errno)); + rte_errno = errno; + /* + * we're still holding an exclusive lock, so drop it to + * shared. + */ + flock(fd, LOCK_SH | LOCK_NB); + + ret = -1; + goto out; + } + close(fd); + } + munmap(arr->data, mmap_len); + + /* area is unmapped, remove the tailq entry */ + TAILQ_REMOVE(&mem_area_tailq, tmp, next); + free(tmp); + ret = 0; + + /* reset the fbarray structure */ + memset(arr, 0, sizeof(*arr)); +out: + rte_spinlock_unlock(&mem_area_lock); + return ret; +} + +void * +rte_fbarray_get(const struct rte_fbarray *arr, unsigned int idx) +{ + void *ret = NULL; + if (arr == NULL) { + rte_errno = EINVAL; + return NULL; + } + + if (idx >= arr->len) { + rte_errno = EINVAL; + return NULL; + } + + ret = RTE_PTR_ADD(arr->data, idx * arr->elt_sz); + + return ret; +} + +int +rte_fbarray_set_used(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, true); +} + +int +rte_fbarray_set_free(struct rte_fbarray *arr, unsigned int idx) +{ + return set_used(arr, idx, false); +} + +int +rte_fbarray_is_used(struct rte_fbarray *arr, unsigned int idx) +{ + struct used_mask *msk; + int msk_idx; + uint64_t msk_bit; + int ret = -1; + + if (arr == NULL || idx >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + msk_idx = MASK_LEN_TO_IDX(idx); + msk_bit = 1ULL << MASK_LEN_TO_MOD(idx); + + ret = (msk->data[msk_idx] & msk_bit) != 0; + + rte_rwlock_read_unlock(&arr->rwlock); + + return ret; +} + +static int +fbarray_find(struct rte_fbarray *arr, unsigned int start, bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = start; + goto out; + } + } else { + if (arr->count == 0) { + rte_errno = ENOENT; + goto out; + } + if (arr->len == arr->count) { + ret = start; + goto out; + } + } + if (next) + ret = find_next(arr, start, used); + else + ret = find_prev(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int +rte_fbarray_find_next_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, false); +} + +int +rte_fbarray_find_next_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, true, true); +} + +int +rte_fbarray_find_prev_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, false); +} + +int +rte_fbarray_find_prev_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find(arr, start, false, true); +} + +static int +fbarray_find_n(struct rte_fbarray *arr, unsigned int start, unsigned int n, + bool next, bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len || n > arr->len || n == 0) { + rte_errno = EINVAL; + return -1; + } + if (next && (arr->len - start) < n) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + if (!next && start < (n - 1)) { + rte_errno = used ? ENOENT : ENOSPC; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (!used) { + if (arr->len == arr->count || arr->len - arr->count < n) { + rte_errno = ENOSPC; + goto out; + } + if (arr->count == 0) { + ret = next ? start : start - n + 1; + goto out; + } + } else { + if (arr->count < n) { + rte_errno = ENOENT; + goto out; + } + if (arr->count == arr->len) { + ret = next ? start : start - n + 1; + goto out; + } + } + + if (next) + ret = find_next_n(arr, start, n, used); + else + ret = find_prev_n(arr, start, n, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +int +rte_fbarray_find_next_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, false); +} + +int +rte_fbarray_find_next_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, true, true); +} + +int +rte_fbarray_find_prev_n_free(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, false); +} + +int +rte_fbarray_find_prev_n_used(struct rte_fbarray *arr, unsigned int start, + unsigned int n) +{ + return fbarray_find_n(arr, start, n, false, true); +} + +static int +fbarray_find_contig(struct rte_fbarray *arr, unsigned int start, bool next, + bool used) +{ + int ret = -1; + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + /* cheap checks to prevent doing useless work */ + if (used) { + if (arr->count == 0) { + ret = 0; + goto out; + } + if (next && arr->count == arr->len) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == arr->len) { + ret = start + 1; + goto out; + } + } else { + if (arr->len == arr->count) { + ret = 0; + goto out; + } + if (next && arr->count == 0) { + ret = arr->len - start; + goto out; + } + if (!next && arr->count == 0) { + ret = start + 1; + goto out; + } + } + + if (next) + ret = find_contig(arr, start, used); + else + ret = find_rev_contig(arr, start, used); +out: + rte_rwlock_read_unlock(&arr->rwlock); + return ret; +} + +static int +fbarray_find_biggest(struct rte_fbarray *arr, unsigned int start, bool used, + bool rev) +{ + int cur_idx, next_idx, cur_len, biggest_idx, biggest_len; + /* don't stack if conditions, use function pointers instead */ + int (*find_func)(struct rte_fbarray *, unsigned int); + int (*find_contig_func)(struct rte_fbarray *, unsigned int); + + if (arr == NULL || start >= arr->len) { + rte_errno = EINVAL; + return -1; + } + /* the other API calls already do their fair share of cheap checks, so + * no need to do them here. + */ + + /* the API's called are thread-safe, but something may still happen + * between the API calls, so lock the fbarray. all other API's are + * read-locking the fbarray, so read lock here is OK. + */ + rte_rwlock_read_lock(&arr->rwlock); + + /* pick out appropriate functions */ + if (used) { + if (rev) { + find_func = rte_fbarray_find_prev_used; + find_contig_func = rte_fbarray_find_rev_contig_used; + } else { + find_func = rte_fbarray_find_next_used; + find_contig_func = rte_fbarray_find_contig_used; + } + } else { + if (rev) { + find_func = rte_fbarray_find_prev_free; + find_contig_func = rte_fbarray_find_rev_contig_free; + } else { + find_func = rte_fbarray_find_next_free; + find_contig_func = rte_fbarray_find_contig_free; + } + } + + cur_idx = start; + biggest_idx = -1; /* default is error */ + biggest_len = 0; + for (;;) { + cur_idx = find_func(arr, cur_idx); + + /* block found, check its length */ + if (cur_idx >= 0) { + cur_len = find_contig_func(arr, cur_idx); + /* decide where we go next */ + next_idx = rev ? cur_idx - cur_len : cur_idx + cur_len; + /* move current index to start of chunk */ + cur_idx = rev ? next_idx + 1 : cur_idx; + + if (cur_len > biggest_len) { + biggest_idx = cur_idx; + biggest_len = cur_len; + } + cur_idx = next_idx; + /* in reverse mode, next_idx may be -1 if chunk started + * at array beginning. this means there's no more work + * to do. + */ + if (cur_idx < 0) + break; + } else { + /* nothing more to find, stop. however, a failed API + * call has set rte_errno, which we want to ignore, as + * reaching the end of fbarray is not an error. + */ + rte_errno = 0; + break; + } + } + /* if we didn't find anything at all, set rte_errno */ + if (biggest_idx < 0) + rte_errno = used ? ENOENT : ENOSPC; + + rte_rwlock_read_unlock(&arr->rwlock); + return biggest_idx; +} + +int +rte_fbarray_find_biggest_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, false, false); +} + +int +rte_fbarray_find_biggest_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, true, false); +} + +int +rte_fbarray_find_rev_biggest_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, false, true); +} + +int +rte_fbarray_find_rev_biggest_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_biggest(arr, start, true, true); +} + + +int +rte_fbarray_find_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, false); +} + +int +rte_fbarray_find_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, true, true); +} + +int +rte_fbarray_find_rev_contig_free(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, false); +} + +int +rte_fbarray_find_rev_contig_used(struct rte_fbarray *arr, unsigned int start) +{ + return fbarray_find_contig(arr, start, false, true); +} + +int +rte_fbarray_find_idx(const struct rte_fbarray *arr, const void *elt) +{ + void *end; + int ret = -1; + + /* + * no need to synchronize as it doesn't matter if underlying data + * changes - we're doing pointer arithmetic here. + */ + + if (arr == NULL || elt == NULL) { + rte_errno = EINVAL; + return -1; + } + end = RTE_PTR_ADD(arr->data, arr->elt_sz * arr->len); + if (elt < arr->data || elt >= end) { + rte_errno = EINVAL; + return -1; + } + + ret = RTE_PTR_DIFF(elt, arr->data) / arr->elt_sz; + + return ret; +} + +void +rte_fbarray_dump_metadata(struct rte_fbarray *arr, FILE *f) +{ + struct used_mask *msk; + unsigned int i; + + if (arr == NULL || f == NULL) { + rte_errno = EINVAL; + return; + } + + if (fully_validate(arr->name, arr->elt_sz, arr->len)) { + fprintf(f, "Invalid file-backed array\n"); + goto out; + } + + /* prevent array from changing under us */ + rte_rwlock_read_lock(&arr->rwlock); + + fprintf(f, "File-backed array: %s\n", arr->name); + fprintf(f, "size: %i occupied: %i elt_sz: %i\n", + arr->len, arr->count, arr->elt_sz); + + msk = get_used_mask(arr->data, arr->elt_sz, arr->len); + + for (i = 0; i < msk->n_masks; i++) + fprintf(f, "msk idx %i: 0x%016" PRIx64 "\n", i, msk->data[i]); +out: + rte_rwlock_read_unlock(&arr->rwlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c new file mode 100644 index 000000000..2d2179d41 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hexdump.c @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <stdint.h> +#include <rte_hexdump.h> +#include <rte_string_fns.h> + +#define LINE_LEN 128 + +void +rte_hexdump(FILE *f, const char *title, const void *buf, unsigned int len) +{ + unsigned int i, out, ofs; + const unsigned char *data = buf; + char line[LINE_LEN]; /* space needed 8+16*3+3+16 == 75 */ + + fprintf(f, "%s at [%p], len=%u\n", + title ? : " Dump data", data, len); + ofs = 0; + while (ofs < len) { + /* format the line in the buffer */ + out = snprintf(line, LINE_LEN, "%08X:", ofs); + for (i = 0; i < 16; i++) { + if (ofs + i < len) + snprintf(line + out, LINE_LEN - out, + " %02X", (data[ofs + i] & 0xff)); + else + strcpy(line + out, " "); + out += 3; + } + + + for (; i <= 16; i++) + out += snprintf(line + out, LINE_LEN - out, " | "); + + for (i = 0; ofs < len && i < 16; i++, ofs++) { + unsigned char c = data[ofs]; + + if (c < ' ' || c > '~') + c = '.'; + out += snprintf(line + out, LINE_LEN - out, "%c", c); + } + fprintf(f, "%s\n", line); + } + fflush(f); +} + +void +rte_memdump(FILE *f, const char *title, const void *buf, unsigned int len) +{ + unsigned int i, out; + const unsigned char *data = buf; + char line[LINE_LEN]; + + if (title) + fprintf(f, "%s: ", title); + + line[0] = '\0'; + for (i = 0, out = 0; i < len; i++) { + /* Make sure we do not overrun the line buffer length. */ + if (out >= LINE_LEN - 4) { + fprintf(f, "%s", line); + out = 0; + line[out] = '\0'; + } + out += snprintf(line + out, LINE_LEN - out, "%02x%s", + (data[i] & 0xff), ((i + 1) < len) ? ":" : ""); + } + if (out > 0) + fprintf(f, "%s", line); + fprintf(f, "\n"); + + fflush(f); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c new file mode 100644 index 000000000..5388b81a5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_hypervisor.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd + */ + +#include "rte_hypervisor.h" + +const char * +rte_hypervisor_get_name(enum rte_hypervisor id) +{ + switch (id) { + case RTE_HYPERVISOR_NONE: + return "none"; + case RTE_HYPERVISOR_KVM: + return "KVM"; + case RTE_HYPERVISOR_HYPERV: + return "Hyper-V"; + case RTE_HYPERVISOR_VMWARE: + return "VMware"; + default: + return "unknown"; + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c new file mode 100644 index 000000000..cf52d717f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_launch.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <sys/queue.h> + +#include <rte_launch.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_atomic.h> +#include <rte_pause.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> + +#include "eal_private.h" + +/* + * Wait until a lcore finished its job. + */ +int +rte_eal_wait_lcore(unsigned slave_id) +{ + if (lcore_config[slave_id].state == WAIT) + return 0; + + while (lcore_config[slave_id].state != WAIT && + lcore_config[slave_id].state != FINISHED) + rte_pause(); + + rte_rmb(); + + /* we are in finished state, go to wait state */ + lcore_config[slave_id].state = WAIT; + return lcore_config[slave_id].ret; +} + +/* + * Check that every SLAVE lcores are in WAIT state, then call + * rte_eal_remote_launch() for all of them. If call_master is true + * (set to CALL_MASTER), also call the function on the master lcore. + */ +int +rte_eal_mp_remote_launch(int (*f)(void *), void *arg, + enum rte_rmt_call_master_t call_master) +{ + int lcore_id; + int master = rte_get_master_lcore(); + + /* check state of lcores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (lcore_config[lcore_id].state != WAIT) + return -EBUSY; + } + + /* send messages to cores */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_remote_launch(f, arg, lcore_id); + } + + if (call_master == CALL_MASTER) { + lcore_config[master].ret = f(arg); + lcore_config[master].state = FINISHED; + } + + return 0; +} + +/* + * Return the state of the lcore identified by slave_id. + */ +enum rte_lcore_state_t +rte_eal_get_lcore_state(unsigned lcore_id) +{ + return lcore_config[lcore_id].state; +} + +/* + * Do a rte_eal_wait_lcore() for every lcore. The return values are + * ignored. + */ +void +rte_eal_mp_wait_lcore(void) +{ + unsigned lcore_id; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rte_eal_wait_lcore(lcore_id); + } +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c new file mode 100644 index 000000000..5404922a8 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_lcore.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <unistd.h> +#include <limits.h> +#include <string.h> + +#include <rte_errno.h> +#include <rte_log.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_thread.h" + +unsigned int rte_get_master_lcore(void) +{ + return rte_eal_get_configuration()->master_lcore; +} + +unsigned int rte_lcore_count(void) +{ + return rte_eal_get_configuration()->lcore_count; +} + +int rte_lcore_index(int lcore_id) +{ + if (unlikely(lcore_id >= RTE_MAX_LCORE)) + return -1; + + if (lcore_id < 0) + lcore_id = (int)rte_lcore_id(); + + return lcore_config[lcore_id].core_index; +} + +int rte_lcore_to_cpu_id(int lcore_id) +{ + if (unlikely(lcore_id >= RTE_MAX_LCORE)) + return -1; + + if (lcore_id < 0) + lcore_id = (int)rte_lcore_id(); + + return lcore_config[lcore_id].core_id; +} + +rte_cpuset_t rte_lcore_cpuset(unsigned int lcore_id) +{ + return lcore_config[lcore_id].cpuset; +} + +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned int lcore_id) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return ROLE_OFF; + return cfg->lcore_role[lcore_id]; +} + +int rte_lcore_is_enabled(unsigned int lcore_id) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return 0; + return cfg->lcore_role[lcore_id] == ROLE_RTE; +} + +unsigned int rte_get_next_lcore(unsigned int i, int skip_master, int wrap) +{ + i++; + if (wrap) + i %= RTE_MAX_LCORE; + + while (i < RTE_MAX_LCORE) { + if (!rte_lcore_is_enabled(i) || + (skip_master && (i == rte_get_master_lcore()))) { + i++; + if (wrap) + i %= RTE_MAX_LCORE; + continue; + } + break; + } + return i; +} + +unsigned int +rte_lcore_to_socket_id(unsigned int lcore_id) +{ + return lcore_config[lcore_id].socket_id; +} + +static int +socket_id_cmp(const void *a, const void *b) +{ + const int *lcore_id_a = a; + const int *lcore_id_b = b; + + if (*lcore_id_a < *lcore_id_b) + return -1; + if (*lcore_id_a > *lcore_id_b) + return 1; + return 0; +} + +/* + * Parse /sys/devices/system/cpu to get the number of physical and logical + * processors on the machine. The function will fill the cpu_info + * structure. + */ +int +rte_eal_cpu_init(void) +{ + /* pointer to global configuration */ + struct rte_config *config = rte_eal_get_configuration(); + unsigned lcore_id; + unsigned count = 0; + unsigned int socket_id, prev_socket_id; + int lcore_to_socket_id[RTE_MAX_LCORE]; + + /* + * Parse the maximum set of logical cores, detect the subset of running + * ones and enable them by default. + */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + lcore_config[lcore_id].core_index = count; + + /* init cpuset for per lcore config */ + CPU_ZERO(&lcore_config[lcore_id].cpuset); + + /* find socket first */ + socket_id = eal_cpu_socket_id(lcore_id); + lcore_to_socket_id[lcore_id] = socket_id; + + if (eal_cpu_detected(lcore_id) == 0) { + config->lcore_role[lcore_id] = ROLE_OFF; + lcore_config[lcore_id].core_index = -1; + continue; + } + + /* By default, lcore 1:1 map to cpu id */ + CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset); + + /* By default, each detected core is enabled */ + config->lcore_role[lcore_id] = ROLE_RTE; + lcore_config[lcore_id].core_role = ROLE_RTE; + lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id); + lcore_config[lcore_id].socket_id = socket_id; + RTE_LOG(DEBUG, EAL, "Detected lcore %u as " + "core %u on socket %u\n", + lcore_id, lcore_config[lcore_id].core_id, + lcore_config[lcore_id].socket_id); + count++; + } + for (; lcore_id < CPU_SETSIZE; lcore_id++) { + if (eal_cpu_detected(lcore_id) == 0) + continue; + RTE_LOG(DEBUG, EAL, "Skipped lcore %u as core %u on socket %u\n", + lcore_id, eal_cpu_core_id(lcore_id), + eal_cpu_socket_id(lcore_id)); + } + + /* Set the count of enabled logical cores of the EAL configuration */ + config->lcore_count = count; + RTE_LOG(DEBUG, EAL, + "Support maximum %u logical core(s) by configuration.\n", + RTE_MAX_LCORE); + RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count); + + /* sort all socket id's in ascending order */ + qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id), + sizeof(lcore_to_socket_id[0]), socket_id_cmp); + + prev_socket_id = -1; + config->numa_node_count = 0; + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + socket_id = lcore_to_socket_id[lcore_id]; + if (socket_id != prev_socket_id) + config->numa_nodes[config->numa_node_count++] = + socket_id; + prev_socket_id = socket_id; + } + RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count); + + return 0; +} + +unsigned int +rte_socket_count(void) +{ + const struct rte_config *config = rte_eal_get_configuration(); + return config->numa_node_count; +} + +int +rte_socket_id_by_idx(unsigned int idx) +{ + const struct rte_config *config = rte_eal_get_configuration(); + if (idx >= config->numa_node_count) { + rte_errno = EINVAL; + return -1; + } + return config->numa_nodes[idx]; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c new file mode 100644 index 000000000..8835c8fff --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_log.c @@ -0,0 +1,481 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdint.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <regex.h> +#include <fnmatch.h> + +#include <rte_eal.h> +#include <rte_log.h> +#include <rte_per_lcore.h> + +#include "eal_private.h" + +/* global log structure */ +struct rte_logs rte_logs = { + .type = ~0, + .level = RTE_LOG_DEBUG, + .file = NULL, +}; + +struct rte_eal_opt_loglevel { + /** Next list entry */ + TAILQ_ENTRY(rte_eal_opt_loglevel) next; + /** Compiled regular expression obtained from the option */ + regex_t re_match; + /** Globbing pattern option */ + char *pattern; + /** Log level value obtained from the option */ + uint32_t level; +}; + +TAILQ_HEAD(rte_eal_opt_loglevel_list, rte_eal_opt_loglevel); + +/** List of valid EAL log level options */ +static struct rte_eal_opt_loglevel_list opt_loglevel_list = + TAILQ_HEAD_INITIALIZER(opt_loglevel_list); + +/* Stream to use for logging if rte_logs.file is NULL */ +static FILE *default_log_stream; + +/** + * This global structure stores some information about the message + * that is currently being processed by one lcore + */ +struct log_cur_msg { + uint32_t loglevel; /**< log level - see rte_log.h */ + uint32_t logtype; /**< log type - see rte_log.h */ +}; + +struct rte_log_dynamic_type { + const char *name; + uint32_t loglevel; +}; + + /* per core log */ +static RTE_DEFINE_PER_LCORE(struct log_cur_msg, log_cur_msg); + +/* default logs */ + +/* Change the stream that will be used by logging system */ +int +rte_openlog_stream(FILE *f) +{ + rte_logs.file = f; + return 0; +} + +FILE * +rte_log_get_stream(void) +{ + FILE *f = rte_logs.file; + + if (f == NULL) { + /* + * Grab the current value of stderr here, rather than + * just initializing default_log_stream to stderr. This + * ensures that we will always use the current value + * of stderr, even if the application closes and + * reopens it. + */ + return default_log_stream ? : stderr; + } + return f; +} + +/* Set global log level */ +void +rte_log_set_global_level(uint32_t level) +{ + rte_logs.level = (uint32_t)level; +} + +/* Get global log level */ +uint32_t +rte_log_get_global_level(void) +{ + return rte_logs.level; +} + +int +rte_log_get_level(uint32_t type) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + + return rte_logs.dynamic_types[type].loglevel; +} + +bool +rte_log_can_log(uint32_t logtype, uint32_t level) +{ + int log_level; + + if (level > rte_log_get_global_level()) + return false; + + log_level = rte_log_get_level(logtype); + if (log_level < 0) + return false; + + if (level > (uint32_t)log_level) + return false; + + return true; +} + +int +rte_log_set_level(uint32_t type, uint32_t level) +{ + if (type >= rte_logs.dynamic_types_len) + return -1; + if (level > RTE_LOG_DEBUG) + return -1; + + rte_logs.dynamic_types[type].loglevel = level; + + return 0; +} + +/* set log level by regular expression */ +int +rte_log_set_level_regexp(const char *regex, uint32_t level) +{ + regex_t r; + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + if (regcomp(&r, regex, 0) != 0) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (regexec(&r, rte_logs.dynamic_types[i].name, 0, + NULL, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + regfree(&r); + + return 0; +} + +/* + * Save the type string and the loglevel for later dynamic + * logtypes which may register later. + */ +static int rte_log_save_level(int priority, + const char *regex, const char *pattern) +{ + struct rte_eal_opt_loglevel *opt_ll = NULL; + + opt_ll = malloc(sizeof(*opt_ll)); + if (opt_ll == NULL) + goto fail; + + opt_ll->level = priority; + + if (regex) { + opt_ll->pattern = NULL; + if (regcomp(&opt_ll->re_match, regex, 0) != 0) + goto fail; + } else if (pattern) { + opt_ll->pattern = strdup(pattern); + if (opt_ll->pattern == NULL) + goto fail; + } else + goto fail; + + TAILQ_INSERT_HEAD(&opt_loglevel_list, opt_ll, next); + return 0; +fail: + free(opt_ll); + return -1; +} + +int rte_log_save_regexp(const char *regex, int tmp) +{ + return rte_log_save_level(tmp, regex, NULL); +} + +/* set log level based on globbing pattern */ +int +rte_log_set_level_pattern(const char *pattern, uint32_t level) +{ + size_t i; + + if (level > RTE_LOG_DEBUG) + return -1; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + + if (fnmatch(pattern, rte_logs.dynamic_types[i].name, 0) == 0) + rte_logs.dynamic_types[i].loglevel = level; + } + + return 0; +} + +int rte_log_save_pattern(const char *pattern, int priority) +{ + return rte_log_save_level(priority, NULL, pattern); +} + +/* get the current loglevel for the message being processed */ +int rte_log_cur_msg_loglevel(void) +{ + return RTE_PER_LCORE(log_cur_msg).loglevel; +} + +/* get the current logtype for the message being processed */ +int rte_log_cur_msg_logtype(void) +{ + return RTE_PER_LCORE(log_cur_msg).logtype; +} + +static int +rte_log_lookup(const char *name) +{ + size_t i; + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + if (strcmp(name, rte_logs.dynamic_types[i].name) == 0) + return i; + } + + return -1; +} + +/* register an extended log type, assuming table is large enough, and id + * is not yet registered. + */ +static int +__rte_log_register(const char *name, int id) +{ + char *dup_name = strdup(name); + + if (dup_name == NULL) + return -ENOMEM; + + rte_logs.dynamic_types[id].name = dup_name; + rte_logs.dynamic_types[id].loglevel = RTE_LOG_INFO; + + return id; +} + +/* register an extended log type */ +int +rte_log_register(const char *name) +{ + struct rte_log_dynamic_type *new_dynamic_types; + int id, ret; + + id = rte_log_lookup(name); + if (id >= 0) + return id; + + new_dynamic_types = realloc(rte_logs.dynamic_types, + sizeof(struct rte_log_dynamic_type) * + (rte_logs.dynamic_types_len + 1)); + if (new_dynamic_types == NULL) + return -ENOMEM; + rte_logs.dynamic_types = new_dynamic_types; + + ret = __rte_log_register(name, rte_logs.dynamic_types_len); + if (ret < 0) + return ret; + + rte_logs.dynamic_types_len++; + + return ret; +} + +/* Register an extended log type and try to pick its level from EAL options */ +int +rte_log_register_type_and_pick_level(const char *name, uint32_t level_def) +{ + struct rte_eal_opt_loglevel *opt_ll; + uint32_t level = level_def; + int type; + + type = rte_log_register(name); + if (type < 0) + return type; + + TAILQ_FOREACH(opt_ll, &opt_loglevel_list, next) { + if (opt_ll->level > RTE_LOG_DEBUG) + continue; + + if (opt_ll->pattern) { + if (fnmatch(opt_ll->pattern, name, 0) == 0) + level = opt_ll->level; + } else { + if (regexec(&opt_ll->re_match, name, 0, NULL, 0) == 0) + level = opt_ll->level; + } + } + + rte_logs.dynamic_types[type].loglevel = level; + + return type; +} + +struct logtype { + uint32_t log_id; + const char *logtype; +}; + +static const struct logtype logtype_strings[] = { + {RTE_LOGTYPE_EAL, "lib.eal"}, + {RTE_LOGTYPE_MALLOC, "lib.malloc"}, + {RTE_LOGTYPE_RING, "lib.ring"}, + {RTE_LOGTYPE_MEMPOOL, "lib.mempool"}, + {RTE_LOGTYPE_TIMER, "lib.timer"}, + {RTE_LOGTYPE_PMD, "pmd"}, + {RTE_LOGTYPE_HASH, "lib.hash"}, + {RTE_LOGTYPE_LPM, "lib.lpm"}, + {RTE_LOGTYPE_KNI, "lib.kni"}, + {RTE_LOGTYPE_ACL, "lib.acl"}, + {RTE_LOGTYPE_POWER, "lib.power"}, + {RTE_LOGTYPE_METER, "lib.meter"}, + {RTE_LOGTYPE_SCHED, "lib.sched"}, + {RTE_LOGTYPE_PORT, "lib.port"}, + {RTE_LOGTYPE_TABLE, "lib.table"}, + {RTE_LOGTYPE_PIPELINE, "lib.pipeline"}, + {RTE_LOGTYPE_MBUF, "lib.mbuf"}, + {RTE_LOGTYPE_CRYPTODEV, "lib.cryptodev"}, + {RTE_LOGTYPE_EFD, "lib.efd"}, + {RTE_LOGTYPE_EVENTDEV, "lib.eventdev"}, + {RTE_LOGTYPE_GSO, "lib.gso"}, + {RTE_LOGTYPE_USER1, "user1"}, + {RTE_LOGTYPE_USER2, "user2"}, + {RTE_LOGTYPE_USER3, "user3"}, + {RTE_LOGTYPE_USER4, "user4"}, + {RTE_LOGTYPE_USER5, "user5"}, + {RTE_LOGTYPE_USER6, "user6"}, + {RTE_LOGTYPE_USER7, "user7"}, + {RTE_LOGTYPE_USER8, "user8"} +}; + +/* Logging should be first initializer (before drivers and bus) */ +RTE_INIT_PRIO(rte_log_init, LOG) +{ + uint32_t i; + + rte_log_set_global_level(RTE_LOG_DEBUG); + + rte_logs.dynamic_types = calloc(RTE_LOGTYPE_FIRST_EXT_ID, + sizeof(struct rte_log_dynamic_type)); + if (rte_logs.dynamic_types == NULL) + return; + + /* register legacy log types */ + for (i = 0; i < RTE_DIM(logtype_strings); i++) + __rte_log_register(logtype_strings[i].logtype, + logtype_strings[i].log_id); + + rte_logs.dynamic_types_len = RTE_LOGTYPE_FIRST_EXT_ID; +} + +static const char * +loglevel_to_string(uint32_t level) +{ + switch (level) { + case 0: return "disabled"; + case RTE_LOG_EMERG: return "emerg"; + case RTE_LOG_ALERT: return "alert"; + case RTE_LOG_CRIT: return "critical"; + case RTE_LOG_ERR: return "error"; + case RTE_LOG_WARNING: return "warning"; + case RTE_LOG_NOTICE: return "notice"; + case RTE_LOG_INFO: return "info"; + case RTE_LOG_DEBUG: return "debug"; + default: return "unknown"; + } +} + +/* dump global level and registered log types */ +void +rte_log_dump(FILE *f) +{ + size_t i; + + fprintf(f, "global log level is %s\n", + loglevel_to_string(rte_log_get_global_level())); + + for (i = 0; i < rte_logs.dynamic_types_len; i++) { + if (rte_logs.dynamic_types[i].name == NULL) + continue; + fprintf(f, "id %zu: %s, level is %s\n", + i, rte_logs.dynamic_types[i].name, + loglevel_to_string(rte_logs.dynamic_types[i].loglevel)); + } +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + */ +int +rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap) +{ + FILE *f = rte_log_get_stream(); + int ret; + + if (logtype >= rte_logs.dynamic_types_len) + return -1; + if (!rte_log_can_log(logtype, level)) + return 0; + + /* save loglevel and logtype in a global per-lcore variable */ + RTE_PER_LCORE(log_cur_msg).loglevel = level; + RTE_PER_LCORE(log_cur_msg).logtype = logtype; + + ret = vfprintf(f, format, ap); + fflush(f); + return ret; +} + +/* + * Generates a log message The message will be sent in the stream + * defined by the previous call to rte_openlog_stream(). + * No need to check level here, done by rte_vlog(). + */ +int +rte_log(uint32_t level, uint32_t logtype, const char *format, ...) +{ + va_list ap; + int ret; + + va_start(ap, format); + ret = rte_vlog(level, logtype, format, ap); + va_end(ap); + return ret; +} + +/* + * Called by environment-specific initialization functions. + */ +void +eal_log_set_default(FILE *default_log) +{ + default_log_stream = default_log; + +#if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG + RTE_LOG(NOTICE, EAL, + "Debug dataplane logs available - lower performance\n"); +#endif +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c new file mode 100644 index 000000000..49d3ed0ce --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_mcfg.c @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include <rte_eal_memconfig.h> +#include <rte_version.h> + +#include "eal_internal_cfg.h" +#include "eal_memcfg.h" +#include "eal_private.h" + +void +eal_mcfg_complete(void) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + struct rte_mem_config *mcfg = cfg->mem_config; + + /* ALL shared mem_config related INIT DONE */ + if (cfg->process_type == RTE_PROC_PRIMARY) + mcfg->magic = RTE_MAGIC; + + internal_config.init_complete = 1; +} + +void +eal_mcfg_wait_complete(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* wait until shared mem_config finish initialising */ + while (mcfg->magic != RTE_MAGIC) + rte_pause(); +} + +int +eal_mcfg_check_version(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* check if version from memconfig matches compiled in macro */ + if (mcfg->version != RTE_VERSION) + return -1; + + return 0; +} + +void +eal_mcfg_update_internal(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + +void +eal_mcfg_update_from_internal(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; + /* record current DPDK version */ + mcfg->version = RTE_VERSION; +} + +void +rte_mcfg_mem_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_mem_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock); +} + +void +rte_mcfg_tailq_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->qlock); +} + +void +rte_mcfg_tailq_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->qlock); +} + +void +rte_mcfg_mempool_read_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_lock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_read_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_read_unlock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_write_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_lock(&mcfg->mplock); +} + +void +rte_mcfg_mempool_write_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_write_unlock(&mcfg->mplock); +} + +void +rte_mcfg_timer_lock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_spinlock_lock(&mcfg->tlock); +} + +void +rte_mcfg_timer_unlock(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_spinlock_unlock(&mcfg->tlock); +} + +bool +rte_mcfg_get_single_file_segments(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + return (bool)mcfg->single_file_segments; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c new file mode 100644 index 000000000..55189d072 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memalloc.c @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include <string.h> + +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_fbarray.h> +#include <rte_memzone.h> +#include <rte_memory.h> +#include <rte_string_fns.h> +#include <rte_rwlock.h> + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" + +struct mem_event_callback_entry { + TAILQ_ENTRY(mem_event_callback_entry) next; + char name[RTE_MEM_EVENT_CALLBACK_NAME_LEN]; + rte_mem_event_callback_t clb; + void *arg; +}; + +struct mem_alloc_validator_entry { + TAILQ_ENTRY(mem_alloc_validator_entry) next; + char name[RTE_MEM_ALLOC_VALIDATOR_NAME_LEN]; + rte_mem_alloc_validator_t clb; + int socket_id; + size_t limit; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(mem_event_callback_entry_list, mem_event_callback_entry); +TAILQ_HEAD(mem_alloc_validator_entry_list, mem_alloc_validator_entry); + +static struct mem_event_callback_entry_list mem_event_callback_list = + TAILQ_HEAD_INITIALIZER(mem_event_callback_list); +static rte_rwlock_t mem_event_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_alloc_validator_entry_list mem_alloc_validator_list = + TAILQ_HEAD_INITIALIZER(mem_alloc_validator_list); +static rte_rwlock_t mem_alloc_validator_rwlock = RTE_RWLOCK_INITIALIZER; + +static struct mem_event_callback_entry * +find_mem_event_callback(const char *name, void *arg) +{ + struct mem_event_callback_entry *r; + + TAILQ_FOREACH(r, &mem_event_callback_list, next) { + if (!strcmp(r->name, name) && r->arg == arg) + break; + } + return r; +} + +static struct mem_alloc_validator_entry * +find_mem_alloc_validator(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *r; + + TAILQ_FOREACH(r, &mem_alloc_validator_list, next) { + if (!strcmp(r->name, name) && r->socket_id == socket_id) + break; + } + return r; +} + +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len) +{ + void *end, *aligned_start, *aligned_end; + size_t pgsz = (size_t)msl->page_sz; + const struct rte_memseg *ms; + + /* for IOVA_VA, it's always contiguous */ + if (rte_eal_iova_mode() == RTE_IOVA_VA && !msl->external) + return true; + + /* for legacy memory, it's always contiguous */ + if (internal_config.legacy_mem) + return true; + + end = RTE_PTR_ADD(start, len); + + /* for nohuge, we check pagemap, otherwise check memseg */ + if (!rte_eal_has_hugepages()) { + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + cur = rte_mem_virt2iova(aligned_start); + expected = cur + pgsz; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + + while (aligned_start < aligned_end) { + cur = rte_mem_virt2iova(aligned_start); + if (cur != expected) + return false; + aligned_start = RTE_PTR_ADD(aligned_start, pgsz); + expected += pgsz; + } + } else { + int start_seg, end_seg, cur_seg; + rte_iova_t cur, expected; + + aligned_start = RTE_PTR_ALIGN_FLOOR(start, pgsz); + aligned_end = RTE_PTR_ALIGN_CEIL(end, pgsz); + + start_seg = RTE_PTR_DIFF(aligned_start, msl->base_va) / + pgsz; + end_seg = RTE_PTR_DIFF(aligned_end, msl->base_va) / + pgsz; + + /* if start and end are on the same page, bail out early */ + if (RTE_PTR_DIFF(aligned_end, aligned_start) == pgsz) + return true; + + /* skip first iteration */ + ms = rte_fbarray_get(&msl->memseg_arr, start_seg); + cur = ms->iova; + expected = cur + pgsz; + + /* if we can't access IOVA addresses, assume non-contiguous */ + if (cur == RTE_BAD_IOVA) + return false; + + for (cur_seg = start_seg + 1; cur_seg < end_seg; + cur_seg++, expected += pgsz) { + ms = rte_fbarray_get(&msl->memseg_arr, cur_seg); + + if (ms->iova != expected) + return false; + } + } + return true; +} + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + if (name == NULL || clb == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->arg = arg; + strlcpy(entry->name, name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_event_callback_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' registered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg) +{ + struct mem_event_callback_entry *entry; + int ret, len; + + if (name == NULL) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_EVENT_CALLBACK_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_EVENT_CALLBACK_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_event_rwlock); + + entry = find_mem_event_callback(name, arg); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_event_callback_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem event callback '%s:%p' unregistered\n", + name, arg); + +unlock: + rte_rwlock_write_unlock(&mem_event_rwlock); + return ret; +} + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len) +{ + struct mem_event_callback_entry *entry; + + rte_rwlock_read_lock(&mem_event_rwlock); + + TAILQ_FOREACH(entry, &mem_event_callback_list, next) { + RTE_LOG(DEBUG, EAL, "Calling mem event callback '%s:%p'\n", + entry->name, entry->arg); + entry->clb(event, start, len, entry->arg); + } + + rte_rwlock_read_unlock(&mem_event_rwlock); +} + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + if (name == NULL || clb == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + ret = -1; + goto unlock; + } + + /* callback successfully created and is valid, add it to the list */ + entry->clb = clb; + entry->socket_id = socket_id; + entry->limit = limit; + strlcpy(entry->name, name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + TAILQ_INSERT_TAIL(&mem_alloc_validator_list, entry, next); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i with limit %zu registered\n", + name, socket_id, limit); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + struct mem_alloc_validator_entry *entry; + int ret, len; + + if (name == NULL || socket_id < 0) { + rte_errno = EINVAL; + return -1; + } + len = strnlen(name, RTE_MEM_ALLOC_VALIDATOR_NAME_LEN); + if (len == 0) { + rte_errno = EINVAL; + return -1; + } else if (len == RTE_MEM_ALLOC_VALIDATOR_NAME_LEN) { + rte_errno = ENAMETOOLONG; + return -1; + } + rte_rwlock_write_lock(&mem_alloc_validator_rwlock); + + entry = find_mem_alloc_validator(name, socket_id); + if (entry == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + TAILQ_REMOVE(&mem_alloc_validator_list, entry, next); + free(entry); + + ret = 0; + + RTE_LOG(DEBUG, EAL, "Mem alloc validator '%s' on socket %i unregistered\n", + name, socket_id); + +unlock: + rte_rwlock_write_unlock(&mem_alloc_validator_rwlock); + return ret; +} + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len) +{ + struct mem_alloc_validator_entry *entry; + int ret = 0; + + rte_rwlock_read_lock(&mem_alloc_validator_rwlock); + + TAILQ_FOREACH(entry, &mem_alloc_validator_list, next) { + if (entry->socket_id != socket_id || entry->limit > new_len) + continue; + RTE_LOG(DEBUG, EAL, "Calling mem alloc validator '%s' on socket %i\n", + entry->name, entry->socket_id); + if (entry->clb(socket_id, entry->limit, new_len) < 0) + ret = -1; + } + + rte_rwlock_read_unlock(&mem_alloc_validator_rwlock); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c new file mode 100644 index 000000000..4c897a13f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memory.c @@ -0,0 +1,939 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/mman.h> +#include <sys/queue.h> + +#include <rte_fbarray.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_errno.h> +#include <rte_log.h> + +#include "eal_memalloc.h" +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_memcfg.h" +#include "malloc_heap.h" + +/* + * Try to mmap *size bytes in /dev/zero. If it is successful, return the + * pointer to the mmap'd area and keep *size unmodified. Else, retry + * with a smaller zone: decrease *size by hugepage_sz until it reaches + * 0. In this case, return NULL. Note: this function returns an address + * which is a multiple of hugepage size. + */ + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" + +static void *next_baseaddr; +static uint64_t system_page_sz; + +#ifdef RTE_EXEC_ENV_LINUX +#define RTE_DONTDUMP MADV_DONTDUMP +#elif defined RTE_EXEC_ENV_FREEBSD +#define RTE_DONTDUMP MADV_NOCORE +#else +#error "madvise doesn't support this OS" +#endif + +#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5 +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags) +{ + bool addr_is_hint, allow_shrink, unmap, no_align; + uint64_t map_sz; + void *mapped_addr, *aligned_addr; + uint8_t try = 0; + + if (system_page_sz == 0) + system_page_sz = sysconf(_SC_PAGESIZE); + + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + + addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0; + allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0; + unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0; + + if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) internal_config.base_virtaddr; + +#ifdef RTE_ARCH_64 + if (next_baseaddr == NULL && internal_config.base_virtaddr == 0 && + rte_eal_process_type() == RTE_PROC_PRIMARY) + next_baseaddr = (void *) eal_get_baseaddr(); +#endif + if (requested_addr == NULL && next_baseaddr != NULL) { + requested_addr = next_baseaddr; + requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz); + addr_is_hint = true; + } + + /* we don't need alignment of resulting pointer in the following cases: + * + * 1. page size is equal to system size + * 2. we have a requested address, and it is page-aligned, and we will + * be discarding the address if we get a different one. + * + * for all other cases, alignment is potentially necessary. + */ + no_align = (requested_addr != NULL && + requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) && + !addr_is_hint) || + page_sz == system_page_sz; + + do { + map_sz = no_align ? *size : *size + page_sz; + if (map_sz > SIZE_MAX) { + RTE_LOG(ERR, EAL, "Map size too big\n"); + rte_errno = E2BIG; + return NULL; + } + + mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; + + if (mapped_addr != MAP_FAILED && addr_is_hint && + mapped_addr != requested_addr) { + try++; + next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz); + if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) { + /* hint was not used. Try with another offset */ + munmap(mapped_addr, map_sz); + mapped_addr = MAP_FAILED; + requested_addr = next_baseaddr; + } + } + } while ((allow_shrink || addr_is_hint) && + mapped_addr == MAP_FAILED && *size > 0); + + /* align resulting address - if map failed, we will ignore the value + * anyway, so no need to add additional checks. + */ + aligned_addr = no_align ? mapped_addr : + RTE_PTR_ALIGN(mapped_addr, page_sz); + + if (*size == 0) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n", + strerror(errno)); + rte_errno = errno; + return NULL; + } else if (mapped_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); + /* pass errno up the call chain */ + rte_errno = errno; + return NULL; + } else if (requested_addr != NULL && !addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n", + requested_addr, aligned_addr); + munmap(mapped_addr, map_sz); + rte_errno = EADDRNOTAVAIL; + return NULL; + } else if (requested_addr != NULL && addr_is_hint && + aligned_addr != requested_addr) { + RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n", + requested_addr, aligned_addr); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n"); + } else if (next_baseaddr != NULL) { + next_baseaddr = RTE_PTR_ADD(aligned_addr, *size); + } + + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", + aligned_addr, *size); + + if (unmap) { + munmap(mapped_addr, map_sz); + } else if (!no_align) { + void *map_end, *aligned_end; + size_t before_len, after_len; + + /* when we reserve space with alignment, we add alignment to + * mapping size. On 32-bit, if 1GB alignment was requested, this + * would waste 1GB of address space, which is a luxury we cannot + * afford. so, if alignment was performed, check if any unneeded + * address space can be unmapped back. + */ + + map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz); + aligned_end = RTE_PTR_ADD(aligned_addr, *size); + + /* unmap space before aligned mmap address */ + before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr); + if (before_len > 0) + munmap(mapped_addr, before_len); + + /* unmap space after aligned end mmap address */ + after_len = RTE_PTR_DIFF(map_end, aligned_end); + if (after_len > 0) + munmap(aligned_end, after_len); + } + + if (!unmap) { + /* Exclude these pages from a core dump. */ + if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0) + RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", + strerror(errno)); + } + + return aligned_addr; +} + +static struct rte_memseg * +virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + const struct rte_fbarray *arr; + void *start, *end; + int ms_idx; + + if (msl == NULL) + return NULL; + + /* a memseg list was specified, check if it's the right one */ + start = msl->base_va; + end = RTE_PTR_ADD(start, msl->len); + + if (addr < start || addr >= end) + return NULL; + + /* now, calculate index */ + arr = &msl->memseg_arr; + ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz; + return rte_fbarray_get(arr, ms_idx); +} + +static struct rte_memseg_list * +virt2memseg_list(const void *addr) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + int msl_idx; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + void *start, *end; + msl = &mcfg->memsegs[msl_idx]; + + start = msl->base_va; + end = RTE_PTR_ADD(start, msl->len); + if (addr >= start && addr < end) + break; + } + /* if we didn't find our memseg list */ + if (msl_idx == RTE_MAX_MEMSEG_LISTS) + return NULL; + return msl; +} + +struct rte_memseg_list * +rte_mem_virt2memseg_list(const void *addr) +{ + return virt2memseg_list(addr); +} + +struct virtiova { + rte_iova_t iova; + void *virt; +}; +static int +find_virt(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} +static int +find_virt_legacy(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, size_t len, void *arg) +{ + struct virtiova *vi = arg; + if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) { + size_t offset = vi->iova - ms->iova; + vi->virt = RTE_PTR_ADD(ms->addr, offset); + /* stop the walk */ + return 1; + } + return 0; +} + +void * +rte_mem_iova2virt(rte_iova_t iova) +{ + struct virtiova vi; + + memset(&vi, 0, sizeof(vi)); + + vi.iova = iova; + /* for legacy mem, we can get away with scanning VA-contiguous segments, + * as we know they are PA-contiguous as well + */ + if (internal_config.legacy_mem) + rte_memseg_contig_walk(find_virt_legacy, &vi); + else + rte_memseg_walk(find_virt, &vi); + + return vi.virt; +} + +struct rte_memseg * +rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) +{ + return virt2memseg(addr, msl != NULL ? msl : + rte_mem_virt2memseg_list(addr)); +} + +static int +physmem_size(const struct rte_memseg_list *msl, void *arg) +{ + uint64_t *total_len = arg; + + if (msl->external) + return 0; + + *total_len += msl->memseg_arr.count * msl->page_sz; + + return 0; +} + +/* get the total size of memory */ +uint64_t +rte_eal_get_physmem_size(void) +{ + uint64_t total_len = 0; + + rte_memseg_list_walk(physmem_size, &total_len); + + return total_len; +} + +static int +dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx, ms_idx, fd; + FILE *f = arg; + + msl_idx = msl - mcfg->memsegs; + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) + return -1; + + fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx); + fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, " + "virt:%p, socket_id:%"PRId32", " + "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", " + "nrank:%"PRIx32" fd:%i\n", + msl_idx, ms_idx, + ms->iova, + ms->len, + ms->addr, + ms->socket_id, + ms->hugepage_sz, + ms->nchannel, + ms->nrank, + fd); + + return 0; +} + +/* + * Defining here because declared in rte_memory.h, but the actual implementation + * is in eal_common_memalloc.c, like all other memalloc internals. + */ +int +rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb, + void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_register(name, clb, arg); +} + +int +rte_mem_event_callback_unregister(const char *name, void *arg) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_event_callback_unregister(name, arg); +} + +int +rte_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id, + limit); +} + +int +rte_mem_alloc_validator_unregister(const char *name, int socket_id) +{ + /* FreeBSD boots with legacy mem enabled by default */ + if (internal_config.legacy_mem) { + RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n"); + rte_errno = ENOTSUP; + return -1; + } + return eal_memalloc_mem_alloc_validator_unregister(name, socket_id); +} + +/* Dump the physical memory layout on console */ +void +rte_dump_physmem_layout(FILE *f) +{ + rte_memseg_walk(dump_memseg, f); +} + +static int +check_iova(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + uint64_t *mask = arg; + rte_iova_t iova; + + /* higher address within segment */ + iova = (ms->iova + ms->len) - 1; + if (!(iova & *mask)) + return 0; + + RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n", + ms->iova, ms->len); + + RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask); + return 1; +} + +#define MAX_DMA_MASK_BITS 63 + +/* check memseg iovas are within the required range based on dma mask */ +static int +check_dma_mask(uint8_t maskbits, bool thread_unsafe) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t mask; + int ret; + + /* Sanity check. We only check width can be managed with 64 bits + * variables. Indeed any higher value is likely wrong. */ + if (maskbits > MAX_DMA_MASK_BITS) { + RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n", + maskbits, MAX_DMA_MASK_BITS); + return -1; + } + + /* create dma mask */ + mask = ~((1ULL << maskbits) - 1); + + if (thread_unsafe) + ret = rte_memseg_walk_thread_unsafe(check_iova, &mask); + else + ret = rte_memseg_walk(check_iova, &mask); + + if (ret) + /* + * Dma mask precludes hugepage usage. + * This device can not be used and we do not need to keep + * the dma mask. + */ + return 1; + + /* + * we need to keep the more restricted maskbit for checking + * potential dynamic memory allocation in the future. + */ + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); + + return 0; +} + +int +rte_mem_check_dma_mask(uint8_t maskbits) +{ + return check_dma_mask(maskbits, false); +} + +int +rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits) +{ + return check_dma_mask(maskbits, true); +} + +/* + * Set dma mask to use when memory initialization is done. + * + * This function should ONLY be used by code executed before the memory + * initialization. PMDs should use rte_mem_check_dma_mask if addressing + * limitations by the device. + */ +void +rte_mem_set_dma_mask(uint8_t maskbits) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits : + RTE_MIN(mcfg->dma_maskbits, maskbits); +} + +/* return the number of memory channels */ +unsigned rte_memory_get_nchannel(void) +{ + return rte_eal_get_configuration()->mem_config->nchannel; +} + +/* return the number of memory rank */ +unsigned rte_memory_get_nrank(void) +{ + return rte_eal_get_configuration()->mem_config->nrank; +} + +static int +rte_eal_memdevice_init(void) +{ + struct rte_config *config; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + return 0; + + config = rte_eal_get_configuration(); + config->mem_config->nchannel = internal_config.force_nchannel; + config->mem_config->nrank = internal_config.force_nrank; + + return 0; +} + +/* Lock page in physical memory and prevent from swapping. */ +int +rte_mem_lock_page(const void *virt) +{ + unsigned long virtual = (unsigned long)virt; + int page_size = getpagesize(); + unsigned long aligned = (virtual & ~(page_size - 1)); + return mlock((void *)aligned, page_size); +} + +int +rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + int n_segs; + size_t len; + + ms = rte_fbarray_get(arr, ms_idx); + + /* find how many more segments there are, starting with + * this one. + */ + n_segs = rte_fbarray_find_contig_used(arr, ms_idx); + len = n_segs * msl->page_sz; + + ret = func(msl, ms, len, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, + ms_idx + n_segs); + } + } + return 0; +} + +int +rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_contig_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ms_idx, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + const struct rte_memseg *ms; + struct rte_fbarray *arr; + + if (msl->memseg_arr.count == 0) + continue; + + arr = &msl->memseg_arr; + + ms_idx = rte_fbarray_find_next_used(arr, 0); + while (ms_idx >= 0) { + ms = rte_fbarray_get(arr, ms_idx); + ret = func(msl, ms, arg); + if (ret) + return ret; + ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1); + } + } + return 0; +} + +int +rte_memseg_walk(rte_memseg_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret) + return ret; + } + return 0; +} + +int +rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg) +{ + int ret = 0; + + /* do not allow allocations/frees/init while we iterate */ + rte_mcfg_mem_read_lock(); + ret = rte_memseg_list_walk_thread_unsafe(func, arg); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + /* segment fd API is not supported for external segments */ + if (msl->external) { + rte_errno = ENOTSUP; + return -1; + } + + ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int +rte_memseg_get_fd(const struct rte_memseg *ms) +{ + int ret; + + rte_mcfg_mem_read_lock(); + ret = rte_memseg_get_fd_thread_unsafe(ms); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms, + size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int msl_idx, seg_idx, ret; + + if (ms == NULL || offset == NULL) { + rte_errno = EINVAL; + return -1; + } + + msl = rte_mem_virt2memseg_list(ms->addr); + if (msl == NULL) { + rte_errno = EINVAL; + return -1; + } + arr = &msl->memseg_arr; + + msl_idx = msl - mcfg->memsegs; + seg_idx = rte_fbarray_find_idx(arr, ms); + + if (!rte_fbarray_is_used(arr, seg_idx)) { + rte_errno = ENOENT; + return -1; + } + + /* segment fd API is not supported for external segments */ + if (msl->external) { + rte_errno = ENOTSUP; + return -1; + } + + ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset); + if (ret < 0) { + rte_errno = -ret; + ret = -1; + } + return ret; +} + +int +rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset) +{ + int ret; + + rte_mcfg_mem_read_lock(); + ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset); + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int socket_id, n; + int ret = 0; + + if (va_addr == NULL || page_sz == 0 || len == 0 || + !rte_is_power_of_2(page_sz) || + RTE_ALIGN(len, page_sz) != len || + ((len / page_sz) != n_pages && iova_addrs != NULL) || + !rte_is_aligned(va_addr, page_sz)) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* make sure the segment doesn't already exist */ + if (malloc_heap_find_external_seg(va_addr, len) != NULL) { + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + + /* get next available socket ID */ + socket_id = mcfg->next_socket_id; + if (socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we can create a new memseg */ + n = len / page_sz; + if (malloc_heap_create_external_seg(va_addr, iova_addrs, n, + page_sz, "extmem", socket_id) == NULL) { + ret = -1; + goto unlock; + } + + /* memseg list successfully created - increment next socket ID */ + mcfg->next_socket_id++; +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +int +rte_extmem_unregister(void *va_addr, size_t len) +{ + struct rte_memseg_list *msl; + int ret = 0; + + if (va_addr == NULL || len == 0) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our segment */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + + ret = malloc_heap_destroy_external_seg(msl); +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +static int +sync_memory(void *va_addr, size_t len, bool attach) +{ + struct rte_memseg_list *msl; + int ret = 0; + + if (va_addr == NULL || len == 0) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our segment */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (attach) + ret = rte_fbarray_attach(&msl->memseg_arr); + else + ret = rte_fbarray_detach(&msl->memseg_arr); + +unlock: + rte_mcfg_mem_write_unlock(); + return ret; +} + +int +rte_extmem_attach(void *va_addr, size_t len) +{ + return sync_memory(va_addr, len, true); +} + +int +rte_extmem_detach(void *va_addr, size_t len) +{ + return sync_memory(va_addr, len, false); +} + +/* init memory subsystem */ +int +rte_eal_memory_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int retval; + RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n"); + + if (!mcfg) + return -1; + + /* lock mem hotplug here, to prevent races while we init */ + rte_mcfg_mem_read_lock(); + + if (rte_eal_memseg_init() < 0) + goto fail; + + if (eal_memalloc_init() < 0) + goto fail; + + retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? + rte_eal_hugepage_init() : + rte_eal_hugepage_attach(); + if (retval < 0) + goto fail; + + if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) + goto fail; + + return 0; +fail: + rte_mcfg_mem_read_unlock(); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c new file mode 100644 index 000000000..7c21aa921 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_memzone.c @@ -0,0 +1,420 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stdarg.h> +#include <inttypes.h> +#include <string.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_errno.h> +#include <rte_string_fns.h> +#include <rte_common.h> +#include <rte_eal_trace.h> + +#include "malloc_heap.h" +#include "malloc_elem.h" +#include "eal_private.h" +#include "eal_memcfg.h" + +static inline const struct rte_memzone * +memzone_lookup_thread_unsafe(const char *name) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + const struct rte_memzone *mz; + int i = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* + * the algorithm is not optimal (linear), but there are few + * zones and this function should be called at init only + */ + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + mz = rte_fbarray_get(arr, i); + if (mz->addr != NULL && + !strncmp(name, mz->name, RTE_MEMZONE_NAMESIZE)) + return mz; + i = rte_fbarray_find_next_used(arr, i + 1); + } + return NULL; +} + +static const struct rte_memzone * +memzone_reserve_aligned_thread_unsafe(const char *name, size_t len, + int socket_id, unsigned int flags, unsigned int align, + unsigned int bound) +{ + struct rte_memzone *mz; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + void *mz_addr; + size_t requested_len; + int mz_idx; + bool contig; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + /* no more room in config */ + if (arr->count >= arr->len) { + RTE_LOG(ERR, EAL, + "%s(): Number of requested memzone segments exceeds RTE_MAX_MEMZONE\n", + __func__); + rte_errno = ENOSPC; + return NULL; + } + + if (strlen(name) > sizeof(mz->name) - 1) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s>: name too long\n", + __func__, name); + rte_errno = ENAMETOOLONG; + return NULL; + } + + /* zone already exist */ + if ((memzone_lookup_thread_unsafe(name)) != NULL) { + RTE_LOG(DEBUG, EAL, "%s(): memzone <%s> already exists\n", + __func__, name); + rte_errno = EEXIST; + return NULL; + } + + /* if alignment is not a power of two */ + if (align && !rte_is_power_of_2(align)) { + RTE_LOG(ERR, EAL, "%s(): Invalid alignment: %u\n", __func__, + align); + rte_errno = EINVAL; + return NULL; + } + + /* alignment less than cache size is not allowed */ + if (align < RTE_CACHE_LINE_SIZE) + align = RTE_CACHE_LINE_SIZE; + + /* align length on cache boundary. Check for overflow before doing so */ + if (len > SIZE_MAX - RTE_CACHE_LINE_MASK) { + rte_errno = EINVAL; /* requested size too big */ + return NULL; + } + + len = RTE_ALIGN_CEIL(len, RTE_CACHE_LINE_SIZE); + + /* save minimal requested length */ + requested_len = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, len); + + /* check that boundary condition is valid */ + if (bound != 0 && (requested_len > bound || !rte_is_power_of_2(bound))) { + rte_errno = EINVAL; + return NULL; + } + + if ((socket_id != SOCKET_ID_ANY) && socket_id < 0) { + rte_errno = EINVAL; + return NULL; + } + + /* only set socket to SOCKET_ID_ANY if we aren't allocating for an + * external heap. + */ + if (!rte_eal_has_hugepages() && socket_id < RTE_MAX_NUMA_NODES) + socket_id = SOCKET_ID_ANY; + + contig = (flags & RTE_MEMZONE_IOVA_CONTIG) != 0; + /* malloc only cares about size flags, remove contig flag from flags */ + flags &= ~RTE_MEMZONE_IOVA_CONTIG; + + if (len == 0 && bound == 0) { + /* no size constraints were placed, so use malloc elem len */ + requested_len = 0; + mz_addr = malloc_heap_alloc_biggest(NULL, socket_id, flags, + align, contig); + } else { + if (len == 0) + requested_len = bound; + /* allocate memory on heap */ + mz_addr = malloc_heap_alloc(NULL, requested_len, socket_id, + flags, align, bound, contig); + } + if (mz_addr == NULL) { + rte_errno = ENOMEM; + return NULL; + } + + struct malloc_elem *elem = malloc_elem_from_data(mz_addr); + + /* fill the zone in config */ + mz_idx = rte_fbarray_find_next_free(arr, 0); + + if (mz_idx < 0) { + mz = NULL; + } else { + rte_fbarray_set_used(arr, mz_idx); + mz = rte_fbarray_get(arr, mz_idx); + } + + if (mz == NULL) { + RTE_LOG(ERR, EAL, "%s(): Cannot find free memzone\n", __func__); + malloc_heap_free(elem); + rte_errno = ENOSPC; + return NULL; + } + + strlcpy(mz->name, name, sizeof(mz->name)); + mz->iova = rte_malloc_virt2iova(mz_addr); + mz->addr = mz_addr; + mz->len = requested_len == 0 ? + elem->size - elem->pad - MALLOC_ELEM_OVERHEAD : + requested_len; + mz->hugepage_sz = elem->msl->page_sz; + mz->socket_id = elem->msl->socket_id; + mz->flags = 0; + + return mz; +} + +static const struct rte_memzone * +rte_memzone_reserve_thread_safe(const char *name, size_t len, int socket_id, + unsigned int flags, unsigned int align, unsigned int bound) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *mz = NULL; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + mz = memzone_reserve_aligned_thread_unsafe( + name, len, socket_id, flags, align, bound); + + rte_eal_trace_memzone_reserve(name, len, socket_id, flags, align, + bound, mz); + + rte_rwlock_write_unlock(&mcfg->mlock); + + return mz; +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment and boundary). If the allocation cannot be done, + * return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_bounded(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align, unsigned bound) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, bound); +} + +/* + * Return a pointer to a correctly filled memzone descriptor (with a + * specified alignment). If the allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, flags, + align, 0); +} + +/* + * Return a pointer to a correctly filled memzone descriptor. If the + * allocation cannot be done, return NULL. + */ +const struct rte_memzone * +rte_memzone_reserve(const char *name, size_t len, int socket_id, + unsigned flags) +{ + return rte_memzone_reserve_thread_safe(name, len, socket_id, + flags, RTE_CACHE_LINE_SIZE, 0); +} + +int +rte_memzone_free(const struct rte_memzone *mz) +{ + char name[RTE_MEMZONE_NAMESIZE]; + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + struct rte_memzone *found_mz; + int ret = 0; + void *addr = NULL; + unsigned idx; + + if (mz == NULL) + return -EINVAL; + + rte_strlcpy(name, mz->name, RTE_MEMZONE_NAMESIZE); + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_write_lock(&mcfg->mlock); + + idx = rte_fbarray_find_idx(arr, mz); + found_mz = rte_fbarray_get(arr, idx); + + if (found_mz == NULL) { + ret = -EINVAL; + } else if (found_mz->addr == NULL) { + RTE_LOG(ERR, EAL, "Memzone is not allocated\n"); + ret = -EINVAL; + } else { + addr = found_mz->addr; + memset(found_mz, 0, sizeof(*found_mz)); + rte_fbarray_set_free(arr, idx); + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + if (addr != NULL) + rte_free(addr); + + rte_eal_trace_memzone_free(name, addr, ret); + return ret; +} + +/* + * Lookup for the memzone identified by the given name + */ +const struct rte_memzone * +rte_memzone_lookup(const char *name) +{ + struct rte_mem_config *mcfg; + const struct rte_memzone *memzone = NULL; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_read_lock(&mcfg->mlock); + + memzone = memzone_lookup_thread_unsafe(name); + + rte_rwlock_read_unlock(&mcfg->mlock); + + rte_eal_trace_memzone_lookup(name, memzone); + return memzone; +} + +static void +dump_memzone(const struct rte_memzone *mz, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl = NULL; + void *cur_addr, *mz_end; + struct rte_memseg *ms; + int mz_idx, ms_idx; + size_t page_sz; + FILE *f = arg; + + mz_idx = rte_fbarray_find_idx(&mcfg->memzones, mz); + + fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, " + "socket_id:%"PRId32", flags:%"PRIx32"\n", + mz_idx, + mz->name, + mz->len, + mz->addr, + mz->socket_id, + mz->flags); + + /* go through each page occupied by this memzone */ + msl = rte_mem_virt2memseg_list(mz->addr); + if (!msl) { + RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n"); + return; + } + page_sz = (size_t)mz->hugepage_sz; + cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz); + mz_end = RTE_PTR_ADD(cur_addr, mz->len); + + fprintf(f, "physical segments used:\n"); + ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz; + ms = rte_fbarray_get(&msl->memseg_arr, ms_idx); + + do { + fprintf(f, " addr: %p iova: 0x%" PRIx64 " " + "len: 0x%zx " + "pagesz: 0x%zx\n", + cur_addr, ms->iova, ms->len, page_sz); + + /* advance VA to next page */ + cur_addr = RTE_PTR_ADD(cur_addr, page_sz); + + /* memzones occupy contiguous segments */ + ++ms; + } while (cur_addr < mz_end); +} + +/* Dump all reserved memory zones on console */ +void +rte_memzone_dump(FILE *f) +{ + rte_memzone_walk(dump_memzone, f); +} + +/* + * Init the memzone subsystem + */ +int +rte_eal_memzone_init(void) +{ + struct rte_mem_config *mcfg; + int ret = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + rte_rwlock_write_lock(&mcfg->mlock); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + rte_fbarray_init(&mcfg->memzones, "memzone", + RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) { + RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n"); + ret = -1; + } else if (rte_eal_process_type() == RTE_PROC_SECONDARY && + rte_fbarray_attach(&mcfg->memzones)) { + RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n"); + ret = -1; + } + + rte_rwlock_write_unlock(&mcfg->mlock); + + return ret; +} + +/* Walk all reserved memory zones */ +void rte_memzone_walk(void (*func)(const struct rte_memzone *, void *), + void *arg) +{ + struct rte_mem_config *mcfg; + struct rte_fbarray *arr; + int i; + + mcfg = rte_eal_get_configuration()->mem_config; + arr = &mcfg->memzones; + + rte_rwlock_read_lock(&mcfg->mlock); + i = rte_fbarray_find_next_used(arr, 0); + while (i >= 0) { + struct rte_memzone *mz = rte_fbarray_get(arr, i); + (*func)(mz, arg); + i = rte_fbarray_find_next_used(arr, i + 1); + } + rte_rwlock_read_unlock(&mcfg->mlock); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c new file mode 100644 index 000000000..8f2cbd1c6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_options.c @@ -0,0 +1,1861 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <syslog.h> +#endif +#include <ctype.h> +#include <limits.h> +#include <errno.h> +#include <getopt.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <dlfcn.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> + +#include <rte_string_fns.h> +#include <rte_eal.h> +#include <rte_log.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_tailq.h> +#include <rte_version.h> +#include <rte_devargs.h> +#include <rte_memcpy.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <rte_telemetry.h> +#endif + +#include "eal_internal_cfg.h" +#include "eal_options.h" +#include "eal_filesystem.h" +#include "eal_private.h" +#ifndef RTE_EXEC_ENV_WINDOWS +#include "eal_trace.h" +#endif + +#define BITS_PER_HEX 4 +#define LCORE_OPT_LST 1 +#define LCORE_OPT_MSK 2 +#define LCORE_OPT_MAP 3 + +const char +eal_short_options[] = + "b:" /* pci-blacklist */ + "c:" /* coremask */ + "s:" /* service coremask */ + "d:" /* driver */ + "h" /* help */ + "l:" /* corelist */ + "S:" /* service corelist */ + "m:" /* memory size */ + "n:" /* memory channels */ + "r:" /* memory ranks */ + "v" /* version */ + "w:" /* pci-whitelist */ + ; + +const struct option +eal_long_options[] = { + {OPT_BASE_VIRTADDR, 1, NULL, OPT_BASE_VIRTADDR_NUM }, + {OPT_CREATE_UIO_DEV, 0, NULL, OPT_CREATE_UIO_DEV_NUM }, + {OPT_FILE_PREFIX, 1, NULL, OPT_FILE_PREFIX_NUM }, + {OPT_HELP, 0, NULL, OPT_HELP_NUM }, + {OPT_HUGE_DIR, 1, NULL, OPT_HUGE_DIR_NUM }, + {OPT_HUGE_UNLINK, 0, NULL, OPT_HUGE_UNLINK_NUM }, + {OPT_IOVA_MODE, 1, NULL, OPT_IOVA_MODE_NUM }, + {OPT_LCORES, 1, NULL, OPT_LCORES_NUM }, + {OPT_LOG_LEVEL, 1, NULL, OPT_LOG_LEVEL_NUM }, + {OPT_TRACE, 1, NULL, OPT_TRACE_NUM }, + {OPT_TRACE_DIR, 1, NULL, OPT_TRACE_DIR_NUM }, + {OPT_TRACE_BUF_SIZE, 1, NULL, OPT_TRACE_BUF_SIZE_NUM }, + {OPT_TRACE_MODE, 1, NULL, OPT_TRACE_MODE_NUM }, + {OPT_MASTER_LCORE, 1, NULL, OPT_MASTER_LCORE_NUM }, + {OPT_MBUF_POOL_OPS_NAME, 1, NULL, OPT_MBUF_POOL_OPS_NAME_NUM}, + {OPT_NO_HPET, 0, NULL, OPT_NO_HPET_NUM }, + {OPT_NO_HUGE, 0, NULL, OPT_NO_HUGE_NUM }, + {OPT_NO_PCI, 0, NULL, OPT_NO_PCI_NUM }, + {OPT_NO_SHCONF, 0, NULL, OPT_NO_SHCONF_NUM }, + {OPT_IN_MEMORY, 0, NULL, OPT_IN_MEMORY_NUM }, + {OPT_PCI_BLACKLIST, 1, NULL, OPT_PCI_BLACKLIST_NUM }, + {OPT_PCI_WHITELIST, 1, NULL, OPT_PCI_WHITELIST_NUM }, + {OPT_PROC_TYPE, 1, NULL, OPT_PROC_TYPE_NUM }, + {OPT_SOCKET_MEM, 1, NULL, OPT_SOCKET_MEM_NUM }, + {OPT_SOCKET_LIMIT, 1, NULL, OPT_SOCKET_LIMIT_NUM }, + {OPT_SYSLOG, 1, NULL, OPT_SYSLOG_NUM }, + {OPT_VDEV, 1, NULL, OPT_VDEV_NUM }, + {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, + {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, + {OPT_LEGACY_MEM, 0, NULL, OPT_LEGACY_MEM_NUM }, + {OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM}, + {OPT_MATCH_ALLOCATIONS, 0, NULL, OPT_MATCH_ALLOCATIONS_NUM}, + {OPT_TELEMETRY, 0, NULL, OPT_TELEMETRY_NUM }, + {OPT_NO_TELEMETRY, 0, NULL, OPT_NO_TELEMETRY_NUM }, + {0, 0, NULL, 0 } +}; + +TAILQ_HEAD(shared_driver_list, shared_driver); + +/* Definition for shared object drivers. */ +struct shared_driver { + TAILQ_ENTRY(shared_driver) next; + + char name[PATH_MAX]; + void* lib_handle; +}; + +/* List of external loadable drivers */ +static struct shared_driver_list solib_list = +TAILQ_HEAD_INITIALIZER(solib_list); + +/* Default path of external loadable drivers */ +static const char *default_solib_dir = RTE_EAL_PMD_PATH; + +/* + * Stringified version of solib path used by dpdk-pmdinfo.py + * Note: PLEASE DO NOT ALTER THIS without making a corresponding + * change to usertools/dpdk-pmdinfo.py + */ +static const char dpdk_solib_path[] __rte_used = +"DPDK_PLUGIN_PATH=" RTE_EAL_PMD_PATH; + +TAILQ_HEAD(device_option_list, device_option); + +struct device_option { + TAILQ_ENTRY(device_option) next; + + enum rte_devtype type; + char arg[]; +}; + +static struct device_option_list devopt_list = +TAILQ_HEAD_INITIALIZER(devopt_list); + +static int master_lcore_parsed; +static int mem_parsed; +static int core_parsed; + +#ifndef RTE_EXEC_ENV_WINDOWS +static char **eal_args; +static char **eal_app_args; + +#define EAL_PARAM_REQ "/eal/params" +#define EAL_APP_PARAM_REQ "/eal/app_params" + +/* callback handler for telemetry library to report out EAL flags */ +int +handle_eal_info_request(const char *cmd, const char *params __rte_unused, + struct rte_tel_data *d) +{ + char **args; + int used = 0; + int i = 0; + + if (strcmp(cmd, EAL_PARAM_REQ) == 0) + args = eal_args; + else + args = eal_app_args; + + rte_tel_data_start_array(d, RTE_TEL_STRING_VAL); + if (args == NULL || args[0] == NULL) + return 0; + + for ( ; args[i] != NULL; i++) + used = rte_tel_data_add_array_string(d, args[i]); + return used; +} + +int +eal_save_args(int argc, char **argv) +{ + int i, j; + + rte_telemetry_register_cmd(EAL_PARAM_REQ, handle_eal_info_request, + "Returns EAL commandline parameters used. Takes no parameters"); + rte_telemetry_register_cmd(EAL_APP_PARAM_REQ, handle_eal_info_request, + "Returns app commandline parameters used. Takes no parameters"); + + /* clone argv to report out later. We overprovision, but + * this does not waste huge amounts of memory + */ + eal_args = calloc(argc + 1, sizeof(*eal_args)); + if (eal_args == NULL) + return -1; + + for (i = 0; i < argc; i++) { + eal_args[i] = strdup(argv[i]); + if (strcmp(argv[i], "--") == 0) + break; + } + eal_args[i++] = NULL; /* always finish with NULL */ + + /* allow reporting of any app args we know about too */ + if (i >= argc) + return 0; + + eal_app_args = calloc(argc - i + 1, sizeof(*eal_args)); + if (eal_app_args == NULL) + return -1; + + for (j = 0; i < argc; j++, i++) + eal_app_args[j] = strdup(argv[i]); + eal_app_args[j] = NULL; + + return 0; +} +#endif + +static int +eal_option_device_add(enum rte_devtype type, const char *optarg) +{ + struct device_option *devopt; + size_t optlen; + int ret; + + optlen = strlen(optarg) + 1; + devopt = calloc(1, sizeof(*devopt) + optlen); + if (devopt == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate device option\n"); + return -ENOMEM; + } + + devopt->type = type; + ret = strlcpy(devopt->arg, optarg, optlen); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Unable to copy device option\n"); + free(devopt); + return -EINVAL; + } + TAILQ_INSERT_TAIL(&devopt_list, devopt, next); + return 0; +} + +int +eal_option_device_parse(void) +{ + struct device_option *devopt; + void *tmp; + int ret = 0; + + TAILQ_FOREACH_SAFE(devopt, &devopt_list, next, tmp) { + if (ret == 0) { + ret = rte_devargs_add(devopt->type, devopt->arg); + if (ret) + RTE_LOG(ERR, EAL, "Unable to parse device '%s'\n", + devopt->arg); + } + TAILQ_REMOVE(&devopt_list, devopt, next); + free(devopt); + } + return ret; +} + +const char * +eal_get_hugefile_prefix(void) +{ + if (internal_config.hugefile_prefix != NULL) + return internal_config.hugefile_prefix; + return HUGEFILE_PREFIX_DEFAULT; +} + +void +eal_reset_internal_config(struct internal_config *internal_cfg) +{ + int i; + + internal_cfg->memory = 0; + internal_cfg->force_nrank = 0; + internal_cfg->force_nchannel = 0; + internal_cfg->hugefile_prefix = NULL; + internal_cfg->hugepage_dir = NULL; + internal_cfg->force_sockets = 0; + /* zero out the NUMA config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_mem[i] = 0; + internal_cfg->force_socket_limits = 0; + /* zero out the NUMA limits config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->socket_limit[i] = 0; + /* zero out hugedir descriptors */ + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) { + memset(&internal_cfg->hugepage_info[i], 0, + sizeof(internal_cfg->hugepage_info[0])); + internal_cfg->hugepage_info[i].lock_descriptor = -1; + } + internal_cfg->base_virtaddr = 0; + +#ifdef LOG_DAEMON + internal_cfg->syslog_facility = LOG_DAEMON; +#endif + + /* if set to NONE, interrupt mode is determined automatically */ + internal_cfg->vfio_intr_mode = RTE_INTR_MODE_NONE; + +#ifdef RTE_LIBEAL_USE_HPET + internal_cfg->no_hpet = 0; +#else + internal_cfg->no_hpet = 1; +#endif + internal_cfg->vmware_tsc_map = 0; + internal_cfg->create_uio_dev = 0; + internal_cfg->iova_mode = RTE_IOVA_DC; + internal_cfg->user_mbuf_pool_ops_name = NULL; + CPU_ZERO(&internal_cfg->ctrl_cpuset); + internal_cfg->init_complete = 0; +} + +static int +eal_plugin_add(const char *path) +{ + struct shared_driver *solib; + + solib = malloc(sizeof(*solib)); + if (solib == NULL) { + RTE_LOG(ERR, EAL, "malloc(solib) failed\n"); + return -1; + } + memset(solib, 0, sizeof(*solib)); + strlcpy(solib->name, path, PATH_MAX-1); + solib->name[PATH_MAX-1] = 0; + TAILQ_INSERT_TAIL(&solib_list, solib, next); + + return 0; +} + +static int +eal_plugindir_init(const char *path) +{ + DIR *d = NULL; + struct dirent *dent = NULL; + char sopath[PATH_MAX]; + + if (path == NULL || *path == '\0') + return 0; + + d = opendir(path); + if (d == NULL) { + RTE_LOG(ERR, EAL, "failed to open directory %s: %s\n", + path, strerror(errno)); + return -1; + } + + while ((dent = readdir(d)) != NULL) { + struct stat sb; + + snprintf(sopath, sizeof(sopath), "%s/%s", path, dent->d_name); + + if (!(stat(sopath, &sb) == 0 && S_ISREG(sb.st_mode))) + continue; + + if (eal_plugin_add(sopath) == -1) + break; + } + + closedir(d); + /* XXX this ignores failures from readdir() itself */ + return (dent == NULL) ? 0 : -1; +} + +int +eal_plugins_init(void) +{ +#ifndef RTE_EXEC_ENV_WINDOWS + struct shared_driver *solib = NULL; + struct stat sb; + + if (*default_solib_dir != '\0' && stat(default_solib_dir, &sb) == 0 && + S_ISDIR(sb.st_mode)) + eal_plugin_add(default_solib_dir); + + TAILQ_FOREACH(solib, &solib_list, next) { + + if (stat(solib->name, &sb) == 0 && S_ISDIR(sb.st_mode)) { + if (eal_plugindir_init(solib->name) == -1) { + RTE_LOG(ERR, EAL, + "Cannot init plugin directory %s\n", + solib->name); + return -1; + } + } else { + RTE_LOG(DEBUG, EAL, "open shared lib %s\n", + solib->name); + solib->lib_handle = dlopen(solib->name, RTLD_NOW); + if (solib->lib_handle == NULL) { + RTE_LOG(ERR, EAL, "%s\n", dlerror()); + return -1; + } + } + + } + return 0; +#endif +} + +/* + * Parse the coremask given as argument (hexadecimal string) and fill + * the global configuration (core role and core count) with the parsed + * value. + */ +static int xdigit2val(unsigned char c) +{ + int val; + + if (isdigit(c)) + val = c - '0'; + else if (isupper(c)) + val = c - 'A' + 10; + else + val = c - 'a' + 10; + return val; +} + +static int +eal_parse_service_coremask(const char *coremask) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, j, idx = 0; + unsigned int count = 0; + char c; + int val; + uint32_t taken_lcore_count = 0; + + if (coremask == NULL) + return -1; + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; + j++, idx++) { + if ((1 << j) & val) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (master_lcore_parsed && + cfg->master_lcore == lcore) { + RTE_LOG(ERR, EAL, + "lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + + if (eal_cpu_detected(idx) == 0) { + RTE_LOG(ERR, EAL, + "lcore %u unavailable\n", idx); + return -1; + } + + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = ROLE_SERVICE; + count++; + } + } + } + + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + + for (; idx < RTE_MAX_LCORE; idx++) + lcore_config[idx].core_index = -1; + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores are in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + cfg->service_lcore_count = count; + return 0; +} + +static int +eal_service_cores_parsed(void) +{ + int idx; + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (lcore_config[idx].core_role == ROLE_SERVICE) + return 1; + } + return 0; +} + +static int +update_lcore_config(int *cores) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int count = 0; + unsigned int i; + int ret = 0; + + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (cores[i] != -1) { + if (eal_cpu_detected(i) == 0) { + RTE_LOG(ERR, EAL, "lcore %u unavailable\n", i); + ret = -1; + continue; + } + cfg->lcore_role[i] = ROLE_RTE; + count++; + } else { + cfg->lcore_role[i] = ROLE_OFF; + } + lcore_config[i].core_index = cores[i]; + } + if (!ret) + cfg->lcore_count = count; + return ret; +} + +static int +eal_parse_coremask(const char *coremask, int *cores) +{ + unsigned count = 0; + int i, j, idx; + int val; + char c; + + for (idx = 0; idx < RTE_MAX_LCORE; idx++) + cores[idx] = -1; + idx = 0; + + /* Remove all blank characters ahead and after . + * Remove 0x/0X if exists. + */ + while (isblank(*coremask)) + coremask++; + if (coremask[0] == '0' && ((coremask[1] == 'x') + || (coremask[1] == 'X'))) + coremask += 2; + i = strlen(coremask); + while ((i > 0) && isblank(coremask[i - 1])) + i--; + if (i == 0) + return -1; + + for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE; i--) { + c = coremask[i]; + if (isxdigit(c) == 0) { + /* invalid characters */ + return -1; + } + val = xdigit2val(c); + for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE; j++, idx++) + { + if ((1 << j) & val) { + cores[idx] = count; + count++; + } + } + } + for (; i >= 0; i--) + if (coremask[i] != '0') + return -1; + if (count == 0) + return -1; + return 0; +} + +static int +eal_parse_service_corelist(const char *corelist) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + int i, idx = 0; + unsigned count = 0; + char *end = NULL; + int min, max; + uint32_t taken_lcore_count = 0; + + if (corelist == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*corelist)) + corelist++; + i = strlen(corelist); + while ((i > 0) && isblank(corelist[i - 1])) + i--; + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtoul(corelist, &end, 10); + if (errno || end == NULL) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cfg->lcore_role[idx] != ROLE_SERVICE) { + /* handle master lcore already parsed */ + uint32_t lcore = idx; + if (cfg->master_lcore == lcore && + master_lcore_parsed) { + RTE_LOG(ERR, EAL, + "Error: lcore %u is master lcore, cannot use as service core\n", + idx); + return -1; + } + if (cfg->lcore_role[idx] == ROLE_RTE) + taken_lcore_count++; + + lcore_config[idx].core_role = + ROLE_SERVICE; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + + if (core_parsed && taken_lcore_count != count) { + RTE_LOG(WARNING, EAL, + "Not all service cores were in the coremask. " + "Please ensure -c or -l includes service cores\n"); + } + + return 0; +} + +static int +eal_parse_corelist(const char *corelist, int *cores) +{ + unsigned count = 0; + char *end = NULL; + int min, max; + int idx; + + for (idx = 0; idx < RTE_MAX_LCORE; idx++) + cores[idx] = -1; + + /* Remove all blank characters ahead */ + while (isblank(*corelist)) + corelist++; + + /* Get list of cores */ + min = RTE_MAX_LCORE; + do { + while (isblank(*corelist)) + corelist++; + if (*corelist == '\0') + return -1; + errno = 0; + idx = strtol(corelist, &end, 10); + if (errno || end == NULL) + return -1; + if (idx < 0 || idx >= RTE_MAX_LCORE) + return -1; + while (isblank(*end)) + end++; + if (*end == '-') { + min = idx; + } else if ((*end == ',') || (*end == '\0')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = min; idx <= max; idx++) { + if (cores[idx] == -1) { + cores[idx] = count; + count++; + } + } + min = RTE_MAX_LCORE; + } else + return -1; + corelist = end + 1; + } while (*end != '\0'); + + if (count == 0) + return -1; + return 0; +} + +/* Changes the lcore id of the master thread */ +static int +eal_parse_master_lcore(const char *arg) +{ + char *parsing_end; + struct rte_config *cfg = rte_eal_get_configuration(); + + errno = 0; + cfg->master_lcore = (uint32_t) strtol(arg, &parsing_end, 0); + if (errno || parsing_end[0] != 0) + return -1; + if (cfg->master_lcore >= RTE_MAX_LCORE) + return -1; + master_lcore_parsed = 1; + + /* ensure master core is not used as service core */ + if (lcore_config[cfg->master_lcore].core_role == ROLE_SERVICE) { + RTE_LOG(ERR, EAL, + "Error: Master lcore is used as a service core\n"); + return -1; + } + + return 0; +} + +/* + * Parse elem, the elem could be single number/range or '(' ')' group + * 1) A single number elem, it's just a simple digit. e.g. 9 + * 2) A single range elem, two digits with a '-' between. e.g. 2-6 + * 3) A group elem, combines multiple 1) or 2) with '( )'. e.g (0,2-4,6) + * Within group elem, '-' used for a range separator; + * ',' used for a single number. + */ +static int +eal_parse_set(const char *input, rte_cpuset_t *set) +{ + unsigned idx; + const char *str = input; + char *end = NULL; + unsigned min, max; + + CPU_ZERO(set); + + while (isblank(*str)) + str++; + + /* only digit or left bracket is qualify for start point */ + if ((!isdigit(*str) && *str != '(') || *str == '\0') + return -1; + + /* process single number or single range of number */ + if (*str != '(') { + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + else { + while (isblank(*end)) + end++; + + min = idx; + max = idx; + if (*end == '-') { + /* process single <number>-<number> */ + end++; + while (isblank(*end)) + end++; + if (!isdigit(*end)) + return -1; + + errno = 0; + idx = strtoul(end, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + max = idx; + while (isblank(*end)) + end++; + if (*end != ',' && *end != '\0') + return -1; + } + + if (*end != ',' && *end != '\0' && + *end != '@') + return -1; + + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + CPU_SET(idx, set); + + return end - input; + } + } + + /* process set within bracket */ + str++; + while (isblank(*str)) + str++; + if (*str == '\0') + return -1; + + min = RTE_MAX_LCORE; + do { + + /* go ahead to the first digit */ + while (isblank(*str)) + str++; + if (!isdigit(*str)) + return -1; + + /* get the digit value */ + errno = 0; + idx = strtoul(str, &end, 10); + if (errno || end == NULL || idx >= CPU_SETSIZE) + return -1; + + /* go ahead to separator '-',',' and ')' */ + while (isblank(*end)) + end++; + if (*end == '-') { + if (min == RTE_MAX_LCORE) + min = idx; + else /* avoid continuous '-' */ + return -1; + } else if ((*end == ',') || (*end == ')')) { + max = idx; + if (min == RTE_MAX_LCORE) + min = idx; + for (idx = RTE_MIN(min, max); + idx <= RTE_MAX(min, max); idx++) + CPU_SET(idx, set); + + min = RTE_MAX_LCORE; + } else + return -1; + + str = end + 1; + } while (*end != '\0' && *end != ')'); + + /* + * to avoid failure that tail blank makes end character check fail + * in eal_parse_lcores( ) + */ + while (isblank(*str)) + str++; + + return str - input; +} + +static int +check_cpuset(rte_cpuset_t *set) +{ + unsigned int idx; + + for (idx = 0; idx < CPU_SETSIZE; idx++) { + if (!CPU_ISSET(idx, set)) + continue; + + if (eal_cpu_detected(idx) == 0) { + RTE_LOG(ERR, EAL, "core %u " + "unavailable\n", idx); + return -1; + } + } + return 0; +} + +/* + * The format pattern: --lcores='<lcores[@cpus]>[<,lcores[@cpus]>...]' + * lcores, cpus could be a single digit/range or a group. + * '(' and ')' are necessary if it's a group. + * If not supply '@cpus', the value of cpus uses the same as lcores. + * e.g. '1,2@(5-7),(3-5)@(0,2),(0,6),7-8' means start 9 EAL thread as below + * lcore 0 runs on cpuset 0x41 (cpu 0,6) + * lcore 1 runs on cpuset 0x2 (cpu 1) + * lcore 2 runs on cpuset 0xe0 (cpu 5,6,7) + * lcore 3,4,5 runs on cpuset 0x5 (cpu 0,2) + * lcore 6 runs on cpuset 0x41 (cpu 0,6) + * lcore 7 runs on cpuset 0x80 (cpu 7) + * lcore 8 runs on cpuset 0x100 (cpu 8) + */ +static int +eal_parse_lcores(const char *lcores) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + rte_cpuset_t lcore_set; + unsigned int set_count; + unsigned idx = 0; + unsigned count = 0; + const char *lcore_start = NULL; + const char *end = NULL; + int offset; + rte_cpuset_t cpuset; + int lflags; + int ret = -1; + + if (lcores == NULL) + return -1; + + /* Remove all blank characters ahead and after */ + while (isblank(*lcores)) + lcores++; + + CPU_ZERO(&cpuset); + + /* Reset lcore config */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + cfg->lcore_role[idx] = ROLE_OFF; + lcore_config[idx].core_index = -1; + CPU_ZERO(&lcore_config[idx].cpuset); + } + + /* Get list of cores */ + do { + while (isblank(*lcores)) + lcores++; + if (*lcores == '\0') + goto err; + + lflags = 0; + + /* record lcore_set start point */ + lcore_start = lcores; + + /* go across a complete bracket */ + if (*lcore_start == '(') { + lcores += strcspn(lcores, ")"); + if (*lcores++ == '\0') + goto err; + } + + /* scan the separator '@', ','(next) or '\0'(finish) */ + lcores += strcspn(lcores, "@,"); + + if (*lcores == '@') { + /* explicit assign cpuset and update the end cursor */ + offset = eal_parse_set(lcores + 1, &cpuset); + if (offset < 0) + goto err; + end = lcores + 1 + offset; + } else { /* ',' or '\0' */ + /* haven't given cpuset, current loop done */ + end = lcores; + + /* go back to check <number>-<number> */ + offset = strcspn(lcore_start, "(-"); + if (offset < (end - lcore_start) && + *(lcore_start + offset) != '(') + lflags = 1; + } + + if (*end != ',' && *end != '\0') + goto err; + + /* parse lcore_set from start point */ + if (eal_parse_set(lcore_start, &lcore_set) < 0) + goto err; + + /* without '@', by default using lcore_set as cpuset */ + if (*lcores != '@') + rte_memcpy(&cpuset, &lcore_set, sizeof(cpuset)); + + set_count = CPU_COUNT(&lcore_set); + /* start to update lcore_set */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (!CPU_ISSET(idx, &lcore_set)) + continue; + set_count--; + + if (cfg->lcore_role[idx] != ROLE_RTE) { + lcore_config[idx].core_index = count; + cfg->lcore_role[idx] = ROLE_RTE; + count++; + } + + if (lflags) { + CPU_ZERO(&cpuset); + CPU_SET(idx, &cpuset); + } + + if (check_cpuset(&cpuset) < 0) + goto err; + rte_memcpy(&lcore_config[idx].cpuset, &cpuset, + sizeof(rte_cpuset_t)); + } + + /* some cores from the lcore_set can't be handled by EAL */ + if (set_count != 0) + goto err; + + lcores = end + 1; + } while (*end != '\0'); + + if (count == 0) + goto err; + + cfg->lcore_count = count; + ret = 0; + +err: + + return ret; +} + +#ifndef RTE_EXEC_ENV_WINDOWS +static int +eal_parse_syslog(const char *facility, struct internal_config *conf) +{ + int i; + static const struct { + const char *name; + int value; + } map[] = { + { "auth", LOG_AUTH }, + { "cron", LOG_CRON }, + { "daemon", LOG_DAEMON }, + { "ftp", LOG_FTP }, + { "kern", LOG_KERN }, + { "lpr", LOG_LPR }, + { "mail", LOG_MAIL }, + { "news", LOG_NEWS }, + { "syslog", LOG_SYSLOG }, + { "user", LOG_USER }, + { "uucp", LOG_UUCP }, + { "local0", LOG_LOCAL0 }, + { "local1", LOG_LOCAL1 }, + { "local2", LOG_LOCAL2 }, + { "local3", LOG_LOCAL3 }, + { "local4", LOG_LOCAL4 }, + { "local5", LOG_LOCAL5 }, + { "local6", LOG_LOCAL6 }, + { "local7", LOG_LOCAL7 }, + { NULL, 0 } + }; + + for (i = 0; map[i].name; i++) { + if (!strcmp(facility, map[i].name)) { + conf->syslog_facility = map[i].value; + return 0; + } + } + return -1; +} +#endif + +static int +eal_parse_log_priority(const char *level) +{ + static const char * const levels[] = { + [RTE_LOG_EMERG] = "emergency", + [RTE_LOG_ALERT] = "alert", + [RTE_LOG_CRIT] = "critical", + [RTE_LOG_ERR] = "error", + [RTE_LOG_WARNING] = "warning", + [RTE_LOG_NOTICE] = "notice", + [RTE_LOG_INFO] = "info", + [RTE_LOG_DEBUG] = "debug", + }; + size_t len = strlen(level); + unsigned long tmp; + char *end; + unsigned int i; + + if (len == 0) + return -1; + + /* look for named values, skip 0 which is not a valid level */ + for (i = 1; i < RTE_DIM(levels); i++) { + if (strncmp(levels[i], level, len) == 0) + return i; + } + + /* not a string, maybe it is numeric */ + errno = 0; + tmp = strtoul(level, &end, 0); + + /* check for errors */ + if (errno != 0 || end == NULL || *end != '\0' || + tmp >= UINT32_MAX) + return -1; + + return tmp; +} + +static int +eal_parse_log_level(const char *arg) +{ + const char *pattern = NULL; + const char *regex = NULL; + char *str, *level; + int priority; + + str = strdup(arg); + if (str == NULL) + return -1; + + if ((level = strchr(str, ','))) { + regex = str; + *level++ = '\0'; + } else if ((level = strchr(str, ':'))) { + pattern = str; + *level++ = '\0'; + } else { + level = str; + } + + priority = eal_parse_log_priority(level); + if (priority < 0) { + fprintf(stderr, "invalid log priority: %s\n", level); + goto fail; + } + + if (regex) { + if (rte_log_set_level_regexp(regex, priority) < 0) { + fprintf(stderr, "cannot set log level %s,%d\n", + regex, priority); + goto fail; + } + if (rte_log_save_regexp(regex, priority) < 0) + goto fail; + } else if (pattern) { + if (rte_log_set_level_pattern(pattern, priority) < 0) { + fprintf(stderr, "cannot set log level %s:%d\n", + pattern, priority); + goto fail; + } + if (rte_log_save_pattern(pattern, priority) < 0) + goto fail; + } else { + rte_log_set_global_level(priority); + } + + free(str); + return 0; + +fail: + free(str); + return -1; +} + +static enum rte_proc_type_t +eal_parse_proc_type(const char *arg) +{ + if (strncasecmp(arg, "primary", sizeof("primary")) == 0) + return RTE_PROC_PRIMARY; + if (strncasecmp(arg, "secondary", sizeof("secondary")) == 0) + return RTE_PROC_SECONDARY; + if (strncasecmp(arg, "auto", sizeof("auto")) == 0) + return RTE_PROC_AUTO; + + return RTE_PROC_INVALID; +} + +static int +eal_parse_iova_mode(const char *name) +{ + int mode; + + if (name == NULL) + return -1; + + if (!strcmp("pa", name)) + mode = RTE_IOVA_PA; + else if (!strcmp("va", name)) + mode = RTE_IOVA_VA; + else + return -1; + + internal_config.iova_mode = mode; + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 and other architectures. + */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +/* caller is responsible for freeing the returned string */ +static char * +available_cores(void) +{ + char *str = NULL; + int previous; + int sequence; + char *tmp; + int idx; + + /* find the first available cpu */ + for (idx = 0; idx < RTE_MAX_LCORE; idx++) { + if (eal_cpu_detected(idx) == 0) + continue; + break; + } + if (idx >= RTE_MAX_LCORE) + return NULL; + + /* first sequence */ + if (asprintf(&str, "%d", idx) < 0) + return NULL; + previous = idx; + sequence = 0; + + for (idx++ ; idx < RTE_MAX_LCORE; idx++) { + if (eal_cpu_detected(idx) == 0) + continue; + + if (idx == previous + 1) { + previous = idx; + sequence = 1; + continue; + } + + /* finish current sequence */ + if (sequence) { + if (asprintf(&tmp, "%s-%d", str, previous) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + } + + /* new sequence */ + if (asprintf(&tmp, "%s,%d", str, idx) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + previous = idx; + sequence = 0; + } + + /* finish last sequence */ + if (sequence) { + if (asprintf(&tmp, "%s-%d", str, previous) < 0) { + free(str); + return NULL; + } + free(str); + str = tmp; + } + + return str; +} + +int +eal_parse_common_option(int opt, const char *optarg, + struct internal_config *conf) +{ + static int b_used; + static int w_used; + + switch (opt) { + /* blacklist */ + case 'b': + if (w_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_BLACKLISTED_PCI, + optarg) < 0) { + return -1; + } + b_used = 1; + break; + /* whitelist */ + case 'w': + if (b_used) + goto bw_used; + if (eal_option_device_add(RTE_DEVTYPE_WHITELISTED_PCI, + optarg) < 0) { + return -1; + } + w_used = 1; + break; + /* coremask */ + case 'c': { + int lcore_indexes[RTE_MAX_LCORE]; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. Please ensure -c is before -s or -S\n"); + if (eal_parse_coremask(optarg, lcore_indexes) < 0) { + RTE_LOG(ERR, EAL, "invalid coremask syntax\n"); + return -1; + } + if (update_lcore_config(lcore_indexes) < 0) { + char *available = available_cores(); + + RTE_LOG(ERR, EAL, + "invalid coremask, please check specified cores are part of %s\n", + available); + free(available); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -c is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-c"); + return -1; + } + + core_parsed = LCORE_OPT_MSK; + break; + } + /* corelist */ + case 'l': { + int lcore_indexes[RTE_MAX_LCORE]; + + if (eal_service_cores_parsed()) + RTE_LOG(WARNING, EAL, + "Service cores parsed before dataplane cores. Please ensure -l is before -s or -S\n"); + + if (eal_parse_corelist(optarg, lcore_indexes) < 0) { + RTE_LOG(ERR, EAL, "invalid core list syntax\n"); + return -1; + } + if (update_lcore_config(lcore_indexes) < 0) { + char *available = available_cores(); + + RTE_LOG(ERR, EAL, + "invalid core list, please check specified cores are part of %s\n", + available); + free(available); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option -l is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_MSK) ? "-c" : + (core_parsed == LCORE_OPT_MAP) ? "--lcore" : + "-l"); + return -1; + } + + core_parsed = LCORE_OPT_LST; + break; + } + /* service coremask */ + case 's': + if (eal_parse_service_coremask(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service coremask\n"); + return -1; + } + break; + /* service corelist */ + case 'S': + if (eal_parse_service_corelist(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid service core list\n"); + return -1; + } + break; + /* size of memory */ + case 'm': + conf->memory = atoi(optarg); + conf->memory *= 1024ULL; + conf->memory *= 1024ULL; + mem_parsed = 1; + break; + /* force number of channels */ + case 'n': + conf->force_nchannel = atoi(optarg); + if (conf->force_nchannel == 0) { + RTE_LOG(ERR, EAL, "invalid channel number\n"); + return -1; + } + break; + /* force number of ranks */ + case 'r': + conf->force_nrank = atoi(optarg); + if (conf->force_nrank == 0 || + conf->force_nrank > 16) { + RTE_LOG(ERR, EAL, "invalid rank number\n"); + return -1; + } + break; + /* force loading of external driver */ + case 'd': + if (eal_plugin_add(optarg) == -1) + return -1; + break; + case 'v': + /* since message is explicitly requested by user, we + * write message at highest log level so it can always + * be seen + * even if info or warning messages are disabled */ + RTE_LOG(CRIT, EAL, "RTE Version: '%s'\n", rte_version()); + break; + + /* long options */ + case OPT_HUGE_UNLINK_NUM: + conf->hugepage_unlink = 1; + break; + + case OPT_NO_HUGE_NUM: + conf->no_hugetlbfs = 1; + /* no-huge is legacy mem */ + conf->legacy_mem = 1; + break; + + case OPT_NO_PCI_NUM: + conf->no_pci = 1; + break; + + case OPT_NO_HPET_NUM: + conf->no_hpet = 1; + break; + + case OPT_VMWARE_TSC_MAP_NUM: + conf->vmware_tsc_map = 1; + break; + + case OPT_NO_SHCONF_NUM: + conf->no_shconf = 1; + break; + + case OPT_IN_MEMORY_NUM: + conf->in_memory = 1; + /* in-memory is a superset of noshconf and huge-unlink */ + conf->no_shconf = 1; + conf->hugepage_unlink = 1; + break; + + case OPT_PROC_TYPE_NUM: + conf->process_type = eal_parse_proc_type(optarg); + break; + + case OPT_MASTER_LCORE_NUM: + if (eal_parse_master_lcore(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_MASTER_LCORE "\n"); + return -1; + } + break; + + case OPT_VDEV_NUM: + if (eal_option_device_add(RTE_DEVTYPE_VIRTUAL, + optarg) < 0) { + return -1; + } + break; + +#ifndef RTE_EXEC_ENV_WINDOWS + case OPT_SYSLOG_NUM: + if (eal_parse_syslog(optarg, conf) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SYSLOG "\n"); + return -1; + } + break; +#endif + + case OPT_LOG_LEVEL_NUM: { + if (eal_parse_log_level(optarg) < 0) { + RTE_LOG(ERR, EAL, + "invalid parameters for --" + OPT_LOG_LEVEL "\n"); + return -1; + } + break; + } + +#ifndef RTE_EXEC_ENV_WINDOWS + case OPT_TRACE_NUM: { + if (eal_trace_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE "\n"); + return -1; + } + break; + } + + case OPT_TRACE_DIR_NUM: { + if (eal_trace_dir_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_DIR "\n"); + return -1; + } + break; + } + + case OPT_TRACE_BUF_SIZE_NUM: { + if (eal_trace_bufsz_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_BUF_SIZE "\n"); + return -1; + } + break; + } + + case OPT_TRACE_MODE_NUM: { + if (eal_trace_mode_args_save(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_TRACE_MODE "\n"); + return -1; + } + break; + } +#endif /* !RTE_EXEC_ENV_WINDOWS */ + + case OPT_LCORES_NUM: + if (eal_parse_lcores(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_LCORES "\n"); + return -1; + } + + if (core_parsed) { + RTE_LOG(ERR, EAL, "Option --lcore is ignored, because (%s) is set!\n", + (core_parsed == LCORE_OPT_LST) ? "-l" : + (core_parsed == LCORE_OPT_MSK) ? "-c" : + "--lcore"); + return -1; + } + + core_parsed = LCORE_OPT_MAP; + break; + case OPT_LEGACY_MEM_NUM: + conf->legacy_mem = 1; + break; + case OPT_SINGLE_FILE_SEGMENTS_NUM: + conf->single_file_segments = 1; + break; + case OPT_IOVA_MODE_NUM: + if (eal_parse_iova_mode(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_IOVA_MODE "\n"); + return -1; + } + break; + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + return -1; + } + break; + case OPT_TELEMETRY_NUM: + break; + case OPT_NO_TELEMETRY_NUM: + conf->no_telemetry = 1; + break; + + /* don't know what to do, leave this to caller */ + default: + return 1; + + } + + return 0; +bw_used: + RTE_LOG(ERR, EAL, "Options blacklist (-b) and whitelist (-w) " + "cannot be used at the same time\n"); + return -1; +} + +static void +eal_auto_detect_cores(struct rte_config *cfg) +{ + unsigned int lcore_id; + unsigned int removed = 0; + rte_cpuset_t affinity_set; + + if (pthread_getaffinity_np(pthread_self(), sizeof(rte_cpuset_t), + &affinity_set)) + CPU_ZERO(&affinity_set); + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (cfg->lcore_role[lcore_id] == ROLE_RTE && + !CPU_ISSET(lcore_id, &affinity_set)) { + cfg->lcore_role[lcore_id] = ROLE_OFF; + removed++; + } + } + + cfg->lcore_count -= removed; +} + +static void +compute_ctrl_threads_cpuset(struct internal_config *internal_cfg) +{ + rte_cpuset_t *cpuset = &internal_cfg->ctrl_cpuset; + rte_cpuset_t default_set; + unsigned int lcore_id; + + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { + if (rte_lcore_has_role(lcore_id, ROLE_OFF)) + continue; + RTE_CPU_OR(cpuset, cpuset, &lcore_config[lcore_id].cpuset); + } + RTE_CPU_NOT(cpuset, cpuset); + + if (pthread_getaffinity_np(pthread_self(), sizeof(rte_cpuset_t), + &default_set)) + CPU_ZERO(&default_set); + + RTE_CPU_AND(cpuset, cpuset, &default_set); + + /* if no remaining cpu, use master lcore cpu affinity */ + if (!CPU_COUNT(cpuset)) { + memcpy(cpuset, &lcore_config[rte_get_master_lcore()].cpuset, + sizeof(*cpuset)); + } +} + +int +eal_cleanup_config(struct internal_config *internal_cfg) +{ + if (internal_cfg->hugefile_prefix != NULL) + free(internal_cfg->hugefile_prefix); + if (internal_cfg->hugepage_dir != NULL) + free(internal_cfg->hugepage_dir); + if (internal_cfg->user_mbuf_pool_ops_name != NULL) + free(internal_cfg->user_mbuf_pool_ops_name); + + return 0; +} + +int +eal_adjust_config(struct internal_config *internal_cfg) +{ + int i; + struct rte_config *cfg = rte_eal_get_configuration(); + + if (!core_parsed) + eal_auto_detect_cores(cfg); + + if (internal_config.process_type == RTE_PROC_AUTO) + internal_config.process_type = eal_proc_type_detect(); + + /* default master lcore is the first one */ + if (!master_lcore_parsed) { + cfg->master_lcore = rte_get_next_lcore(-1, 0, 0); + if (cfg->master_lcore >= RTE_MAX_LCORE) + return -1; + lcore_config[cfg->master_lcore].core_role = ROLE_RTE; + } + + compute_ctrl_threads_cpuset(internal_cfg); + + /* if no memory amounts were requested, this will result in 0 and + * will be overridden later, right after eal_hugepage_info_init() */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_cfg->memory += internal_cfg->socket_mem[i]; + + return 0; +} + +int +eal_check_common_options(struct internal_config *internal_cfg) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (cfg->lcore_role[cfg->master_lcore] != ROLE_RTE) { + RTE_LOG(ERR, EAL, "Master lcore is not enabled for DPDK\n"); + return -1; + } + + if (internal_cfg->process_type == RTE_PROC_INVALID) { + RTE_LOG(ERR, EAL, "Invalid process type specified\n"); + return -1; + } + if (internal_cfg->hugefile_prefix != NULL && + strlen(internal_cfg->hugefile_prefix) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_FILE_PREFIX " option\n"); + return -1; + } + if (internal_cfg->hugepage_dir != NULL && + strlen(internal_cfg->hugepage_dir) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_HUGE_DIR" option\n"); + return -1; + } + if (internal_cfg->user_mbuf_pool_ops_name != NULL && + strlen(internal_cfg->user_mbuf_pool_ops_name) < 1) { + RTE_LOG(ERR, EAL, "Invalid length of --" OPT_MBUF_POOL_OPS_NAME" option\n"); + return -1; + } + if (index(eal_get_hugefile_prefix(), '%') != NULL) { + RTE_LOG(ERR, EAL, "Invalid char, '%%', in --"OPT_FILE_PREFIX" " + "option\n"); + return -1; + } + if (mem_parsed && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options -m and --"OPT_SOCKET_MEM" cannot " + "be specified at the same time\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->force_sockets == 1) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_MEM" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_HUGE_UNLINK" cannot " + "be specified together with --"OPT_NO_HUGE"\n"); + return -1; + } + if (internal_config.force_socket_limits && internal_config.legacy_mem) { + RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_LIMIT + " is only supported in non-legacy memory mode\n"); + } + if (internal_cfg->single_file_segments && + internal_cfg->hugepage_unlink && + !internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is " + "not compatible with --"OPT_HUGE_UNLINK"\n"); + return -1; + } + if (internal_cfg->legacy_mem && + internal_cfg->in_memory) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_IN_MEMORY"\n"); + return -1; + } + if (internal_cfg->legacy_mem && internal_cfg->match_allocations) { + RTE_LOG(ERR, EAL, "Option --"OPT_LEGACY_MEM" is not compatible " + "with --"OPT_MATCH_ALLOCATIONS"\n"); + return -1; + } + if (internal_cfg->no_hugetlbfs && internal_cfg->match_allocations) { + RTE_LOG(ERR, EAL, "Option --"OPT_NO_HUGE" is not compatible " + "with --"OPT_MATCH_ALLOCATIONS"\n"); + return -1; + } + if (internal_cfg->legacy_mem && internal_cfg->memory == 0) { + RTE_LOG(NOTICE, EAL, "Static memory layout is selected, " + "amount of reserved memory can be adjusted with " + "-m or --"OPT_SOCKET_MEM"\n"); + } + + return 0; +} + +void +eal_common_usage(void) +{ + printf("[options]\n\n" + "EAL common options:\n" + " -c COREMASK Hexadecimal bitmask of cores to run on\n" + " -l CORELIST List of cores to run on\n" + " The argument format is <c1>[-c2][,c3[-c4],...]\n" + " where c1, c2, etc are core indexes between 0 and %d\n" + " --"OPT_LCORES" COREMAP Map lcore set to physical cpu set\n" + " The argument format is\n" + " '<lcores[@cpus]>[<,lcores[@cpus]>...]'\n" + " lcores and cpus list are grouped by '(' and ')'\n" + " Within the group, '-' is used for range separator,\n" + " ',' is used for single number separator.\n" + " '( )' can be omitted for single element group,\n" + " '@' can be omitted if cpus and lcores have the same value\n" + " -s SERVICE COREMASK Hexadecimal bitmask of cores to be used as service cores\n" + " --"OPT_MASTER_LCORE" ID Core ID that is used as master\n" + " --"OPT_MBUF_POOL_OPS_NAME" Pool ops name for mbuf to use\n" + " -n CHANNELS Number of memory channels\n" + " -m MB Memory to allocate (see also --"OPT_SOCKET_MEM")\n" + " -r RANKS Force number of memory ranks (don't detect)\n" + " -b, --"OPT_PCI_BLACKLIST" Add a PCI device in black list.\n" + " Prevent EAL from using this PCI device. The argument\n" + " format is <domain:bus:devid.func>.\n" + " -w, --"OPT_PCI_WHITELIST" Add a PCI device in white list.\n" + " Only use the specified PCI devices. The argument format\n" + " is <[domain:]bus:devid.func>. This option can be present\n" + " several times (once per device).\n" + " [NOTE: PCI whitelist cannot be used with -b option]\n" + " --"OPT_VDEV" Add a virtual device.\n" + " The argument format is <driver><id>[,key=val,...]\n" + " (ex: --vdev=net_pcap0,iface=eth2).\n" + " --"OPT_IOVA_MODE" Set IOVA mode. 'pa' for IOVA_PA\n" + " 'va' for IOVA_VA\n" + " -d LIB.so|DIR Add a driver or driver directory\n" + " (can be used multiple times)\n" + " --"OPT_VMWARE_TSC_MAP" Use VMware TSC map instead of native RDTSC\n" + " --"OPT_PROC_TYPE" Type of this process (primary|secondary|auto)\n" +#ifndef RTE_EXEC_ENV_WINDOWS + " --"OPT_SYSLOG" Set syslog facility\n" +#endif + " --"OPT_LOG_LEVEL"=<int> Set global log level\n" + " --"OPT_LOG_LEVEL"=<type-match>:<int>\n" + " Set specific log level\n" +#ifndef RTE_EXEC_ENV_WINDOWS + " --"OPT_TRACE"=<regex-match>\n" + " Enable trace based on regular expression trace name.\n" + " By default, the trace is disabled.\n" + " User must specify this option to enable trace.\n" + " --"OPT_TRACE_DIR"=<directory path>\n" + " Specify trace directory for trace output.\n" + " By default, trace output will created at\n" + " $HOME directory and parameter must be\n" + " specified once only.\n" + " --"OPT_TRACE_BUF_SIZE"=<int>\n" + " Specify maximum size of allocated memory\n" + " for trace output for each thread. Valid\n" + " unit can be either 'B|K|M' for 'Bytes',\n" + " 'KBytes' and 'MBytes' respectively.\n" + " Default is 1MB and parameter must be\n" + " specified once only.\n" + " --"OPT_TRACE_MODE"=<o[verwrite] | d[iscard]>\n" + " Specify the mode of update of trace\n" + " output file. Either update on a file can\n" + " be wrapped or discarded when file size\n" + " reaches its maximum limit.\n" + " Default mode is 'overwrite' and parameter\n" + " must be specified once only.\n" +#endif /* !RTE_EXEC_ENV_WINDOWS */ + " -v Display version information on startup\n" + " -h, --help This help\n" + " --"OPT_IN_MEMORY" Operate entirely in memory. This will\n" + " disable secondary process support\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_TELEMETRY" Enable telemetry support (on by default)\n" + " --"OPT_NO_TELEMETRY" Disable telemetry support\n" + "\nEAL options for DEBUG use only:\n" + " --"OPT_HUGE_UNLINK" Unlink hugepage files after init\n" + " --"OPT_NO_HUGE" Use malloc instead of hugetlbfs\n" + " --"OPT_NO_PCI" Disable PCI\n" + " --"OPT_NO_HPET" Disable HPET\n" + " --"OPT_NO_SHCONF" No shared config (mmap'd files)\n" + "\n", RTE_MAX_LCORE); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c new file mode 100644 index 000000000..935e8fefe --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_proc.c @@ -0,0 +1,1217 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2016-2018 Intel Corporation + */ + +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <fnmatch.h> +#include <inttypes.h> +#include <libgen.h> +#include <limits.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <unistd.h> + +#include <rte_alarm.h> +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_tailq.h> + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" + +static int mp_fd = -1; +static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */ +static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */ +static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER; +static char peer_name[PATH_MAX]; + +struct action_entry { + TAILQ_ENTRY(action_entry) next; + char action_name[RTE_MP_MAX_NAME_LEN]; + rte_mp_t action; +}; + +/** Double linked list of actions. */ +TAILQ_HEAD(action_entry_list, action_entry); + +static struct action_entry_list action_entry_list = + TAILQ_HEAD_INITIALIZER(action_entry_list); + +enum mp_type { + MP_MSG, /* Share message with peers, will not block */ + MP_REQ, /* Request for information, Will block for a reply */ + MP_REP, /* Response to previously-received request */ + MP_IGN, /* Response telling requester to ignore this response */ +}; + +struct mp_msg_internal { + int type; + struct rte_mp_msg msg; +}; + +struct async_request_param { + rte_mp_async_reply_t clb; + struct rte_mp_reply user_reply; + struct timespec end; + int n_responses_processed; +}; + +struct pending_request { + TAILQ_ENTRY(pending_request) next; + enum { + REQUEST_TYPE_SYNC, + REQUEST_TYPE_ASYNC + } type; + char dst[PATH_MAX]; + struct rte_mp_msg *request; + struct rte_mp_msg *reply; + int reply_received; + RTE_STD_C11 + union { + struct { + struct async_request_param *param; + } async; + struct { + pthread_cond_t cond; + } sync; + }; +}; + +TAILQ_HEAD(pending_request_list, pending_request); + +static struct { + struct pending_request_list requests; + pthread_mutex_t lock; +} pending_requests = { + .requests = TAILQ_HEAD_INITIALIZER(pending_requests.requests), + .lock = PTHREAD_MUTEX_INITIALIZER, + /**< used in async requests only */ +}; + +/* forward declarations */ +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type); + +/* for use with alarm callback */ +static void +async_reply_handle(void *arg); + +/* for use with process_msg */ +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg); + +static void +trigger_async_action(struct pending_request *req); + +static struct pending_request * +find_pending_request(const char *dst, const char *act_name) +{ + struct pending_request *r; + + TAILQ_FOREACH(r, &pending_requests.requests, next) { + if (!strcmp(r->dst, dst) && + !strcmp(r->request->name, act_name)) + break; + } + + return r; +} + +static void +create_socket_path(const char *name, char *buf, int len) +{ + const char *prefix = eal_mp_socket_path(); + + if (strlen(name) > 0) + snprintf(buf, len, "%s_%s", prefix, name); + else + strlcpy(buf, prefix, len); +} + +int +rte_eal_primary_proc_alive(const char *config_file_path) +{ + int config_fd; + + if (config_file_path) + config_fd = open(config_file_path, O_RDONLY); + else { + const char *path; + + path = eal_runtime_config_path(); + config_fd = open(path, O_RDONLY); + } + if (config_fd < 0) + return 0; + + int ret = lockf(config_fd, F_TEST, 0); + close(config_fd); + + return !!ret; +} + +static struct action_entry * +find_action_entry_by_name(const char *name) +{ + struct action_entry *entry; + + TAILQ_FOREACH(entry, &action_entry_list, next) { + if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0) + break; + } + + return entry; +} + +static int +validate_action_name(const char *name) +{ + if (name == NULL) { + RTE_LOG(ERR, EAL, "Action name cannot be NULL\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) { + RTE_LOG(ERR, EAL, "Length of action name is zero\n"); + rte_errno = EINVAL; + return -1; + } + if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) { + rte_errno = E2BIG; + return -1; + } + return 0; +} + +int +rte_mp_action_register(const char *name, rte_mp_t action) +{ + struct action_entry *entry; + + if (validate_action_name(name) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + entry = malloc(sizeof(struct action_entry)); + if (entry == NULL) { + rte_errno = ENOMEM; + return -1; + } + strlcpy(entry->action_name, name, sizeof(entry->action_name)); + entry->action = action; + + pthread_mutex_lock(&mp_mutex_action); + if (find_action_entry_by_name(name) != NULL) { + pthread_mutex_unlock(&mp_mutex_action); + rte_errno = EEXIST; + free(entry); + return -1; + } + TAILQ_INSERT_TAIL(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + return 0; +} + +void +rte_mp_action_unregister(const char *name) +{ + struct action_entry *entry; + + if (validate_action_name(name) != 0) + return; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return; + } + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(name); + if (entry == NULL) { + pthread_mutex_unlock(&mp_mutex_action); + return; + } + TAILQ_REMOVE(&action_entry_list, entry, next); + pthread_mutex_unlock(&mp_mutex_action); + free(entry); +} + +static int +read_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + int msglen; + struct iovec iov; + struct msghdr msgh; + char control[CMSG_SPACE(sizeof(m->msg.fds))]; + struct cmsghdr *cmsg; + int buflen = sizeof(*m) - sizeof(m->msg.fds); + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = m; + iov.iov_len = buflen; + + msgh.msg_name = s; + msgh.msg_namelen = sizeof(*s); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + msglen = recvmsg(mp_fd, &msgh, 0); + if (msglen < 0) { + RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno)); + return -1; + } + + if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + RTE_LOG(ERR, EAL, "truncated msg\n"); + return -1; + } + + /* read auxiliary FDs if any */ + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds)); + break; + } + } + /* sanity-check the response */ + if (m->msg.num_fds < 0 || m->msg.num_fds > RTE_MP_MAX_FD_NUM) { + RTE_LOG(ERR, EAL, "invalid number of fd's received\n"); + return -1; + } + if (m->msg.len_param < 0 || m->msg.len_param > RTE_MP_MAX_PARAM_LEN) { + RTE_LOG(ERR, EAL, "invalid received data length\n"); + return -1; + } + return 0; +} + +static void +process_msg(struct mp_msg_internal *m, struct sockaddr_un *s) +{ + struct pending_request *pending_req; + struct action_entry *entry; + struct rte_mp_msg *msg = &m->msg; + rte_mp_t action = NULL; + + RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name); + + if (m->type == MP_REP || m->type == MP_IGN) { + struct pending_request *req = NULL; + + pthread_mutex_lock(&pending_requests.lock); + pending_req = find_pending_request(s->sun_path, msg->name); + if (pending_req) { + memcpy(pending_req->reply, msg, sizeof(*msg)); + /* -1 indicates that we've been asked to ignore */ + pending_req->reply_received = + m->type == MP_REP ? 1 : -1; + + if (pending_req->type == REQUEST_TYPE_SYNC) + pthread_cond_signal(&pending_req->sync.cond); + else if (pending_req->type == REQUEST_TYPE_ASYNC) + req = async_reply_handle_thread_unsafe( + pending_req); + } else + RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); + return; + } + + pthread_mutex_lock(&mp_mutex_action); + entry = find_action_entry_by_name(msg->name); + if (entry != NULL) + action = entry->action; + pthread_mutex_unlock(&mp_mutex_action); + + if (!action) { + if (m->type == MP_REQ && !internal_config.init_complete) { + /* if this is a request, and init is not yet complete, + * and callback wasn't registered, we should tell the + * requester to ignore our existence because we're not + * yet ready to process this request. + */ + struct rte_mp_msg dummy; + + memset(&dummy, 0, sizeof(dummy)); + strlcpy(dummy.name, msg->name, sizeof(dummy.name)); + mp_send(&dummy, s->sun_path, MP_IGN); + } else { + RTE_LOG(ERR, EAL, "Cannot find action: %s\n", + msg->name); + } + } else if (action(msg, s->sun_path) < 0) { + RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name); + } +} + +static void * +mp_handle(void *arg __rte_unused) +{ + struct mp_msg_internal msg; + struct sockaddr_un sa; + + while (1) { + if (read_msg(&msg, &sa) == 0) + process_msg(&msg, &sa); + } + + return NULL; +} + +static int +timespec_cmp(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec < b->tv_sec) + return -1; + if (a->tv_sec > b->tv_sec) + return 1; + if (a->tv_nsec < b->tv_nsec) + return -1; + if (a->tv_nsec > b->tv_nsec) + return 1; + return 0; +} + +enum async_action { + ACTION_FREE, /**< free the action entry, but don't trigger callback */ + ACTION_TRIGGER /**< trigger callback, then free action entry */ +}; + +static enum async_action +process_async_request(struct pending_request *sr, const struct timespec *now) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + bool timeout, last_msg; + + param = sr->async.param; + reply = ¶m->user_reply; + + /* did we timeout? */ + timeout = timespec_cmp(¶m->end, now) <= 0; + + /* if we received a response, adjust relevant data and copy mesasge. */ + if (sr->reply_received == 1 && sr->reply) { + struct rte_mp_msg *msg, *user_msgs, *tmp; + + msg = sr->reply; + user_msgs = reply->msgs; + + tmp = realloc(user_msgs, sizeof(*msg) * + (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + sr->dst, sr->request->name); + /* this entry is going to be removed and its message + * dropped, but we don't want to leak memory, so + * continue. + */ + } else { + user_msgs = tmp; + reply->msgs = user_msgs; + memcpy(&user_msgs[reply->nb_received], + msg, sizeof(*msg)); + reply->nb_received++; + } + + /* mark this request as processed */ + param->n_responses_processed++; + } else if (sr->reply_received == -1) { + /* we were asked to ignore this process */ + reply->nb_sent--; + } else if (timeout) { + /* count it as processed response, but don't increment + * nb_received. + */ + param->n_responses_processed++; + } + + free(sr->reply); + + last_msg = param->n_responses_processed == reply->nb_sent; + + return last_msg ? ACTION_TRIGGER : ACTION_FREE; +} + +static void +trigger_async_action(struct pending_request *sr) +{ + struct async_request_param *param; + struct rte_mp_reply *reply; + + param = sr->async.param; + reply = ¶m->user_reply; + + param->clb(sr->request, reply); + + /* clean up */ + free(sr->async.param->user_reply.msgs); + free(sr->async.param); + free(sr->request); + free(sr); +} + +static struct pending_request * +async_reply_handle_thread_unsafe(void *arg) +{ + struct pending_request *req = (struct pending_request *)arg; + enum async_action action; + struct timespec ts_now; + struct timeval now; + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto no_trigger; + } + ts_now.tv_nsec = now.tv_usec * 1000; + ts_now.tv_sec = now.tv_sec; + + action = process_async_request(req, &ts_now); + + TAILQ_REMOVE(&pending_requests.requests, req, next); + + if (rte_eal_alarm_cancel(async_reply_handle, req) < 0) { + /* if we failed to cancel the alarm because it's already in + * progress, don't proceed because otherwise we will end up + * handling the same message twice. + */ + if (rte_errno == EINPROGRESS) { + RTE_LOG(DEBUG, EAL, "Request handling is already in progress\n"); + goto no_trigger; + } + RTE_LOG(ERR, EAL, "Failed to cancel alarm\n"); + } + + if (action == ACTION_TRIGGER) + return req; +no_trigger: + free(req); + return NULL; +} + +static void +async_reply_handle(void *arg) +{ + struct pending_request *req; + + pthread_mutex_lock(&pending_requests.lock); + req = async_reply_handle_thread_unsafe(arg); + pthread_mutex_unlock(&pending_requests.lock); + + if (req != NULL) + trigger_async_action(req); +} + +static int +open_socket_fd(void) +{ + struct sockaddr_un un; + + peer_name[0] = '\0'; + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + snprintf(peer_name, sizeof(peer_name), + "%d_%"PRIx64, getpid(), rte_rdtsc()); + + mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (mp_fd < 0) { + RTE_LOG(ERR, EAL, "failed to create unix socket\n"); + return -1; + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + + create_socket_path(peer_name, un.sun_path, sizeof(un.sun_path)); + + unlink(un.sun_path); /* May still exist since last run */ + + if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + RTE_LOG(ERR, EAL, "failed to bind %s: %s\n", + un.sun_path, strerror(errno)); + close(mp_fd); + return -1; + } + + RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path); + return mp_fd; +} + +static void +close_socket_fd(void) +{ + char path[PATH_MAX]; + + if (mp_fd < 0) + return; + + close(mp_fd); + create_socket_path(peer_name, path, sizeof(path)); + unlink(path); +} + +int +rte_mp_channel_init(void) +{ + char path[PATH_MAX]; + int dir_fd; + pthread_t mp_handle_tid; + + /* in no shared files mode, we do not have secondary processes support, + * so no need to initialize IPC. + */ + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC will be disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + /* create filter path */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_filter, basename(path), sizeof(mp_filter)); + + /* path may have been modified, so recreate it */ + create_socket_path("*", path, sizeof(path)); + strlcpy(mp_dir_path, dirname(path), sizeof(mp_dir_path)); + + /* lock the directory */ + dir_fd = open(mp_dir_path, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "failed to open %s: %s\n", + mp_dir_path, strerror(errno)); + return -1; + } + + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "failed to lock %s: %s\n", + mp_dir_path, strerror(errno)); + close(dir_fd); + return -1; + } + + if (open_socket_fd() < 0) { + close(dir_fd); + return -1; + } + + if (rte_ctrl_thread_create(&mp_handle_tid, "rte_mp_handle", + NULL, mp_handle, NULL) < 0) { + RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", + strerror(errno)); + close(mp_fd); + close(dir_fd); + mp_fd = -1; + return -1; + } + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + close(dir_fd); + + return 0; +} + +void +rte_mp_channel_cleanup(void) +{ + close_socket_fd(); +} + +/** + * Return -1, as fail to send message and it's caused by the local side. + * Return 0, as fail to send message and it's caused by the remote side. + * Return 1, as succeed to send message. + * + */ +static int +send_msg(const char *dst_path, struct rte_mp_msg *msg, int type) +{ + int snd; + struct iovec iov; + struct msghdr msgh; + struct cmsghdr *cmsg; + struct sockaddr_un dst; + struct mp_msg_internal m; + int fd_size = msg->num_fds * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + + m.type = type; + memcpy(&m.msg, msg, sizeof(*msg)); + + memset(&dst, 0, sizeof(dst)); + dst.sun_family = AF_UNIX; + strlcpy(dst.sun_path, dst_path, sizeof(dst.sun_path)); + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = &m; + iov.iov_len = sizeof(m) - sizeof(msg->fds); + + msgh.msg_name = &dst; + msgh.msg_namelen = sizeof(dst); + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), msg->fds, fd_size); + + do { + snd = sendmsg(mp_fd, &msgh, 0); + } while (snd < 0 && errno == EINTR); + + if (snd < 0) { + rte_errno = errno; + /* Check if it caused by peer process exits */ + if (errno == ECONNREFUSED && + rte_eal_process_type() == RTE_PROC_PRIMARY) { + unlink(dst_path); + return 0; + } + RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n", + dst_path, strerror(errno)); + return -1; + } + + return 1; +} + +static int +mp_send(struct rte_mp_msg *msg, const char *peer, int type) +{ + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + + if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY)) + peer = eal_mp_socket_path(); + + if (peer) { + if (send_msg(peer, msg, type) < 0) + return -1; + else + return 0; + } + + /* broadcast to all secondary processes */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", + mp_dir_path); + rte_errno = errno; + return -1; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + closedir(mp_dir); + return -1; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + if (send_msg(path, msg, type) < 0) + ret = -1; + } + /* unlock the dir */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + return ret; +} + +static int +check_input(const struct rte_mp_msg *msg) +{ + if (msg == NULL) { + RTE_LOG(ERR, EAL, "Msg cannot be NULL\n"); + rte_errno = EINVAL; + return -1; + } + + if (validate_action_name(msg->name) != 0) + return -1; + + if (msg->len_param < 0) { + RTE_LOG(ERR, EAL, "Message data length is negative\n"); + rte_errno = EINVAL; + return -1; + } + + if (msg->num_fds < 0) { + RTE_LOG(ERR, EAL, "Number of fd's is negative\n"); + rte_errno = EINVAL; + return -1; + } + + if (msg->len_param > RTE_MP_MAX_PARAM_LEN) { + RTE_LOG(ERR, EAL, "Message data is too long\n"); + rte_errno = E2BIG; + return -1; + } + + if (msg->num_fds > RTE_MP_MAX_FD_NUM) { + RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", + RTE_MP_MAX_FD_NUM); + rte_errno = E2BIG; + return -1; + } + + return 0; +} + +int +rte_mp_sendmsg(struct rte_mp_msg *msg) +{ + if (check_input(msg) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name); + return mp_send(msg, NULL, MP_MSG); +} + +static int +mp_request_async(const char *dst, struct rte_mp_msg *req, + struct async_request_param *param, const struct timespec *ts) +{ + struct rte_mp_msg *reply_msg; + struct pending_request *pending_req, *exist; + int ret = -1; + + pending_req = calloc(1, sizeof(*pending_req)); + reply_msg = calloc(1, sizeof(*reply_msg)); + if (pending_req == NULL || reply_msg == NULL) { + RTE_LOG(ERR, EAL, "Could not allocate space for sync request\n"); + rte_errno = ENOMEM; + ret = -1; + goto fail; + } + + pending_req->type = REQUEST_TYPE_ASYNC; + strlcpy(pending_req->dst, dst, sizeof(pending_req->dst)); + pending_req->request = req; + pending_req->reply = reply_msg; + pending_req->async.param = param; + + /* queue already locked by caller */ + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + ret = -1; + goto fail; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } else if (ret == 0) { + ret = 0; + goto fail; + } + param->user_reply.nb_sent++; + + /* if alarm set fails, we simply ignore the reply */ + if (rte_eal_alarm_set(ts->tv_sec * 1000000 + ts->tv_nsec / 1000, + async_reply_handle, pending_req) < 0) { + RTE_LOG(ERR, EAL, "Fail to set alarm for request %s:%s\n", + dst, req->name); + ret = -1; + goto fail; + } + TAILQ_INSERT_TAIL(&pending_requests.requests, pending_req, next); + + return 0; +fail: + free(pending_req); + free(reply_msg); + return ret; +} + +static int +mp_request_sync(const char *dst, struct rte_mp_msg *req, + struct rte_mp_reply *reply, const struct timespec *ts) +{ + int ret; + struct rte_mp_msg msg, *tmp; + struct pending_request pending_req, *exist; + + pending_req.type = REQUEST_TYPE_SYNC; + pending_req.reply_received = 0; + strlcpy(pending_req.dst, dst, sizeof(pending_req.dst)); + pending_req.request = req; + pending_req.reply = &msg; + pthread_cond_init(&pending_req.sync.cond, NULL); + + exist = find_pending_request(dst, req->name); + if (exist) { + RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name); + rte_errno = EEXIST; + return -1; + } + + ret = send_msg(dst, req, MP_REQ); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n", + dst, req->name); + return -1; + } else if (ret == 0) + return 0; + + TAILQ_INSERT_TAIL(&pending_requests.requests, &pending_req, next); + + reply->nb_sent++; + + do { + ret = pthread_cond_timedwait(&pending_req.sync.cond, + &pending_requests.lock, ts); + } while (ret != 0 && ret != ETIMEDOUT); + + TAILQ_REMOVE(&pending_requests.requests, &pending_req, next); + + if (pending_req.reply_received == 0) { + RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n", + dst, req->name); + rte_errno = ETIMEDOUT; + return -1; + } + if (pending_req.reply_received == -1) { + RTE_LOG(DEBUG, EAL, "Asked to ignore response\n"); + /* not receiving this message is not an error, so decrement + * number of sent messages + */ + reply->nb_sent--; + return 0; + } + + tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1)); + if (!tmp) { + RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n", + dst, req->name); + rte_errno = ENOMEM; + return -1; + } + memcpy(&tmp[reply->nb_received], &msg, sizeof(msg)); + reply->msgs = tmp; + reply->nb_received++; + return 0; +} + +int +rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply, + const struct timespec *ts) +{ + int dir_fd, ret = -1; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec end; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + if (check_input(req) != 0) + goto end; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Failed to get current time\n"); + rte_errno = errno; + goto end; + } + + end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end.tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + pthread_mutex_lock(&pending_requests.lock); + ret = mp_request_sync(eal_mp_socket_path(), req, reply, &end); + pthread_mutex_unlock(&pending_requests.lock); + goto end; + } + + /* for primary process, broadcast request, and collect reply 1 by 1 */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto end; + } + + dir_fd = dirfd(mp_dir); + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto close_end; + } + + pthread_mutex_lock(&pending_requests.lock); + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + /* unlocks the mutex while waiting for response, + * locks on receive + */ + if (mp_request_sync(path, req, reply, &end)) + goto unlock_end; + } + ret = 0; + +unlock_end: + pthread_mutex_unlock(&pending_requests.lock); + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + +close_end: + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + +end: + if (ret) { + free(reply->msgs); + reply->nb_received = 0; + reply->msgs = NULL; + } + return ret; +} + +int +rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts, + rte_mp_async_reply_t clb) +{ + struct rte_mp_msg *copy; + struct pending_request *dummy; + struct async_request_param *param; + struct rte_mp_reply *reply; + int dir_fd, ret = 0; + DIR *mp_dir; + struct dirent *ent; + struct timeval now; + struct timespec *end; + bool dummy_used = false; + + RTE_LOG(DEBUG, EAL, "request: %s\n", req->name); + + if (check_input(req) != 0) + return -1; + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + rte_errno = ENOTSUP; + return -1; + } + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Failed to get current time\n"); + rte_errno = errno; + return -1; + } + copy = calloc(1, sizeof(*copy)); + dummy = calloc(1, sizeof(*dummy)); + param = calloc(1, sizeof(*param)); + if (copy == NULL || dummy == NULL || param == NULL) { + RTE_LOG(ERR, EAL, "Failed to allocate memory for async reply\n"); + rte_errno = ENOMEM; + goto fail; + } + + /* copy message */ + memcpy(copy, req, sizeof(*copy)); + + param->n_responses_processed = 0; + param->clb = clb; + end = ¶m->end; + reply = ¶m->user_reply; + + end->tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000; + end->tv_sec = now.tv_sec + ts->tv_sec + + (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000; + reply->nb_sent = 0; + reply->nb_received = 0; + reply->msgs = NULL; + + /* we have to lock the request queue here, as we will be adding a bunch + * of requests to the queue at once, and some of the replies may arrive + * before we add all of the requests to the queue. + */ + pthread_mutex_lock(&pending_requests.lock); + + /* we have to ensure that callback gets triggered even if we don't send + * anything, therefore earlier we have allocated a dummy request. fill + * it, and put it on the queue if we don't send any requests. + */ + dummy->type = REQUEST_TYPE_ASYNC; + dummy->request = copy; + dummy->reply = NULL; + dummy->async.param = param; + dummy->reply_received = 1; /* short-circuit the timeout */ + + /* for secondary process, send request to the primary process only */ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + ret = mp_request_async(eal_mp_socket_path(), copy, param, ts); + + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_TAIL(&pending_requests.requests, dummy, + next); + dummy_used = true; + } + + pthread_mutex_unlock(&pending_requests.lock); + + /* if we couldn't send anything, clean up */ + if (ret != 0) + goto fail; + return 0; + } + + /* for primary process, broadcast request */ + mp_dir = opendir(mp_dir_path); + if (!mp_dir) { + RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path); + rte_errno = errno; + goto unlock_fail; + } + dir_fd = dirfd(mp_dir); + + /* lock the directory to prevent processes spinning up while we send */ + if (flock(dir_fd, LOCK_SH)) { + RTE_LOG(ERR, EAL, "Unable to lock directory %s\n", + mp_dir_path); + rte_errno = errno; + goto closedir_fail; + } + + while ((ent = readdir(mp_dir))) { + char path[PATH_MAX]; + + if (fnmatch(mp_filter, ent->d_name, 0) != 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", mp_dir_path, + ent->d_name); + + if (mp_request_async(path, copy, param, ts)) + ret = -1; + } + /* if we didn't send anything, put dummy request on the queue */ + if (ret == 0 && reply->nb_sent == 0) { + TAILQ_INSERT_HEAD(&pending_requests.requests, dummy, next); + dummy_used = true; + } + + /* finally, unlock the queue */ + pthread_mutex_unlock(&pending_requests.lock); + + /* unlock the directory */ + flock(dir_fd, LOCK_UN); + + /* dir_fd automatically closed on closedir */ + closedir(mp_dir); + + /* if dummy was unused, free it */ + if (!dummy_used) + free(dummy); + + return ret; +closedir_fail: + closedir(mp_dir); +unlock_fail: + pthread_mutex_unlock(&pending_requests.lock); +fail: + free(dummy); + free(param); + free(copy); + return -1; +} + +int +rte_mp_reply(struct rte_mp_msg *msg, const char *peer) +{ + RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name); + + if (check_input(msg) != 0) + return -1; + + if (peer == NULL) { + RTE_LOG(ERR, EAL, "peer is not specified\n"); + rte_errno = EINVAL; + return -1; + } + + if (internal_config.no_shconf) { + RTE_LOG(DEBUG, EAL, "No shared files mode enabled, IPC is disabled\n"); + return 0; + } + + return mp_send(msg, peer, MP_REP); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c new file mode 100644 index 000000000..60c5dd66f --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_string_fns.c @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <stdio.h> +#include <stdarg.h> +#include <errno.h> + +#include <rte_string_fns.h> + +/* split string into tokens */ +int +rte_strsplit(char *string, int stringlen, + char **tokens, int maxtokens, char delim) +{ + int i, tok = 0; + int tokstart = 1; /* first token is right at start of string */ + + if (string == NULL || tokens == NULL) + goto einval_error; + + for (i = 0; i < stringlen; i++) { + if (string[i] == '\0' || tok >= maxtokens) + break; + if (tokstart) { + tokstart = 0; + tokens[tok++] = &string[i]; + } + if (string[i] == delim) { + string[i] = '\0'; + tokstart = 1; + } + } + return tok; + +einval_error: + errno = EINVAL; + return -1; +} + +/* Copy src string into dst. + * + * Return negative value and NUL-terminate if dst is too short, + * Otherwise return number of bytes copied. + */ +ssize_t +rte_strscpy(char *dst, const char *src, size_t dsize) +{ + size_t nleft = dsize; + size_t res = 0; + + /* Copy as many bytes as will fit. */ + while (nleft != 0) { + dst[res] = src[res]; + if (src[res] == '\0') + return res; + res++; + nleft--; + } + + /* Not enough room in dst, set NUL and return error. */ + if (res != 0) + dst[res - 1] = '\0'; + return -E2BIG; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c new file mode 100644 index 000000000..ead06897b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_tailqs.c @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <sys/queue.h> +#include <stdint.h> +#include <errno.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <inttypes.h> + +#include <rte_memory.h> +#include <rte_launch.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_atomic.h> +#include <rte_branch_prediction.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_debug.h> + +#include "eal_private.h" +#include "eal_memcfg.h" + +TAILQ_HEAD(rte_tailq_elem_head, rte_tailq_elem); +/* local tailq list */ +static struct rte_tailq_elem_head rte_tailq_elem_head = + TAILQ_HEAD_INITIALIZER(rte_tailq_elem_head); + +/* number of tailqs registered, -1 before call to rte_eal_tailqs_init */ +static int rte_tailqs_count = -1; + +struct rte_tailq_head * +rte_eal_tailq_lookup(const char *name) +{ + unsigned i; + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (name == NULL) + return NULL; + + for (i = 0; i < RTE_MAX_TAILQ; i++) { + if (!strncmp(name, mcfg->tailq_head[i].name, + RTE_TAILQ_NAMESIZE-1)) + return &mcfg->tailq_head[i]; + } + + return NULL; +} + +void +rte_dump_tailq(FILE *f) +{ + struct rte_mem_config *mcfg; + unsigned i = 0; + + mcfg = rte_eal_get_configuration()->mem_config; + + rte_mcfg_tailq_read_lock(); + for (i = 0; i < RTE_MAX_TAILQ; i++) { + const struct rte_tailq_head *tailq = &mcfg->tailq_head[i]; + const struct rte_tailq_entry_head *head = &tailq->tailq_head; + + fprintf(f, "Tailq %u: qname:<%s>, tqh_first:%p, tqh_last:%p\n", + i, tailq->name, head->tqh_first, head->tqh_last); + } + rte_mcfg_tailq_read_unlock(); +} + +static struct rte_tailq_head * +rte_eal_tailq_create(const char *name) +{ + struct rte_tailq_head *head = NULL; + + if (!rte_eal_tailq_lookup(name) && + (rte_tailqs_count + 1 < RTE_MAX_TAILQ)) { + struct rte_mem_config *mcfg; + + mcfg = rte_eal_get_configuration()->mem_config; + head = &mcfg->tailq_head[rte_tailqs_count]; + strlcpy(head->name, name, sizeof(head->name) - 1); + TAILQ_INIT(&head->tailq_head); + rte_tailqs_count++; + } + + return head; +} + +/* local register, used to store "early" tailqs before rte_eal_init() and to + * ensure secondary process only registers tailqs once. */ +static int +rte_eal_tailq_local_register(struct rte_tailq_elem *t) +{ + struct rte_tailq_elem *temp; + + TAILQ_FOREACH(temp, &rte_tailq_elem_head, next) { + if (!strncmp(t->name, temp->name, sizeof(temp->name))) + return -1; + } + + TAILQ_INSERT_TAIL(&rte_tailq_elem_head, t, next); + return 0; +} + +static void +rte_eal_tailq_update(struct rte_tailq_elem *t) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* primary process is the only one that creates */ + t->head = rte_eal_tailq_create(t->name); + } else { + t->head = rte_eal_tailq_lookup(t->name); + } +} + +int +rte_eal_tailq_register(struct rte_tailq_elem *t) +{ + if (rte_eal_tailq_local_register(t) < 0) { + RTE_LOG(ERR, EAL, + "%s tailq is already registered\n", t->name); + goto error; + } + + /* if a register happens after rte_eal_tailqs_init(), then we can update + * tailq head */ + if (rte_tailqs_count >= 0) { + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + TAILQ_REMOVE(&rte_tailq_elem_head, t, next); + goto error; + } + } + + return 0; + +error: + t->head = NULL; + return -1; +} + +int +rte_eal_tailqs_init(void) +{ + struct rte_tailq_elem *t; + + rte_tailqs_count = 0; + + TAILQ_FOREACH(t, &rte_tailq_elem_head, next) { + /* second part of register job for "early" tailqs, see + * rte_eal_tailq_register and EAL_REGISTER_TAILQ */ + rte_eal_tailq_update(t); + if (t->head == NULL) { + RTE_LOG(ERR, EAL, + "Cannot initialize tailq: %s\n", t->name); + /* TAILQ_REMOVE not needed, error is already fatal */ + goto fail; + } + } + + return 0; + +fail: + rte_dump_tailq(stderr); + return -1; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c new file mode 100644 index 000000000..f9f588c17 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_thread.c @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <pthread.h> +#include <signal.h> +#include <sched.h> +#include <assert.h> +#include <string.h> + +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_log.h> +#ifndef RTE_EXEC_ENV_WINDOWS +#include <rte_trace_point.h> +#endif + +#include "eal_internal_cfg.h" +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DECLARE_PER_LCORE(unsigned , _socket_id); + +unsigned rte_socket_id(void) +{ + return RTE_PER_LCORE(_socket_id); +} + +int +rte_lcore_has_role(unsigned int lcore_id, enum rte_lcore_role_t role) +{ + struct rte_config *cfg = rte_eal_get_configuration(); + + if (lcore_id >= RTE_MAX_LCORE) + return -EINVAL; + + return cfg->lcore_role[lcore_id] == role; +} + +static int +eal_cpuset_socket_id(rte_cpuset_t *cpusetp) +{ + unsigned cpu = 0; + int socket_id = SOCKET_ID_ANY; + int sid; + + if (cpusetp == NULL) + return SOCKET_ID_ANY; + + do { + if (!CPU_ISSET(cpu, cpusetp)) + continue; + + if (socket_id == SOCKET_ID_ANY) + socket_id = eal_cpu_socket_id(cpu); + + sid = eal_cpu_socket_id(cpu); + if (socket_id != sid) { + socket_id = SOCKET_ID_ANY; + break; + } + + } while (++cpu < CPU_SETSIZE); + + return socket_id; +} + +int +rte_thread_set_affinity(rte_cpuset_t *cpusetp) +{ + int s; + unsigned lcore_id; + pthread_t tid; + + tid = pthread_self(); + + s = pthread_setaffinity_np(tid, sizeof(rte_cpuset_t), cpusetp); + if (s != 0) { + RTE_LOG(ERR, EAL, "pthread_setaffinity_np failed\n"); + return -1; + } + + /* store socket_id in TLS for quick access */ + RTE_PER_LCORE(_socket_id) = + eal_cpuset_socket_id(cpusetp); + + /* store cpuset in TLS for quick access */ + memmove(&RTE_PER_LCORE(_cpuset), cpusetp, + sizeof(rte_cpuset_t)); + + lcore_id = rte_lcore_id(); + if (lcore_id != (unsigned)LCORE_ID_ANY) { + /* EAL thread will update lcore_config */ + lcore_config[lcore_id].socket_id = RTE_PER_LCORE(_socket_id); + memmove(&lcore_config[lcore_id].cpuset, cpusetp, + sizeof(rte_cpuset_t)); + } + + return 0; +} + +void +rte_thread_get_affinity(rte_cpuset_t *cpusetp) +{ + assert(cpusetp); + memmove(cpusetp, &RTE_PER_LCORE(_cpuset), + sizeof(rte_cpuset_t)); +} + +int +eal_thread_dump_affinity(char *str, unsigned size) +{ + rte_cpuset_t cpuset; + unsigned cpu; + int ret; + unsigned int out = 0; + + rte_thread_get_affinity(&cpuset); + + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) + continue; + + ret = snprintf(str + out, + size - out, "%u,", cpu); + if (ret < 0 || (unsigned)ret >= size - out) { + /* string will be truncated */ + ret = -1; + goto exit; + } + + out += ret; + } + + ret = 0; +exit: + /* remove the last separator */ + if (out > 0) + str[out - 1] = '\0'; + + return ret; +} + + +struct rte_thread_ctrl_params { + void *(*start_routine)(void *); + void *arg; + pthread_barrier_t configured; +}; + +static void *rte_thread_init(void *arg) +{ + int ret; + rte_cpuset_t *cpuset = &internal_config.ctrl_cpuset; + struct rte_thread_ctrl_params *params = arg; + void *(*start_routine)(void *) = params->start_routine; + void *routine_arg = params->arg; + + /* Store cpuset in TLS for quick access */ + memmove(&RTE_PER_LCORE(_cpuset), cpuset, sizeof(rte_cpuset_t)); + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + +#ifndef RTE_EXEC_ENV_WINDOWS + __rte_trace_mem_per_thread_alloc(); +#endif + return start_routine(routine_arg); +} + +int +rte_ctrl_thread_create(pthread_t *thread, const char *name, + const pthread_attr_t *attr, + void *(*start_routine)(void *), void *arg) +{ + rte_cpuset_t *cpuset = &internal_config.ctrl_cpuset; + struct rte_thread_ctrl_params *params; + int ret; + + params = malloc(sizeof(*params)); + if (!params) + return -ENOMEM; + + params->start_routine = start_routine; + params->arg = arg; + + pthread_barrier_init(¶ms->configured, NULL, 2); + + ret = pthread_create(thread, attr, rte_thread_init, (void *)params); + if (ret != 0) { + free(params); + return -ret; + } + + if (name != NULL) { + ret = rte_thread_setname(*thread, name); + if (ret < 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for ctrl thread\n"); + } + + ret = pthread_setaffinity_np(*thread, sizeof(*cpuset), cpuset); + if (ret) + goto fail; + + ret = pthread_barrier_wait(¶ms->configured); + if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + + return 0; + +fail: + if (PTHREAD_BARRIER_SERIAL_THREAD == + pthread_barrier_wait(¶ms->configured)) { + pthread_barrier_destroy(¶ms->configured); + free(params); + } + pthread_cancel(*thread); + pthread_join(*thread, NULL); + return -ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c new file mode 100644 index 000000000..fa9ee1b22 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_timer.c @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/types.h> +#include <time.h> +#include <errno.h> + +#include <rte_common.h> +#include <rte_compat.h> +#include <rte_log.h> +#include <rte_cycles.h> +#include <rte_pause.h> +#include <rte_eal.h> + +#include "eal_private.h" +#include "eal_memcfg.h" + +/* The frequency of the RDTSC timer resolution */ +static uint64_t eal_tsc_resolution_hz; + +/* Pointer to user delay function */ +void (*rte_delay_us)(unsigned int) = NULL; + +void +rte_delay_us_block(unsigned int us) +{ + const uint64_t start = rte_get_timer_cycles(); + const uint64_t ticks = (uint64_t)us * rte_get_timer_hz() / 1E6; + while ((rte_get_timer_cycles() - start) < ticks) + rte_pause(); +} + +void +rte_delay_us_sleep(unsigned int us) +{ + struct timespec wait[2]; + int ind = 0; + + wait[0].tv_sec = 0; + if (us >= US_PER_S) { + wait[0].tv_sec = us / US_PER_S; + us -= wait[0].tv_sec * US_PER_S; + } + wait[0].tv_nsec = 1000 * us; + + while (nanosleep(&wait[ind], &wait[1 - ind]) && errno == EINTR) { + /* + * Sleep was interrupted. Flip the index, so the 'remainder' + * will become the 'request' for a next call. + */ + ind = 1 - ind; + } +} + +uint64_t +rte_get_tsc_hz(void) +{ + return eal_tsc_resolution_hz; +} + +static uint64_t +estimate_tsc_freq(void) +{ +#define CYC_PER_10MHZ 1E7 + RTE_LOG(WARNING, EAL, "WARNING: TSC frequency estimated roughly" + " - clock timings may be less accurate.\n"); + /* assume that the sleep(1) will sleep for 1 second */ + uint64_t start = rte_rdtsc(); + sleep(1); + /* Round up to 10Mhz. 1E7 ~ 10Mhz */ + return RTE_ALIGN_MUL_NEAR(rte_rdtsc() - start, CYC_PER_10MHZ); +} + +void +set_tsc_freq(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint64_t freq; + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { + /* + * Just use the primary process calculated TSC rate in any + * secondary process. It avoids any unnecessary overhead on + * systems where arch-specific frequency detection is not + * available. + */ + eal_tsc_resolution_hz = mcfg->tsc_hz; + return; + } + + freq = get_tsc_freq_arch(); + if (!freq) + freq = get_tsc_freq(); + if (!freq) + freq = estimate_tsc_freq(); + + RTE_LOG(DEBUG, EAL, "TSC frequency is ~%" PRIu64 " KHz\n", freq / 1000); + eal_tsc_resolution_hz = freq; + mcfg->tsc_hz = freq; +} + +void rte_delay_us_callback_register(void (*userfunc)(unsigned int)) +{ + rte_delay_us = userfunc; +} + +RTE_INIT(rte_timer_init) +{ + /* set rte_delay_us_block as a delay function */ + rte_delay_us_callback_register(rte_delay_us_block); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c new file mode 100644 index 000000000..875553d7e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace.c @@ -0,0 +1,498 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <fnmatch.h> +#include <inttypes.h> +#include <sys/queue.h> +#include <regex.h> + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_string_fns.h> + +#include "eal_trace.h" + +RTE_DEFINE_PER_LCORE(volatile int, trace_point_sz); +RTE_DEFINE_PER_LCORE(void *, trace_mem); +static RTE_DEFINE_PER_LCORE(char, ctf_field[TRACE_CTF_FIELD_SIZE]); +static RTE_DEFINE_PER_LCORE(int, ctf_count); + +static struct trace_point_head tp_list = STAILQ_HEAD_INITIALIZER(tp_list); +static struct trace trace = { .args = STAILQ_HEAD_INITIALIZER(trace.args), }; + +struct trace * +trace_obj_get(void) +{ + return &trace; +} + +struct trace_point_head * +trace_list_head_get(void) +{ + return &tp_list; +} + +int +eal_trace_init(void) +{ + struct trace_arg *arg; + + /* Trace memory should start with 8B aligned for natural alignment */ + RTE_BUILD_BUG_ON((offsetof(struct __rte_trace_header, mem) % 8) != 0); + + /* One of the trace point registration failed */ + if (trace.register_errno) { + rte_errno = trace.register_errno; + goto fail; + } + + if (!STAILQ_EMPTY(&trace.args)) + trace.status = true; + + if (!rte_trace_is_enabled()) + return 0; + + rte_spinlock_init(&trace.lock); + + /* Is duplicate trace name registered */ + if (trace_has_duplicate_entry()) + goto fail; + + /* Generate UUID ver 4 with total size of events and number of + * events + */ + trace_uuid_generate(); + + /* Apply buffer size configuration for trace output */ + trace_bufsz_args_apply(); + + /* Generate CTF TDSL metadata */ + if (trace_metadata_create() < 0) + goto fail; + + /* Create trace directory */ + if (trace_mkdir()) + goto free_meta; + + /* Save current epoch timestamp for future use */ + if (trace_epoch_time_save() < 0) + goto fail; + + /* Apply global configurations */ + STAILQ_FOREACH(arg, &trace.args, next) + trace_args_apply(arg->val); + + rte_trace_mode_set(trace.mode); + + return 0; + +free_meta: + trace_metadata_destroy(); +fail: + trace_err("failed to initialize trace [%s]", rte_strerror(rte_errno)); + return -rte_errno; +} + +void +eal_trace_fini(void) +{ + if (!rte_trace_is_enabled()) + return; + trace_mem_per_thread_free(); + trace_metadata_destroy(); + eal_trace_args_free(); +} + +bool +rte_trace_is_enabled(void) +{ + return trace.status; +} + +static void +trace_mode_set(rte_trace_point_t *trace, enum rte_trace_mode mode) +{ + if (mode == RTE_TRACE_MODE_OVERWRITE) + __atomic_and_fetch(trace, ~__RTE_TRACE_FIELD_ENABLE_DISCARD, + __ATOMIC_RELEASE); + else + __atomic_or_fetch(trace, __RTE_TRACE_FIELD_ENABLE_DISCARD, + __ATOMIC_RELEASE); +} + +void +rte_trace_mode_set(enum rte_trace_mode mode) +{ + struct trace_point *tp; + + if (!rte_trace_is_enabled()) + return; + + STAILQ_FOREACH(tp, &tp_list, next) + trace_mode_set(tp->handle, mode); + + trace.mode = mode; +} + +enum +rte_trace_mode rte_trace_mode_get(void) +{ + return trace.mode; +} + +static bool +trace_point_is_invalid(rte_trace_point_t *t) +{ + return (t == NULL) || (trace_id_get(t) >= trace.nb_trace_points); +} + +bool +rte_trace_point_is_enabled(rte_trace_point_t *trace) +{ + uint64_t val; + + if (trace_point_is_invalid(trace)) + return false; + + val = __atomic_load_n(trace, __ATOMIC_ACQUIRE); + return (val & __RTE_TRACE_FIELD_ENABLE_MASK) != 0; +} + +int +rte_trace_point_enable(rte_trace_point_t *trace) +{ + if (trace_point_is_invalid(trace)) + return -ERANGE; + + __atomic_or_fetch(trace, __RTE_TRACE_FIELD_ENABLE_MASK, + __ATOMIC_RELEASE); + return 0; +} + +int +rte_trace_point_disable(rte_trace_point_t *trace) +{ + if (trace_point_is_invalid(trace)) + return -ERANGE; + + __atomic_and_fetch(trace, ~__RTE_TRACE_FIELD_ENABLE_MASK, + __ATOMIC_RELEASE); + return 0; +} + +int +rte_trace_pattern(const char *pattern, bool enable) +{ + struct trace_point *tp; + int rc = 0, found = 0; + + STAILQ_FOREACH(tp, &tp_list, next) { + if (fnmatch(pattern, tp->name, 0) == 0) { + if (enable) + rc = rte_trace_point_enable(tp->handle); + else + rc = rte_trace_point_disable(tp->handle); + found = 1; + } + if (rc < 0) + return rc; + } + + return rc | found; +} + +int +rte_trace_regexp(const char *regex, bool enable) +{ + struct trace_point *tp; + int rc = 0, found = 0; + regex_t r; + + if (regcomp(&r, regex, 0) != 0) + return -EINVAL; + + STAILQ_FOREACH(tp, &tp_list, next) { + if (regexec(&r, tp->name, 0, NULL, 0) == 0) { + if (enable) + rc = rte_trace_point_enable(tp->handle); + else + rc = rte_trace_point_disable(tp->handle); + found = 1; + } + if (rc < 0) + return rc; + } + regfree(&r); + + return rc | found; +} + +rte_trace_point_t * +rte_trace_point_lookup(const char *name) +{ + struct trace_point *tp; + + if (name == NULL) + return NULL; + + STAILQ_FOREACH(tp, &tp_list, next) + if (strncmp(tp->name, name, TRACE_POINT_NAME_SIZE) == 0) + return tp->handle; + + return NULL; +} + +static void +trace_point_dump(FILE *f, struct trace_point *tp) +{ + rte_trace_point_t *handle = tp->handle; + + fprintf(f, "\tid %d, %s, size is %d, %s\n", + trace_id_get(handle), tp->name, + (uint16_t)(*handle & __RTE_TRACE_FIELD_SIZE_MASK), + rte_trace_point_is_enabled(handle) ? "enabled" : "disabled"); +} + +static void +trace_lcore_mem_dump(FILE *f) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + + if (trace->nb_trace_mem_list == 0) + return; + + rte_spinlock_lock(&trace->lock); + fprintf(f, "nb_trace_mem_list = %d\n", trace->nb_trace_mem_list); + fprintf(f, "\nTrace mem info\n--------------\n"); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + header = trace->lcore_meta[count].mem; + fprintf(f, "\tid %d, mem=%p, area=%s, lcore_id=%d, name=%s\n", + count, header, + trace_area_to_string(trace->lcore_meta[count].area), + header->stream_header.lcore_id, + header->stream_header.thread_name); + } + rte_spinlock_unlock(&trace->lock); +} + +void +rte_trace_dump(FILE *f) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + + fprintf(f, "\nGlobal info\n-----------\n"); + fprintf(f, "status = %s\n", + rte_trace_is_enabled() ? "enabled" : "disabled"); + fprintf(f, "mode = %s\n", + trace_mode_to_string(rte_trace_mode_get())); + fprintf(f, "dir = %s\n", trace->dir); + fprintf(f, "buffer len = %d\n", trace->buff_len); + fprintf(f, "number of trace points = %d\n", trace->nb_trace_points); + + trace_lcore_mem_dump(f); + fprintf(f, "\nTrace point info\n----------------\n"); + STAILQ_FOREACH(tp, tp_list, next) + trace_point_dump(f, tp); +} + +void +__rte_trace_mem_per_thread_alloc(void) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + + if (!rte_trace_is_enabled()) + return; + + if (RTE_PER_LCORE(trace_mem)) + return; + + rte_spinlock_lock(&trace->lock); + + count = trace->nb_trace_mem_list; + + /* Allocate room for storing the thread trace mem meta */ + trace->lcore_meta = realloc(trace->lcore_meta, + sizeof(trace->lcore_meta[0]) * (count + 1)); + + /* Provide dummy space for fast path to consume */ + if (trace->lcore_meta == NULL) { + trace_crit("trace mem meta memory realloc failed"); + header = NULL; + goto fail; + } + + /* First attempt from huge page */ + header = eal_malloc_no_trace(NULL, trace_mem_sz(trace->buff_len), 8); + if (header) { + trace->lcore_meta[count].area = TRACE_AREA_HUGEPAGE; + goto found; + } + + /* Second attempt from heap */ + header = malloc(trace_mem_sz(trace->buff_len)); + if (header == NULL) { + trace_crit("trace mem malloc attempt failed"); + header = NULL; + goto fail; + + } + + /* Second attempt from heap is success */ + trace->lcore_meta[count].area = TRACE_AREA_HEAP; + + /* Initialize the trace header */ +found: + header->offset = 0; + header->len = trace->buff_len; + header->stream_header.magic = TRACE_CTF_MAGIC; + rte_uuid_copy(header->stream_header.uuid, trace->uuid); + header->stream_header.lcore_id = rte_lcore_id(); + + /* Store the thread name */ + char *name = header->stream_header.thread_name; + memset(name, 0, __RTE_TRACE_EMIT_STRING_LEN_MAX); + rte_thread_getname(pthread_self(), name, + __RTE_TRACE_EMIT_STRING_LEN_MAX); + + trace->lcore_meta[count].mem = header; + trace->nb_trace_mem_list++; +fail: + RTE_PER_LCORE(trace_mem) = header; + rte_spinlock_unlock(&trace->lock); +} + +void +trace_mem_per_thread_free(void) +{ + struct trace *trace = trace_obj_get(); + uint32_t count; + void *mem; + + if (!rte_trace_is_enabled()) + return; + + rte_spinlock_lock(&trace->lock); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + mem = trace->lcore_meta[count].mem; + if (trace->lcore_meta[count].area == TRACE_AREA_HUGEPAGE) + eal_free_no_trace(mem); + else if (trace->lcore_meta[count].area == TRACE_AREA_HEAP) + free(mem); + } + rte_spinlock_unlock(&trace->lock); +} + +void +__rte_trace_point_emit_field(size_t sz, const char *in, const char *datatype) +{ + char *field = RTE_PER_LCORE(ctf_field); + int count = RTE_PER_LCORE(ctf_count); + size_t size; + int rc; + + size = RTE_MAX(0, TRACE_CTF_FIELD_SIZE - 1 - count); + RTE_PER_LCORE(trace_point_sz) += sz; + rc = snprintf(RTE_PTR_ADD(field, count), size, "%s %s;", datatype, in); + if (rc <= 0 || (size_t)rc >= size) { + RTE_PER_LCORE(trace_point_sz) = 0; + trace_crit("CTF field is too long"); + return; + } + RTE_PER_LCORE(ctf_count) += rc; +} + +int +__rte_trace_point_register(rte_trace_point_t *handle, const char *name, + void (*register_fn)(void)) +{ + char *field = RTE_PER_LCORE(ctf_field); + struct trace_point *tp; + uint16_t sz; + + /* Sanity checks of arguments */ + if (name == NULL || register_fn == NULL || handle == NULL) { + trace_err("invalid arguments"); + rte_errno = EINVAL; + goto fail; + } + + /* Check the size of the trace point object */ + RTE_PER_LCORE(trace_point_sz) = 0; + RTE_PER_LCORE(ctf_count) = 0; + register_fn(); + if (RTE_PER_LCORE(trace_point_sz) == 0) { + trace_err("missing rte_trace_emit_header() in register fn"); + rte_errno = EBADF; + goto fail; + } + + /* Is size overflowed */ + if (RTE_PER_LCORE(trace_point_sz) > UINT16_MAX) { + trace_err("trace point size overflowed"); + rte_errno = ENOSPC; + goto fail; + } + + /* Are we running out of space to store trace points? */ + if (trace.nb_trace_points > UINT16_MAX) { + trace_err("trace point exceeds the max count"); + rte_errno = ENOSPC; + goto fail; + } + + /* Get the size of the trace point */ + sz = RTE_PER_LCORE(trace_point_sz); + tp = calloc(1, sizeof(struct trace_point)); + if (tp == NULL) { + trace_err("fail to allocate trace point memory"); + rte_errno = ENOMEM; + goto fail; + } + + /* Initialize the trace point */ + if (rte_strscpy(tp->name, name, TRACE_POINT_NAME_SIZE) < 0) { + trace_err("name is too long"); + rte_errno = E2BIG; + goto free; + } + + /* Copy the field data for future use */ + if (rte_strscpy(tp->ctf_field, field, TRACE_CTF_FIELD_SIZE) < 0) { + trace_err("CTF field size is too long"); + rte_errno = E2BIG; + goto free; + } + + /* Clear field memory for the next event */ + memset(field, 0, TRACE_CTF_FIELD_SIZE); + + /* Form the trace handle */ + *handle = sz; + *handle |= trace.nb_trace_points << __RTE_TRACE_FIELD_ID_SHIFT; + + trace.nb_trace_points++; + tp->handle = handle; + + /* Add the trace point at tail */ + STAILQ_INSERT_TAIL(&tp_list, tp, next); + __atomic_thread_fence(__ATOMIC_RELEASE); + + /* All Good !!! */ + return 0; +free: + free(tp); +fail: + if (trace.register_errno == 0) + trace.register_errno = rte_errno; + + return -rte_errno; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c new file mode 100644 index 000000000..302e2bb74 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_ctf.c @@ -0,0 +1,488 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <inttypes.h> +#include <time.h> + +#include <rte_byteorder.h> +#include <rte_common.h> +#include <rte_time.h> +#include <rte_trace.h> +#include <rte_version.h> + +#include "eal_trace.h" + +__rte_format_printf(2, 0) +static int +metadata_printf(char **str, const char *fmt, ...) +{ + va_list ap; + int rc; + + *str = NULL; + va_start(ap, fmt); + rc = vasprintf(str, fmt, ap); + va_end(ap); + + return rc; +} + +static int +meta_copy(char **meta, int *offset, char *str, int rc) +{ + int count = *offset; + char *ptr = *meta; + + if (rc < 0) + return rc; + + ptr = realloc(ptr, count + rc); + if (ptr == NULL) + goto free_str; + + memcpy(RTE_PTR_ADD(ptr, count), str, rc); + count += rc; + free(str); + + *meta = ptr; + *offset = count; + + return rc; + +free_str: + if (str) + free(str); + return -ENOMEM; +} + +static int +meta_data_type_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "/* CTF 1.8 */\n" + "typealias integer {size = 8; base = x;}:= uint8_t;\n" + "typealias integer {size = 16; base = x;} := uint16_t;\n" + "typealias integer {size = 32; base = x;} := uint32_t;\n" + "typealias integer {size = 64; base = x;} := uint64_t;\n" + "typealias integer {size = 8; signed = true;} := int8_t;\n" + "typealias integer {size = 16; signed = true;} := int16_t;\n" + "typealias integer {size = 32; signed = true;} := int32_t;\n" + "typealias integer {size = 64; signed = true;} := int64_t;\n" +#ifdef RTE_ARCH_64 + "typealias integer {size = 64; base = x;} := uintptr_t;\n" +#else + "typealias integer {size = 32; base = x;} := uintptr_t;\n" +#endif +#ifdef RTE_ARCH_64 + "typealias integer {size = 64; base = x;} := long;\n" +#else + "typealias integer {size = 32; base = x;} := long;\n" +#endif + "typealias integer {size = 8; signed = false; encoding = ASCII; } := string_bounded_t;\n\n" + "typealias floating_point {\n" + " exp_dig = 8;\n" + " mant_dig = 24;\n" + "} := float;\n\n" + "typealias floating_point {\n" + " exp_dig = 11;\n" + " mant_dig = 53;\n" + "} := double;\n\n"); + + return meta_copy(meta, offset, str, rc); +} + +static int +is_be(void) +{ +#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN + return 1; +#else + return 0; +#endif +} + +static int +meta_header_emit(char **meta, int *offset) +{ + struct trace *trace = trace_obj_get(); + char uustr[RTE_UUID_STRLEN]; + char *str = NULL; + int rc; + + rte_uuid_unparse(trace->uuid, uustr, RTE_UUID_STRLEN); + rc = metadata_printf(&str, + "trace {\n" + " major = 1;\n" + " minor = 8;\n" + " uuid = \"%s\";\n" + " byte_order = %s;\n" + " packet.header := struct {\n" + " uint32_t magic;\n" + " uint8_t uuid[16];\n" + " };\n" + "};\n\n", uustr, is_be() ? "be" : "le"); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_env_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "env {\n" + " dpdk_version = \"%s\";\n" + " tracer_name = \"dpdk\";\n" + "};\n\n", rte_version()); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass1_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "clock {\n" + " name = \"dpdk\";\n" + " freq = "); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass2_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n" + " offset_s =", 0); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass3_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n" + " offset =", 0); + return meta_copy(meta, offset, str, rc); +} + +static int +meta_clock_pass4_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "%20"PRIu64";\n};\n\n" + "typealias integer {\n" + " size = 48; align = 1; signed = false;\n" + " map = clock.dpdk.value;\n" + "} := uint48_clock_dpdk_t;\n\n", 0); + + return meta_copy(meta, offset, str, rc); +} + +static int +meta_stream_emit(char **meta, int *offset) +{ + char *str = NULL; + int rc; + + rc = metadata_printf(&str, + "stream {\n" + " packet.context := struct {\n" + " uint32_t cpu_id;\n" + " string_bounded_t name[32];\n" + " };\n" + " event.header := struct {\n" + " uint48_clock_dpdk_t timestamp;\n" + " uint16_t id;\n" + " } align(64);\n" + "};\n\n"); + return meta_copy(meta, offset, str, rc); +} + +static void +string_fixed_replace(char *input, const char *search, const char *replace) +{ + char *found; + size_t len; + + found = strstr(input, search); + if (found == NULL) + return; + + if (strlen(found) != strlen(search)) + return; + + len = strlen(replace); + memcpy(found, replace, len); + found[len] = '\0'; +} + +static void +ctf_fixup_align(char *str) +{ + string_fixed_replace(str, "align", "_align"); +} + +static void +ctf_fixup_arrow_deref(char *str) +{ + const char *replace = "_"; + const char *search = "->"; + char *found; + size_t len; + + found = strstr(str, search); + if (found == NULL) + return; + + do { + memcpy(found, replace, strlen(replace)); + len = strlen(found + 2); + memcpy(found + 1, found + 2, len); + found[len + 1] = '\0'; + found = strstr(str, search); + } while (found != NULL); +} + +static void +ctf_fixup_dot_deref(char *str) +{ + const char *replace = "_"; + const char *search = "."; + char *found; + size_t len; + + found = strstr(str, search); + if (found == NULL) + return; + + len = strlen(replace); + do { + memcpy(found, replace, len); + found = strstr(str, search); + } while (found != NULL); +} + +static void +ctf_fixup_event(char *str) +{ + string_fixed_replace(str, "event", "_event"); +} + +static int +ctf_fixup_keyword(char *str) +{ + char dup_str[TRACE_CTF_FIELD_SIZE]; + char input[TRACE_CTF_FIELD_SIZE]; + const char *delim = ";"; + char *from; + int len; + + if (str == NULL) + return 0; + + len = strlen(str); + if (len >= TRACE_CTF_FIELD_SIZE) { + trace_err("ctf_field reached its maximum limit"); + return -EMSGSIZE; + } + + /* Create duplicate string */ + strcpy(dup_str, str); + + len = 0; + from = strtok(dup_str, delim); + while (from != NULL) { + strcpy(input, from); + ctf_fixup_align(input); + ctf_fixup_dot_deref(input); + ctf_fixup_arrow_deref(input); + ctf_fixup_event(input); + + strcpy(&input[strlen(input)], delim); + if ((len + strlen(input)) >= TRACE_CTF_FIELD_SIZE) { + trace_err("ctf_field reached its maximum limit"); + return -EMSGSIZE; + } + + strcpy(str + len, input); + len += strlen(input); + from = strtok(NULL, delim); + } + + return 0; +} + +static int +meta_event_emit(char **meta, int *offset, struct trace_point *tp) +{ + char *str = NULL; + int rc; + + /* Fixup ctf field string in case it using reserved ctf keywords */ + rc = ctf_fixup_keyword(tp->ctf_field); + if (rc) + return rc; + + rc = metadata_printf(&str, + "event {\n" + " id = %d;\n" + " name = \"%s\";\n" + " fields := struct {\n" + " %s\n" + " };\n" + "};\n\n", trace_id_get(tp->handle), tp->name, tp->ctf_field); + return meta_copy(meta, offset, str, rc); +} + +int +trace_metadata_create(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + int rc, offset = 0; + char *meta = NULL; + + rc = meta_data_type_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_header_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_env_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_clock_pass1_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq = offset; + + rc = meta_clock_pass2_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq_off_s = offset; + + rc = meta_clock_pass3_emit(&meta, &offset); + if (rc < 0) + goto fail; + trace->ctf_meta_offset_freq_off = offset; + + rc = meta_clock_pass4_emit(&meta, &offset); + if (rc < 0) + goto fail; + + rc = meta_stream_emit(&meta, &offset); + if (rc < 0) + goto fail; + + STAILQ_FOREACH(tp, tp_list, next) + if (meta_event_emit(&meta, &offset, tp) < 0) + goto fail; + + trace->ctf_meta = meta; + return 0; + +fail: + if (meta) + free(meta); + return -EBADF; +} + +void +trace_metadata_destroy(void) +{ + struct trace *trace = trace_obj_get(); + + if (trace->ctf_meta) { + free(trace->ctf_meta); + trace->ctf_meta = NULL; + } +} + +static void +meta_fix_freq(struct trace *trace, char *meta) +{ + char *str; + int rc; + + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq); + rc = sprintf(str, "%20"PRIu64"", rte_get_timer_hz()); + str[rc] = ';'; +} + +static void +meta_fix_freq_offset(struct trace *trace, char *meta) +{ + uint64_t uptime_tickes_floor, uptime_ticks, freq, uptime_sec; + uint64_t offset, offset_s; + char *str; + int rc; + + uptime_ticks = trace->uptime_ticks & + ((1ULL << __RTE_TRACE_EVENT_HEADER_ID_SHIFT) - 1); + freq = rte_get_tsc_hz(); + uptime_tickes_floor = RTE_ALIGN_MUL_FLOOR(uptime_ticks, freq); + + uptime_sec = uptime_tickes_floor / freq; + offset_s = trace->epoch_sec - uptime_sec; + + offset = uptime_ticks - uptime_tickes_floor; + offset += trace->epoch_nsec * (freq / NSEC_PER_SEC); + + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq_off_s); + rc = sprintf(str, "%20"PRIu64"", offset_s); + str[rc] = ';'; + str = RTE_PTR_ADD(meta, trace->ctf_meta_offset_freq_off); + rc = sprintf(str, "%20"PRIu64"", offset); + str[rc] = ';'; +} + +static void +meta_fixup(struct trace *trace, char *meta) +{ + meta_fix_freq(trace, meta); + meta_fix_freq_offset(trace, meta); +} + +int +rte_trace_metadata_dump(FILE *f) +{ + struct trace *trace = trace_obj_get(); + char *ctf_meta = trace->ctf_meta; + int rc; + + if (!rte_trace_is_enabled()) + return 0; + + if (ctf_meta == NULL) + return -EINVAL; + + if (!__atomic_load_n(&trace->ctf_fixup_done, __ATOMIC_SEQ_CST) && + rte_get_timer_hz()) { + meta_fixup(trace, ctf_meta); + __atomic_store_n(&trace->ctf_fixup_done, 1, __ATOMIC_SEQ_CST); + } + + rc = fprintf(f, "%s", ctf_meta); + return rc < 0 ? rc : 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c new file mode 100644 index 000000000..4a8ce9088 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_points.c @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <rte_trace_point_register.h> + +#include <rte_eal_trace.h> + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_void); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u64); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u32); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u16); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_u8); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i64); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i32); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i16); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_i8); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_int); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_long); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_float); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_double); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_ptr); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_str); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_generic_func); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_alarm_set); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_alarm_cancel); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_zmalloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_malloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_realloc); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_mem_free); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_reserve); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_lookup); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_memzone_free); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_thread_remote_launch); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_thread_lcore_ready); + +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_callback_register); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_callback_unregister); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_enable); +RTE_TRACE_POINT_DEFINE(rte_eal_trace_intr_disable); + +RTE_INIT(eal_trace_init) +{ + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_void, + lib.eal.generic.void); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u64, + lib.eal.generic.u64); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u32, + lib.eal.generic.u32); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u16, + lib.eal.generic.u16); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_u8, + lib.eal.generic.u8); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i64, + lib.eal.generic.i64); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i32, + lib.eal.generic.i32); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i16, + lib.eal.generic.i16); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_i8, + lib.eal.generic.i8); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_int, + lib.eal.generic.int); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_long, + lib.eal.generic.long); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_float, + lib.eal.generic.float); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_double, + lib.eal.generic.double); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_ptr, + lib.eal.generic.ptr); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_str, + lib.eal.generic.string); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_generic_func, + lib.eal.generic.func); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_alarm_set, + lib.eal.alarm.set); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_alarm_cancel, + lib.eal.alarm.cancel); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_zmalloc, + lib.eal.mem.zmalloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_malloc, + lib.eal.mem.malloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_realloc, + lib.eal.mem.realloc); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_mem_free, + lib.eal.mem.free); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_reserve, + lib.eal.memzone.reserve); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_lookup, + lib.eal.memzone.lookup); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_memzone_free, + lib.eal.memzone.free); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_thread_remote_launch, + lib.eal.thread.remote.launch); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_thread_lcore_ready, + lib.eal.thread.lcore.ready); + + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_callback_register, + lib.eal.intr.register); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_callback_unregister, + lib.eal.intr.unregister); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_enable, + lib.eal.intr.enable); + RTE_TRACE_POINT_REGISTER(rte_eal_trace_intr_disable, + lib.eal.intr.disable); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c new file mode 100644 index 000000000..64f58fb66 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_trace_utils.c @@ -0,0 +1,448 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#include <fnmatch.h> +#include <pwd.h> +#include <sys/stat.h> +#include <time.h> + +#include <rte_common.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +#include "eal_filesystem.h" +#include "eal_trace.h" + +const char * +trace_mode_to_string(enum rte_trace_mode mode) +{ + switch (mode) { + case RTE_TRACE_MODE_OVERWRITE: return "overwrite"; + case RTE_TRACE_MODE_DISCARD: return "discard"; + default: return "unknown"; + } +} + +const char * +trace_area_to_string(enum trace_area_e area) +{ + switch (area) { + case TRACE_AREA_HEAP: return "heap"; + case TRACE_AREA_HUGEPAGE: return "hugepage"; + default: return "unknown"; + } +} + +static bool +trace_entry_compare(const char *name) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace_point *tp; + int count = 0; + + STAILQ_FOREACH(tp, tp_list, next) { + if (strncmp(tp->name, name, TRACE_POINT_NAME_SIZE) == 0) + count++; + if (count > 1) { + trace_err("found duplicate entry %s", name); + rte_errno = EEXIST; + return true; + } + } + return false; +} + +bool +trace_has_duplicate_entry(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace_point *tp; + + /* Is duplicate trace name registered */ + STAILQ_FOREACH(tp, tp_list, next) + if (trace_entry_compare(tp->name)) + return true; + + return false; +} + +void +trace_uuid_generate(void) +{ + struct trace_point_head *tp_list = trace_list_head_get(); + struct trace *trace = trace_obj_get(); + struct trace_point *tp; + uint64_t sz_total = 0; + + /* Go over the registered trace points to get total size of events */ + STAILQ_FOREACH(tp, tp_list, next) { + const uint16_t sz = *tp->handle & __RTE_TRACE_FIELD_SIZE_MASK; + sz_total += sz; + } + + rte_uuid_t uuid = RTE_UUID_INIT(sz_total, trace->nb_trace_points, + 0x4370, 0x8f50, 0x222ddd514176ULL); + rte_uuid_copy(trace->uuid, uuid); +} + +static int +trace_session_name_generate(char *trace_dir) +{ + struct tm *tm_result; + time_t tm; + int rc; + + tm = time(NULL); + if ((int)tm == -1) + goto fail; + + tm_result = localtime(&tm); + if (tm_result == NULL) + goto fail; + + rc = rte_strscpy(trace_dir, eal_get_hugefile_prefix(), + TRACE_PREFIX_LEN); + if (rc == -E2BIG) + rc = TRACE_PREFIX_LEN; + trace_dir[rc++] = '-'; + + rc = strftime(trace_dir + rc, TRACE_DIR_STR_LEN - rc, + "%Y-%m-%d-%p-%I-%M-%S", tm_result); + if (rc == 0) + goto fail; + + return rc; +fail: + rte_errno = errno; + return -rte_errno; +} + +static int +trace_dir_update(const char *str) +{ + struct trace *trace = trace_obj_get(); + int rc, remaining; + + remaining = sizeof(trace->dir) - trace->dir_offset; + rc = rte_strscpy(&trace->dir[0] + trace->dir_offset, str, remaining); + if (rc < 0) + goto fail; + + trace->dir_offset += rc; +fail: + return rc; +} + +int +eal_trace_args_save(const char *val) +{ + struct trace *trace = trace_obj_get(); + struct trace_arg *arg = malloc(sizeof(*arg)); + + if (arg == NULL) { + trace_err("failed to allocate memory for %s", val); + return -ENOMEM; + } + + arg->val = strdup(val); + if (arg->val == NULL) { + trace_err("failed to allocate memory for %s", val); + free(arg); + return -ENOMEM; + } + + STAILQ_INSERT_TAIL(&trace->args, arg, next); + return 0; +} + +void +eal_trace_args_free(void) +{ + struct trace *trace = trace_obj_get(); + struct trace_arg *arg; + + while (!STAILQ_EMPTY(&trace->args)) { + arg = STAILQ_FIRST(&trace->args); + STAILQ_REMOVE_HEAD(&trace->args, next); + free(arg->val); + free(arg); + } +} + +int +trace_args_apply(const char *arg) +{ + if (rte_trace_regexp(arg, true) < 0) { + trace_err("cannot enable trace for %s", arg); + return -1; + } + + return 0; +} + +int +eal_trace_bufsz_args_save(char const *val) +{ + struct trace *trace = trace_obj_get(); + uint64_t bufsz; + + bufsz = rte_str_to_size(val); + if (bufsz == 0) { + trace_err("buffer size cannot be zero"); + return -EINVAL; + } + + trace->buff_len = bufsz; + return 0; +} + +void +trace_bufsz_args_apply(void) +{ + struct trace *trace = trace_obj_get(); + + if (trace->buff_len == 0) + trace->buff_len = 1024 * 1024; /* 1MB */ +} + +int +eal_trace_mode_args_save(const char *val) +{ + struct trace *trace = trace_obj_get(); + size_t len = strlen(val); + unsigned long tmp; + char *pattern; + + if (len == 0) { + trace_err("value is not provided with option"); + return -EINVAL; + } + + pattern = (char *)calloc(1, len + 2); + if (pattern == NULL) { + trace_err("fail to allocate memory"); + return -ENOMEM; + } + + sprintf(pattern, "%s*", val); + + if (fnmatch(pattern, "overwrite", 0) == 0) + tmp = RTE_TRACE_MODE_OVERWRITE; + else if (fnmatch(pattern, "discard", 0) == 0) + tmp = RTE_TRACE_MODE_DISCARD; + else { + free(pattern); + return -EINVAL; + } + + trace->mode = tmp; + free(pattern); + return 0; +} + +int +eal_trace_dir_args_save(char const *val) +{ + struct trace *trace = trace_obj_get(); + char *dir_path; + int rc; + + if (strlen(val) >= sizeof(trace->dir) - 1) { + trace_err("input string is too big"); + return -ENAMETOOLONG; + } + + if (asprintf(&dir_path, "%s/", val) == -1) { + trace_err("failed to copy directory: %s", strerror(errno)); + return -ENOMEM; + } + + rc = trace_dir_update(dir_path); + + free(dir_path); + return rc; +} + +int +trace_epoch_time_save(void) +{ + struct trace *trace = trace_obj_get(); + struct timespec epoch = { 0, 0 }; + uint64_t avg, start, end; + + start = rte_get_tsc_cycles(); + if (clock_gettime(CLOCK_REALTIME, &epoch) < 0) { + trace_err("failed to get the epoch time"); + return -1; + } + end = rte_get_tsc_cycles(); + avg = (start + end) >> 1; + + trace->epoch_sec = (uint64_t) epoch.tv_sec; + trace->epoch_nsec = (uint64_t) epoch.tv_nsec; + trace->uptime_ticks = avg; + + return 0; +} + +static int +trace_dir_default_path_get(char *dir_path) +{ + struct trace *trace = trace_obj_get(); + uint32_t size = sizeof(trace->dir); + struct passwd *pwd; + char *home_dir; + + /* First check for shell environment variable */ + home_dir = getenv("HOME"); + if (home_dir == NULL) { + /* Fallback to password file entry */ + pwd = getpwuid(getuid()); + if (pwd == NULL) + return -EINVAL; + + home_dir = pwd->pw_dir; + } + + /* Append dpdk-traces to directory */ + if (snprintf(dir_path, size, "%s/dpdk-traces/", home_dir) < 0) + return -ENAMETOOLONG; + + return 0; +} + +int +trace_mkdir(void) +{ + struct trace *trace = trace_obj_get(); + char session[TRACE_DIR_STR_LEN]; + char *dir_path; + int rc; + + if (!trace->dir_offset) { + dir_path = calloc(1, sizeof(trace->dir)); + if (dir_path == NULL) { + trace_err("fail to allocate memory"); + return -ENOMEM; + } + + rc = trace_dir_default_path_get(dir_path); + if (rc < 0) { + trace_err("fail to get default path"); + free(dir_path); + return rc; + } + + rc = trace_dir_update(dir_path); + free(dir_path); + if (rc < 0) + return rc; + } + + /* Create the path if it t exist, no "mkdir -p" available here */ + rc = mkdir(trace->dir, 0700); + if (rc < 0 && errno != EEXIST) { + trace_err("mkdir %s failed [%s]", trace->dir, strerror(errno)); + rte_errno = errno; + return -rte_errno; + } + + rc = trace_session_name_generate(session); + if (rc < 0) + return rc; + rc = trace_dir_update(session); + if (rc < 0) + return rc; + + rc = mkdir(trace->dir, 0700); + if (rc < 0) { + trace_err("mkdir %s failed [%s]", trace->dir, strerror(errno)); + rte_errno = errno; + return -rte_errno; + } + + RTE_LOG(INFO, EAL, "Trace dir: %s\n", trace->dir); + return 0; +} + +static int +trace_meta_save(struct trace *trace) +{ + char file_name[PATH_MAX]; + FILE *f; + int rc; + + rc = snprintf(file_name, PATH_MAX, "%s/metadata", trace->dir); + if (rc < 0) + return rc; + + f = fopen(file_name, "w"); + if (f == NULL) + return -errno; + + rc = rte_trace_metadata_dump(f); + + if (fclose(f)) + rc = -errno; + + return rc; +} + + +static inline int +trace_file_sz(struct __rte_trace_header *hdr) +{ + return sizeof(struct __rte_trace_stream_header) + hdr->offset; +} + +static int +trace_mem_save(struct trace *trace, struct __rte_trace_header *hdr, + uint32_t cnt) +{ + char file_name[PATH_MAX]; + FILE *f; + int rc; + + rc = snprintf(file_name, PATH_MAX, "%s/channel0_%d", trace->dir, cnt); + if (rc < 0) + return rc; + + f = fopen(file_name, "w"); + if (f == NULL) + return -errno; + + rc = fwrite(&hdr->stream_header, trace_file_sz(hdr), 1, f); + rc = (rc == 1) ? 0 : -EACCES; + + if (fclose(f)) + rc = -errno; + + return rc; +} + +int +rte_trace_save(void) +{ + struct trace *trace = trace_obj_get(); + struct __rte_trace_header *header; + uint32_t count; + int rc = 0; + + if (trace->nb_trace_mem_list == 0) + return rc; + + rc = trace_meta_save(trace); + if (rc) + return rc; + + rte_spinlock_lock(&trace->lock); + for (count = 0; count < trace->nb_trace_mem_list; count++) { + header = trace->lcore_meta[count].mem; + rc = trace_mem_save(trace, header, count); + if (rc) + break; + } + rte_spinlock_unlock(&trace->lock); + return rc; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c new file mode 100644 index 000000000..0a80bfbb3 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_common_uuid.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C) 1996, 1997 Theodore Ts'o. + */ + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <ctype.h> + +#include <rte_uuid.h> + +/* UUID packed form */ +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +static void uuid_pack(const struct uuid *uu, rte_uuid_t ptr) +{ + uint32_t tmp; + uint8_t *out = ptr; + + tmp = uu->time_low; + out[3] = (uint8_t) tmp; + tmp >>= 8; + out[2] = (uint8_t) tmp; + tmp >>= 8; + out[1] = (uint8_t) tmp; + tmp >>= 8; + out[0] = (uint8_t) tmp; + + tmp = uu->time_mid; + out[5] = (uint8_t) tmp; + tmp >>= 8; + out[4] = (uint8_t) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (uint8_t) tmp; + tmp >>= 8; + out[6] = (uint8_t) tmp; + + tmp = uu->clock_seq; + out[9] = (uint8_t) tmp; + tmp >>= 8; + out[8] = (uint8_t) tmp; + + memcpy(out+10, uu->node, 6); +} + +static void uuid_unpack(const rte_uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + +bool rte_uuid_is_null(const rte_uuid_t uu) +{ + const uint8_t *cp = uu; + int i; + + for (i = 0; i < 16; i++) + if (*cp++) + return false; + return true; +} + +/* + * rte_uuid_compare() - compare two UUIDs. + */ +int rte_uuid_compare(const rte_uuid_t uu1, const rte_uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + +#define UUCMP(u1, u2) \ + do { if (u1 != u2) return (u1 < u2) ? -1 : 1; } while (0) + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); +#undef UUCMP + + return memcmp(uuid1.node, uuid2.node, 6); +} + +int rte_uuid_parse(const char *in, rte_uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + + for (i = 0, cp = in; i <= 36; i++, cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i == 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + + for (i = 0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} + +void rte_uuid_unparse(const rte_uuid_t uu, char *out, size_t len) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + + snprintf(out, len, + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h new file mode 100644 index 000000000..5d21f07c2 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_filesystem.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +/** + * @file + * Stores functions and path defines for files and directories + * on the filesystem for Linux, that are used by the Linux EAL. + */ + +#ifndef EAL_FILESYSTEM_H +#define EAL_FILESYSTEM_H + +/** Path of rte config file. */ + +#include <stdint.h> +#include <limits.h> +#include <unistd.h> +#include <stdlib.h> + +#include <rte_string_fns.h> +#include "eal_internal_cfg.h" + +/* sets up platform-specific runtime data dir */ +int +eal_create_runtime_dir(void); + +int +eal_clean_runtime_dir(void); + +/** Function to return hugefile prefix that's currently set up */ +const char * +eal_get_hugefile_prefix(void); + +#define RUNTIME_CONFIG_FNAME "config" +static inline const char * +eal_runtime_config_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + RUNTIME_CONFIG_FNAME); + return buffer; +} + +/** Path of primary/secondary communication unix socket file. */ +#define MP_SOCKET_FNAME "mp_socket" +static inline const char * +eal_mp_socket_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + MP_SOCKET_FNAME); + return buffer; +} + +#define FBARRAY_NAME_FMT "%s/fbarray_%s" +static inline const char * +eal_get_fbarray_path(char *buffer, size_t buflen, const char *name) { + snprintf(buffer, buflen, FBARRAY_NAME_FMT, rte_eal_get_runtime_dir(), + name); + return buffer; +} + +/** Path of hugepage info file. */ +#define HUGEPAGE_INFO_FNAME "hugepage_info" +static inline const char * +eal_hugepage_info_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + HUGEPAGE_INFO_FNAME); + return buffer; +} + +/** Path of hugepage data file. */ +#define HUGEPAGE_DATA_FNAME "hugepage_data" +static inline const char * +eal_hugepage_data_path(void) +{ + static char buffer[PATH_MAX]; /* static so auto-zeroed */ + + snprintf(buffer, sizeof(buffer), "%s/%s", rte_eal_get_runtime_dir(), + HUGEPAGE_DATA_FNAME); + return buffer; +} + +/** String format for hugepage map files. */ +#define HUGEFILE_FMT "%s/%smap_%d" +static inline const char * +eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id) +{ + snprintf(buffer, buflen, HUGEFILE_FMT, hugedir, + eal_get_hugefile_prefix(), f_id); + return buffer; +} + +/** define the default filename prefix for the %s values above */ +#define HUGEFILE_PREFIX_DEFAULT "rte" + +/** Function to read a single numeric value from a file on the filesystem. + * Used to read information from files on /sys */ +int eal_parse_sysfs_value(const char *filename, unsigned long *val); + +#endif /* EAL_FILESYSTEM_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h new file mode 100644 index 000000000..1b560d337 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_hugepages.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_HUGEPAGES_H +#define EAL_HUGEPAGES_H + +#include <stddef.h> +#include <stdint.h> +#include <limits.h> + +#define MAX_HUGEPAGE_PATH PATH_MAX + +/** + * Structure used to store information about hugepages that we mapped + * through the files in hugetlbfs. + */ +struct hugepage_file { + void *orig_va; /**< virtual addr of first mmap() */ + void *final_va; /**< virtual addr of 2nd mmap() */ + uint64_t physaddr; /**< physical addr */ + size_t size; /**< the page size */ + int socket_id; /**< NUMA socket ID */ + int file_id; /**< the '%d' in HUGEFILE_FMT */ + char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */ +}; + +/** + * Read the information on what hugepages are available for the EAL to use, + * clearing out any unused ones. + */ +int eal_hugepage_info_init(void); + +/** + * Read whatever information primary process has shared about hugepages into + * secondary process. + */ +int eal_hugepage_info_read(void); + +#endif /* EAL_HUGEPAGES_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h new file mode 100644 index 000000000..c650bc081 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_internal_cfg.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +/** + * @file + * Holds the structures for the eal internal configuration + */ + +#ifndef EAL_INTERNAL_CFG_H +#define EAL_INTERNAL_CFG_H + +#include <rte_eal.h> +#include <rte_pci_dev_feature_defs.h> + +#include "eal_thread.h" + +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) +#define MAX_HUGEPAGE_SIZES 4 /**< support up to 4 page sizes */ +#else +#define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */ +#endif + +/* + * internal configuration structure for the number, size and + * mount points of hugepages + */ +struct hugepage_info { + uint64_t hugepage_sz; /**< size of a huge page */ + char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */ + uint32_t num_pages[RTE_MAX_NUMA_NODES]; + /**< number of hugepages of that size on each socket */ + int lock_descriptor; /**< file descriptor for hugepage dir */ +}; + +/** + * internal configuration + */ +struct internal_config { + volatile size_t memory; /**< amount of asked memory */ + volatile unsigned force_nchannel; /**< force number of channels */ + volatile unsigned force_nrank; /**< force number of ranks */ + volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */ + unsigned hugepage_unlink; /**< true to unlink backing files */ + volatile unsigned no_pci; /**< true to disable PCI */ + volatile unsigned no_hpet; /**< true to disable HPET */ + volatile unsigned vmware_tsc_map; /**< true to use VMware TSC mapping + * instead of native TSC */ + volatile unsigned no_shconf; /**< true if there is no shared config */ + volatile unsigned in_memory; + /**< true if DPDK should operate entirely in-memory and not create any + * shared files or runtime data. + */ + volatile unsigned create_uio_dev; /**< true to create /dev/uioX devices */ + volatile enum rte_proc_type_t process_type; /**< multi-process proc type */ + /** true to try allocating memory on specific sockets */ + volatile unsigned force_sockets; + volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */ + volatile unsigned force_socket_limits; + volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */ + uintptr_t base_virtaddr; /**< base address to try and reserve memory from */ + volatile unsigned legacy_mem; + /**< true to enable legacy memory behavior (no dynamic allocation, + * IOVA-contiguous segments). + */ + volatile unsigned match_allocations; + /**< true to free hugepages exactly as allocated */ + volatile unsigned single_file_segments; + /**< true if storing all pages within single files (per-page-size, + * per-node) non-legacy mode only. + */ + volatile int syslog_facility; /**< facility passed to openlog() */ + /** default interrupt mode for VFIO */ + volatile enum rte_intr_mode vfio_intr_mode; + char *hugefile_prefix; /**< the base filename of hugetlbfs files */ + char *hugepage_dir; /**< specific hugetlbfs directory to use */ + char *user_mbuf_pool_ops_name; + /**< user defined mbuf pool ops name */ + unsigned num_hugepage_sizes; /**< how many sizes on this system */ + struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES]; + enum rte_iova_mode iova_mode ; /**< Set IOVA mode on this system */ + rte_cpuset_t ctrl_cpuset; /**< cpuset for ctrl threads */ + volatile unsigned int init_complete; + /**< indicates whether EAL has completed initialization */ + unsigned int no_telemetry; /**< true to disable Telemetry */ +}; +extern struct internal_config internal_config; /**< Global EAL configuration. */ + +void eal_reset_internal_config(struct internal_config *internal_cfg); + +#endif /* EAL_INTERNAL_CFG_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h new file mode 100644 index 000000000..e953cd84e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_memalloc.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#ifndef EAL_MEMALLOC_H +#define EAL_MEMALLOC_H + +#include <stdbool.h> + +#include <rte_memory.h> + +/* + * Allocate segment of specified page size. + */ +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket); + +/* + * Allocate `n_segs` segments. + * + * Note: `ms` can be NULL. + * + * Note: it is possible to request best-effort allocation by setting `exact` to + * `false`, in which case allocator will return however many pages it managed to + * allocate successfully. + */ +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact); + +/* + * Deallocate segment + */ +int +eal_memalloc_free_seg(struct rte_memseg *ms); + +/* + * Deallocate `n_segs` segments. Returns 0 on successful deallocation of all + * segments, returns -1 on error. Any segments that could have been deallocated, + * will be deallocated even in case of error. + */ +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs); + +/* + * Check if memory pointed to by `start` and of `length` that resides in + * memseg list `msl` is IOVA-contiguous. + */ +bool +eal_memalloc_is_contig(const struct rte_memseg_list *msl, void *start, + size_t len); + +/* synchronize local memory map to primary process */ +int +eal_memalloc_sync_with_primary(void); + +int +eal_memalloc_mem_event_callback_register(const char *name, + rte_mem_event_callback_t clb, void *arg); + +int +eal_memalloc_mem_event_callback_unregister(const char *name, void *arg); + +void +eal_memalloc_mem_event_notify(enum rte_mem_event event, const void *start, + size_t len); + +int +eal_memalloc_mem_alloc_validator_register(const char *name, + rte_mem_alloc_validator_t clb, int socket_id, size_t limit); + +int +eal_memalloc_mem_alloc_validator_unregister(const char *name, int socket_id); + +int +eal_memalloc_mem_alloc_validate(int socket_id, size_t new_len); + +/* returns fd or -errno */ +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd); + +/* returns 0 or -errno */ +int +eal_memalloc_set_seg_list_fd(int list_idx, int fd); + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset); + +int +eal_memalloc_init(void); + +#endif /* EAL_MEMALLOC_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h b/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h new file mode 100644 index 000000000..583fcb595 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_memcfg.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef EAL_MEMCFG_H +#define EAL_MEMCFG_H + +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_pause.h> +#include <rte_spinlock.h> +#include <rte_rwlock.h> +#include <rte_tailq.h> + +#include "malloc_heap.h" + +/** + * Memory configuration shared across multiple processes. + */ +struct rte_mem_config { + volatile uint32_t magic; /**< Magic number - sanity check. */ + uint32_t version; + /**< Prevent secondary processes using different DPDK versions. */ + + /* memory topology */ + uint32_t nchannel; /**< Number of channels (0 if unknown). */ + uint32_t nrank; /**< Number of ranks (0 if unknown). */ + + /** + * current lock nest order + * - qlock->mlock (ring/hash/lpm) + * - mplock->qlock->mlock (mempool) + * Notice: + * *ALWAYS* obtain qlock first if having to obtain both qlock and mlock + */ + rte_rwlock_t mlock; /**< used by memzones for thread safety. */ + rte_rwlock_t qlock; /**< used by tailqs for thread safety. */ + rte_rwlock_t mplock; /**< used by mempool library for thread safety. */ + rte_spinlock_t tlock; /**< used by timer library for thread safety. */ + + rte_rwlock_t memory_hotplug_lock; + /**< Indicates whether memory hotplug request is in progress. */ + + /* memory segments and zones */ + struct rte_fbarray memzones; /**< Memzone descriptors. */ + + struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS]; + /**< List of dynamic arrays holding memsegs */ + + struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; + /**< Tailqs for objects */ + + struct malloc_heap malloc_heaps[RTE_MAX_HEAPS]; + /**< DPDK malloc heaps */ + + int next_socket_id; /**< Next socket ID for external malloc heap */ + + /* rte_mem_config has to be mapped at the exact same address in all + * processes, so we need to store it. + */ + uint64_t mem_cfg_addr; /**< Address of this structure in memory. */ + + /* Primary and secondary processes cannot run with different legacy or + * single file segments options, so to avoid having to specify these + * options to all processes, store them in shared config and update the + * internal config at init time. + */ + uint32_t legacy_mem; /**< stored legacy mem parameter. */ + uint32_t single_file_segments; + /**< stored single file segments parameter. */ + + uint64_t tsc_hz; + /**< TSC rate */ + + uint8_t dma_maskbits; /**< Keeps the more restricted dma mask. */ +}; + +/* update internal config from shared mem config */ +void +eal_mcfg_update_internal(void); + +/* update shared mem config from internal config */ +void +eal_mcfg_update_from_internal(void); + +/* wait until primary process initialization is complete */ +void +eal_mcfg_wait_complete(void); + +/* check if DPDK version of current process matches one stored in the config */ +int +eal_mcfg_check_version(void); + +/* set mem config as complete */ +void +eal_mcfg_complete(void); + +#endif /* EAL_MEMCFG_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_options.h b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h new file mode 100644 index 000000000..18e6da9ab --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_options.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2014 6WIND S.A. + */ + +#ifndef EAL_OPTIONS_H +#define EAL_OPTIONS_H + +#include "getopt.h" + +struct rte_tel_data; + +enum { + /* long options mapped to a short option */ +#define OPT_HELP "help" + OPT_HELP_NUM = 'h', +#define OPT_PCI_BLACKLIST "pci-blacklist" + OPT_PCI_BLACKLIST_NUM = 'b', +#define OPT_PCI_WHITELIST "pci-whitelist" + OPT_PCI_WHITELIST_NUM = 'w', + + /* first long only option value must be >= 256, so that we won't + * conflict with short options */ + OPT_LONG_MIN_NUM = 256, +#define OPT_BASE_VIRTADDR "base-virtaddr" + OPT_BASE_VIRTADDR_NUM, +#define OPT_CREATE_UIO_DEV "create-uio-dev" + OPT_CREATE_UIO_DEV_NUM, +#define OPT_FILE_PREFIX "file-prefix" + OPT_FILE_PREFIX_NUM, +#define OPT_HUGE_DIR "huge-dir" + OPT_HUGE_DIR_NUM, +#define OPT_HUGE_UNLINK "huge-unlink" + OPT_HUGE_UNLINK_NUM, +#define OPT_LCORES "lcores" + OPT_LCORES_NUM, +#define OPT_LOG_LEVEL "log-level" + OPT_LOG_LEVEL_NUM, +#define OPT_TRACE "trace" + OPT_TRACE_NUM, +#define OPT_TRACE_DIR "trace-dir" + OPT_TRACE_DIR_NUM, +#define OPT_TRACE_BUF_SIZE "trace-bufsz" + OPT_TRACE_BUF_SIZE_NUM, +#define OPT_TRACE_MODE "trace-mode" + OPT_TRACE_MODE_NUM, +#define OPT_MASTER_LCORE "master-lcore" + OPT_MASTER_LCORE_NUM, +#define OPT_MBUF_POOL_OPS_NAME "mbuf-pool-ops-name" + OPT_MBUF_POOL_OPS_NAME_NUM, +#define OPT_PROC_TYPE "proc-type" + OPT_PROC_TYPE_NUM, +#define OPT_NO_HPET "no-hpet" + OPT_NO_HPET_NUM, +#define OPT_NO_HUGE "no-huge" + OPT_NO_HUGE_NUM, +#define OPT_NO_PCI "no-pci" + OPT_NO_PCI_NUM, +#define OPT_NO_SHCONF "no-shconf" + OPT_NO_SHCONF_NUM, +#define OPT_IN_MEMORY "in-memory" + OPT_IN_MEMORY_NUM, +#define OPT_SOCKET_MEM "socket-mem" + OPT_SOCKET_MEM_NUM, +#define OPT_SOCKET_LIMIT "socket-limit" + OPT_SOCKET_LIMIT_NUM, +#define OPT_SYSLOG "syslog" + OPT_SYSLOG_NUM, +#define OPT_VDEV "vdev" + OPT_VDEV_NUM, +#define OPT_VFIO_INTR "vfio-intr" + OPT_VFIO_INTR_NUM, +#define OPT_VMWARE_TSC_MAP "vmware-tsc-map" + OPT_VMWARE_TSC_MAP_NUM, +#define OPT_LEGACY_MEM "legacy-mem" + OPT_LEGACY_MEM_NUM, +#define OPT_SINGLE_FILE_SEGMENTS "single-file-segments" + OPT_SINGLE_FILE_SEGMENTS_NUM, +#define OPT_IOVA_MODE "iova-mode" + OPT_IOVA_MODE_NUM, +#define OPT_MATCH_ALLOCATIONS "match-allocations" + OPT_MATCH_ALLOCATIONS_NUM, +#define OPT_TELEMETRY "telemetry" + OPT_TELEMETRY_NUM, +#define OPT_NO_TELEMETRY "no-telemetry" + OPT_NO_TELEMETRY_NUM, + OPT_LONG_MAX_NUM +}; + +extern const char eal_short_options[]; +extern const struct option eal_long_options[]; + +int eal_parse_common_option(int opt, const char *argv, + struct internal_config *conf); +int eal_option_device_parse(void); +int eal_adjust_config(struct internal_config *internal_cfg); +int eal_cleanup_config(struct internal_config *internal_cfg); +int eal_check_common_options(struct internal_config *internal_cfg); +void eal_common_usage(void); +enum rte_proc_type_t eal_proc_type_detect(void); +int eal_plugins_init(void); +int eal_save_args(int argc, char **argv); +int handle_eal_info_request(const char *cmd, const char *params __rte_unused, + struct rte_tel_data *d); + +#endif /* EAL_OPTIONS_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_private.h b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h new file mode 100644 index 000000000..869ce183a --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_private.h @@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#ifndef _EAL_PRIVATE_H_ +#define _EAL_PRIVATE_H_ + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +#include <rte_dev.h> +#include <rte_lcore.h> + +/** + * Structure storing internal configuration (per-lcore) + */ +struct lcore_config { + pthread_t thread_id; /**< pthread identifier */ + int pipe_master2slave[2]; /**< communication pipe with master */ + int pipe_slave2master[2]; /**< communication pipe with master */ + + lcore_function_t * volatile f; /**< function to call */ + void * volatile arg; /**< argument of function */ + volatile int ret; /**< return value of function */ + + volatile enum rte_lcore_state_t state; /**< lcore state */ + unsigned int socket_id; /**< physical socket id for this lcore */ + unsigned int core_id; /**< core number on socket for this lcore */ + int core_index; /**< relative index, starting from 0 */ + uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */ + + rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */ +}; + +extern struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/** + * The global RTE configuration structure. + */ +struct rte_config { + uint32_t master_lcore; /**< Id of the master lcore */ + uint32_t lcore_count; /**< Number of available logical cores. */ + uint32_t numa_node_count; /**< Number of detected NUMA nodes. */ + uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */ + uint32_t service_lcore_count;/**< Number of available service cores. */ + enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */ + + /** Primary or secondary configuration */ + enum rte_proc_type_t process_type; + + /** PA or VA mapping mode */ + enum rte_iova_mode iova_mode; + + /** + * Pointer to memory configuration, which may be shared across multiple + * DPDK instances + */ + struct rte_mem_config *mem_config; +} __rte_packed; + +/** + * Get the global configuration structure. + * + * @return + * A pointer to the global configuration structure. + */ +struct rte_config *rte_eal_get_configuration(void); + +/** + * Initialize the memzone subsystem (private to eal). + * + * @return + * - 0 on success + * - Negative on error + */ +int rte_eal_memzone_init(void); + +/** + * Common log initialization function (private to eal). Determines + * where log data is written when no call to rte_openlog_stream is + * in effect. + * + * @param default_log + * The default log stream to be used. + * @return + * - 0 on success + * - Negative on error + */ +void eal_log_set_default(FILE *default_log); + +/** + * Fill configuration with number of physical and logical processors + * + * This function is private to EAL. + * + * Parse /proc/cpuinfo to get the number of physical and logical + * processors on the machine. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_cpu_init(void); + +/** + * Create memseg lists + * + * This function is private to EAL. + * + * Preallocate virtual memory. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memseg_init(void); + +/** + * Map memory + * + * This function is private to EAL. + * + * Fill configuration structure with these infos, and return 0 on success. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_memory_init(void); + +/** + * Configure timers + * + * This function is private to EAL. + * + * Mmap memory areas used by HPET (high precision event timer) that will + * provide our time reference, and configure the TSC frequency also for it + * to be used as a reference. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_timer_init(void); + +/** + * Init the default log stream + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_log_init(const char *id, int facility); + +/** + * Save the log regexp for later + */ +int rte_log_save_regexp(const char *type, int priority); +int rte_log_save_pattern(const char *pattern, int priority); + +/** + * Init tail queues for non-EAL library structures. This is to allow + * the rings, mempools, etc. lists to be shared among multiple processes + * + * This function is private to EAL + * + * @return + * 0 on success, negative on error + */ +int rte_eal_tailqs_init(void); + +/** + * Init interrupt handling. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_intr_init(void); + +/** + * Init alarm mechanism. This is to allow a callback be called after + * specific time. + * + * This function is private to EAL. + * + * @return + * 0 on success, negative on error + */ +int rte_eal_alarm_init(void); + +/** + * Function is to check if the kernel module(like, vfio, vfio_iommu_type1, + * etc.) loaded. + * + * @param module_name + * The module's name which need to be checked + * + * @return + * -1 means some error happens(NULL pointer or open failure) + * 0 means the module not loaded + * 1 means the module loaded + */ +int rte_eal_check_module(const char *module_name); + +/** + * Get virtual area of specified size from the OS. + * + * This function is private to the EAL. + * + * @param requested_addr + * Address where to request address space. + * @param size + * Size of requested area. + * @param page_sz + * Page size on which to align requested virtual area. + * @param flags + * EAL_VIRTUAL_AREA_* flags. + * @param mmap_flags + * Extra flags passed directly to mmap(). + * + * @return + * Virtual area address if successful. + * NULL if unsuccessful. + */ + +#define EAL_VIRTUAL_AREA_ADDR_IS_HINT (1 << 0) +/**< don't fail if cannot get exact requested address. */ +#define EAL_VIRTUAL_AREA_ALLOW_SHRINK (1 << 1) +/**< try getting smaller sized (decrement by page size) virtual areas if cannot + * get area of requested size. + */ +#define EAL_VIRTUAL_AREA_UNMAP (1 << 2) +/**< immediately unmap reserved virtual area. */ +void * +eal_get_virtual_area(void *requested_addr, size_t *size, + size_t page_sz, int flags, int mmap_flags); + +/** + * Get cpu core_id. + * + * This function is private to the EAL. + */ +unsigned eal_cpu_core_id(unsigned lcore_id); + +/** + * Check if cpu is present. + * + * This function is private to the EAL. + */ +int eal_cpu_detected(unsigned lcore_id); + +/** + * Set TSC frequency from precise value or estimation + * + * This function is private to the EAL. + */ +void set_tsc_freq(void); + +/** + * Get precise TSC frequency from system + * + * This function is private to the EAL. + */ +uint64_t get_tsc_freq(void); + +/** + * Get TSC frequency if the architecture supports. + * + * This function is private to the EAL. + * + * @return + * The number of TSC cycles in one second. + * Returns zero if the architecture support is not available. + */ +uint64_t get_tsc_freq_arch(void); + +/** + * Prepare physical memory mapping + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_init(void); + +/** + * Creates memory mapping in secondary process + * i.e. hugepages on Linux and + * contigmem on BSD. + * + * This function is private to the EAL. + */ +int rte_eal_hugepage_attach(void); + +/** + * Find a bus capable of identifying a device. + * + * @param str + * A device identifier (PCI address, virtual PMD name, ...). + * + * @return + * A valid bus handle if found. + * NULL if no bus is able to parse this device. + */ +struct rte_bus *rte_bus_find_by_device_name(const char *str); + +/** + * Create the unix channel for primary/secondary communication. + * + * @return + * 0 on success; + * (<0) on failure. + */ +int rte_mp_channel_init(void); + +/** + * Primary/secondary communication cleanup. + */ +void rte_mp_channel_cleanup(void); + +/** + * @internal + * Parse a device string and store its information in an + * rte_devargs structure. + * + * A device description is split by layers of abstraction of the device: + * bus, class and driver. Each layer will offer a set of properties that + * can be applied either to configure or recognize a device. + * + * This function will parse those properties and prepare the rte_devargs + * to be given to each layers for processing. + * + * Note: if the "data" field of the devargs points to devstr, + * then no dynamic allocation is performed and the rte_devargs + * can be safely discarded. + * + * Otherwise ``data`` will hold a workable copy of devstr, that will be + * used by layers descriptors within rte_devargs. In this case, + * any rte_devargs should be cleaned-up before being freed. + * + * @param da + * rte_devargs structure to fill. + * + * @param devstr + * Device string. + * + * @return + * 0 on success. + * Negative errno values on error (rte_errno is set). + */ +int +rte_devargs_layers_parse(struct rte_devargs *devargs, + const char *devstr); + +/* + * probe a device at local process. + * + * @param devargs + * Device arguments including bus, class and driver properties. + * @param new_dev + * new device be probed as output. + * @return + * 0 on success, negative on error. + */ +int local_dev_probe(const char *devargs, struct rte_device **new_dev); + +/** + * Hotplug remove a given device from a specific bus at local process. + * + * @param dev + * Data structure of the device to remove. + * @return + * 0 on success, negative on error. + */ +int local_dev_remove(struct rte_device *dev); + +/** + * Iterate over all buses to find the corresponding bus to handle the sigbus + * error. + * @param failure_addr + * Pointer of the fault address of the sigbus error. + * + * @return + * 0 success to handle the sigbus. + * -1 failed to handle the sigbus + * 1 no bus can handler the sigbus + */ +int rte_bus_sigbus_handler(const void *failure_addr); + +/** + * @internal + * Register the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_register(void); + +/** + * @internal + * Unregister the sigbus handler. + * + * @return + * - On success, zero. + * - On failure, a negative value. + */ +int +dev_sigbus_handler_unregister(void); + +/** + * Get OS-specific EAL mapping base address. + */ +uint64_t +eal_get_baseaddr(void); + +void * +eal_malloc_no_trace(const char *type, size_t size, unsigned int align); + +void eal_free_no_trace(void *addr); + +#endif /* _EAL_PRIVATE_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h new file mode 100644 index 000000000..b40ed249e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_thread.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_THREAD_H +#define EAL_THREAD_H + +#include <rte_lcore.h> + +/** + * basic loop of thread, called for each thread by eal_init(). + * + * @param arg + * opaque pointer + */ +__rte_noreturn void *eal_thread_loop(void *arg); + +/** + * Init per-lcore info for master thread + * + * @param lcore_id + * identifier of master lcore + */ +void eal_thread_init_master(unsigned lcore_id); + +/** + * Get the NUMA socket id from cpu id. + * This function is private to EAL. + * + * @param cpu_id + * The logical process id. + * @return + * socket_id or SOCKET_ID_ANY + */ +unsigned eal_cpu_socket_id(unsigned cpu_id); + +/** + * Default buffer size to use with eal_thread_dump_affinity() + */ +#define RTE_CPU_AFFINITY_STR_LEN 256 + +/** + * Dump the current pthread cpuset. + * This function is private to EAL. + * + * Note: + * If the dump size is greater than the size of given buffer, + * the string will be truncated and with '\0' at the end. + * + * @param str + * The string buffer the cpuset will dump to. + * @param size + * The string buffer size. + * @return + * 0 for success, -1 if truncation happens. + */ +int +eal_thread_dump_affinity(char *str, unsigned size); + +#endif /* EAL_THREAD_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h b/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h new file mode 100644 index 000000000..8f6061615 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/eal_trace.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2020 Marvell International Ltd. + */ + +#ifndef __EAL_TRACE_H +#define __EAL_TRACE_H + +#include <rte_cycles.h> +#include <rte_log.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> +#include <rte_trace.h> +#include <rte_trace_point.h> +#include <rte_uuid.h> + +#include "eal_private.h" +#include "eal_thread.h" + +#define trace_err(fmt, args...) \ + RTE_LOG(ERR, EAL, "%s():%u " fmt "\n", __func__, __LINE__, ## args) + +#define trace_crit(fmt, args...) \ + RTE_LOG(CRIT, EAL, "%s():%u " fmt "\n", __func__, __LINE__, ## args) + +#define TRACE_PREFIX_LEN 12 +#define TRACE_DIR_STR_LEN (sizeof("YYYY-mm-dd-AM-HH-MM-SS") + TRACE_PREFIX_LEN) +#define TRACE_CTF_FIELD_SIZE 384 +#define TRACE_POINT_NAME_SIZE 64 +#define TRACE_CTF_MAGIC 0xC1FC1FC1 +#define TRACE_MAX_ARGS 32 + +struct trace_point { + STAILQ_ENTRY(trace_point) next; + rte_trace_point_t *handle; + char name[TRACE_POINT_NAME_SIZE]; + char ctf_field[TRACE_CTF_FIELD_SIZE]; +}; + +enum trace_area_e { + TRACE_AREA_HEAP, + TRACE_AREA_HUGEPAGE, +}; + +struct thread_mem_meta { + void *mem; + enum trace_area_e area; +}; + +struct trace_arg { + STAILQ_ENTRY(trace_arg) next; + char *val; +}; + +struct trace { + char dir[PATH_MAX]; + int dir_offset; + int register_errno; + bool status; + enum rte_trace_mode mode; + rte_uuid_t uuid; + uint32_t buff_len; + STAILQ_HEAD(, trace_arg) args; + uint32_t nb_trace_points; + uint32_t nb_trace_mem_list; + struct thread_mem_meta *lcore_meta; + uint64_t epoch_sec; + uint64_t epoch_nsec; + uint64_t uptime_ticks; + char *ctf_meta; + uint32_t ctf_meta_offset_freq; + uint32_t ctf_meta_offset_freq_off_s; + uint32_t ctf_meta_offset_freq_off; + uint16_t ctf_fixup_done; + rte_spinlock_t lock; +}; + +/* Helper functions */ +static inline uint16_t +trace_id_get(rte_trace_point_t *trace) +{ + return (*trace & __RTE_TRACE_FIELD_ID_MASK) >> + __RTE_TRACE_FIELD_ID_SHIFT; +} + +static inline size_t +trace_mem_sz(uint32_t len) +{ + return len + sizeof(struct __rte_trace_header); +} + +/* Trace object functions */ +struct trace *trace_obj_get(void); + +/* Trace point list functions */ +STAILQ_HEAD(trace_point_head, trace_point); +struct trace_point_head *trace_list_head_get(void); + +/* Util functions */ +const char *trace_mode_to_string(enum rte_trace_mode mode); +const char *trace_area_to_string(enum trace_area_e area); +int trace_args_apply(const char *arg); +void trace_bufsz_args_apply(void); +bool trace_has_duplicate_entry(void); +void trace_uuid_generate(void); +int trace_metadata_create(void); +void trace_metadata_destroy(void); +int trace_mkdir(void); +int trace_epoch_time_save(void); +void trace_mem_per_thread_free(void); + +/* EAL interface */ +int eal_trace_init(void); +void eal_trace_fini(void); +int eal_trace_args_save(const char *val); +void eal_trace_args_free(void); +int eal_trace_dir_args_save(const char *val); +int eal_trace_mode_args_save(const char *val); +int eal_trace_bufsz_args_save(const char *val); + +#endif /* __EAL_TRACE_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c new file mode 100644 index 000000000..ee791903b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.c @@ -0,0 +1,465 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ +#include <string.h> + +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_alarm.h> +#include <rte_string_fns.h> +#include <rte_devargs.h> + +#include "hotplug_mp.h" +#include "eal_private.h" + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +struct mp_reply_bundle { + struct rte_mp_msg msg; + void *peer; +}; + +static int cmp_dev_name(const struct rte_device *dev, const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +/** + * Secondary to primary request. + * start from function eal_dev_hotplug_request_to_primary. + * + * device attach on secondary: + * a) secondary send sync request to the primary. + * b) primary receive the request and attach the new device if + * failed goto i). + * c) primary forward attach sync request to all secondary. + * d) secondary receive the request and attach the device and send a reply. + * e) primary check the reply if all success goes to j). + * f) primary send attach rollback sync request to all secondary. + * g) secondary receive the request and detach the device and send a reply. + * h) primary receive the reply and detach device as rollback action. + * i) send attach fail to secondary as a reply of step a), goto k). + * j) send attach success to secondary as a reply of step a). + * k) secondary receive reply and return. + * + * device detach on secondary: + * a) secondary send sync request to the primary. + * b) primary send detach sync request to all secondary. + * c) secondary detach the device and send a reply. + * d) primary check the reply if all success goes to g). + * e) primary send detach rollback sync request to all secondary. + * f) secondary receive the request and attach back device. goto h). + * g) primary detach the device if success goto i), else goto e). + * h) primary send detach fail to secondary as a reply of step a), goto j). + * i) primary send detach success to secondary as a reply of step a). + * j) secondary receive reply and return. + */ + +static int +send_response_to_secondary(const struct eal_dev_mp_req *req, + int result, + const void *peer) +{ + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + int ret; + + memset(&mp_resp, 0, sizeof(mp_resp)); + mp_resp.len_param = sizeof(*resp); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + memcpy(resp, req, sizeof(*req)); + resp->result = result; + + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + return ret; +} + +static void +__handle_secondary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + const struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req tmp_req; + struct rte_devargs da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + tmp_req = *req; + + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + ret = local_dev_probe(req->devargs, &dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug add device on primary\n"); + if (ret != -EEXIST) + goto finish; + } + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + if (tmp_req.result != 0) { + ret = tmp_req.result; + RTE_LOG(ERR, EAL, "Failed to hotplug add device on secondary\n"); + if (ret != -EEXIST) + goto rollback; + } + } else if (req->t == EAL_DEV_REQ_TYPE_DETACH) { + ret = rte_devargs_parse(&da, req->devargs); + if (ret != 0) + goto finish; + free(da.args); /* we don't need those */ + da.args = NULL; + + ret = eal_dev_hotplug_request_to_secondary(&tmp_req); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to send hotplug request to secondary\n"); + ret = -ENOMSG; + goto rollback; + } + + bus = rte_bus_find_by_name(da.bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da.bus->name); + ret = -ENOENT; + goto finish; + } + + dev = bus->find_device(NULL, cmp_dev_name, da.name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da.name); + ret = -ENOENT; + goto finish; + } + + if (tmp_req.result != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on secondary\n"); + ret = tmp_req.result; + if (ret != -ENOENT) + goto rollback; + } + + ret = local_dev_remove(dev); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Failed to hotplug remove device on primary\n"); + if (ret != -ENOENT) + goto rollback; + } + } else { + RTE_LOG(ERR, EAL, "unsupported secondary to primary request\n"); + ret = -ENOTSUP; + } + goto finish; + +rollback: + if (req->t == EAL_DEV_REQ_TYPE_ATTACH) { + tmp_req.t = EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + local_dev_remove(dev); + } else { + tmp_req.t = EAL_DEV_REQ_TYPE_DETACH_ROLLBACK; + eal_dev_hotplug_request_to_secondary(&tmp_req); + } + +finish: + ret = send_response_to_secondary(&tmp_req, ret, bundle->peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send response to secondary\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_secondary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct mp_reply_bundle *bundle; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + int ret = 0; + + bundle = malloc(sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = strdup(peer); + if (bundle->peer == NULL) { + free(bundle); + RTE_LOG(ERR, EAL, "not enough memory\n"); + return send_response_to_secondary(req, -ENOMEM, peer); + } + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_secondary_request, bundle); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to add mp task\n"); + free(bundle->peer); + free(bundle); + return send_response_to_secondary(req, ret, peer); + } + return 0; +} + +static void __handle_primary_request(void *param) +{ + struct mp_reply_bundle *bundle = param; + struct rte_mp_msg *msg = &bundle->msg; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct rte_mp_msg mp_resp; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct rte_devargs *da; + struct rte_device *dev; + struct rte_bus *bus; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + + switch (req->t) { + case EAL_DEV_REQ_TYPE_ATTACH: + case EAL_DEV_REQ_TYPE_DETACH_ROLLBACK: + ret = local_dev_probe(req->devargs, &dev); + break; + case EAL_DEV_REQ_TYPE_DETACH: + case EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK: + da = calloc(1, sizeof(*da)); + if (da == NULL) { + ret = -ENOMEM; + break; + } + + ret = rte_devargs_parse(da, req->devargs); + if (ret != 0) + goto quit; + + bus = rte_bus_find_by_name(da->bus->name); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", da->bus->name); + ret = -ENOENT; + goto quit; + } + + dev = bus->find_device(NULL, cmp_dev_name, da->name); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find plugged device (%s)\n", da->name); + ret = -ENOENT; + goto quit; + } + + if (!rte_dev_is_probed(dev)) { + if (req->t == EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK) { + /** + * Don't fail the rollback just because there's + * nothing to do. + */ + ret = 0; + } else + ret = -ENODEV; + + goto quit; + } + + ret = local_dev_remove(dev); +quit: + free(da->args); + free(da); + break; + default: + ret = -EINVAL; + } + + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + resp->result = ret; + if (rte_mp_reply(&mp_resp, bundle->peer) < 0) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + + free(bundle->peer); + free(bundle); +} + +static int +handle_primary_request(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg mp_resp; + const struct eal_dev_mp_req *req = + (const struct eal_dev_mp_req *)msg->param; + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_resp.param; + struct mp_reply_bundle *bundle; + int ret = 0; + + memset(&mp_resp, 0, sizeof(mp_resp)); + strlcpy(mp_resp.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_resp.name)); + mp_resp.len_param = sizeof(*req); + memcpy(resp, req, sizeof(*resp)); + + bundle = calloc(1, sizeof(*bundle)); + if (bundle == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + bundle->msg = *msg; + /** + * We need to send reply on interrupt thread, but peer can't be + * parsed directly, so this is a temporal hack, need to be fixed + * when it is ready. + */ + bundle->peer = (void *)strdup(peer); + if (bundle->peer == NULL) { + RTE_LOG(ERR, EAL, "not enough memory\n"); + free(bundle); + resp->result = -ENOMEM; + ret = rte_mp_reply(&mp_resp, peer); + if (ret) + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + + /** + * We are at IPC callback thread, sync IPC is not allowed due to + * dead lock, so we delegate the task to interrupt thread. + */ + ret = rte_eal_alarm_set(1, __handle_primary_request, bundle); + if (ret != 0) { + free(bundle->peer); + free(bundle); + resp->result = ret; + ret = rte_mp_reply(&mp_resp, peer); + if (ret != 0) { + RTE_LOG(ERR, EAL, "failed to send reply to primary request\n"); + return ret; + } + } + return 0; +} + +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + struct eal_dev_mp_req *resp; + int ret; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret || mp_reply.nb_received != 1) { + RTE_LOG(ERR, EAL, "Cannot send request to primary\n"); + if (!ret) + return -1; + return ret; + } + + resp = (struct eal_dev_mp_req *)mp_reply.msgs[0].param; + req->result = resp->result; + + free(mp_reply.msgs); + return ret; +} + +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req) +{ + struct rte_mp_msg mp_req; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = MP_TIMEOUT_S, .tv_nsec = 0}; + int ret; + int i; + + memset(&mp_req, 0, sizeof(mp_req)); + memcpy(mp_req.param, req, sizeof(*req)); + mp_req.len_param = sizeof(*req); + strlcpy(mp_req.name, EAL_DEV_MP_ACTION_REQUEST, sizeof(mp_req.name)); + + ret = rte_mp_request_sync(&mp_req, &mp_reply, &ts); + if (ret != 0) { + /* if IPC is not supported, behave as if the call succeeded */ + if (rte_errno != ENOTSUP) + RTE_LOG(ERR, EAL, "rte_mp_request_sync failed\n"); + else + ret = 0; + return ret; + } + + if (mp_reply.nb_sent != mp_reply.nb_received) { + RTE_LOG(ERR, EAL, "not all secondary reply\n"); + free(mp_reply.msgs); + return -1; + } + + req->result = 0; + for (i = 0; i < mp_reply.nb_received; i++) { + struct eal_dev_mp_req *resp = + (struct eal_dev_mp_req *)mp_reply.msgs[i].param; + if (resp->result != 0) { + if (req->t == EAL_DEV_REQ_TYPE_ATTACH && + resp->result == -EEXIST) + continue; + if (req->t == EAL_DEV_REQ_TYPE_DETACH && + resp->result == -ENOENT) + continue; + req->result = resp->result; + } + } + + free(mp_reply.msgs); + return 0; +} + +int eal_mp_dev_hotplug_init(void) +{ + int ret; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_secondary_request); + /* primary is allowed to not support IPC */ + if (ret != 0 && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } else { + ret = rte_mp_action_register(EAL_DEV_MP_ACTION_REQUEST, + handle_primary_request); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + EAL_DEV_MP_ACTION_REQUEST); + return ret; + } + } + + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h new file mode 100644 index 000000000..8fcf9b52e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/hotplug_mp.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef _HOTPLUG_MP_H_ +#define _HOTPLUG_MP_H_ + +#include "rte_dev.h" +#include "rte_bus.h" + +#define EAL_DEV_MP_ACTION_REQUEST "eal_dev_mp_request" +#define EAL_DEV_MP_ACTION_RESPONSE "eal_dev_mp_response" + +#define EAL_DEV_MP_DEV_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN +#define EAL_DEV_MP_BUS_NAME_MAX_LEN 32 +#define EAL_DEV_MP_DEV_ARGS_MAX_LEN 128 + +enum eal_dev_req_type { + EAL_DEV_REQ_TYPE_ATTACH, + EAL_DEV_REQ_TYPE_DETACH, + EAL_DEV_REQ_TYPE_ATTACH_ROLLBACK, + EAL_DEV_REQ_TYPE_DETACH_ROLLBACK, +}; + +struct eal_dev_mp_req { + enum eal_dev_req_type t; + char devargs[EAL_DEV_MP_DEV_ARGS_MAX_LEN]; + int result; +}; + +/** + * Register all mp action callbacks for hotplug. + * + * @return + * 0 on success, negative on error. + */ +int +eal_mp_dev_hotplug_init(void); + +/** + * This is a synchronous wrapper for secondary process send + * request to primary process, this is invoked when an attach + * or detach request is issued from primary process. + */ +int eal_dev_hotplug_request_to_primary(struct eal_dev_mp_req *req); + +/** + * this is a synchronous wrapper for primary process send + * request to secondary process, this is invoked when an attach + * or detach request issued from secondary process. + */ +int eal_dev_hotplug_request_to_secondary(struct eal_dev_mp_req *req); + + +#endif /* _HOTPLUG_MP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c new file mode 100644 index 000000000..51cdfc5d5 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.c @@ -0,0 +1,682 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <inttypes.h> +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_debug.h> +#include <rte_common.h> +#include <rte_spinlock.h> + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "malloc_elem.h" +#include "malloc_heap.h" + +/* + * If debugging is enabled, freed memory is set to poison value + * to catch buggy programs. Otherwise, freed memory is set to zero + * to avoid having to zero in zmalloc + */ +#ifdef RTE_MALLOC_DEBUG +#define MALLOC_POISON 0x6b +#else +#define MALLOC_POISON 0 +#endif + +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align) +{ + void *cur_page, *contig_seg_start, *page_end, *cur_seg_end; + void *data_start, *data_end; + rte_iova_t expected_iova; + struct rte_memseg *ms; + size_t page_sz, cur, max; + + page_sz = (size_t)elem->msl->page_sz; + data_start = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_end = RTE_PTR_ADD(elem, elem->size - MALLOC_ELEM_TRAILER_LEN); + /* segment must start after header and with specified alignment */ + contig_seg_start = RTE_PTR_ALIGN_CEIL(data_start, align); + + /* return if aligned address is already out of malloc element */ + if (contig_seg_start > data_end) + return 0; + + /* if we're in IOVA as VA mode, or if we're in legacy mode with + * hugepages, all elements are IOVA-contiguous. however, we can only + * make these assumptions about internal memory - externally allocated + * segments have to be checked. + */ + if (!elem->msl->external && + (rte_eal_iova_mode() == RTE_IOVA_VA || + (internal_config.legacy_mem && + rte_eal_has_hugepages()))) + return RTE_PTR_DIFF(data_end, contig_seg_start); + + cur_page = RTE_PTR_ALIGN_FLOOR(contig_seg_start, page_sz); + ms = rte_mem_virt2memseg(cur_page, elem->msl); + + /* do first iteration outside the loop */ + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start) - + MALLOC_ELEM_TRAILER_LEN; + max = cur; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + + cur_page = RTE_PTR_ADD(cur_page, page_sz); + + while (cur_page < data_end) { + page_end = RTE_PTR_ADD(cur_page, page_sz); + cur_seg_end = RTE_MIN(page_end, data_end); + + /* reset start of contiguous segment if unexpected iova */ + if (ms->iova != expected_iova) { + /* next contiguous segment must start at specified + * alignment. + */ + contig_seg_start = RTE_PTR_ALIGN(cur_page, align); + /* new segment start may be on a different page, so find + * the page and skip to next iteration to make sure + * we're not blowing past data end. + */ + ms = rte_mem_virt2memseg(contig_seg_start, elem->msl); + cur_page = ms->addr; + /* don't trigger another recalculation */ + expected_iova = ms->iova; + continue; + } + /* cur_seg_end ends on a page boundary or on data end. if we're + * looking at data end, then malloc trailer is already included + * in the calculations. if we're looking at page end, then we + * know there's more data past this page and thus there's space + * for malloc element trailer, so don't count it here. + */ + cur = RTE_PTR_DIFF(cur_seg_end, contig_seg_start); + /* update max if cur value is bigger */ + if (cur > max) + max = cur; + + /* move to next page */ + cur_page = page_end; + expected_iova = ms->iova + page_sz; + /* memsegs are contiguous in memory */ + ms++; + } + + return max; +} + +/* + * Initialize a general malloc_elem header structure + */ +void +malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap, + struct rte_memseg_list *msl, size_t size, + struct malloc_elem *orig_elem, size_t orig_size) +{ + elem->heap = heap; + elem->msl = msl; + elem->prev = NULL; + elem->next = NULL; + memset(&elem->free_list, 0, sizeof(elem->free_list)); + elem->state = ELEM_FREE; + elem->size = size; + elem->pad = 0; + elem->orig_elem = orig_elem; + elem->orig_size = orig_size; + set_header(elem); + set_trailer(elem); +} + +void +malloc_elem_insert(struct malloc_elem *elem) +{ + struct malloc_elem *prev_elem, *next_elem; + struct malloc_heap *heap = elem->heap; + + /* first and last elements must be both NULL or both non-NULL */ + if ((heap->first == NULL) != (heap->last == NULL)) { + RTE_LOG(ERR, EAL, "Heap is probably corrupt\n"); + return; + } + + if (heap->first == NULL && heap->last == NULL) { + /* if empty heap */ + heap->first = elem; + heap->last = elem; + prev_elem = NULL; + next_elem = NULL; + } else if (elem < heap->first) { + /* if lower than start */ + prev_elem = NULL; + next_elem = heap->first; + heap->first = elem; + } else if (elem > heap->last) { + /* if higher than end */ + prev_elem = heap->last; + next_elem = NULL; + heap->last = elem; + } else { + /* the new memory is somewhere between start and end */ + uint64_t dist_from_start, dist_from_end; + + dist_from_end = RTE_PTR_DIFF(heap->last, elem); + dist_from_start = RTE_PTR_DIFF(elem, heap->first); + + /* check which is closer, and find closest list entries */ + if (dist_from_start < dist_from_end) { + prev_elem = heap->first; + while (prev_elem->next < elem) + prev_elem = prev_elem->next; + next_elem = prev_elem->next; + } else { + next_elem = heap->last; + while (next_elem->prev > elem) + next_elem = next_elem->prev; + prev_elem = next_elem->prev; + } + } + + /* insert new element */ + elem->prev = prev_elem; + elem->next = next_elem; + if (prev_elem) + prev_elem->next = elem; + if (next_elem) + next_elem->prev = elem; +} + +/* + * Attempt to find enough physically contiguous memory in this block to store + * our data. Assume that element has at least enough space to fit in the data, + * so we just check the page addresses. + */ +static bool +elem_check_phys_contig(const struct rte_memseg_list *msl, + void *start, size_t size) +{ + return eal_memalloc_is_contig(msl, start, size); +} + +/* + * calculate the starting point of where data of the requested size + * and alignment would fit in the current element. If the data doesn't + * fit, return NULL. + */ +static void * +elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + size_t elem_size = elem->size; + + /* + * we're allocating from the end, so adjust the size of element by + * alignment size. + */ + while (elem_size >= size) { + const size_t bmask = ~(bound - 1); + uintptr_t end_pt = (uintptr_t)elem + + elem_size - MALLOC_ELEM_TRAILER_LEN; + uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + uintptr_t new_elem_start; + + /* check boundary */ + if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { + end_pt = RTE_ALIGN_FLOOR(end_pt, bound); + new_data_start = RTE_ALIGN_FLOOR((end_pt - size), + align); + end_pt = new_data_start + size; + + if (((end_pt - 1) & bmask) != (new_data_start & bmask)) + return NULL; + } + + new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; + + /* if the new start point is before the exist start, + * it won't fit + */ + if (new_elem_start < (uintptr_t)elem) + return NULL; + + if (contig) { + size_t new_data_size = end_pt - new_data_start; + + /* + * if physical contiguousness was requested and we + * couldn't fit all data into one physically contiguous + * block, try again with lower addresses. + */ + if (!elem_check_phys_contig(elem->msl, + (void *)new_data_start, + new_data_size)) { + elem_size -= align; + continue; + } + } + return (void *)new_elem_start; + } + return NULL; +} + +/* + * use elem_start_pt to determine if we get meet the size and + * alignment request from the current element + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + return elem_start_pt(elem, size, align, bound, contig) != NULL; +} + +/* + * split an existing element into two smaller elements at the given + * split_pt parameter. + */ +static void +split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt) +{ + struct malloc_elem *next_elem = elem->next; + const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem; + const size_t new_elem_size = elem->size - old_elem_size; + + malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size, + elem->orig_elem, elem->orig_size); + split_pt->prev = elem; + split_pt->next = next_elem; + if (next_elem) + next_elem->prev = split_pt; + else + elem->heap->last = split_pt; + elem->next = split_pt; + elem->size = old_elem_size; + set_trailer(elem); + if (elem->pad) { + /* Update inner padding inner element size. */ + elem = RTE_PTR_ADD(elem, elem->pad); + elem->size = old_elem_size - elem->pad; + } +} + +/* + * our malloc heap is a doubly linked list, so doubly remove our element. + */ +static void __rte_unused +remove_elem(struct malloc_elem *elem) +{ + struct malloc_elem *next, *prev; + next = elem->next; + prev = elem->prev; + + if (next) + next->prev = prev; + else + elem->heap->last = prev; + if (prev) + prev->next = next; + else + elem->heap->first = next; + + elem->prev = NULL; + elem->next = NULL; +} + +static int +next_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem->next == RTE_PTR_ADD(elem, elem->size) && + elem->next->msl == elem->msl && + (!internal_config.match_allocations || + elem->orig_elem == elem->next->orig_elem); +} + +static int +prev_elem_is_adjacent(struct malloc_elem *elem) +{ + return elem == RTE_PTR_ADD(elem->prev, elem->prev->size) && + elem->prev->msl == elem->msl && + (!internal_config.match_allocations || + elem->orig_elem == elem->prev->orig_elem); +} + +/* + * Given an element size, compute its freelist index. + * We free an element into the freelist containing similarly-sized elements. + * We try to allocate elements starting with the freelist containing + * similarly-sized elements, and if necessary, we search freelists + * containing larger elements. + * + * Example element size ranges for a heap with five free lists: + * heap->free_head[0] - (0 , 2^8] + * heap->free_head[1] - (2^8 , 2^10] + * heap->free_head[2] - (2^10 ,2^12] + * heap->free_head[3] - (2^12, 2^14] + * heap->free_head[4] - (2^14, MAX_SIZE] + */ +size_t +malloc_elem_free_list_index(size_t size) +{ +#define MALLOC_MINSIZE_LOG2 8 +#define MALLOC_LOG2_INCREMENT 2 + + size_t log2; + size_t index; + + if (size <= (1UL << MALLOC_MINSIZE_LOG2)) + return 0; + + /* Find next power of 2 >= size. */ + log2 = sizeof(size) * 8 - __builtin_clzl(size-1); + + /* Compute freelist index, based on log2(size). */ + index = (log2 - MALLOC_MINSIZE_LOG2 + MALLOC_LOG2_INCREMENT - 1) / + MALLOC_LOG2_INCREMENT; + + return index <= RTE_HEAP_NUM_FREELISTS-1? + index: RTE_HEAP_NUM_FREELISTS-1; +} + +/* + * Add the specified element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem) +{ + size_t idx; + + idx = malloc_elem_free_list_index(elem->size - MALLOC_ELEM_HEADER_LEN); + elem->state = ELEM_FREE; + LIST_INSERT_HEAD(&elem->heap->free_head[idx], elem, free_list); +} + +/* + * Remove the specified element from its heap's free list. + */ +void +malloc_elem_free_list_remove(struct malloc_elem *elem) +{ + LIST_REMOVE(elem, free_list); +} + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + * This function is only called from malloc_heap_alloc so parameter checking + * is not done here, as it's done there previously. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align, + size_t bound, bool contig) +{ + struct malloc_elem *new_elem = elem_start_pt(elem, size, align, bound, + contig); + const size_t old_elem_size = (uintptr_t)new_elem - (uintptr_t)elem; + const size_t trailer_size = elem->size - old_elem_size - size - + MALLOC_ELEM_OVERHEAD; + + malloc_elem_free_list_remove(elem); + + if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split it, too much free space after elem */ + struct malloc_elem *new_free_elem = + RTE_PTR_ADD(new_elem, size + MALLOC_ELEM_OVERHEAD); + + split_elem(elem, new_free_elem); + malloc_elem_free_list_insert(new_free_elem); + + if (elem == elem->heap->last) + elem->heap->last = new_free_elem; + } + + if (old_elem_size < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* don't split it, pad the element instead */ + elem->state = ELEM_BUSY; + elem->pad = old_elem_size; + + /* put a dummy header in padding, to point to real element header */ + if (elem->pad > 0) { /* pad will be at least 64-bytes, as everything + * is cache-line aligned */ + new_elem->pad = elem->pad; + new_elem->state = ELEM_PAD; + new_elem->size = elem->size - elem->pad; + set_header(new_elem); + } + + return new_elem; + } + + /* we are going to split the element in two. The original element + * remains free, and the new element is the one allocated. + * Re-insert original element, in case its new size makes it + * belong on a different list. + */ + split_elem(elem, new_elem); + new_elem->state = ELEM_BUSY; + malloc_elem_free_list_insert(elem); + + return new_elem; +} + +/* + * join two struct malloc_elem together. elem1 and elem2 must + * be contiguous in memory. + */ +static inline void +join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2) +{ + struct malloc_elem *next = elem2->next; + elem1->size += elem2->size; + if (next) + next->prev = elem1; + else + elem1->heap->last = elem1; + elem1->next = next; + if (elem1->pad) { + struct malloc_elem *inner = RTE_PTR_ADD(elem1, elem1->pad); + inner->size = elem1->size - elem1->pad; + } +} + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem) +{ + /* + * check if next element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->next != NULL && elem->next->state == ELEM_FREE && + next_elem_is_adjacent(elem)) { + void *erase; + size_t erase_len; + + /* we will want to erase the trailer and header */ + erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->next->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + /* erase header, trailer and pad */ + memset(erase, MALLOC_POISON, erase_len); + } + + /* + * check if prev element exists, is adjacent and is free, if so join + * with it, need to remove from free list. + */ + if (elem->prev != NULL && elem->prev->state == ELEM_FREE && + prev_elem_is_adjacent(elem)) { + struct malloc_elem *new_elem; + void *erase; + size_t erase_len; + + /* we will want to erase trailer and header */ + erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN); + erase_len = MALLOC_ELEM_OVERHEAD + elem->pad; + + /* remove from free list, join to this one */ + malloc_elem_free_list_remove(elem->prev); + + new_elem = elem->prev; + join_elem(new_elem, elem); + + /* erase header, trailer and pad */ + memset(erase, MALLOC_POISON, erase_len); + + elem = new_elem; + } + + return elem; +} + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem) +{ + void *ptr; + size_t data_len; + + ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_len = elem->size - MALLOC_ELEM_OVERHEAD; + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + elem->pad = 0; + + /* decrease heap's count of allocated elements */ + elem->heap->alloc_count--; + + /* poison memory */ + memset(ptr, MALLOC_POISON, data_len); + + return elem; +} + +/* assume all checks were already done */ +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) +{ + struct malloc_elem *hide_start, *hide_end, *prev, *next; + size_t len_before, len_after; + + hide_start = start; + hide_end = RTE_PTR_ADD(start, len); + + prev = elem->prev; + next = elem->next; + + /* we cannot do anything with non-adjacent elements */ + if (next && next_elem_is_adjacent(elem)) { + len_after = RTE_PTR_DIFF(next, hide_end); + if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split after */ + split_elem(elem, hide_end); + + malloc_elem_free_list_insert(hide_end); + } else if (len_after > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + /* we cannot do anything with non-adjacent elements */ + if (prev && prev_elem_is_adjacent(elem)) { + len_before = RTE_PTR_DIFF(hide_start, elem); + if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* split before */ + split_elem(elem, hide_start); + + prev = elem; + elem = hide_start; + + malloc_elem_free_list_insert(prev); + } else if (len_before > 0) { + RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n"); + return; + } + } + + remove_elem(elem); +} + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size) +{ + const size_t new_size = size + elem->pad + MALLOC_ELEM_OVERHEAD; + + /* if we request a smaller size, then always return ok */ + if (elem->size >= new_size) + return 0; + + /* check if there is a next element, it's free and adjacent */ + if (!elem->next || elem->next->state != ELEM_FREE || + !next_elem_is_adjacent(elem)) + return -1; + if (elem->size + elem->next->size < new_size) + return -1; + + /* we now know the element fits, so remove from free list, + * join the two + */ + malloc_elem_free_list_remove(elem->next); + join_elem(elem, elem->next); + + if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) { + /* now we have a big block together. Lets cut it down a bit, by splitting */ + struct malloc_elem *split_pt = RTE_PTR_ADD(elem, new_size); + split_pt = RTE_PTR_ALIGN_CEIL(split_pt, RTE_CACHE_LINE_SIZE); + split_elem(elem, split_pt); + malloc_elem_free_list_insert(split_pt); + } + return 0; +} + +static inline const char * +elem_state_to_str(enum elem_state state) +{ + switch (state) { + case ELEM_PAD: + return "PAD"; + case ELEM_BUSY: + return "BUSY"; + case ELEM_FREE: + return "FREE"; + } + return "ERROR"; +} + +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f) +{ + fprintf(f, "Malloc element at %p (%s)\n", elem, + elem_state_to_str(elem->state)); + fprintf(f, " len: 0x%zx pad: 0x%" PRIx32 "\n", elem->size, elem->pad); + fprintf(f, " prev: %p next: %p\n", elem->prev, elem->next); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h new file mode 100644 index 000000000..a1e5f7f02 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_elem.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_ELEM_H_ +#define MALLOC_ELEM_H_ + +#include <stdbool.h> + +#define MIN_DATA_SIZE (RTE_CACHE_LINE_SIZE) + +/* dummy definition of struct so we can use pointers to it in malloc_elem struct */ +struct malloc_heap; + +enum elem_state { + ELEM_FREE = 0, + ELEM_BUSY, + ELEM_PAD /* element is a padding-only header */ +}; + +struct malloc_elem { + struct malloc_heap *heap; + struct malloc_elem *volatile prev; + /**< points to prev elem in memseg */ + struct malloc_elem *volatile next; + /**< points to next elem in memseg */ + LIST_ENTRY(malloc_elem) free_list; + /**< list of free elements in heap */ + struct rte_memseg_list *msl; + volatile enum elem_state state; + uint32_t pad; + size_t size; + struct malloc_elem *orig_elem; + size_t orig_size; +#ifdef RTE_MALLOC_DEBUG + uint64_t header_cookie; /* Cookie marking start of data */ + /* trailer cookie at start + size */ +#endif +} __rte_cache_aligned; + +#ifndef RTE_MALLOC_DEBUG +static const unsigned MALLOC_ELEM_TRAILER_LEN = 0; + +/* dummy function - just check if pointer is non-null */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem){ return elem != NULL; } + +/* dummy function - no header if malloc_debug is not enabled */ +static inline void +set_header(struct malloc_elem *elem __rte_unused){ } + +/* dummy function - no trailer if malloc_debug is not enabled */ +static inline void +set_trailer(struct malloc_elem *elem __rte_unused){ } + + +#else +static const unsigned MALLOC_ELEM_TRAILER_LEN = RTE_CACHE_LINE_SIZE; + +#define MALLOC_HEADER_COOKIE 0xbadbadbadadd2e55ULL /**< Header cookie. */ +#define MALLOC_TRAILER_COOKIE 0xadd2e55badbadbadULL /**< Trailer cookie.*/ + +/* define macros to make referencing the header and trailer cookies easier */ +#define MALLOC_ELEM_TRAILER(elem) (*((uint64_t*)RTE_PTR_ADD(elem, \ + elem->size - MALLOC_ELEM_TRAILER_LEN))) +#define MALLOC_ELEM_HEADER(elem) (elem->header_cookie) + +static inline void +set_header(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_HEADER(elem) = MALLOC_HEADER_COOKIE; +} + +static inline void +set_trailer(struct malloc_elem *elem) +{ + if (elem != NULL) + MALLOC_ELEM_TRAILER(elem) = MALLOC_TRAILER_COOKIE; +} + +/* check that the header and trailer cookies are set correctly */ +static inline int +malloc_elem_cookies_ok(const struct malloc_elem *elem) +{ + return elem != NULL && + MALLOC_ELEM_HEADER(elem) == MALLOC_HEADER_COOKIE && + MALLOC_ELEM_TRAILER(elem) == MALLOC_TRAILER_COOKIE; +} + +#endif + +static const unsigned MALLOC_ELEM_HEADER_LEN = sizeof(struct malloc_elem); +#define MALLOC_ELEM_OVERHEAD (MALLOC_ELEM_HEADER_LEN + MALLOC_ELEM_TRAILER_LEN) + +/* + * Given a pointer to the start of a memory block returned by malloc, get + * the actual malloc_elem header for that block. + */ +static inline struct malloc_elem * +malloc_elem_from_data(const void *data) +{ + if (data == NULL) + return NULL; + + struct malloc_elem *elem = RTE_PTR_SUB(data, MALLOC_ELEM_HEADER_LEN); + if (!malloc_elem_cookies_ok(elem)) + return NULL; + return elem->state != ELEM_PAD ? elem: RTE_PTR_SUB(elem, elem->pad); +} + +/* + * initialise a malloc_elem header + */ +void +malloc_elem_init(struct malloc_elem *elem, + struct malloc_heap *heap, + struct rte_memseg_list *msl, + size_t size, + struct malloc_elem *orig_elem, + size_t orig_size); + +void +malloc_elem_insert(struct malloc_elem *elem); + +/* + * return true if the current malloc_elem can hold a block of data + * of the requested size and with the requested alignment + */ +int +malloc_elem_can_hold(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * reserve a block of data in an existing malloc_elem. If the malloc_elem + * is much larger than the data block requested, we split the element in two. + */ +struct malloc_elem * +malloc_elem_alloc(struct malloc_elem *elem, size_t size, + unsigned int align, size_t bound, bool contig); + +/* + * free a malloc_elem block by adding it to the free list. If the + * blocks either immediately before or immediately after newly freed block + * are also free, the blocks are merged together. + */ +struct malloc_elem * +malloc_elem_free(struct malloc_elem *elem); + +struct malloc_elem * +malloc_elem_join_adjacent_free(struct malloc_elem *elem); + +/* + * attempt to resize a malloc_elem by expanding into any free space + * immediately after it in memory. + */ +int +malloc_elem_resize(struct malloc_elem *elem, size_t size); + +void +malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len); + +void +malloc_elem_free_list_remove(struct malloc_elem *elem); + +/* + * dump contents of malloc elem to a file. + */ +void +malloc_elem_dump(const struct malloc_elem *elem, FILE *f); + +/* + * Given an element size, compute its freelist index. + */ +size_t +malloc_elem_free_list_index(size_t size); + +/* + * Add element to its heap's free list. + */ +void +malloc_elem_free_list_insert(struct malloc_elem *elem); + +/* + * Find biggest IOVA-contiguous zone within an element with specified alignment. + */ +size_t +malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align); + +#endif /* MALLOC_ELEM_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c new file mode 100644 index 000000000..bd5065698 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.c @@ -0,0 +1,1367 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <errno.h> +#include <sys/queue.h> + +#include <rte_memory.h> +#include <rte_errno.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_string_fns.h> +#include <rte_spinlock.h> +#include <rte_memcpy.h> +#include <rte_memzone.h> +#include <rte_atomic.h> +#include <rte_fbarray.h> + +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" +#include "malloc_elem.h" +#include "malloc_heap.h" +#include "malloc_mp.h" + +/* start external socket ID's at a very high number */ +#define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ +#define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) + +static unsigned +check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) +{ + unsigned check_flag = 0; + + if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) + return 1; + + switch (hugepage_sz) { + case RTE_PGSIZE_256K: + check_flag = RTE_MEMZONE_256KB; + break; + case RTE_PGSIZE_2M: + check_flag = RTE_MEMZONE_2MB; + break; + case RTE_PGSIZE_16M: + check_flag = RTE_MEMZONE_16MB; + break; + case RTE_PGSIZE_256M: + check_flag = RTE_MEMZONE_256MB; + break; + case RTE_PGSIZE_512M: + check_flag = RTE_MEMZONE_512MB; + break; + case RTE_PGSIZE_1G: + check_flag = RTE_MEMZONE_1GB; + break; + case RTE_PGSIZE_4G: + check_flag = RTE_MEMZONE_4GB; + break; + case RTE_PGSIZE_16G: + check_flag = RTE_MEMZONE_16GB; + } + + return check_flag & flags; +} + +int +malloc_socket_to_heap_id(unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (heap->socket_id == socket_id) + return i; + } + return -1; +} + +/* + * Expand the heap with a memory area. + */ +static struct malloc_elem * +malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, + void *start, size_t len) +{ + struct malloc_elem *elem = start; + + malloc_elem_init(elem, heap, msl, len, elem, len); + + malloc_elem_insert(elem); + + elem = malloc_elem_join_adjacent_free(elem); + + malloc_elem_free_list_insert(elem); + + return elem; +} + +static int +malloc_add_seg(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct malloc_heap *heap; + int msl_idx, heap_idx; + + if (msl->external) + return 0; + + heap_idx = malloc_socket_to_heap_id(msl->socket_id); + if (heap_idx < 0) { + RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); + return -1; + } + heap = &mcfg->malloc_heaps[heap_idx]; + + /* msl is const, so find it */ + msl_idx = msl - mcfg->memsegs; + + if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) + return -1; + + found_msl = &mcfg->memsegs[msl_idx]; + + malloc_heap_add_memory(heap, found_msl, ms->addr, len); + + heap->total_size += len; + + RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, + msl->socket_id); + return 0; +} + +/* + * Iterates through the freelist for a heap to find a free element + * which can store data of the required size and with the requested alignment. + * If size is 0, find the biggest available elem. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_suitable_element(struct malloc_heap *heap, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + size_t idx; + struct malloc_elem *elem, *alt_elem = NULL; + + for (idx = malloc_elem_free_list_index(size); + idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + if (malloc_elem_can_hold(elem, size, align, bound, + contig)) { + if (check_hugepage_sz(flags, + elem->msl->page_sz)) + return elem; + if (alt_elem == NULL) + alt_elem = elem; + } + } + } + + if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) + return alt_elem; + + return NULL; +} + +/* + * Iterates through the freelist for a heap to find a free element with the + * biggest size and requested alignment. Will also set size to whatever element + * size that was found. + * Returns null on failure, or pointer to element on success. + */ +static struct malloc_elem * +find_biggest_element(struct malloc_heap *heap, size_t *size, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem, *max_elem = NULL; + size_t idx, max_size = 0; + + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) { + size_t cur_size; + if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && + !check_hugepage_sz(flags, + elem->msl->page_sz)) + continue; + if (contig) { + cur_size = + malloc_elem_find_max_iova_contig(elem, + align); + } else { + void *data_start = RTE_PTR_ADD(elem, + MALLOC_ELEM_HEADER_LEN); + void *data_end = RTE_PTR_ADD(elem, elem->size - + MALLOC_ELEM_TRAILER_LEN); + void *aligned = RTE_PTR_ALIGN_CEIL(data_start, + align); + /* check if aligned data start is beyond end */ + if (aligned >= data_end) + continue; + cur_size = RTE_PTR_DIFF(data_end, aligned); + } + if (cur_size > max_size) { + max_size = cur_size; + max_elem = elem; + } + } + } + + *size = max_size; + return max_elem; +} + +/* + * Main function to allocate a block of memory from the heap. + * It locks the free list, scans it, and adds a new memseg if the + * scan fails. Once the new memseg is added, it re-scans and should return + * the new element after releasing the lock. + */ +static void * +heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct malloc_elem *elem; + + size = RTE_CACHE_LINE_ROUNDUP(size); + align = RTE_CACHE_LINE_ROUNDUP(align); + + /* roundup might cause an overflow */ + if (size == 0) + return NULL; + elem = find_suitable_element(heap, size, flags, align, bound, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, bound, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +static void * +heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, + unsigned int flags, size_t align, bool contig) +{ + struct malloc_elem *elem; + size_t size; + + align = RTE_CACHE_LINE_ROUNDUP(align); + + elem = find_biggest_element(heap, &size, flags, align, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, 0, contig); + + /* increase heap's count of allocated elements */ + heap->alloc_count++; + } + + return elem == NULL ? NULL : (void *)(&elem[1]); +} + +/* this function is exposed in malloc_mp.h */ +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len) +{ + if (elem != NULL) { + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, map_addr, map_len); + } + + eal_memalloc_free_seg_bulk(ms, n_segs); +} + +/* this function is exposed in malloc_mp.h */ +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct malloc_elem *elem = NULL; + size_t alloc_sz; + int allocd_pages; + void *ret, *map_addr; + + alloc_sz = (size_t)pg_sz * n_segs; + + /* first, check if we're allowed to allocate this memory */ + if (eal_memalloc_mem_alloc_validate(socket, + heap->total_size + alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); + return NULL; + } + + allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, + socket, true); + + /* make sure we've allocated our pages... */ + if (allocd_pages < 0) + return NULL; + + map_addr = ms[0]->addr; + msl = rte_mem_virt2memseg_list(map_addr); + + /* check if we wanted contiguous memory but didn't get it */ + if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { + RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", + __func__); + goto fail; + } + + /* + * Once we have all the memseg lists configured, if there is a dma mask + * set, check iova addresses are not out of range. Otherwise the device + * setting the dma mask could have problems with the mapped memory. + * + * There are two situations when this can happen: + * 1) memory initialization + * 2) dynamic memory allocation + * + * For 1), an error when checking dma mask implies app can not be + * executed. For 2) implies the new memory can not be added. + */ + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + /* + * Currently this can only happen if IOMMU is enabled + * and the address width supported by the IOMMU hw is + * not enough for using the memory mapped IOVAs. + * + * If IOVA is VA, advice to try with '--iova-mode pa' + * which could solve some situations when IOVA VA is not + * really needed. + */ + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", + __func__); + + /* + * If IOVA is VA and it is possible to run with IOVA PA, + * because user is root, give and advice for solving the + * problem. + */ + if ((rte_eal_iova_mode() == RTE_IOVA_VA) && + rte_eal_using_phys_addrs()) + RTE_LOG(ERR, EAL, + "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", + __func__); + goto fail; + } + + /* add newly minted memsegs to malloc heap */ + elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); + + /* try once more, as now we have allocated new memory */ + ret = find_suitable_element(heap, elt_size, flags, align, bound, + contig); + + if (ret == NULL) + goto fail; + + return elem; + +fail: + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + return NULL; +} + +static int +try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_elem *elem; + struct rte_memseg **ms; + void *map_addr; + size_t alloc_sz; + int n_segs; + bool callback_triggered = false; + + alloc_sz = RTE_ALIGN_CEIL(align + elt_size + + MALLOC_ELEM_TRAILER_LEN, pg_sz); + n_segs = alloc_sz / pg_sz; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + if (ms == NULL) + return -1; + memset(ms, 0, sizeof(*ms) * n_segs); + + elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, + bound, contig, ms, n_segs); + + if (elem == NULL) + goto free_ms; + + map_addr = ms[0]->addr; + + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* notify other processes that this has happened */ + if (request_sync()) { + /* we couldn't ensure all processes have mapped memory, + * so free it back and notify everyone that it's been + * freed back. + * + * technically, we could've avoided adding memory addresses to + * the map, but that would've led to inconsistent behavior + * between primary and secondary processes, as those get + * callbacks during sync. therefore, force primary process to + * do alloc-and-rollback syncs as well. + */ + callback_triggered = true; + goto free_elem; + } + heap->total_size += alloc_sz; + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", + socket, alloc_sz >> 20ULL); + + free(ms); + + return 0; + +free_elem: + if (callback_triggered) + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + map_addr, alloc_sz); + + rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); + + request_sync(); +free_ms: + free(ms); + + return -1; +} + +static int +try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, + size_t elt_size, int socket, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct malloc_mp_req req; + int req_result; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_ALLOC; + req.alloc_req.align = align; + req.alloc_req.bound = bound; + req.alloc_req.contig = contig; + req.alloc_req.flags = flags; + req.alloc_req.elt_size = elt_size; + req.alloc_req.page_sz = pg_sz; + req.alloc_req.socket = socket; + req.alloc_req.heap = heap; /* it's in shared memory */ + + req_result = request_to_primary(&req); + + if (req_result != 0) + return -1; + + if (req.result != REQ_RESULT_SUCCESS) + return -1; + + return 0; +} + +static int +try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig) +{ + int ret; + + rte_mcfg_mem_write_lock(); + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } else { + ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, + flags, align, bound, contig); + } + + rte_mcfg_mem_write_unlock(); + return ret; +} + +static int +compare_pagesz(const void *a, const void *b) +{ + const struct rte_memseg_list * const*mpa = a; + const struct rte_memseg_list * const*mpb = b; + const struct rte_memseg_list *msla = *mpa; + const struct rte_memseg_list *mslb = *mpb; + uint64_t pg_sz_a = msla->page_sz; + uint64_t pg_sz_b = mslb->page_sz; + + if (pg_sz_a < pg_sz_b) + return -1; + if (pg_sz_a > pg_sz_b) + return 1; + return 0; +} + +static int +alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; + struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; + uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; + uint64_t prev_pg_sz; + int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; + bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + void *ret; + + memset(requested_msls, 0, sizeof(requested_msls)); + memset(other_msls, 0, sizeof(other_msls)); + memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); + memset(other_pg_sz, 0, sizeof(other_pg_sz)); + + /* + * go through memseg list and take note of all the page sizes available, + * and if any of them were specifically requested by the user. + */ + n_requested_msls = 0; + n_other_msls = 0; + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->socket_id != socket) + continue; + + if (msl->base_va == NULL) + continue; + + /* if pages of specific size were requested */ + if (size_flags != 0 && check_hugepage_sz(size_flags, + msl->page_sz)) + requested_msls[n_requested_msls++] = msl; + else if (size_flags == 0 || size_hint) + other_msls[n_other_msls++] = msl; + } + + /* sort the lists, smallest first */ + qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), + compare_pagesz); + qsort(other_msls, n_other_msls, sizeof(other_msls[0]), + compare_pagesz); + + /* now, extract page sizes we are supposed to try */ + prev_pg_sz = 0; + n_requested_pg_sz = 0; + for (i = 0; i < n_requested_msls; i++) { + uint64_t pg_sz = requested_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + requested_pg_sz[n_requested_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + prev_pg_sz = 0; + n_other_pg_sz = 0; + for (i = 0; i < n_other_msls; i++) { + uint64_t pg_sz = other_msls[i]->page_sz; + + if (prev_pg_sz != pg_sz) { + other_pg_sz[n_other_pg_sz++] = pg_sz; + prev_pg_sz = pg_sz; + } + } + + /* finally, try allocating memory of specified page sizes, starting from + * the smallest sizes + */ + for (i = 0; i < n_requested_pg_sz; i++) { + uint64_t pg_sz = requested_pg_sz[i]; + + /* + * do not pass the size hint here, as user expects other page + * sizes first, before resorting to best effort allocation. + */ + if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, + align, bound, contig)) + return 0; + } + if (n_other_pg_sz == 0) + return -1; + + /* now, check if we can reserve anything with size hint */ + ret = find_suitable_element(heap, size, flags, align, bound, contig); + if (ret != NULL) + return 0; + + /* + * we still couldn't reserve memory, so try expanding heap with other + * page sizes, if there are any + */ + for (i = 0; i < n_other_pg_sz; i++) { + uint64_t pg_sz = other_pg_sz[i]; + + if (!try_expand_heap(heap, pg_sz, size, socket, flags, + align, bound, contig)) + return 0; + } + return -1; +} + +/* this will try lower page sizes first */ +static void * +malloc_heap_alloc_on_heap_id(const char *type, size_t size, + unsigned int heap_id, unsigned int flags, size_t align, + size_t bound, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; + int socket_id; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + /* for legacy mode, try once and with all flags */ + if (internal_config.legacy_mem) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + goto alloc_unlock; + } + + /* + * we do not pass the size hint here, because even if allocation fails, + * we may still be able to allocate memory from appropriate page sizes, + * we just need to request more memory first. + */ + + socket_id = rte_socket_id_by_idx(heap_id); + /* + * if socket ID is negative, we cannot find a socket ID for this heap - + * which means it's an external heap. those can have unexpected page + * sizes, so if the user asked to allocate from there - assume user + * knows what they're doing, and allow allocating from there with any + * page size flags. + */ + if (socket_id < 0) + size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; + + ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); + if (ret != NULL) + goto alloc_unlock; + + /* if socket ID is invalid, this is an external heap */ + if (socket_id < 0) + goto alloc_unlock; + + if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, + bound, contig)) { + ret = heap_alloc(heap, type, size, flags, align, bound, contig); + + /* this should have succeeded */ + if (ret == NULL) + RTE_LOG(ERR, EAL, "Error allocating from heap\n"); + } +alloc_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket_arg, + unsigned int flags, size_t align, size_t bound, bool contig) +{ + int socket, heap_id, i; + void *ret; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) + return NULL; + + ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, + bound, contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps. we are only iterating through native DPDK sockets, + * so external heaps won't be included. + */ + for (i = 0; i < (int) rte_socket_count(); i++) { + if (i == heap_id) + continue; + ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, + bound, contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +static void * +heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, + unsigned int flags, size_t align, bool contig) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + void *ret; + + rte_spinlock_lock(&(heap->lock)); + + align = align == 0 ? 1 : align; + + ret = heap_alloc_biggest(heap, type, flags, align, contig); + + rte_spinlock_unlock(&(heap->lock)); + + return ret; +} + +void * +malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, + size_t align, bool contig) +{ + int socket, i, cur_socket, heap_id; + void *ret; + + /* return NULL if align is not power-of-2 */ + if ((align && !rte_is_power_of_2(align))) + return NULL; + + if (!rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + if (socket_arg == SOCKET_ID_ANY) + socket = malloc_get_numa_socket(); + else + socket = socket_arg; + + /* turn socket ID into heap ID */ + heap_id = malloc_socket_to_heap_id(socket); + /* if heap id is negative, socket ID was invalid */ + if (heap_id < 0) + return NULL; + + ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, + contig); + if (ret != NULL || socket_arg != SOCKET_ID_ANY) + return ret; + + /* try other heaps */ + for (i = 0; i < (int) rte_socket_count(); i++) { + cur_socket = rte_socket_id_by_idx(i); + if (cur_socket == socket) + continue; + ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, + contig); + if (ret != NULL) + return ret; + } + return NULL; +} + +/* this function is exposed in malloc_mp.h */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len) +{ + int n_segs, seg_idx, max_seg_idx; + struct rte_memseg_list *msl; + size_t page_sz; + + msl = rte_mem_virt2memseg_list(aligned_start); + if (msl == NULL) + return -1; + + page_sz = (size_t)msl->page_sz; + n_segs = aligned_len / page_sz; + seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; + max_seg_idx = seg_idx + n_segs; + + for (; seg_idx < max_seg_idx; seg_idx++) { + struct rte_memseg *ms; + + ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); + eal_memalloc_free_seg(ms); + } + return 0; +} + +int +malloc_heap_free(struct malloc_elem *elem) +{ + struct malloc_heap *heap; + void *start, *aligned_start, *end, *aligned_end; + size_t len, aligned_len, page_sz; + struct rte_memseg_list *msl; + unsigned int i, n_segs, before_space, after_space; + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + /* elem may be merged with previous element, so keep heap address */ + heap = elem->heap; + msl = elem->msl; + page_sz = (size_t)msl->page_sz; + + rte_spinlock_lock(&(heap->lock)); + + /* mark element as free */ + elem->state = ELEM_FREE; + + elem = malloc_elem_free(elem); + + /* anything after this is a bonus */ + ret = 0; + + /* ...of which we can't avail if we are in legacy mode, or if this is an + * externally allocated segment. + */ + if (internal_config.legacy_mem || (msl->external > 0)) + goto free_unlock; + + /* check if we can free any memory back to the system */ + if (elem->size < page_sz) + goto free_unlock; + + /* if user requested to match allocations, the sizes must match - if not, + * we will defer freeing these hugepages until the entire original allocation + * can be freed + */ + if (internal_config.match_allocations && elem->size != elem->orig_size) + goto free_unlock; + + /* probably, but let's make sure, as we may not be using up full page */ + start = elem; + len = elem->size; + aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); + end = RTE_PTR_ADD(elem, len); + aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); + + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + + /* can't free anything */ + if (aligned_len < page_sz) + goto free_unlock; + + /* we can free something. however, some of these pages may be marked as + * unfreeable, so also check that as well + */ + n_segs = aligned_len / page_sz; + for (i = 0; i < n_segs; i++) { + const struct rte_memseg *tmp = + rte_mem_virt2memseg(aligned_start, msl); + + if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + /* this is an unfreeable segment, so move start */ + aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); + } + } + + /* recalculate length and number of segments */ + aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); + n_segs = aligned_len / page_sz; + + /* check if we can still free some pages */ + if (n_segs == 0) + goto free_unlock; + + /* We're not done yet. We also have to check if by freeing space we will + * be leaving free elements that are too small to store new elements. + * Check if we have enough space in the beginning and at the end, or if + * start/end are exactly page aligned. + */ + before_space = RTE_PTR_DIFF(aligned_start, elem); + after_space = RTE_PTR_DIFF(end, aligned_end); + if (before_space != 0 && + before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space before start, but we may be able to + * move the start forward by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move start */ + aligned_start = RTE_PTR_ADD(aligned_start, page_sz); + aligned_len -= page_sz; + n_segs--; + } + if (after_space != 0 && after_space < + MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { + /* There is not enough space after end, but we may be able to + * move the end backwards by one page. + */ + if (n_segs == 1) + goto free_unlock; + + /* move end */ + aligned_end = RTE_PTR_SUB(aligned_end, page_sz); + aligned_len -= page_sz; + n_segs--; + } + + /* now we can finally free us some pages */ + + rte_mcfg_mem_write_lock(); + + /* + * we allow secondary processes to clear the heap of this allocated + * memory because it is safe to do so, as even if notifications about + * unmapped pages don't make it to other processes, heap is shared + * across all processes, and will become empty of this memory anyway, + * and nothing can allocate it back unless primary process will be able + * to deliver allocation message to every single running process. + */ + + malloc_elem_free_list_remove(elem); + + malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); + + heap->total_size -= aligned_len; + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* notify user about changes in memory map */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + aligned_start, aligned_len); + + /* don't care if any of this fails */ + malloc_heap_free_pages(aligned_start, aligned_len); + + request_sync(); + } else { + struct malloc_mp_req req; + + memset(&req, 0, sizeof(req)); + + req.t = REQ_TYPE_FREE; + req.free_req.addr = aligned_start; + req.free_req.len = aligned_len; + + /* + * we request primary to deallocate pages, but we don't do it + * in this thread. instead, we notify primary that we would like + * to deallocate pages, and this process will receive another + * request (in parallel) that will do it for us on another + * thread. + * + * we also don't really care if this succeeds - the data is + * already removed from the heap, so it is, for all intents and + * purposes, hidden from the rest of DPDK even if some other + * process (including this one) may have these pages mapped. + * + * notifications about deallocated memory happen during sync. + */ + request_to_primary(&req); + } + + RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", + msl->socket_id, aligned_len >> 20ULL); + + rte_mcfg_mem_write_unlock(); +free_unlock: + rte_spinlock_unlock(&(heap->lock)); + return ret; +} + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size) +{ + int ret; + + if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) + return -1; + + rte_spinlock_lock(&(elem->heap->lock)); + + ret = malloc_elem_resize(elem, size); + + rte_spinlock_unlock(&(elem->heap->lock)); + + return ret; +} + +/* + * Function to retrieve data for a given heap + */ +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats) +{ + size_t idx; + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + /* Initialise variables for heap */ + socket_stats->free_count = 0; + socket_stats->heap_freesz_bytes = 0; + socket_stats->greatest_free_size = 0; + + /* Iterate through free list */ + for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { + for (elem = LIST_FIRST(&heap->free_head[idx]); + !!elem; elem = LIST_NEXT(elem, free_list)) + { + socket_stats->free_count++; + socket_stats->heap_freesz_bytes += elem->size; + if (elem->size > socket_stats->greatest_free_size) + socket_stats->greatest_free_size = elem->size; + } + } + /* Get stats on overall heap and allocated memory on this heap */ + socket_stats->heap_totalsz_bytes = heap->total_size; + socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - + socket_stats->heap_freesz_bytes); + socket_stats->alloc_count = heap->alloc_count; + + rte_spinlock_unlock(&heap->lock); + return 0; +} + +/* + * Function to retrieve data for a given heap + */ +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f) +{ + struct malloc_elem *elem; + + rte_spinlock_lock(&heap->lock); + + fprintf(f, "Heap size: 0x%zx\n", heap->total_size); + fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); + + elem = heap->first; + while (elem) { + malloc_elem_dump(elem, f); + elem = elem->next; + } + + rte_spinlock_unlock(&heap->lock); +} + +static int +destroy_elem(struct malloc_elem *elem, size_t len) +{ + struct malloc_heap *heap = elem->heap; + + /* notify all subscribers that a memory area is going to be removed */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); + + /* this element can be removed */ + malloc_elem_free_list_remove(elem); + malloc_elem_hide_region(elem, elem, len); + + heap->total_size -= len; + + memset(elem, 0, sizeof(*elem)); + + return 0; +} + +struct rte_memseg_list * +malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz, const char *seg_name, + unsigned int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + char fbarray_name[RTE_FBARRAY_NAME_LEN]; + struct rte_memseg_list *msl = NULL; + struct rte_fbarray *arr; + size_t seg_len = n_pages * page_sz; + unsigned int i; + + /* first, find a free memseg list */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *tmp = &mcfg->memsegs[i]; + if (tmp->base_va == NULL) { + msl = tmp; + break; + } + } + if (msl == NULL) { + RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); + rte_errno = ENOSPC; + return NULL; + } + + snprintf(fbarray_name, sizeof(fbarray_name), "%s_%p", + seg_name, va_addr); + + /* create the backing fbarray */ + if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, + sizeof(struct rte_memseg)) < 0) { + RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); + return NULL; + } + arr = &msl->memseg_arr; + + /* fbarray created, fill it up */ + for (i = 0; i < n_pages; i++) { + struct rte_memseg *ms; + + rte_fbarray_set_used(arr, i); + ms = rte_fbarray_get(arr, i); + ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); + ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->socket_id = socket_id; + } + + /* set up the memseg list */ + msl->base_va = va_addr; + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->len = seg_len; + msl->version = 0; + msl->external = 1; + + return msl; +} + +struct extseg_walk_arg { + void *va_addr; + size_t len; + struct rte_memseg_list *msl; +}; + +static int +extseg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct extseg_walk_arg *wa = arg; + + if (msl->base_va == wa->va_addr && msl->len == wa->len) { + unsigned int found_idx; + + /* msl is const */ + found_idx = msl - mcfg->memsegs; + wa->msl = &mcfg->memsegs[found_idx]; + return 1; + } + return 0; +} + +struct rte_memseg_list * +malloc_heap_find_external_seg(void *va_addr, size_t len) +{ + struct extseg_walk_arg wa; + int res; + + wa.va_addr = va_addr; + wa.len = len; + + res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); + + if (res != 1) { + /* 0 means nothing was found, -1 shouldn't happen */ + if (res == 0) + rte_errno = ENOENT; + return NULL; + } + return wa.msl; +} + +int +malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) +{ + /* destroy the fbarray backing this memory */ + if (rte_fbarray_destroy(&msl->memseg_arr) < 0) + return -1; + + /* reset the memseg list */ + memset(msl, 0, sizeof(*msl)); + + return 0; +} + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, + struct rte_memseg_list *msl) +{ + /* erase contents of new memory */ + memset(msl->base_va, 0, msl->len); + + /* now, add newly minted memory to the malloc heap */ + malloc_heap_add_memory(heap, msl, msl->base_va, msl->len); + + heap->total_size += msl->len; + + /* all done! */ + RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", + heap->name, msl->base_va); + + /* notify all subscribers that a new memory area has been added */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + msl->base_va, msl->len); + + return 0; +} + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len) +{ + struct malloc_elem *elem = heap->first; + + /* find element with specified va address */ + while (elem != NULL && elem != va_addr) { + elem = elem->next; + /* stop if we've blown past our VA */ + if (elem > (struct malloc_elem *)va_addr) { + rte_errno = ENOENT; + return -1; + } + } + /* check if element was found */ + if (elem == NULL || elem->msl->len != len) { + rte_errno = ENOENT; + return -1; + } + /* if element's size is not equal to segment len, segment is busy */ + if (elem->state == ELEM_BUSY || elem->size != len) { + rte_errno = EBUSY; + return -1; + } + return destroy_elem(elem, len); +} + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + uint32_t next_socket_id = mcfg->next_socket_id; + + /* prevent overflow. did you really create 2 billion heaps??? */ + if (next_socket_id > INT32_MAX) { + RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); + rte_errno = ENOSPC; + return -1; + } + + /* initialize empty heap */ + heap->alloc_count = 0; + heap->first = NULL; + heap->last = NULL; + LIST_INIT(heap->free_head); + rte_spinlock_init(&heap->lock); + heap->total_size = 0; + heap->socket_id = next_socket_id; + + /* we hold a global mem hotplug writelock, so it's safe to increment */ + mcfg->next_socket_id++; + + /* set up name */ + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + return 0; +} + +int +malloc_heap_destroy(struct malloc_heap *heap) +{ + if (heap->alloc_count != 0) { + RTE_LOG(ERR, EAL, "Heap is still in use\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->first != NULL || heap->last != NULL) { + RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); + rte_errno = EBUSY; + return -1; + } + if (heap->total_size != 0) + RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); + + /* after this, the lock will be dropped */ + memset(heap, 0, sizeof(*heap)); + + return 0; +} + +int +rte_eal_malloc_heap_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + if (internal_config.match_allocations) { + RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); + } + + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* assign min socket ID to external heaps */ + mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; + + /* assign names to default DPDK heaps */ + for (i = 0; i < rte_socket_count(); i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + char heap_name[RTE_HEAP_NAME_MAX_LEN]; + int socket_id = rte_socket_id_by_idx(i); + + snprintf(heap_name, sizeof(heap_name), + "socket_%i", socket_id); + strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); + heap->socket_id = socket_id; + } + } + + + if (register_mp_requests()) { + RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); + rte_mcfg_mem_read_unlock(); + return -1; + } + + /* unlock mem hotplug here. it's safe for primary as no requests can + * even come before primary itself is fully initialized, and secondaries + * do not need to initialize the heap. + */ + rte_mcfg_mem_read_unlock(); + + /* secondary process does not need to initialize anything */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* add all IOVA-contiguous areas to the heap */ + return rte_memseg_contig_walk(malloc_add_seg, NULL); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h new file mode 100644 index 000000000..772736b53 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_heap.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef MALLOC_HEAP_H_ +#define MALLOC_HEAP_H_ + +#include <stdbool.h> +#include <sys/queue.h> + +#include <rte_malloc.h> +#include <rte_spinlock.h> + +/* Number of free lists per heap, grouped by size. */ +#define RTE_HEAP_NUM_FREELISTS 13 +#define RTE_HEAP_NAME_MAX_LEN 32 + +/* dummy definition, for pointers */ +struct malloc_elem; + +/** + * Structure to hold malloc heap + */ +struct malloc_heap { + rte_spinlock_t lock; + LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS]; + struct malloc_elem *volatile first; + struct malloc_elem *volatile last; + + unsigned int alloc_count; + unsigned int socket_id; + size_t total_size; + char name[RTE_HEAP_NAME_MAX_LEN]; +} __rte_cache_aligned; + +#ifdef __cplusplus +extern "C" { +#endif + +static inline unsigned +malloc_get_numa_socket(void) +{ + unsigned socket_id = rte_socket_id(); + + if (socket_id == (unsigned)SOCKET_ID_ANY) + return 0; + + return socket_id; +} + +void * +malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags, + size_t align, size_t bound, bool contig); + +void * +malloc_heap_alloc_biggest(const char *type, int socket, unsigned int flags, + size_t align, bool contig); + +int +malloc_heap_create(struct malloc_heap *heap, const char *heap_name); + +int +malloc_heap_destroy(struct malloc_heap *heap); + +struct rte_memseg_list * +malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], + unsigned int n_pages, size_t page_sz, const char *seg_name, + unsigned int socket_id); + +struct rte_memseg_list * +malloc_heap_find_external_seg(void *va_addr, size_t len); + +int +malloc_heap_destroy_external_seg(struct rte_memseg_list *msl); + +int +malloc_heap_add_external_memory(struct malloc_heap *heap, + struct rte_memseg_list *msl); + +int +malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, + size_t len); + +int +malloc_heap_free(struct malloc_elem *elem); + +int +malloc_heap_resize(struct malloc_elem *elem, size_t size); + +int +malloc_heap_get_stats(struct malloc_heap *heap, + struct rte_malloc_socket_stats *socket_stats); + +void +malloc_heap_dump(struct malloc_heap *heap, FILE *f); + +int +malloc_socket_to_heap_id(unsigned int socket_id); + +int +rte_eal_malloc_heap_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* MALLOC_HEAP_H_ */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c new file mode 100644 index 000000000..1f212f834 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.c @@ -0,0 +1,751 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include <string.h> +#include <sys/time.h> + +#include <rte_alarm.h> +#include <rte_errno.h> +#include <rte_string_fns.h> + +#include "eal_memalloc.h" +#include "eal_memcfg.h" + +#include "malloc_elem.h" +#include "malloc_mp.h" + +#define MP_ACTION_SYNC "mp_malloc_sync" +/**< request sent by primary process to notify of changes in memory map */ +#define MP_ACTION_ROLLBACK "mp_malloc_rollback" +/**< request sent by primary process to notify of changes in memory map. this is + * essentially a regular sync request, but we cannot send sync requests while + * another one is in progress, and we might have to - therefore, we do this as + * a separate callback. + */ +#define MP_ACTION_REQUEST "mp_malloc_request" +/**< request sent by secondary process to ask for allocation/deallocation */ +#define MP_ACTION_RESPONSE "mp_malloc_response" +/**< response sent to secondary process to indicate result of request */ + +/* forward declarations */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply); + +#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */ + +/* when we're allocating, we need to store some state to ensure that we can + * roll back later + */ +struct primary_alloc_req_state { + struct malloc_heap *heap; + struct rte_memseg **ms; + int ms_len; + struct malloc_elem *elem; + void *map_addr; + size_t map_len; +}; + +enum req_state { + REQ_STATE_INACTIVE = 0, + REQ_STATE_ACTIVE, + REQ_STATE_COMPLETE +}; + +struct mp_request { + TAILQ_ENTRY(mp_request) next; + struct malloc_mp_req user_req; /**< contents of request */ + pthread_cond_t cond; /**< variable we use to time out on this request */ + enum req_state state; /**< indicate status of this request */ + struct primary_alloc_req_state alloc_state; +}; + +/* + * We could've used just a single request, but it may be possible for + * secondaries to timeout earlier than the primary, and send a new request while + * primary is still expecting replies to the old one. Therefore, each new + * request will get assigned a new ID, which is how we will distinguish between + * expected and unexpected messages. + */ +TAILQ_HEAD(mp_request_list, mp_request); +static struct { + struct mp_request_list list; + pthread_mutex_t lock; +} mp_request_list = { + .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list), + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +/** + * General workflow is the following: + * + * Allocation: + * S: send request to primary + * P: attempt to allocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * if success, sendmsg success + * if failure, roll back allocation and send a rollback request + * S: if received msg of success, quit + * if received rollback request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * Aside from timeouts, there are three points where we can quit: + * - if allocation failed straight away + * - if allocation and sync request succeeded + * - if allocation succeeded, sync request failed, allocation rolled back and + * rollback request received (irrespective of whether it succeeded or failed) + * + * Deallocation: + * S: send request to primary + * P: attempt to deallocate memory + * if failed, sendmsg failure + * if success, send sync request + * S: if received msg of failure, quit + * if received sync request, synchronize memory map and reply with result + * P: if received sync request result + * sendmsg sync request result + * S: if received msg, quit + * + * There is no "rollback" from deallocation, as it's safe to have some memory + * mapped in some processes - it's absent from the heap, so it won't get used. + */ + +static struct mp_request * +find_request_by_id(uint64_t id) +{ + struct mp_request *req; + TAILQ_FOREACH(req, &mp_request_list.list, next) { + if (req->user_req.id == id) + break; + } + return req; +} + +/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */ +static uint64_t +get_unique_id(void) +{ + uint64_t id; + do { + id = rte_rand(); + } while (find_request_by_id(id) != NULL); + return id; +} + +/* secondary will respond to sync requests thusly */ +static int +handle_sync(const struct rte_mp_msg *msg, const void *peer) +{ + struct rte_mp_msg reply; + const struct malloc_mp_req *req = + (const struct malloc_mp_req *)msg->param; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.param; + int ret; + + if (req->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected request from primary\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + reply.num_fds = 0; + strlcpy(reply.name, msg->name, sizeof(reply.name)); + reply.len_param = sizeof(*resp); + + ret = eal_memalloc_sync_with_primary(); + + resp->t = REQ_TYPE_SYNC; + resp->id = req->id; + resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL; + + rte_mp_reply(&reply, peer); + + return 0; +} + +static int +handle_alloc_request(const struct malloc_mp_req *m, + struct mp_request *req) +{ + const struct malloc_req_alloc *ar = &m->alloc_req; + struct malloc_heap *heap; + struct malloc_elem *elem; + struct rte_memseg **ms; + size_t alloc_sz; + int n_segs; + void *map_addr; + + alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size + + MALLOC_ELEM_TRAILER_LEN, ar->page_sz); + n_segs = alloc_sz / ar->page_sz; + + heap = ar->heap; + + /* we can't know in advance how many pages we'll need, so we malloc */ + ms = malloc(sizeof(*ms) * n_segs); + if (ms == NULL) { + RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n"); + goto fail; + } + memset(ms, 0, sizeof(*ms) * n_segs); + + elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket, + ar->flags, ar->align, ar->bound, ar->contig, ms, + n_segs); + + if (elem == NULL) + goto fail; + + map_addr = ms[0]->addr; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); + + /* we have succeeded in allocating memory, but we still need to sync + * with other processes. however, since DPDK IPC is single-threaded, we + * send an asynchronous request and exit this callback. + */ + + req->alloc_state.ms = ms; + req->alloc_state.ms_len = n_segs; + req->alloc_state.map_addr = map_addr; + req->alloc_state.map_len = alloc_sz; + req->alloc_state.elem = elem; + req->alloc_state.heap = heap; + + return 0; +fail: + free(ms); + return -1; +} + +/* first stage of primary handling requests from secondary */ +static int +handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + int ret; + + /* lock access to request */ + pthread_mutex_lock(&mp_request_list.lock); + + /* make sure it's not a dupe */ + entry = find_request_by_id(m->id); + if (entry != NULL) { + RTE_LOG(ERR, EAL, "Duplicate request id\n"); + goto fail; + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n"); + goto fail; + } + + /* erase all data */ + memset(entry, 0, sizeof(*entry)); + + if (m->t == REQ_TYPE_ALLOC) { + ret = handle_alloc_request(m, entry); + } else if (m->t == REQ_TYPE_FREE) { + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + m->free_req.addr, m->free_req.len); + + ret = malloc_heap_free_pages(m->free_req.addr, + m->free_req.len); + } else { + RTE_LOG(ERR, EAL, "Unexpected request from secondary\n"); + goto fail; + } + + if (ret != 0) { + struct rte_mp_msg resp_msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)resp_msg.param; + + /* send failure message straight away */ + resp_msg.num_fds = 0; + resp_msg.len_param = sizeof(*resp); + strlcpy(resp_msg.name, MP_ACTION_RESPONSE, + sizeof(resp_msg.name)); + + resp->t = m->t; + resp->result = REQ_RESULT_FAIL; + resp->id = m->id; + + if (rte_mp_sendmsg(&resp_msg)) { + RTE_LOG(ERR, EAL, "Couldn't send response\n"); + goto fail; + } + /* we did not modify the request */ + free(entry); + } else { + struct rte_mp_msg sr_msg; + struct malloc_mp_req *sr = + (struct malloc_mp_req *)sr_msg.param; + struct timespec ts; + + memset(&sr_msg, 0, sizeof(sr_msg)); + + /* we can do something, so send sync request asynchronously */ + sr_msg.num_fds = 0; + sr_msg.len_param = sizeof(*sr); + strlcpy(sr_msg.name, MP_ACTION_SYNC, sizeof(sr_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + sr->t = REQ_TYPE_SYNC; + sr->id = m->id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&sr_msg, &ts, + handle_sync_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Couldn't send sync request\n"); + if (m->t == REQ_TYPE_ALLOC) + free(entry->alloc_state.ms); + goto fail; + } + + /* mark request as in progress */ + memcpy(&entry->user_req, m, sizeof(*m)); + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + } + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +/* callback for asynchronous sync requests for primary. this will either do a + * sendmsg with results, or trigger rollback request. + */ +static int +handle_sync_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply) +{ + enum malloc_req_result result; + struct mp_request *entry; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + int i; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + result = REQ_RESULT_SUCCESS; + + if (reply->nb_received != reply->nb_sent) + result = REQ_RESULT_FAIL; + + for (i = 0; i < reply->nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply->msgs[i].param; + + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response to sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->id != entry->user_req.id) { + RTE_LOG(ERR, EAL, "Response to wrong sync request\n"); + result = REQ_RESULT_FAIL; + break; + } + if (resp->result == REQ_RESULT_FAIL) { + result = REQ_RESULT_FAIL; + break; + } + } + + if (entry->user_req.t == REQ_TYPE_FREE) { + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + /* this is a free request, just sendmsg result */ + resp->t = REQ_TYPE_FREE; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_SUCCESS) { + struct malloc_heap *heap = entry->alloc_state.heap; + struct rte_mp_msg msg; + struct malloc_mp_req *resp = + (struct malloc_mp_req *)msg.param; + + memset(&msg, 0, sizeof(msg)); + + heap->total_size += entry->alloc_state.map_len; + + /* result is success, so just notify secondary about this */ + resp->t = REQ_TYPE_ALLOC; + resp->result = result; + resp->id = entry->user_req.id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + } else if (entry->user_req.t == REQ_TYPE_ALLOC && + result == REQ_RESULT_FAIL) { + struct rte_mp_msg rb_msg; + struct malloc_mp_req *rb = + (struct malloc_mp_req *)rb_msg.param; + struct timespec ts; + struct primary_alloc_req_state *state = + &entry->alloc_state; + int ret; + + memset(&rb_msg, 0, sizeof(rb_msg)); + + /* we've failed to sync, so do a rollback */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + state->map_addr, state->map_len); + + rollback_expand_heap(state->ms, state->ms_len, state->elem, + state->map_addr, state->map_len); + + /* send rollback request */ + rb_msg.num_fds = 0; + rb_msg.len_param = sizeof(*rb); + strlcpy(rb_msg.name, MP_ACTION_ROLLBACK, sizeof(rb_msg.name)); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* sync requests carry no data */ + rb->t = REQ_TYPE_SYNC; + rb->id = entry->user_req.id; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_async(&rb_msg, &ts, + handle_rollback_response); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n"); + + /* we couldn't send rollback request, but that's OK - + * secondary will time out, and memory has been removed + * from heap anyway. + */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(state->ms); + free(entry); + goto fail; + } + } else { + RTE_LOG(ERR, EAL, " to sync request of unknown type\n"); + goto fail; + } + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +static int +handle_rollback_response(const struct rte_mp_msg *request, + const struct rte_mp_reply *reply __rte_unused) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param; + const struct malloc_mp_req *mpreq = + (const struct malloc_mp_req *)request->param; + struct mp_request *entry; + + /* lock the request */ + pthread_mutex_lock(&mp_request_list.lock); + + memset(&msg, 0, sizeof(msg)); + + entry = find_request_by_id(mpreq->id); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto fail; + } + + if (entry->user_req.t != REQ_TYPE_ALLOC) { + RTE_LOG(ERR, EAL, "Unexpected active request\n"); + goto fail; + } + + /* we don't care if rollback succeeded, request still failed */ + resp->t = REQ_TYPE_ALLOC; + resp->result = REQ_RESULT_FAIL; + resp->id = mpreq->id; + msg.num_fds = 0; + msg.len_param = sizeof(*resp); + strlcpy(msg.name, MP_ACTION_RESPONSE, sizeof(msg.name)); + + if (rte_mp_sendmsg(&msg)) + RTE_LOG(ERR, EAL, "Could not send message to secondary process\n"); + + /* clean up */ + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry->alloc_state.ms); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return 0; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + return -1; +} + +/* final stage of the request from secondary */ +static int +handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused) +{ + const struct malloc_mp_req *m = + (const struct malloc_mp_req *)msg->param; + struct mp_request *entry; + + pthread_mutex_lock(&mp_request_list.lock); + + entry = find_request_by_id(m->id); + if (entry != NULL) { + /* update request status */ + entry->user_req.result = m->result; + + entry->state = REQ_STATE_COMPLETE; + + /* trigger thread wakeup */ + pthread_cond_signal(&entry->cond); + } + + pthread_mutex_unlock(&mp_request_list.lock); + + return 0; +} + +/* synchronously request memory map sync, this is only called whenever primary + * process initiates the allocation. + */ +int +request_sync(void) +{ + struct rte_mp_msg msg; + struct rte_mp_reply reply; + struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param; + struct timespec ts; + int i, ret = -1; + + memset(&msg, 0, sizeof(msg)); + memset(&reply, 0, sizeof(reply)); + + /* no need to create tailq entries as this is entirely synchronous */ + + msg.num_fds = 0; + msg.len_param = sizeof(*req); + strlcpy(msg.name, MP_ACTION_SYNC, sizeof(msg.name)); + + /* sync request carries no data */ + req->t = REQ_TYPE_SYNC; + req->id = get_unique_id(); + + ts.tv_nsec = 0; + ts.tv_sec = MP_TIMEOUT_S; + + /* there may be stray timeout still waiting */ + do { + ret = rte_mp_request_sync(&msg, &reply, &ts); + } while (ret != 0 && rte_errno == EEXIST); + if (ret != 0) { + /* if IPC is unsupported, behave as if the call succeeded */ + if (rte_errno != ENOTSUP) + RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n"); + else + ret = 0; + goto out; + } + + if (reply.nb_received != reply.nb_sent) { + RTE_LOG(ERR, EAL, "Not all secondaries have responded\n"); + goto out; + } + + for (i = 0; i < reply.nb_received; i++) { + struct malloc_mp_req *resp = + (struct malloc_mp_req *)reply.msgs[i].param; + if (resp->t != REQ_TYPE_SYNC) { + RTE_LOG(ERR, EAL, "Unexpected response from secondary\n"); + goto out; + } + if (resp->id != req->id) { + RTE_LOG(ERR, EAL, "Wrong request ID\n"); + goto out; + } + if (resp->result != REQ_RESULT_SUCCESS) { + RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n"); + goto out; + } + } + + ret = 0; +out: + free(reply.msgs); + return ret; +} + +/* this is a synchronous wrapper around a bunch of asynchronous requests to + * primary process. this will initiate a request and wait until responses come. + */ +int +request_to_primary(struct malloc_mp_req *user_req) +{ + struct rte_mp_msg msg; + struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param; + struct mp_request *entry; + struct timespec ts; + struct timeval now; + int ret; + + memset(&msg, 0, sizeof(msg)); + memset(&ts, 0, sizeof(ts)); + + pthread_mutex_lock(&mp_request_list.lock); + + entry = malloc(sizeof(*entry)); + if (entry == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n"); + goto fail; + } + + memset(entry, 0, sizeof(*entry)); + + if (gettimeofday(&now, NULL) < 0) { + RTE_LOG(ERR, EAL, "Cannot get current time\n"); + goto fail; + } + + ts.tv_nsec = (now.tv_usec * 1000) % 1000000000; + ts.tv_sec = now.tv_sec + MP_TIMEOUT_S + + (now.tv_usec * 1000) / 1000000000; + + /* initialize the request */ + pthread_cond_init(&entry->cond, NULL); + + msg.num_fds = 0; + msg.len_param = sizeof(*msg_req); + strlcpy(msg.name, MP_ACTION_REQUEST, sizeof(msg.name)); + + /* (attempt to) get a unique id */ + user_req->id = get_unique_id(); + + /* copy contents of user request into the message */ + memcpy(msg_req, user_req, sizeof(*msg_req)); + + if (rte_mp_sendmsg(&msg)) { + RTE_LOG(ERR, EAL, "Cannot send message to primary\n"); + goto fail; + } + + /* copy contents of user request into active request */ + memcpy(&entry->user_req, user_req, sizeof(*user_req)); + + /* mark request as in progress */ + entry->state = REQ_STATE_ACTIVE; + + TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next); + + /* finally, wait on timeout */ + do { + ret = pthread_cond_timedwait(&entry->cond, + &mp_request_list.lock, &ts); + } while (ret != 0 && ret != ETIMEDOUT); + + if (entry->state != REQ_STATE_COMPLETE) { + RTE_LOG(ERR, EAL, "Request timed out\n"); + ret = -1; + } else { + ret = 0; + user_req->result = entry->user_req.result; + } + TAILQ_REMOVE(&mp_request_list.list, entry, next); + free(entry); + + pthread_mutex_unlock(&mp_request_list.lock); + return ret; +fail: + pthread_mutex_unlock(&mp_request_list.lock); + free(entry); + return -1; +} + +int +register_mp_requests(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + /* it's OK for primary to not support IPC */ + if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request) && + rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_REQUEST); + return -1; + } + } else { + if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_SYNC); + return -1; + } + if (rte_mp_action_register(MP_ACTION_RESPONSE, + handle_response)) { + RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n", + MP_ACTION_RESPONSE); + return -1; + } + } + return 0; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h new file mode 100644 index 000000000..2b86b76f6 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/malloc_mp.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef MALLOC_MP_H +#define MALLOC_MP_H + +#include <stdbool.h> +#include <stdint.h> + +#include <rte_common.h> +#include <rte_random.h> +#include <rte_spinlock.h> +#include <rte_tailq.h> + +/* forward declarations */ +struct malloc_heap; +struct rte_memseg; + +/* multiprocess synchronization structures for malloc */ +enum malloc_req_type { + REQ_TYPE_ALLOC, /**< ask primary to allocate */ + REQ_TYPE_FREE, /**< ask primary to free */ + REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */ +}; + +enum malloc_req_result { + REQ_RESULT_SUCCESS, + REQ_RESULT_FAIL +}; + +struct malloc_req_alloc { + struct malloc_heap *heap; + uint64_t page_sz; + size_t elt_size; + int socket; + unsigned int flags; + size_t align; + size_t bound; + bool contig; +}; + +struct malloc_req_free { + RTE_STD_C11 + union { + void *addr; + uint64_t addr_64; + }; + uint64_t len; +}; + +struct malloc_mp_req { + enum malloc_req_type t; + RTE_STD_C11 + union { + struct malloc_req_alloc alloc_req; + struct malloc_req_free free_req; + }; + uint64_t id; /**< not to be populated by caller */ + enum malloc_req_result result; +}; + +int +register_mp_requests(void); + +int +request_to_primary(struct malloc_mp_req *req); + +/* synchronous memory map sync request */ +int +request_sync(void); + +/* functions from malloc_heap exposed here */ +int +malloc_heap_free_pages(void *aligned_start, size_t aligned_len); + +struct malloc_elem * +alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, + int socket, unsigned int flags, size_t align, size_t bound, + bool contig, struct rte_memseg **ms, int n_segs); + +void +rollback_expand_heap(struct rte_memseg **ms, int n_segs, + struct malloc_elem *elem, void *map_addr, size_t map_len); + +#endif /* MALLOC_MP_H */ diff --git a/src/spdk/dpdk/lib/librte_eal/common/meson.build b/src/spdk/dpdk/lib/librte_eal/common/meson.build new file mode 100644 index 000000000..55aaeb18e --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/meson.build @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +includes += include_directories('.') + +if is_windows + sources += files( + 'eal_common_bus.c', + 'eal_common_class.c', + 'eal_common_devargs.c', + 'eal_common_errno.c', + 'eal_common_launch.c', + 'eal_common_lcore.c', + 'eal_common_log.c', + 'eal_common_options.c', + 'eal_common_thread.c', + ) + subdir_done() +endif + +sources += files( + 'eal_common_bus.c', + 'eal_common_cpuflags.c', + 'eal_common_class.c', + 'eal_common_devargs.c', + 'eal_common_dev.c', + 'eal_common_errno.c', + 'eal_common_fbarray.c', + 'eal_common_hexdump.c', + 'eal_common_hypervisor.c', + 'eal_common_launch.c', + 'eal_common_lcore.c', + 'eal_common_log.c', + 'eal_common_mcfg.c', + 'eal_common_memalloc.c', + 'eal_common_memory.c', + 'eal_common_memzone.c', + 'eal_common_options.c', + 'eal_common_proc.c', + 'eal_common_string_fns.c', + 'eal_common_tailqs.c', + 'eal_common_thread.c', + 'eal_common_timer.c', + 'eal_common_trace.c', + 'eal_common_trace_ctf.c', + 'eal_common_trace_points.c', + 'eal_common_trace_utils.c', + 'eal_common_uuid.c', + 'hotplug_mp.c', + 'malloc_elem.c', + 'malloc_heap.c', + 'malloc_mp.c', + 'rte_keepalive.c', + 'rte_malloc.c', + 'rte_random.c', + 'rte_reciprocal.c', + 'rte_service.c', +) diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c new file mode 100644 index 000000000..e0494b201 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_keepalive.c @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2015-2016 Intel Corporation + */ + +#include <inttypes.h> + +#include <rte_common.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_log.h> +#include <rte_keepalive.h> +#include <rte_malloc.h> + +struct rte_keepalive { + /** Core Liveness. */ + struct { + /* + * Each element must be cache aligned to prevent false sharing. + */ + enum rte_keepalive_state core_state __rte_cache_aligned; + } live_data[RTE_KEEPALIVE_MAXCORES]; + + /** Last-seen-alive timestamps */ + uint64_t last_alive[RTE_KEEPALIVE_MAXCORES]; + + /** + * Cores to check. + * Indexed by core id, non-zero if the core should be checked. + */ + uint8_t active_cores[RTE_KEEPALIVE_MAXCORES]; + + /** Dead core handler. */ + rte_keepalive_failure_callback_t callback; + + /** + * Dead core handler app data. + * Pointer is passed to dead core handler. + */ + void *callback_data; + uint64_t tsc_initial; + uint64_t tsc_mhz; + + /** Core state relay handler. */ + rte_keepalive_relay_callback_t relay_callback; + + /** + * Core state relay handler app data. + * Pointer is passed to live core handler. + */ + void *relay_callback_data; +}; + +static void +print_trace(const char *msg, struct rte_keepalive *keepcfg, int idx_core) +{ + RTE_LOG(INFO, EAL, "%sLast seen %" PRId64 "ms ago.\n", + msg, + ((rte_rdtsc() - keepcfg->last_alive[idx_core])*1000) + / rte_get_tsc_hz() + ); +} + +void +rte_keepalive_dispatch_pings(__rte_unused void *ptr_timer, + void *ptr_data) +{ + struct rte_keepalive *keepcfg = ptr_data; + int idx_core; + + for (idx_core = 0; idx_core < RTE_KEEPALIVE_MAXCORES; idx_core++) { + if (keepcfg->active_cores[idx_core] == 0) + continue; + + switch (keepcfg->live_data[idx_core].core_state) { + case RTE_KA_STATE_UNUSED: + break; + case RTE_KA_STATE_ALIVE: /* Alive */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_MISSING; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_MISSING: /* MIA */ + print_trace("Core MIA. ", keepcfg, idx_core); + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_DEAD; + break; + case RTE_KA_STATE_DEAD: /* Dead */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_GONE; + print_trace("Core died. ", keepcfg, idx_core); + if (keepcfg->callback) + keepcfg->callback( + keepcfg->callback_data, + idx_core + ); + break; + case RTE_KA_STATE_GONE: /* Buried */ + break; + case RTE_KA_STATE_DOZING: /* Core going idle */ + keepcfg->live_data[idx_core].core_state = + RTE_KA_STATE_SLEEP; + keepcfg->last_alive[idx_core] = rte_rdtsc(); + break; + case RTE_KA_STATE_SLEEP: /* Idled core */ + break; + } + if (keepcfg->relay_callback) + keepcfg->relay_callback( + keepcfg->relay_callback_data, + idx_core, + keepcfg->live_data[idx_core].core_state, + keepcfg->last_alive[idx_core] + ); + } +} + +struct rte_keepalive * +rte_keepalive_create(rte_keepalive_failure_callback_t callback, + void *data) +{ + struct rte_keepalive *keepcfg; + + keepcfg = rte_zmalloc("RTE_EAL_KEEPALIVE", + sizeof(struct rte_keepalive), + RTE_CACHE_LINE_SIZE); + if (keepcfg != NULL) { + keepcfg->callback = callback; + keepcfg->callback_data = data; + keepcfg->tsc_initial = rte_rdtsc(); + keepcfg->tsc_mhz = rte_get_tsc_hz() / 1000; + } + return keepcfg; +} + +void rte_keepalive_register_relay_callback(struct rte_keepalive *keepcfg, + rte_keepalive_relay_callback_t callback, + void *data) +{ + keepcfg->relay_callback = callback; + keepcfg->relay_callback_data = data; +} + +void +rte_keepalive_register_core(struct rte_keepalive *keepcfg, const int id_core) +{ + if (id_core < RTE_KEEPALIVE_MAXCORES) { + keepcfg->active_cores[id_core] = RTE_KA_STATE_ALIVE; + keepcfg->last_alive[id_core] = rte_rdtsc(); + } +} + +void +rte_keepalive_mark_alive(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_ALIVE; +} + +void +rte_keepalive_mark_sleep(struct rte_keepalive *keepcfg) +{ + keepcfg->live_data[rte_lcore_id()].core_state = RTE_KA_STATE_DOZING; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c new file mode 100644 index 000000000..f1b73168b --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_malloc.c @@ -0,0 +1,668 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#include <stdint.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <sys/queue.h> + +#include <rte_errno.h> +#include <rte_memcpy.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_eal_memconfig.h> +#include <rte_branch_prediction.h> +#include <rte_debug.h> +#include <rte_launch.h> +#include <rte_per_lcore.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_spinlock.h> +#include <rte_eal_trace.h> + +#include <rte_malloc.h> +#include "malloc_elem.h" +#include "malloc_heap.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" + + +/* Free the memory space back to heap */ +static void +mem_free(void *addr, const bool trace_ena) +{ + if (trace_ena) + rte_eal_trace_mem_free(addr); + + if (addr == NULL) return; + if (malloc_heap_free(malloc_elem_from_data(addr)) < 0) + RTE_LOG(ERR, EAL, "Error: Invalid memory\n"); +} + +void +rte_free(void *addr) +{ + return mem_free(addr, true); +} + +void +eal_free_no_trace(void *addr) +{ + return mem_free(addr, false); +} + +static void * +malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg, const bool trace_ena) +{ + void *ptr; + + /* return NULL if size is 0 or alignment is not power-of-2 */ + if (size == 0 || (align && !rte_is_power_of_2(align))) + return NULL; + + /* if there are no hugepages and if we are not allocating from an + * external heap, use memory from any socket available. checking for + * socket being external may return -1 in case of invalid socket, but + * that's OK - if there are no hugepages, it doesn't matter. + */ + if (rte_malloc_heap_socket_is_external(socket_arg) != 1 && + !rte_eal_has_hugepages()) + socket_arg = SOCKET_ID_ANY; + + ptr = malloc_heap_alloc(type, size, socket_arg, 0, + align == 0 ? 1 : align, 0, false); + + if (trace_ena) + rte_eal_trace_mem_malloc(type, size, align, socket_arg, ptr); + return ptr; +} + +/* + * Allocate memory on specified heap. + */ +void * +rte_malloc_socket(const char *type, size_t size, unsigned int align, + int socket_arg) +{ + return malloc_socket(type, size, align, socket_arg, true); +} + +void * +eal_malloc_no_trace(const char *type, size_t size, unsigned int align) +{ + return malloc_socket(type, size, align, SOCKET_ID_ANY, false); +} + +/* + * Allocate memory on default heap. + */ +void * +rte_malloc(const char *type, size_t size, unsigned align) +{ + return rte_malloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket) +{ + void *ptr = rte_malloc_socket(type, size, align, socket); + +#ifdef RTE_MALLOC_DEBUG + /* + * If DEBUG is enabled, then freed memory is marked with poison + * value and set to zero on allocation. + * If DEBUG is not enabled then memory is already zeroed. + */ + if (ptr != NULL) + memset(ptr, 0, size); +#endif + + rte_eal_trace_mem_zmalloc(type, size, align, socket, ptr); + return ptr; +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_zmalloc(const char *type, size_t size, unsigned align) +{ + return rte_zmalloc_socket(type, size, align, SOCKET_ID_ANY); +} + +/* + * Allocate zero'd memory on specified heap. + */ +void * +rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket) +{ + return rte_zmalloc_socket(type, num * size, align, socket); +} + +/* + * Allocate zero'd memory on default heap. + */ +void * +rte_calloc(const char *type, size_t num, size_t size, unsigned align) +{ + return rte_zmalloc(type, num * size, align); +} + +/* + * Resize allocated memory on specified heap. + */ +void * +rte_realloc_socket(void *ptr, size_t size, unsigned int align, int socket) +{ + if (ptr == NULL) + return rte_malloc_socket(NULL, size, align, socket); + + struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (elem == NULL) { + RTE_LOG(ERR, EAL, "Error: memory corruption detected\n"); + return NULL; + } + + size = RTE_CACHE_LINE_ROUNDUP(size), align = RTE_CACHE_LINE_ROUNDUP(align); + + /* check requested socket id and alignment matches first, and if ok, + * see if we can resize block + */ + if ((socket == SOCKET_ID_ANY || + (unsigned int)socket == elem->heap->socket_id) && + RTE_PTR_ALIGN(ptr, align) == ptr && + malloc_heap_resize(elem, size) == 0) { + rte_eal_trace_mem_realloc(size, align, socket, ptr); + return ptr; + } + + /* either requested socket id doesn't match, alignment is off + * or we have no room to expand, + * so move the data. + */ + void *new_ptr = rte_malloc_socket(NULL, size, align, socket); + if (new_ptr == NULL) + return NULL; + /* elem: |pad|data_elem|data|trailer| */ + const size_t old_size = elem->size - elem->pad - MALLOC_ELEM_OVERHEAD; + rte_memcpy(new_ptr, ptr, old_size < size ? old_size : size); + rte_free(ptr); + + rte_eal_trace_mem_realloc(size, align, socket, new_ptr); + return new_ptr; +} + +/* + * Resize allocated memory. + */ +void * +rte_realloc(void *ptr, size_t size, unsigned int align) +{ + return rte_realloc_socket(ptr, size, align, SOCKET_ID_ANY); +} + +int +rte_malloc_validate(const void *ptr, size_t *size) +{ + const struct malloc_elem *elem = malloc_elem_from_data(ptr); + if (!malloc_elem_cookies_ok(elem)) + return -1; + if (size != NULL) + *size = elem->size - elem->pad - MALLOC_ELEM_OVERHEAD; + return 0; +} + +/* + * Function to retrieve data for heap on given socket + */ +int +rte_malloc_get_socket_stats(int socket, + struct rte_malloc_socket_stats *socket_stats) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int heap_idx; + + heap_idx = malloc_socket_to_heap_id(socket); + if (heap_idx < 0) + return -1; + + return malloc_heap_get_stats(&mcfg->malloc_heaps[heap_idx], + socket_stats); +} + +/* + * Function to dump contents of all heaps + */ +void +rte_malloc_dump_heaps(FILE *f) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + fprintf(f, "Heap id: %u\n", idx); + malloc_heap_dump(&mcfg->malloc_heaps[idx], f); + } +} + +int +rte_malloc_heap_get_socket(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + unsigned int idx; + int ret; + + if (name == NULL || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_read_lock(); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if (!strncmp(name, tmp->name, RTE_HEAP_NAME_MAX_LEN)) { + heap = tmp; + break; + } + } + + if (heap != NULL) { + ret = heap->socket_id; + } else { + rte_errno = ENOENT; + ret = -1; + } + rte_mcfg_mem_read_unlock(); + + return ret; +} + +int +rte_malloc_heap_socket_is_external(int socket_id) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int idx; + int ret = -1; + + if (socket_id == SOCKET_ID_ANY) + return 0; + + rte_mcfg_mem_read_lock(); + for (idx = 0; idx < RTE_MAX_HEAPS; idx++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[idx]; + + if ((int)tmp->socket_id == socket_id) { + /* external memory always has large socket ID's */ + ret = tmp->socket_id >= RTE_MAX_NUMA_NODES; + break; + } + } + rte_mcfg_mem_read_unlock(); + + return ret; +} + +/* + * Print stats on memory type. If type is NULL, info on all types is printed + */ +void +rte_malloc_dump_stats(FILE *f, __rte_unused const char *type) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int heap_id; + struct rte_malloc_socket_stats sock_stats; + + /* Iterate through all initialised heaps */ + for (heap_id = 0; heap_id < RTE_MAX_HEAPS; heap_id++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; + + malloc_heap_get_stats(heap, &sock_stats); + + fprintf(f, "Heap id:%u\n", heap_id); + fprintf(f, "\tHeap name:%s\n", heap->name); + fprintf(f, "\tHeap_size:%zu,\n", sock_stats.heap_totalsz_bytes); + fprintf(f, "\tFree_size:%zu,\n", sock_stats.heap_freesz_bytes); + fprintf(f, "\tAlloc_size:%zu,\n", sock_stats.heap_allocsz_bytes); + fprintf(f, "\tGreatest_free_size:%zu,\n", + sock_stats.greatest_free_size); + fprintf(f, "\tAlloc_count:%u,\n",sock_stats.alloc_count); + fprintf(f, "\tFree_count:%u,\n", sock_stats.free_count); + } + return; +} + +/* + * TODO: Set limit to memory that can be allocated to memory type + */ +int +rte_malloc_set_limit(__rte_unused const char *type, + __rte_unused size_t max) +{ + return 0; +} + +/* + * Return the IO address of a virtual address obtained through rte_malloc + */ +rte_iova_t +rte_malloc_virt2iova(const void *addr) +{ + const struct rte_memseg *ms; + struct malloc_elem *elem = malloc_elem_from_data(addr); + + if (elem == NULL) + return RTE_BAD_IOVA; + + if (!elem->msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t) addr; + + ms = rte_mem_virt2memseg(addr, elem->msl); + if (ms == NULL) + return RTE_BAD_IOVA; + + if (ms->iova == RTE_BAD_IOVA) + return RTE_BAD_IOVA; + + return ms->iova + RTE_PTR_DIFF(addr, ms->addr); +} + +static struct malloc_heap * +find_named_heap(const char *name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int i; + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *heap = &mcfg->malloc_heaps[i]; + + if (!strncmp(name, heap->name, RTE_HEAP_NAME_MAX_LEN)) + return heap; + } + return NULL; +} + +int +rte_malloc_heap_memory_add(const char *heap_name, void *va_addr, size_t len, + rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + unsigned int n; + int ret; + + if (heap_name == NULL || va_addr == NULL || + page_sz == 0 || !rte_is_power_of_2(page_sz) || + RTE_ALIGN(len, page_sz) != len || + !rte_is_aligned(va_addr, page_sz) || + ((len / page_sz) != n_pages && iova_addrs != NULL) || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot add memory to internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + n = len / page_sz; + + msl = malloc_heap_create_external_seg(va_addr, iova_addrs, n, page_sz, + heap_name, heap->socket_id); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_add_external_memory(heap, msl); + msl->heap = 1; /* mark it as heap segment */ + rte_spinlock_unlock(&heap->lock); + +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +int +rte_malloc_heap_memory_remove(const char *heap_name, void *va_addr, size_t len) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + /* cannot remove memory from internal heaps */ + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_remove_external_memory(heap, va_addr, len); + rte_spinlock_unlock(&heap->lock); + if (ret != 0) + goto unlock; + + ret = malloc_heap_destroy_external_seg(msl); + +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +static int +sync_memory(const char *heap_name, void *va_addr, size_t len, bool attach) +{ + struct malloc_heap *heap = NULL; + struct rte_memseg_list *msl; + int ret; + + if (heap_name == NULL || va_addr == NULL || len == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_read_lock(); + + /* find our heap */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to sync to internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + + /* find corresponding memseg list to sync to */ + msl = malloc_heap_find_external_seg(va_addr, len); + if (msl == NULL) { + ret = -1; + goto unlock; + } + + if (attach) { + ret = rte_fbarray_attach(&msl->memseg_arr); + if (ret == 0) { + /* notify all subscribers that a new memory area was + * added. + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + va_addr, len); + } else { + ret = -1; + goto unlock; + } + } else { + /* notify all subscribers that a memory area is about to + * be removed. + */ + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + msl->base_va, msl->len); + ret = rte_fbarray_detach(&msl->memseg_arr); + if (ret < 0) { + ret = -1; + goto unlock; + } + } +unlock: + rte_mcfg_mem_read_unlock(); + return ret; +} + +int +rte_malloc_heap_memory_attach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, true); +} + +int +rte_malloc_heap_memory_detach(const char *heap_name, void *va_addr, size_t len) +{ + return sync_memory(heap_name, va_addr, len, false); +} + +int +rte_malloc_heap_create(const char *heap_name) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct malloc_heap *heap = NULL; + int i, ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + /* check if there is space in the heap list, or if heap with this name + * already exists. + */ + rte_mcfg_mem_write_lock(); + + for (i = 0; i < RTE_MAX_HEAPS; i++) { + struct malloc_heap *tmp = &mcfg->malloc_heaps[i]; + /* existing heap */ + if (strncmp(heap_name, tmp->name, + RTE_HEAP_NAME_MAX_LEN) == 0) { + RTE_LOG(ERR, EAL, "Heap %s already exists\n", + heap_name); + rte_errno = EEXIST; + ret = -1; + goto unlock; + } + /* empty heap */ + if (strnlen(tmp->name, RTE_HEAP_NAME_MAX_LEN) == 0) { + heap = tmp; + break; + } + } + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Cannot create new heap: no space\n"); + rte_errno = ENOSPC; + ret = -1; + goto unlock; + } + + /* we're sure that we can create a new heap, so do it */ + ret = malloc_heap_create(heap, heap_name); +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} + +int +rte_malloc_heap_destroy(const char *heap_name) +{ + struct malloc_heap *heap = NULL; + int ret; + + if (heap_name == NULL || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == 0 || + strnlen(heap_name, RTE_HEAP_NAME_MAX_LEN) == + RTE_HEAP_NAME_MAX_LEN) { + rte_errno = EINVAL; + return -1; + } + rte_mcfg_mem_write_lock(); + + /* start from non-socket heaps */ + heap = find_named_heap(heap_name); + if (heap == NULL) { + RTE_LOG(ERR, EAL, "Heap %s not found\n", heap_name); + rte_errno = ENOENT; + ret = -1; + goto unlock; + } + /* we shouldn't be able to destroy internal heaps */ + if (heap->socket_id < RTE_MAX_NUMA_NODES) { + rte_errno = EPERM; + ret = -1; + goto unlock; + } + /* sanity checks done, now we can destroy the heap */ + rte_spinlock_lock(&heap->lock); + ret = malloc_heap_destroy(heap); + + /* if we failed, lock is still active */ + if (ret < 0) + rte_spinlock_unlock(&heap->lock); +unlock: + rte_mcfg_mem_write_unlock(); + + return ret; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_random.c b/src/spdk/dpdk/lib/librte_eal/common/rte_random.c new file mode 100644 index 000000000..b7a089ac4 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_random.c @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Ericsson AB + */ + +#ifdef RTE_MACHINE_CPUFLAG_RDSEED +#include <x86intrin.h> +#endif +#include <stdlib.h> +#include <unistd.h> + +#include <rte_branch_prediction.h> +#include <rte_cycles.h> +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_memory.h> +#include <rte_random.h> + +struct rte_rand_state { + uint64_t z1; + uint64_t z2; + uint64_t z3; + uint64_t z4; + uint64_t z5; +} __rte_cache_aligned; + +static struct rte_rand_state rand_states[RTE_MAX_LCORE]; + +static uint32_t +__rte_rand_lcg32(uint32_t *seed) +{ + *seed = 1103515245U * *seed + 12345U; + + return *seed; +} + +static uint64_t +__rte_rand_lcg64(uint32_t *seed) +{ + uint64_t low; + uint64_t high; + + /* A 64-bit LCG would have been much cleaner, but good + * multiplier/increments for such seem hard to come by. + */ + + low = __rte_rand_lcg32(seed); + high = __rte_rand_lcg32(seed); + + return low | (high << 32); +} + +static uint64_t +__rte_rand_lfsr258_gen_seed(uint32_t *seed, uint64_t min_value) +{ + uint64_t res; + + res = __rte_rand_lcg64(seed); + + if (res < min_value) + res += min_value; + + return res; +} + +static void +__rte_srand_lfsr258(uint64_t seed, struct rte_rand_state *state) +{ + uint32_t lcg_seed; + + lcg_seed = (uint32_t)(seed ^ (seed >> 32)); + + state->z1 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 2UL); + state->z2 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 512UL); + state->z3 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 4096UL); + state->z4 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 131072UL); + state->z5 = __rte_rand_lfsr258_gen_seed(&lcg_seed, 8388608UL); +} + +void +rte_srand(uint64_t seed) +{ + unsigned int lcore_id; + + /* add lcore_id to seed to avoid having the same sequence */ + for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) + __rte_srand_lfsr258(seed + lcore_id, &rand_states[lcore_id]); +} + +static __rte_always_inline uint64_t +__rte_rand_lfsr258_comp(uint64_t z, uint64_t a, uint64_t b, uint64_t c, + uint64_t d) +{ + return ((z & c) << d) ^ (((z << a) ^ z) >> b); +} + +/* Based on L’Ecuyer, P.: Tables of maximally equidistributed combined + * LFSR generators. + */ + +static __rte_always_inline uint64_t +__rte_rand_lfsr258(struct rte_rand_state *state) +{ + state->z1 = __rte_rand_lfsr258_comp(state->z1, 1UL, 53UL, + 18446744073709551614UL, 10UL); + state->z2 = __rte_rand_lfsr258_comp(state->z2, 24UL, 50UL, + 18446744073709551104UL, 5UL); + state->z3 = __rte_rand_lfsr258_comp(state->z3, 3UL, 23UL, + 18446744073709547520UL, 29UL); + state->z4 = __rte_rand_lfsr258_comp(state->z4, 5UL, 24UL, + 18446744073709420544UL, 23UL); + state->z5 = __rte_rand_lfsr258_comp(state->z5, 3UL, 33UL, + 18446744073701163008UL, 8UL); + + return state->z1 ^ state->z2 ^ state->z3 ^ state->z4 ^ state->z5; +} + +static __rte_always_inline +struct rte_rand_state *__rte_rand_get_state(void) +{ + unsigned int lcore_id; + + lcore_id = rte_lcore_id(); + + if (unlikely(lcore_id == LCORE_ID_ANY)) + lcore_id = rte_get_master_lcore(); + + return &rand_states[lcore_id]; +} + +uint64_t +rte_rand(void) +{ + struct rte_rand_state *state; + + state = __rte_rand_get_state(); + + return __rte_rand_lfsr258(state); +} + +uint64_t +rte_rand_max(uint64_t upper_bound) +{ + struct rte_rand_state *state; + uint8_t ones; + uint8_t leading_zeros; + uint64_t mask = ~((uint64_t)0); + uint64_t res; + + if (unlikely(upper_bound < 2)) + return 0; + + state = __rte_rand_get_state(); + + ones = __builtin_popcountll(upper_bound); + + /* Handle power-of-2 upper_bound as a special case, since it + * has no bias issues. + */ + if (unlikely(ones == 1)) + return __rte_rand_lfsr258(state) & (upper_bound - 1); + + /* The approach to avoiding bias is to create a mask that + * stretches beyond the request value range, and up to the + * next power-of-2. In case the masked generated random value + * is equal to or greater than the upper bound, just discard + * the value and generate a new one. + */ + + leading_zeros = __builtin_clzll(upper_bound); + mask >>= leading_zeros; + + do { + res = __rte_rand_lfsr258(state) & mask; + } while (unlikely(res >= upper_bound)); + + return res; +} + +static uint64_t +__rte_random_initial_seed(void) +{ +#ifdef RTE_LIBEAL_USE_GETENTROPY + int ge_rc; + uint64_t ge_seed; + + ge_rc = getentropy(&ge_seed, sizeof(ge_seed)); + + if (ge_rc == 0) + return ge_seed; +#endif +#ifdef RTE_MACHINE_CPUFLAG_RDSEED + unsigned int rdseed_low; + unsigned int rdseed_high; + + /* first fallback: rdseed instruction, if available */ + if (_rdseed32_step(&rdseed_low) == 1 && + _rdseed32_step(&rdseed_high) == 1) + return (uint64_t)rdseed_low | ((uint64_t)rdseed_high << 32); +#endif + /* second fallback: seed using rdtsc */ + return rte_get_tsc_cycles(); +} + +RTE_INIT(rte_rand_init) +{ + uint64_t seed; + + seed = __rte_random_initial_seed(); + + rte_srand(seed); +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c new file mode 100644 index 000000000..42dfa44eb --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_reciprocal.c @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + * Copyright(c) Hannes Frederic Sowa + * All rights reserved. + */ + +#include <stdio.h> +#include <stdint.h> + +#include <rte_common.h> + +#include "rte_reciprocal.h" + +struct rte_reciprocal rte_reciprocal_value(uint32_t d) +{ + struct rte_reciprocal R; + uint64_t m; + int l; + + l = rte_fls_u32(d - 1); + m = ((1ULL << 32) * ((1ULL << l) - d)); + m /= d; + + ++m; + R.m = m; + R.sh1 = RTE_MIN(l, 1); + R.sh2 = RTE_MAX(l - 1, 0); + + return R; +} + +/* + * Code taken from Hacker's Delight: + * http://www.hackersdelight.org/hdcodetxt/divlu.c.txt + * License permits inclusion here per: + * http://www.hackersdelight.org/permissions.htm + */ +static uint64_t +divide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) +{ + const uint64_t b = (1ULL << 32); /* Number base (16 bits). */ + uint64_t un1, un0, /* Norm. dividend LSD's. */ + vn1, vn0, /* Norm. divisor digits. */ + q1, q0, /* Quotient digits. */ + un64, un21, un10, /* Dividend digit pairs. */ + rhat; /* A remainder. */ + int s; /* Shift amount for norm. */ + + /* If overflow, set rem. to an impossible value. */ + if (u1 >= v) { + if (r != NULL) + *r = (uint64_t) -1; + return (uint64_t) -1; + } + + /* Count leading zeros. */ + s = __builtin_clzll(v); + if (s > 0) { + v = v << s; + un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un10 = u0 << s; + } else { + + un64 = u1 | u0; + un10 = u0; + } + + vn1 = v >> 32; + vn0 = v & 0xFFFFFFFF; + + un1 = un10 >> 32; + un0 = un10 & 0xFFFFFFFF; + + q1 = un64/vn1; + rhat = un64 - q1*vn1; +again1: + if (q1 >= b || q1*vn0 > b*rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again1; + } + + un21 = un64*b + un1 - q1*v; + + q0 = un21/vn1; + rhat = un21 - q0*vn1; +again2: + if (q0 >= b || q0*vn0 > b*rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat < b) + goto again2; + } + + if (r != NULL) + *r = (un21*b + un0 - q0*v) >> s; + return q1*b + q0; +} + +struct rte_reciprocal_u64 +rte_reciprocal_value_u64(uint64_t d) +{ + struct rte_reciprocal_u64 R; + uint64_t m; + uint64_t r; + int l; + + l = 63 - __builtin_clzll(d); + + m = divide_128_div_64_to_64((1ULL << l), 0, d, &r) << 1; + if (r << 1 < r || r << 1 >= d) + m++; + m = (1ULL << l) - d ? m + 1 : 1; + R.m = m; + + R.sh1 = l > 1 ? 1 : l; + R.sh2 = (l > 0) ? l : 0; + R.sh2 -= R.sh2 && (m == 1) ? 1 : 0; + + return R; +} diff --git a/src/spdk/dpdk/lib/librte_eal/common/rte_service.c b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c new file mode 100644 index 000000000..6123a2124 --- /dev/null +++ b/src/spdk/dpdk/lib/librte_eal/common/rte_service.c @@ -0,0 +1,919 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Intel Corporation + */ + +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <limits.h> +#include <string.h> + +#include <rte_compat.h> +#include <rte_service.h> +#include <rte_service_component.h> + +#include <rte_eal.h> +#include <rte_lcore.h> +#include <rte_common.h> +#include <rte_debug.h> +#include <rte_cycles.h> +#include <rte_atomic.h> +#include <rte_memory.h> +#include <rte_malloc.h> +#include <rte_spinlock.h> + +#include "eal_private.h" + +#define RTE_SERVICE_NUM_MAX 64 + +#define SERVICE_F_REGISTERED (1 << 0) +#define SERVICE_F_STATS_ENABLED (1 << 1) +#define SERVICE_F_START_CHECK (1 << 2) + +/* runstates for services and lcores, denoting if they are active or not */ +#define RUNSTATE_STOPPED 0 +#define RUNSTATE_RUNNING 1 + +/* internal representation of a service */ +struct rte_service_spec_impl { + /* public part of the struct */ + struct rte_service_spec spec; + + /* spin lock that when set indicates a service core is currently + * running this service callback. When not set, a core may take the + * lock and then run the service callback. + */ + rte_spinlock_t execute_lock; + + /* API set/get-able variables */ + int8_t app_runstate; + int8_t comp_runstate; + uint8_t internal_flags; + + /* per service statistics */ + /* Indicates how many cores the service is mapped to run on. + * It does not indicate the number of cores the service is running + * on currently. + */ + uint32_t num_mapped_cores; + uint64_t calls; + uint64_t cycles_spent; +} __rte_cache_aligned; + +/* the internal values of a service core */ +struct core_state { + /* map of services IDs are run on this core */ + uint64_t service_mask; + uint8_t runstate; /* running or stopped */ + uint8_t is_service_core; /* set if core is currently a service core */ + uint8_t service_active_on_lcore[RTE_SERVICE_NUM_MAX]; + uint64_t loops; + uint64_t calls_per_service[RTE_SERVICE_NUM_MAX]; +} __rte_cache_aligned; + +static uint32_t rte_service_count; +static struct rte_service_spec_impl *rte_services; +static struct core_state *lcore_states; +static uint32_t rte_service_library_initialized; + +int32_t +rte_service_init(void) +{ + if (rte_service_library_initialized) { + RTE_LOG(NOTICE, EAL, + "service library init() called, init flag %d\n", + rte_service_library_initialized); + return -EALREADY; + } + + rte_services = rte_calloc("rte_services", RTE_SERVICE_NUM_MAX, + sizeof(struct rte_service_spec_impl), + RTE_CACHE_LINE_SIZE); + if (!rte_services) { + RTE_LOG(ERR, EAL, "error allocating rte services array\n"); + goto fail_mem; + } + + lcore_states = rte_calloc("rte_service_core_states", RTE_MAX_LCORE, + sizeof(struct core_state), RTE_CACHE_LINE_SIZE); + if (!lcore_states) { + RTE_LOG(ERR, EAL, "error allocating core states array\n"); + goto fail_mem; + } + + int i; + int count = 0; + struct rte_config *cfg = rte_eal_get_configuration(); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role == ROLE_SERVICE) { + if ((unsigned int)i == cfg->master_lcore) + continue; + rte_service_lcore_add(i); + count++; + } + } + + rte_service_library_initialized = 1; + return 0; +fail_mem: + rte_free(rte_services); + rte_free(lcore_states); + return -ENOMEM; +} + +void +rte_service_finalize(void) +{ + if (!rte_service_library_initialized) + return; + + rte_service_lcore_reset_all(); + rte_eal_mp_wait_lcore(); + + rte_free(rte_services); + rte_free(lcore_states); + + rte_service_library_initialized = 0; +} + +/* returns 1 if service is registered and has not been unregistered + * Returns 0 if service never registered, or has been unregistered + */ +static inline int +service_valid(uint32_t id) +{ + return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED); +} + +static struct rte_service_spec_impl * +service_get(uint32_t id) +{ + return &rte_services[id]; +} + +/* validate ID and retrieve service pointer, or return error value */ +#define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do { \ + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) \ + return retval; \ + service = &rte_services[id]; \ +} while (0) + +/* returns 1 if statistics should be collected for service + * Returns 0 if statistics should not be collected for service + */ +static inline int +service_stats_enabled(struct rte_service_spec_impl *impl) +{ + return !!(impl->internal_flags & SERVICE_F_STATS_ENABLED); +} + +static inline int +service_mt_safe(struct rte_service_spec_impl *s) +{ + return !!(s->spec.capabilities & RTE_SERVICE_CAP_MT_SAFE); +} + +int32_t +rte_service_set_stats_enable(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_STATS_ENABLED; + else + s->internal_flags &= ~(SERVICE_F_STATS_ENABLED); + + return 0; +} + +int32_t +rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + + if (enabled) + s->internal_flags |= SERVICE_F_START_CHECK; + else + s->internal_flags &= ~(SERVICE_F_START_CHECK); + + return 0; +} + +uint32_t +rte_service_get_count(void) +{ + return rte_service_count; +} + +int32_t +rte_service_get_by_name(const char *name, uint32_t *service_id) +{ + if (!service_id) + return -EINVAL; + + int i; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (service_valid(i) && + strcmp(name, rte_services[i].spec.name) == 0) { + *service_id = i; + return 0; + } + } + + return -ENODEV; +} + +const char * +rte_service_get_name(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, 0); + return s->spec.name; +} + +int32_t +rte_service_probe_capability(uint32_t id, uint32_t capability) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + return !!(s->spec.capabilities & capability); +} + +int32_t +rte_service_component_register(const struct rte_service_spec *spec, + uint32_t *id_ptr) +{ + uint32_t i; + int32_t free_slot = -1; + + if (spec->callback == NULL || strlen(spec->name) == 0) + return -EINVAL; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) { + free_slot = i; + break; + } + } + + if ((free_slot < 0) || (i == RTE_SERVICE_NUM_MAX)) + return -ENOSPC; + + struct rte_service_spec_impl *s = &rte_services[free_slot]; + s->spec = *spec; + s->internal_flags |= SERVICE_F_REGISTERED | SERVICE_F_START_CHECK; + + rte_service_count++; + + if (id_ptr) + *id_ptr = free_slot; + + return 0; +} + +int32_t +rte_service_component_unregister(uint32_t id) +{ + uint32_t i; + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + rte_service_count--; + + s->internal_flags &= ~(SERVICE_F_REGISTERED); + + /* clear the run-bit in all cores */ + for (i = 0; i < RTE_MAX_LCORE; i++) + lcore_states[i].service_mask &= ~(UINT64_C(1) << id); + + memset(&rte_services[id], 0, sizeof(struct rte_service_spec_impl)); + + return 0; +} + +int32_t +rte_service_component_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* comp_runstate act as the guard variable. Use store-release + * memory order. This synchronizes with load-acquire in + * service_run and service_runstate_get function. + */ + if (runstate) + __atomic_store_n(&s->comp_runstate, RUNSTATE_RUNNING, + __ATOMIC_RELEASE); + else + __atomic_store_n(&s->comp_runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_runstate_set(uint32_t id, uint32_t runstate) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* app_runstate act as the guard variable. Use store-release + * memory order. This synchronizes with load-acquire in + * service_run runstate_get function. + */ + if (runstate) + __atomic_store_n(&s->app_runstate, RUNSTATE_RUNNING, + __ATOMIC_RELEASE); + else + __atomic_store_n(&s->app_runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_runstate_get(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* comp_runstate and app_runstate act as the guard variables. + * Use load-acquire memory order. This synchronizes with + * store-release in service state set functions. + */ + if (__atomic_load_n(&s->comp_runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING && + __atomic_load_n(&s->app_runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) { + int check_disabled = !(s->internal_flags & + SERVICE_F_START_CHECK); + int lcore_mapped = (__atomic_load_n(&s->num_mapped_cores, + __ATOMIC_RELAXED) > 0); + + return (check_disabled | lcore_mapped); + } else + return 0; + +} + +static inline void +service_runner_do_callback(struct rte_service_spec_impl *s, + struct core_state *cs, uint32_t service_idx) +{ + void *userdata = s->spec.callback_userdata; + + if (service_stats_enabled(s)) { + uint64_t start = rte_rdtsc(); + s->spec.callback(userdata); + uint64_t end = rte_rdtsc(); + s->cycles_spent += end - start; + cs->calls_per_service[service_idx]++; + s->calls++; + } else + s->spec.callback(userdata); +} + + +/* Expects the service 's' is valid. */ +static int32_t +service_run(uint32_t i, struct core_state *cs, uint64_t service_mask, + struct rte_service_spec_impl *s, uint32_t serialize_mt_unsafe) +{ + if (!s) + return -EINVAL; + + /* comp_runstate and app_runstate act as the guard variables. + * Use load-acquire memory order. This synchronizes with + * store-release in service state set functions. + */ + if (__atomic_load_n(&s->comp_runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_RUNNING || + __atomic_load_n(&s->app_runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_RUNNING || + !(service_mask & (UINT64_C(1) << i))) { + cs->service_active_on_lcore[i] = 0; + return -ENOEXEC; + } + + cs->service_active_on_lcore[i] = 1; + + if ((service_mt_safe(s) == 0) && (serialize_mt_unsafe == 1)) { + if (!rte_spinlock_trylock(&s->execute_lock)) + return -EBUSY; + + service_runner_do_callback(s, cs, i); + rte_spinlock_unlock(&s->execute_lock); + } else + service_runner_do_callback(s, cs, i); + + return 0; +} + +int32_t +rte_service_may_be_active(uint32_t id) +{ + uint32_t ids[RTE_MAX_LCORE] = {0}; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + int i; + + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) + return -EINVAL; + + for (i = 0; i < lcore_count; i++) { + if (lcore_states[i].service_active_on_lcore[id]) + return 1; + } + + return 0; +} + +int32_t +rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_mt_unsafe) +{ + struct core_state *cs = &lcore_states[rte_lcore_id()]; + struct rte_service_spec_impl *s; + + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + /* Increment num_mapped_cores to reflect that this core is + * now mapped capable of running the service. + */ + __atomic_add_fetch(&s->num_mapped_cores, 1, __ATOMIC_RELAXED); + + int ret = service_run(id, cs, UINT64_MAX, s, serialize_mt_unsafe); + + __atomic_sub_fetch(&s->num_mapped_cores, 1, __ATOMIC_RELAXED); + + return ret; +} + +static int32_t +service_runner_func(void *arg) +{ + RTE_SET_USED(arg); + uint32_t i; + const int lcore = rte_lcore_id(); + struct core_state *cs = &lcore_states[lcore]; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + while (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) { + const uint64_t service_mask = cs->service_mask; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + /* return value ignored as no change to code flow */ + service_run(i, cs, service_mask, service_get(i), 1); + } + + cs->loops++; + } + + lcore_config[lcore].state = WAIT; + + return 0; +} + +int32_t +rte_service_lcore_count(void) +{ + int32_t count = 0; + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) + count += lcore_states[i].is_service_core; + return count; +} + +int32_t +rte_service_lcore_list(uint32_t array[], uint32_t n) +{ + uint32_t count = rte_service_lcore_count(); + if (count > n) + return -ENOMEM; + + if (!array) + return -EINVAL; + + uint32_t i; + uint32_t idx = 0; + for (i = 0; i < RTE_MAX_LCORE; i++) { + struct core_state *cs = &lcore_states[i]; + if (cs->is_service_core) { + array[idx] = i; + idx++; + } + } + + return count; +} + +int32_t +rte_service_lcore_count_services(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + return __builtin_popcountll(cs->service_mask); +} + +int32_t +rte_service_start_with_defaults(void) +{ + /* create a default mapping from cores to services, then start the + * services to make them transparent to unaware applications. + */ + uint32_t i; + int ret; + uint32_t count = rte_service_get_count(); + + int32_t lcore_iter = 0; + uint32_t ids[RTE_MAX_LCORE] = {0}; + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + + if (lcore_count == 0) + return -ENOTSUP; + + for (i = 0; (int)i < lcore_count; i++) + rte_service_lcore_start(ids[i]); + + for (i = 0; i < count; i++) { + /* do 1:1 core mapping here, with each service getting + * assigned a single core by default. Adding multiple services + * should multiplex to a single core, or 1:1 if there are the + * same amount of services as service-cores + */ + ret = rte_service_map_lcore_set(i, ids[lcore_iter], 1); + if (ret) + return -ENODEV; + + lcore_iter++; + if (lcore_iter >= lcore_count) + lcore_iter = 0; + + ret = rte_service_runstate_set(i, 1); + if (ret) + return -ENOEXEC; + } + + return 0; +} + +static int32_t +service_update(uint32_t sid, uint32_t lcore, uint32_t *set, uint32_t *enabled) +{ + /* validate ID, or return error value */ + if (sid >= RTE_SERVICE_NUM_MAX || !service_valid(sid) || + lcore >= RTE_MAX_LCORE || !lcore_states[lcore].is_service_core) + return -EINVAL; + + uint64_t sid_mask = UINT64_C(1) << sid; + if (set) { + uint64_t lcore_mapped = lcore_states[lcore].service_mask & + sid_mask; + + if (*set && !lcore_mapped) { + lcore_states[lcore].service_mask |= sid_mask; + __atomic_add_fetch(&rte_services[sid].num_mapped_cores, + 1, __ATOMIC_RELAXED); + } + if (!*set && lcore_mapped) { + lcore_states[lcore].service_mask &= ~(sid_mask); + __atomic_sub_fetch(&rte_services[sid].num_mapped_cores, + 1, __ATOMIC_RELAXED); + } + } + + if (enabled) + *enabled = !!(lcore_states[lcore].service_mask & (sid_mask)); + + return 0; +} + +int32_t +rte_service_map_lcore_set(uint32_t id, uint32_t lcore, uint32_t enabled) +{ + uint32_t on = enabled > 0; + return service_update(id, lcore, &on, 0); +} + +int32_t +rte_service_map_lcore_get(uint32_t id, uint32_t lcore) +{ + uint32_t enabled; + int ret = service_update(id, lcore, 0, &enabled); + if (ret == 0) + return enabled; + return ret; +} + +static void +set_lcore_state(uint32_t lcore, int32_t state) +{ + /* mark core state in hugepage backed config */ + struct rte_config *cfg = rte_eal_get_configuration(); + cfg->lcore_role[lcore] = state; + + /* mark state in process local lcore_config */ + lcore_config[lcore].core_role = state; + + /* update per-lcore optimized state tracking */ + lcore_states[lcore].is_service_core = (state == ROLE_SERVICE); +} + +int32_t +rte_service_lcore_reset_all(void) +{ + /* loop over cores, reset all to mask 0 */ + uint32_t i; + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_states[i].is_service_core) { + lcore_states[i].service_mask = 0; + set_lcore_state(i, ROLE_RTE); + /* runstate act as guard variable Use + * store-release memory order here to synchronize + * with load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[i].runstate, + RUNSTATE_STOPPED, __ATOMIC_RELEASE); + } + } + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) + __atomic_store_n(&rte_services[i].num_mapped_cores, 0, + __ATOMIC_RELAXED); + + return 0; +} + +int32_t +rte_service_lcore_add(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + if (lcore_states[lcore].is_service_core) + return -EALREADY; + + set_lcore_state(lcore, ROLE_SERVICE); + + /* ensure that after adding a core the mask and state are defaults */ + lcore_states[lcore].service_mask = 0; + /* Use store-release memory order here to synchronize with + * load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return rte_eal_wait_lcore(lcore); +} + +int32_t +rte_service_lcore_del(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) != + RUNSTATE_STOPPED) + return -EBUSY; + + set_lcore_state(lcore, ROLE_RTE); + + rte_smp_wmb(); + return 0; +} + +int32_t +rte_service_lcore_start(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + struct core_state *cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&cs->runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_RUNNING) + return -EALREADY; + + /* set core to run state first, and then launch otherwise it will + * return immediately as runstate keeps it in the service poll loop + */ + /* Use load-acquire memory order here to synchronize with + * store-release in runstate update functions. + */ + __atomic_store_n(&cs->runstate, RUNSTATE_RUNNING, __ATOMIC_RELEASE); + + int ret = rte_eal_remote_launch(service_runner_func, 0, lcore); + /* returns -EBUSY if the core is already launched, 0 on success */ + return ret; +} + +int32_t +rte_service_lcore_stop(uint32_t lcore) +{ + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + /* runstate act as the guard variable. Use load-acquire + * memory order here to synchronize with store-release + * in runstate update functions. + */ + if (__atomic_load_n(&lcore_states[lcore].runstate, __ATOMIC_ACQUIRE) == + RUNSTATE_STOPPED) + return -EALREADY; + + uint32_t i; + uint64_t service_mask = lcore_states[lcore].service_mask; + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + int32_t enabled = service_mask & (UINT64_C(1) << i); + int32_t service_running = rte_service_runstate_get(i); + int32_t only_core = (1 == + __atomic_load_n(&rte_services[i].num_mapped_cores, + __ATOMIC_RELAXED)); + + /* if the core is mapped, and the service is running, and this + * is the only core that is mapped, the service would cease to + * run if this core stopped, so fail instead. + */ + if (enabled && service_running && only_core) + return -EBUSY; + } + + /* Use store-release memory order here to synchronize with + * load-acquire in runstate read functions. + */ + __atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED, + __ATOMIC_RELEASE); + + return 0; +} + +int32_t +rte_service_attr_get(uint32_t id, uint32_t attr_id, uint64_t *attr_value) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + if (!attr_value) + return -EINVAL; + + switch (attr_id) { + case RTE_SERVICE_ATTR_CYCLES: + *attr_value = s->cycles_spent; + return 0; + case RTE_SERVICE_ATTR_CALL_COUNT: + *attr_value = s->calls; + return 0; + default: + return -EINVAL; + } +} + +int32_t +rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + uint64_t *attr_value) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE || !attr_value) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + switch (attr_id) { + case RTE_SERVICE_LCORE_ATTR_LOOPS: + *attr_value = cs->loops; + return 0; + default: + return -EINVAL; + } +} + +static void +service_dump_one(FILE *f, struct rte_service_spec_impl *s, uint32_t reset) +{ + /* avoid divide by zero */ + int calls = 1; + if (s->calls != 0) + calls = s->calls; + + if (reset) { + s->cycles_spent = 0; + s->calls = 0; + return; + } + + if (f == NULL) + return; + + fprintf(f, " %s: stats %d\tcalls %"PRIu64"\tcycles %" + PRIu64"\tavg: %"PRIu64"\n", + s->spec.name, service_stats_enabled(s), s->calls, + s->cycles_spent, s->cycles_spent / calls); +} + +int32_t +rte_service_attr_reset_all(uint32_t id) +{ + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + int reset = 1; + service_dump_one(NULL, s, reset); + return 0; +} + +int32_t +rte_service_lcore_attr_reset_all(uint32_t lcore) +{ + struct core_state *cs; + + if (lcore >= RTE_MAX_LCORE) + return -EINVAL; + + cs = &lcore_states[lcore]; + if (!cs->is_service_core) + return -ENOTSUP; + + cs->loops = 0; + + return 0; +} + +static void +service_dump_calls_per_lcore(FILE *f, uint32_t lcore, uint32_t reset) +{ + uint32_t i; + struct core_state *cs = &lcore_states[lcore]; + + fprintf(f, "%02d\t", lcore); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + fprintf(f, "%"PRIu64"\t", cs->calls_per_service[i]); + if (reset) + cs->calls_per_service[i] = 0; + } + fprintf(f, "\n"); +} + +int32_t +rte_service_dump(FILE *f, uint32_t id) +{ + uint32_t i; + int print_one = (id != UINT32_MAX); + + /* print only the specified service */ + if (print_one) { + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + fprintf(f, "Service %s Summary\n", s->spec.name); + uint32_t reset = 0; + service_dump_one(f, s, reset); + return 0; + } + + /* print all services, as UINT32_MAX was passed as id */ + fprintf(f, "Services Summary\n"); + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { + if (!service_valid(i)) + continue; + uint32_t reset = 0; + service_dump_one(f, &rte_services[i], reset); + } + + fprintf(f, "Service Cores Summary\n"); + for (i = 0; i < RTE_MAX_LCORE; i++) { + if (lcore_config[i].core_role != ROLE_SERVICE) + continue; + + uint32_t reset = 0; + service_dump_calls_per_lcore(f, i, reset); + } + + return 0; +} |