From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/seastar/dpdk/drivers/event/sw/Makefile | 62 ++ src/seastar/dpdk/drivers/event/sw/event_ring.h | 185 +++++ src/seastar/dpdk/drivers/event/sw/iq_ring.h | 176 +++++ .../drivers/event/sw/rte_pmd_evdev_sw_version.map | 3 + src/seastar/dpdk/drivers/event/sw/sw_evdev.c | 833 +++++++++++++++++++++ src/seastar/dpdk/drivers/event/sw/sw_evdev.h | 318 ++++++++ .../dpdk/drivers/event/sw/sw_evdev_scheduler.c | 601 +++++++++++++++ .../dpdk/drivers/event/sw/sw_evdev_worker.c | 185 +++++ .../dpdk/drivers/event/sw/sw_evdev_xstats.c | 674 +++++++++++++++++ 9 files changed, 3037 insertions(+) create mode 100644 src/seastar/dpdk/drivers/event/sw/Makefile create mode 100644 src/seastar/dpdk/drivers/event/sw/event_ring.h create mode 100644 src/seastar/dpdk/drivers/event/sw/iq_ring.h create mode 100644 src/seastar/dpdk/drivers/event/sw/rte_pmd_evdev_sw_version.map create mode 100644 src/seastar/dpdk/drivers/event/sw/sw_evdev.c create mode 100644 src/seastar/dpdk/drivers/event/sw/sw_evdev.h create mode 100644 src/seastar/dpdk/drivers/event/sw/sw_evdev_scheduler.c create mode 100644 src/seastar/dpdk/drivers/event/sw/sw_evdev_worker.c create mode 100644 src/seastar/dpdk/drivers/event/sw/sw_evdev_xstats.c (limited to 'src/seastar/dpdk/drivers/event/sw') diff --git a/src/seastar/dpdk/drivers/event/sw/Makefile b/src/seastar/dpdk/drivers/event/sw/Makefile new file mode 100644 index 00000000..857a87cc --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/Makefile @@ -0,0 +1,62 @@ +# BSD LICENSE +# +# Copyright(c) 2016-2017 Intel Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# library name +LIB = librte_pmd_sw_event.a + +# build flags +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) +# for older GCC versions, allow us to initialize an event using +# designated initializers. +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +ifeq ($(shell test $(GCC_VERSION) -le 50 && echo 1), 1) +CFLAGS += -Wno-missing-field-initializers +endif +endif + +# library version +LIBABIVER := 1 + +# versioning export map +EXPORT_MAP := rte_pmd_evdev_sw_version.map + +# library source files +SRCS-$(CONFIG_RTE_LIBRTE_PMD_SW_EVENTDEV) += sw_evdev.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_SW_EVENTDEV) += sw_evdev_worker.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_SW_EVENTDEV) += sw_evdev_scheduler.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_SW_EVENTDEV) += sw_evdev_xstats.c + +# export include files +SYMLINK-y-include += + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/seastar/dpdk/drivers/event/sw/event_ring.h b/src/seastar/dpdk/drivers/event/sw/event_ring.h new file mode 100644 index 00000000..cdaee95d --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/event_ring.h @@ -0,0 +1,185 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Generic ring structure for passing events from one core to another. + * + * Used by the software scheduler for the producer and consumer rings for + * each port, i.e. for passing events from worker cores to scheduler and + * vice-versa. Designed for single-producer, single-consumer use with two + * cores working on each ring. + */ + +#ifndef _EVENT_RING_ +#define _EVENT_RING_ + +#include + +#include +#include +#include + +#define QE_RING_NAMESIZE 32 + +struct qe_ring { + char name[QE_RING_NAMESIZE] __rte_cache_aligned; + uint32_t ring_size; /* size of memory block allocated to the ring */ + uint32_t mask; /* mask for read/write values == ring_size -1 */ + uint32_t size; /* actual usable space in the ring */ + volatile uint32_t write_idx __rte_cache_aligned; + volatile uint32_t read_idx __rte_cache_aligned; + + struct rte_event ring[0] __rte_cache_aligned; +}; + +#ifndef force_inline +#define force_inline inline __attribute__((always_inline)) +#endif + +static inline struct qe_ring * +qe_ring_create(const char *name, unsigned int size, unsigned int socket_id) +{ + struct qe_ring *retval; + const uint32_t ring_size = rte_align32pow2(size + 1); + size_t memsize = sizeof(*retval) + + (ring_size * sizeof(retval->ring[0])); + + retval = rte_zmalloc_socket(NULL, memsize, 0, socket_id); + if (retval == NULL) + goto end; + + snprintf(retval->name, sizeof(retval->name), "EVDEV_RG_%s", name); + retval->ring_size = ring_size; + retval->mask = ring_size - 1; + retval->size = size; +end: + return retval; +} + +static inline void +qe_ring_destroy(struct qe_ring *r) +{ + rte_free(r); +} + +static force_inline unsigned int +qe_ring_count(const struct qe_ring *r) +{ + return r->write_idx - r->read_idx; +} + +static force_inline unsigned int +qe_ring_free_count(const struct qe_ring *r) +{ + return r->size - qe_ring_count(r); +} + +static force_inline unsigned int +qe_ring_enqueue_burst(struct qe_ring *r, const struct rte_event *qes, + unsigned int nb_qes, uint16_t *free_count) +{ + const uint32_t size = r->size; + const uint32_t mask = r->mask; + const uint32_t read = r->read_idx; + uint32_t write = r->write_idx; + const uint32_t space = read + size - write; + uint32_t i; + + if (space < nb_qes) + nb_qes = space; + + for (i = 0; i < nb_qes; i++, write++) + r->ring[write & mask] = qes[i]; + + rte_smp_wmb(); + + if (nb_qes != 0) + r->write_idx = write; + + *free_count = space - nb_qes; + + return nb_qes; +} + +static force_inline unsigned int +qe_ring_enqueue_burst_with_ops(struct qe_ring *r, const struct rte_event *qes, + unsigned int nb_qes, uint8_t *ops) +{ + const uint32_t size = r->size; + const uint32_t mask = r->mask; + const uint32_t read = r->read_idx; + uint32_t write = r->write_idx; + const uint32_t space = read + size - write; + uint32_t i; + + if (space < nb_qes) + nb_qes = space; + + for (i = 0; i < nb_qes; i++, write++) { + r->ring[write & mask] = qes[i]; + r->ring[write & mask].op = ops[i]; + } + + rte_smp_wmb(); + + if (nb_qes != 0) + r->write_idx = write; + + return nb_qes; +} + +static force_inline unsigned int +qe_ring_dequeue_burst(struct qe_ring *r, struct rte_event *qes, + unsigned int nb_qes) +{ + const uint32_t mask = r->mask; + uint32_t read = r->read_idx; + const uint32_t write = r->write_idx; + const uint32_t items = write - read; + uint32_t i; + + if (items < nb_qes) + nb_qes = items; + + + for (i = 0; i < nb_qes; i++, read++) + qes[i] = r->ring[read & mask]; + + rte_smp_rmb(); + + if (nb_qes != 0) + r->read_idx += nb_qes; + + return nb_qes; +} + +#endif diff --git a/src/seastar/dpdk/drivers/event/sw/iq_ring.h b/src/seastar/dpdk/drivers/event/sw/iq_ring.h new file mode 100644 index 00000000..d480d156 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/iq_ring.h @@ -0,0 +1,176 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Ring structure definitions used for the internal ring buffers of the + * SW eventdev implementation. These are designed for single-core use only. + */ +#ifndef _IQ_RING_ +#define _IQ_RING_ + +#include + +#include +#include +#include +#include + +#define IQ_RING_NAMESIZE 12 +#define QID_IQ_DEPTH 512 +#define QID_IQ_MASK (uint16_t)(QID_IQ_DEPTH - 1) + +struct iq_ring { + char name[IQ_RING_NAMESIZE] __rte_cache_aligned; + uint16_t write_idx; + uint16_t read_idx; + + struct rte_event ring[QID_IQ_DEPTH]; +}; + +#ifndef force_inline +#define force_inline inline __attribute__((always_inline)) +#endif + +static inline struct iq_ring * +iq_ring_create(const char *name, unsigned int socket_id) +{ + struct iq_ring *retval; + + retval = rte_malloc_socket(NULL, sizeof(*retval), 0, socket_id); + if (retval == NULL) + goto end; + + snprintf(retval->name, sizeof(retval->name), "%s", name); + retval->write_idx = retval->read_idx = 0; +end: + return retval; +} + +static inline void +iq_ring_destroy(struct iq_ring *r) +{ + rte_free(r); +} + +static force_inline uint16_t +iq_ring_count(const struct iq_ring *r) +{ + return r->write_idx - r->read_idx; +} + +static force_inline uint16_t +iq_ring_free_count(const struct iq_ring *r) +{ + return QID_IQ_MASK - iq_ring_count(r); +} + +static force_inline uint16_t +iq_ring_enqueue_burst(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes) +{ + const uint16_t read = r->read_idx; + uint16_t write = r->write_idx; + const uint16_t space = read + QID_IQ_MASK - write; + uint16_t i; + + if (space < nb_qes) + nb_qes = space; + + for (i = 0; i < nb_qes; i++, write++) + r->ring[write & QID_IQ_MASK] = qes[i]; + + r->write_idx = write; + + return nb_qes; +} + +static force_inline uint16_t +iq_ring_dequeue_burst(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes) +{ + uint16_t read = r->read_idx; + const uint16_t write = r->write_idx; + const uint16_t items = write - read; + uint16_t i; + + for (i = 0; i < nb_qes; i++, read++) + qes[i] = r->ring[read & QID_IQ_MASK]; + + if (items < nb_qes) + nb_qes = items; + + r->read_idx += nb_qes; + + return nb_qes; +} + +/* assumes there is space, from a previous dequeue_burst */ +static force_inline uint16_t +iq_ring_put_back(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes) +{ + uint16_t i, read = r->read_idx; + + for (i = nb_qes; i-- > 0; ) + r->ring[--read & QID_IQ_MASK] = qes[i]; + + r->read_idx = read; + return nb_qes; +} + +static force_inline const struct rte_event * +iq_ring_peek(const struct iq_ring *r) +{ + return &r->ring[r->read_idx & QID_IQ_MASK]; +} + +static force_inline void +iq_ring_pop(struct iq_ring *r) +{ + r->read_idx++; +} + +static force_inline int +iq_ring_enqueue(struct iq_ring *r, const struct rte_event *qe) +{ + const uint16_t read = r->read_idx; + const uint16_t write = r->write_idx; + const uint16_t space = read + QID_IQ_MASK - write; + + if (space == 0) + return -1; + + r->ring[write & QID_IQ_MASK] = *qe; + + r->write_idx = write + 1; + + return 0; +} + +#endif diff --git a/src/seastar/dpdk/drivers/event/sw/rte_pmd_evdev_sw_version.map b/src/seastar/dpdk/drivers/event/sw/rte_pmd_evdev_sw_version.map new file mode 100644 index 00000000..5352e7e3 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/rte_pmd_evdev_sw_version.map @@ -0,0 +1,3 @@ +DPDK_17.05 { + local: *; +}; diff --git a/src/seastar/dpdk/drivers/event/sw/sw_evdev.c b/src/seastar/dpdk/drivers/event/sw/sw_evdev.c new file mode 100644 index 00000000..a31aaa66 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/sw_evdev.c @@ -0,0 +1,833 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include + +#include "sw_evdev.h" +#include "iq_ring.h" +#include "event_ring.h" + +#define EVENTDEV_NAME_SW_PMD event_sw +#define NUMA_NODE_ARG "numa_node" +#define SCHED_QUANTA_ARG "sched_quanta" +#define CREDIT_QUANTA_ARG "credit_quanta" + +static void +sw_info_get(struct rte_eventdev *dev, struct rte_event_dev_info *info); + +static int +sw_port_link(struct rte_eventdev *dev, void *port, const uint8_t queues[], + const uint8_t priorities[], uint16_t num) +{ + struct sw_port *p = port; + struct sw_evdev *sw = sw_pmd_priv(dev); + int i; + + RTE_SET_USED(priorities); + for (i = 0; i < num; i++) { + struct sw_qid *q = &sw->qids[queues[i]]; + + /* check for qid map overflow */ + if (q->cq_num_mapped_cqs >= RTE_DIM(q->cq_map)) { + rte_errno = -EDQUOT; + break; + } + + if (p->is_directed && p->num_qids_mapped > 0) { + rte_errno = -EDQUOT; + break; + } + + if (q->type == SW_SCHED_TYPE_DIRECT) { + /* check directed qids only map to one port */ + if (p->num_qids_mapped > 0) { + rte_errno = -EDQUOT; + break; + } + /* check port only takes a directed flow */ + if (num > 1) { + rte_errno = -EDQUOT; + break; + } + + p->is_directed = 1; + p->num_qids_mapped = 1; + } else if (q->type == RTE_SCHED_TYPE_ORDERED) { + p->num_ordered_qids++; + p->num_qids_mapped++; + } else if (q->type == RTE_SCHED_TYPE_ATOMIC) { + p->num_qids_mapped++; + } + + q->cq_map[q->cq_num_mapped_cqs] = p->id; + rte_smp_wmb(); + q->cq_num_mapped_cqs++; + } + return i; +} + +static int +sw_port_unlink(struct rte_eventdev *dev, void *port, uint8_t queues[], + uint16_t nb_unlinks) +{ + struct sw_port *p = port; + struct sw_evdev *sw = sw_pmd_priv(dev); + unsigned int i, j; + + int unlinked = 0; + for (i = 0; i < nb_unlinks; i++) { + struct sw_qid *q = &sw->qids[queues[i]]; + for (j = 0; j < q->cq_num_mapped_cqs; j++) { + if (q->cq_map[j] == p->id) { + q->cq_map[j] = + q->cq_map[q->cq_num_mapped_cqs - 1]; + rte_smp_wmb(); + q->cq_num_mapped_cqs--; + unlinked++; + + p->num_qids_mapped--; + + if (q->type == RTE_SCHED_TYPE_ORDERED) + p->num_ordered_qids--; + + continue; + } + } + } + return unlinked; +} + +static int +sw_port_setup(struct rte_eventdev *dev, uint8_t port_id, + const struct rte_event_port_conf *conf) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + struct sw_port *p = &sw->ports[port_id]; + char buf[QE_RING_NAMESIZE]; + unsigned int i; + + struct rte_event_dev_info info; + sw_info_get(dev, &info); + + /* detect re-configuring and return credits to instance if needed */ + if (p->initialized) { + /* taking credits from pool is done one quanta at a time, and + * credits may be spend (counted in p->inflights) or still + * available in the port (p->inflight_credits). We must return + * the sum to no leak credits + */ + int possible_inflights = p->inflight_credits + p->inflights; + rte_atomic32_sub(&sw->inflights, possible_inflights); + } + + *p = (struct sw_port){0}; /* zero entire structure */ + p->id = port_id; + p->sw = sw; + + snprintf(buf, sizeof(buf), "sw%d_%s", dev->data->dev_id, + "rx_worker_ring"); + p->rx_worker_ring = qe_ring_create(buf, MAX_SW_PROD_Q_DEPTH, + dev->data->socket_id); + if (p->rx_worker_ring == NULL) { + SW_LOG_ERR("Error creating RX worker ring for port %d\n", + port_id); + return -1; + } + + p->inflight_max = conf->new_event_threshold; + + snprintf(buf, sizeof(buf), "sw%d_%s", dev->data->dev_id, + "cq_worker_ring"); + p->cq_worker_ring = qe_ring_create(buf, conf->dequeue_depth, + dev->data->socket_id); + if (p->cq_worker_ring == NULL) { + qe_ring_destroy(p->rx_worker_ring); + SW_LOG_ERR("Error creating CQ worker ring for port %d\n", + port_id); + return -1; + } + sw->cq_ring_space[port_id] = conf->dequeue_depth; + + /* set hist list contents to empty */ + for (i = 0; i < SW_PORT_HIST_LIST; i++) { + p->hist_list[i].fid = -1; + p->hist_list[i].qid = -1; + } + dev->data->ports[port_id] = p; + + rte_smp_wmb(); + p->initialized = 1; + return 0; +} + +static void +sw_port_release(void *port) +{ + struct sw_port *p = (void *)port; + if (p == NULL) + return; + + qe_ring_destroy(p->rx_worker_ring); + qe_ring_destroy(p->cq_worker_ring); + memset(p, 0, sizeof(*p)); +} + +static int32_t +qid_init(struct sw_evdev *sw, unsigned int idx, int type, + const struct rte_event_queue_conf *queue_conf) +{ + unsigned int i; + int dev_id = sw->data->dev_id; + int socket_id = sw->data->socket_id; + char buf[IQ_RING_NAMESIZE]; + struct sw_qid *qid = &sw->qids[idx]; + + for (i = 0; i < SW_IQS_MAX; i++) { + snprintf(buf, sizeof(buf), "q_%u_iq_%d", idx, i); + qid->iq[i] = iq_ring_create(buf, socket_id); + if (!qid->iq[i]) { + SW_LOG_DBG("ring create failed"); + goto cleanup; + } + } + + /* Initialize the FID structures to no pinning (-1), and zero packets */ + const struct sw_fid_t fid = {.cq = -1, .pcount = 0}; + for (i = 0; i < RTE_DIM(qid->fids); i++) + qid->fids[i] = fid; + + qid->id = idx; + qid->type = type; + qid->priority = queue_conf->priority; + + if (qid->type == RTE_SCHED_TYPE_ORDERED) { + char ring_name[RTE_RING_NAMESIZE]; + uint32_t window_size; + + /* rte_ring and window_size_mask require require window_size to + * be a power-of-2. + */ + window_size = rte_align32pow2( + queue_conf->nb_atomic_order_sequences); + + qid->window_size = window_size - 1; + + if (!window_size) { + SW_LOG_DBG( + "invalid reorder_window_size for ordered queue\n" + ); + goto cleanup; + } + + snprintf(buf, sizeof(buf), "sw%d_iq_%d_rob", dev_id, i); + qid->reorder_buffer = rte_zmalloc_socket(buf, + window_size * sizeof(qid->reorder_buffer[0]), + 0, socket_id); + if (!qid->reorder_buffer) { + SW_LOG_DBG("reorder_buffer malloc failed\n"); + goto cleanup; + } + + memset(&qid->reorder_buffer[0], + 0, + window_size * sizeof(qid->reorder_buffer[0])); + + snprintf(ring_name, sizeof(ring_name), "sw%d_q%d_freelist", + dev_id, idx); + + /* lookup the ring, and if it already exists, free it */ + struct rte_ring *cleanup = rte_ring_lookup(ring_name); + if (cleanup) + rte_ring_free(cleanup); + + qid->reorder_buffer_freelist = rte_ring_create(ring_name, + window_size, + socket_id, + RING_F_SP_ENQ | RING_F_SC_DEQ); + if (!qid->reorder_buffer_freelist) { + SW_LOG_DBG("freelist ring create failed"); + goto cleanup; + } + + /* Populate the freelist with reorder buffer entries. Enqueue + * 'window_size - 1' entries because the rte_ring holds only + * that many. + */ + for (i = 0; i < window_size - 1; i++) { + if (rte_ring_sp_enqueue(qid->reorder_buffer_freelist, + &qid->reorder_buffer[i]) < 0) + goto cleanup; + } + + qid->reorder_buffer_index = 0; + qid->cq_next_tx = 0; + } + + qid->initialized = 1; + + return 0; + +cleanup: + for (i = 0; i < SW_IQS_MAX; i++) { + if (qid->iq[i]) + iq_ring_destroy(qid->iq[i]); + } + + if (qid->reorder_buffer) { + rte_free(qid->reorder_buffer); + qid->reorder_buffer = NULL; + } + + if (qid->reorder_buffer_freelist) { + rte_ring_free(qid->reorder_buffer_freelist); + qid->reorder_buffer_freelist = NULL; + } + + return -EINVAL; +} + +static int +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id, + const struct rte_event_queue_conf *conf) +{ + int type; + + /* SINGLE_LINK can be OR-ed with other types, so handle first */ + if (RTE_EVENT_QUEUE_CFG_SINGLE_LINK & conf->event_queue_cfg) { + type = SW_SCHED_TYPE_DIRECT; + } else { + switch (conf->event_queue_cfg) { + case RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY: + type = RTE_SCHED_TYPE_ATOMIC; + break; + case RTE_EVENT_QUEUE_CFG_ORDERED_ONLY: + type = RTE_SCHED_TYPE_ORDERED; + break; + case RTE_EVENT_QUEUE_CFG_PARALLEL_ONLY: + type = RTE_SCHED_TYPE_PARALLEL; + break; + case RTE_EVENT_QUEUE_CFG_ALL_TYPES: + SW_LOG_ERR("QUEUE_CFG_ALL_TYPES not supported\n"); + return -ENOTSUP; + default: + SW_LOG_ERR("Unknown queue type %d requested\n", + conf->event_queue_cfg); + return -EINVAL; + } + } + + struct sw_evdev *sw = sw_pmd_priv(dev); + return qid_init(sw, queue_id, type, conf); +} + +static void +sw_queue_release(struct rte_eventdev *dev, uint8_t id) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + struct sw_qid *qid = &sw->qids[id]; + uint32_t i; + + for (i = 0; i < SW_IQS_MAX; i++) + iq_ring_destroy(qid->iq[i]); + + if (qid->type == RTE_SCHED_TYPE_ORDERED) { + rte_free(qid->reorder_buffer); + rte_ring_free(qid->reorder_buffer_freelist); + } + memset(qid, 0, sizeof(*qid)); +} + +static void +sw_queue_def_conf(struct rte_eventdev *dev, uint8_t queue_id, + struct rte_event_queue_conf *conf) +{ + RTE_SET_USED(dev); + RTE_SET_USED(queue_id); + + static const struct rte_event_queue_conf default_conf = { + .nb_atomic_flows = 4096, + .nb_atomic_order_sequences = 1, + .event_queue_cfg = RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, + .priority = RTE_EVENT_DEV_PRIORITY_NORMAL, + }; + + *conf = default_conf; +} + +static void +sw_port_def_conf(struct rte_eventdev *dev, uint8_t port_id, + struct rte_event_port_conf *port_conf) +{ + RTE_SET_USED(dev); + RTE_SET_USED(port_id); + + port_conf->new_event_threshold = 1024; + port_conf->dequeue_depth = 16; + port_conf->enqueue_depth = 16; +} + +static int +sw_dev_configure(const struct rte_eventdev *dev) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + const struct rte_eventdev_data *data = dev->data; + const struct rte_event_dev_config *conf = &data->dev_conf; + + sw->qid_count = conf->nb_event_queues; + sw->port_count = conf->nb_event_ports; + sw->nb_events_limit = conf->nb_events_limit; + rte_atomic32_set(&sw->inflights, 0); + + if (conf->event_dev_cfg & RTE_EVENT_DEV_CFG_PER_DEQUEUE_TIMEOUT) + return -ENOTSUP; + + return 0; +} + +static void +sw_info_get(struct rte_eventdev *dev, struct rte_event_dev_info *info) +{ + RTE_SET_USED(dev); + + static const struct rte_event_dev_info evdev_sw_info = { + .driver_name = SW_PMD_NAME, + .max_event_queues = RTE_EVENT_MAX_QUEUES_PER_DEV, + .max_event_queue_flows = SW_QID_NUM_FIDS, + .max_event_queue_priority_levels = SW_Q_PRIORITY_MAX, + .max_event_priority_levels = SW_IQS_MAX, + .max_event_ports = SW_PORTS_MAX, + .max_event_port_dequeue_depth = MAX_SW_CONS_Q_DEPTH, + .max_event_port_enqueue_depth = MAX_SW_PROD_Q_DEPTH, + .max_num_events = SW_INFLIGHT_EVENTS_TOTAL, + .event_dev_cap = (RTE_EVENT_DEV_CAP_QUEUE_QOS | + RTE_EVENT_DEV_CAP_EVENT_QOS), + }; + + *info = evdev_sw_info; +} + +static void +sw_dump(struct rte_eventdev *dev, FILE *f) +{ + const struct sw_evdev *sw = sw_pmd_priv(dev); + + static const char * const q_type_strings[] = { + "Ordered", "Atomic", "Parallel", "Directed" + }; + uint32_t i; + fprintf(f, "EventDev %s: ports %d, qids %d\n", "todo-fix-name", + sw->port_count, sw->qid_count); + + fprintf(f, "\trx %"PRIu64"\n\tdrop %"PRIu64"\n\ttx %"PRIu64"\n", + sw->stats.rx_pkts, sw->stats.rx_dropped, sw->stats.tx_pkts); + fprintf(f, "\tsched calls: %"PRIu64"\n", sw->sched_called); + fprintf(f, "\tsched cq/qid call: %"PRIu64"\n", sw->sched_cq_qid_called); + fprintf(f, "\tsched no IQ enq: %"PRIu64"\n", sw->sched_no_iq_enqueues); + fprintf(f, "\tsched no CQ enq: %"PRIu64"\n", sw->sched_no_cq_enqueues); + uint32_t inflights = rte_atomic32_read(&sw->inflights); + uint32_t credits = sw->nb_events_limit - inflights; + fprintf(f, "\tinflight %d, credits: %d\n", inflights, credits); + +#define COL_RED "\x1b[31m" +#define COL_RESET "\x1b[0m" + + for (i = 0; i < sw->port_count; i++) { + int max, j; + const struct sw_port *p = &sw->ports[i]; + if (!p->initialized) { + fprintf(f, " %sPort %d not initialized.%s\n", + COL_RED, i, COL_RESET); + continue; + } + fprintf(f, " Port %d %s\n", i, + p->is_directed ? " (SingleCons)" : ""); + fprintf(f, "\trx %"PRIu64"\tdrop %"PRIu64"\ttx %"PRIu64 + "\t%sinflight %d%s\n", sw->ports[i].stats.rx_pkts, + sw->ports[i].stats.rx_dropped, + sw->ports[i].stats.tx_pkts, + (p->inflights == p->inflight_max) ? + COL_RED : COL_RESET, + sw->ports[i].inflights, COL_RESET); + + fprintf(f, "\tMax New: %u" + "\tAvg cycles PP: %"PRIu64"\tCredits: %u\n", + sw->ports[i].inflight_max, + sw->ports[i].avg_pkt_ticks, + sw->ports[i].inflight_credits); + fprintf(f, "\tReceive burst distribution:\n"); + float zp_percent = p->zero_polls * 100.0 / p->total_polls; + fprintf(f, zp_percent < 10 ? "\t\t0:%.02f%% " : "\t\t0:%.0f%% ", + zp_percent); + for (max = (int)RTE_DIM(p->poll_buckets); max-- > 0;) + if (p->poll_buckets[max] != 0) + break; + for (j = 0; j <= max; j++) { + if (p->poll_buckets[j] != 0) { + float poll_pc = p->poll_buckets[j] * 100.0 / + p->total_polls; + fprintf(f, "%u-%u:%.02f%% ", + ((j << SW_DEQ_STAT_BUCKET_SHIFT) + 1), + ((j+1) << SW_DEQ_STAT_BUCKET_SHIFT), + poll_pc); + } + } + fprintf(f, "\n"); + + if (p->rx_worker_ring) { + uint64_t used = qe_ring_count(p->rx_worker_ring); + uint64_t space = qe_ring_free_count(p->rx_worker_ring); + const char *col = (space == 0) ? COL_RED : COL_RESET; + fprintf(f, "\t%srx ring used: %4"PRIu64"\tfree: %4" + PRIu64 COL_RESET"\n", col, used, space); + } else + fprintf(f, "\trx ring not initialized.\n"); + + if (p->cq_worker_ring) { + uint64_t used = qe_ring_count(p->cq_worker_ring); + uint64_t space = qe_ring_free_count(p->cq_worker_ring); + const char *col = (space == 0) ? COL_RED : COL_RESET; + fprintf(f, "\t%scq ring used: %4"PRIu64"\tfree: %4" + PRIu64 COL_RESET"\n", col, used, space); + } else + fprintf(f, "\tcq ring not initialized.\n"); + } + + for (i = 0; i < sw->qid_count; i++) { + const struct sw_qid *qid = &sw->qids[i]; + if (!qid->initialized) { + fprintf(f, " %sQueue %d not initialized.%s\n", + COL_RED, i, COL_RESET); + continue; + } + int affinities_per_port[SW_PORTS_MAX] = {0}; + uint32_t inflights = 0; + + fprintf(f, " Queue %d (%s)\n", i, q_type_strings[qid->type]); + fprintf(f, "\trx %"PRIu64"\tdrop %"PRIu64"\ttx %"PRIu64"\n", + qid->stats.rx_pkts, qid->stats.rx_dropped, + qid->stats.tx_pkts); + if (qid->type == RTE_SCHED_TYPE_ORDERED) { + struct rte_ring *rob_buf_free = + qid->reorder_buffer_freelist; + if (rob_buf_free) + fprintf(f, "\tReorder entries in use: %u\n", + rte_ring_free_count(rob_buf_free)); + else + fprintf(f, + "\tReorder buffer not initialized\n"); + } + + uint32_t flow; + for (flow = 0; flow < RTE_DIM(qid->fids); flow++) + if (qid->fids[flow].cq != -1) { + affinities_per_port[qid->fids[flow].cq]++; + inflights += qid->fids[flow].pcount; + } + + uint32_t cq; + fprintf(f, "\tInflights: %u\tFlows pinned per port: ", + inflights); + for (cq = 0; cq < sw->port_count; cq++) + fprintf(f, "%d ", affinities_per_port[cq]); + fprintf(f, "\n"); + + uint32_t iq; + uint32_t iq_printed = 0; + for (iq = 0; iq < SW_IQS_MAX; iq++) { + if (!qid->iq[iq]) { + fprintf(f, "\tiq %d is not initialized.\n", iq); + iq_printed = 1; + continue; + } + uint32_t used = iq_ring_count(qid->iq[iq]); + uint32_t free = iq_ring_free_count(qid->iq[iq]); + const char *col = (free == 0) ? COL_RED : COL_RESET; + if (used > 0) { + fprintf(f, "\t%siq %d: Used %d\tFree %d" + COL_RESET"\n", col, iq, used, free); + iq_printed = 1; + } + } + if (iq_printed == 0) + fprintf(f, "\t-- iqs empty --\n"); + } +} + +static int +sw_start(struct rte_eventdev *dev) +{ + unsigned int i, j; + struct sw_evdev *sw = sw_pmd_priv(dev); + /* check all ports are set up */ + for (i = 0; i < sw->port_count; i++) + if (sw->ports[i].rx_worker_ring == NULL) { + SW_LOG_ERR("Port %d not configured\n", i); + return -ESTALE; + } + + /* check all queues are configured and mapped to ports*/ + for (i = 0; i < sw->qid_count; i++) + if (sw->qids[i].iq[0] == NULL || + sw->qids[i].cq_num_mapped_cqs == 0) { + SW_LOG_ERR("Queue %d not configured\n", i); + return -ENOLINK; + } + + /* build up our prioritized array of qids */ + /* We don't use qsort here, as if all/multiple entries have the same + * priority, the result is non-deterministic. From "man 3 qsort": + * "If two members compare as equal, their order in the sorted + * array is undefined." + */ + uint32_t qidx = 0; + for (j = 0; j <= RTE_EVENT_DEV_PRIORITY_LOWEST; j++) { + for (i = 0; i < sw->qid_count; i++) { + if (sw->qids[i].priority == j) { + sw->qids_prioritized[qidx] = &sw->qids[i]; + qidx++; + } + } + } + + if (sw_xstats_init(sw) < 0) + return -EINVAL; + + rte_smp_wmb(); + sw->started = 1; + + return 0; +} + +static void +sw_stop(struct rte_eventdev *dev) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + sw_xstats_uninit(sw); + sw->started = 0; + rte_smp_wmb(); +} + +static int +sw_close(struct rte_eventdev *dev) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + uint32_t i; + + for (i = 0; i < sw->qid_count; i++) + sw_queue_release(dev, i); + sw->qid_count = 0; + + for (i = 0; i < sw->port_count; i++) + sw_port_release(&sw->ports[i]); + sw->port_count = 0; + + memset(&sw->stats, 0, sizeof(sw->stats)); + sw->sched_called = 0; + sw->sched_no_iq_enqueues = 0; + sw->sched_no_cq_enqueues = 0; + sw->sched_cq_qid_called = 0; + + return 0; +} + +static int +assign_numa_node(const char *key __rte_unused, const char *value, void *opaque) +{ + int *socket_id = opaque; + *socket_id = atoi(value); + if (*socket_id >= RTE_MAX_NUMA_NODES) + return -1; + return 0; +} + +static int +set_sched_quanta(const char *key __rte_unused, const char *value, void *opaque) +{ + int *quanta = opaque; + *quanta = atoi(value); + if (*quanta < 0 || *quanta >= 4096) + return -1; + return 0; +} + +static int +set_credit_quanta(const char *key __rte_unused, const char *value, void *opaque) +{ + int *credit = opaque; + *credit = atoi(value); + if (*credit < 0 || *credit >= 128) + return -1; + return 0; +} + +static int +sw_probe(struct rte_vdev_device *vdev) +{ + static const struct rte_eventdev_ops evdev_sw_ops = { + .dev_configure = sw_dev_configure, + .dev_infos_get = sw_info_get, + .dev_close = sw_close, + .dev_start = sw_start, + .dev_stop = sw_stop, + .dump = sw_dump, + + .queue_def_conf = sw_queue_def_conf, + .queue_setup = sw_queue_setup, + .queue_release = sw_queue_release, + .port_def_conf = sw_port_def_conf, + .port_setup = sw_port_setup, + .port_release = sw_port_release, + .port_link = sw_port_link, + .port_unlink = sw_port_unlink, + + .xstats_get = sw_xstats_get, + .xstats_get_names = sw_xstats_get_names, + .xstats_get_by_name = sw_xstats_get_by_name, + .xstats_reset = sw_xstats_reset, + }; + + static const char *const args[] = { + NUMA_NODE_ARG, + SCHED_QUANTA_ARG, + CREDIT_QUANTA_ARG, + NULL + }; + const char *name; + const char *params; + struct rte_eventdev *dev; + struct sw_evdev *sw; + int socket_id = rte_socket_id(); + int sched_quanta = SW_DEFAULT_SCHED_QUANTA; + int credit_quanta = SW_DEFAULT_CREDIT_QUANTA; + + name = rte_vdev_device_name(vdev); + params = rte_vdev_device_args(vdev); + if (params != NULL && params[0] != '\0') { + struct rte_kvargs *kvlist = rte_kvargs_parse(params, args); + + if (!kvlist) { + SW_LOG_INFO( + "Ignoring unsupported parameters when creating device '%s'\n", + name); + } else { + int ret = rte_kvargs_process(kvlist, NUMA_NODE_ARG, + assign_numa_node, &socket_id); + if (ret != 0) { + SW_LOG_ERR( + "%s: Error parsing numa node parameter", + name); + rte_kvargs_free(kvlist); + return ret; + } + + ret = rte_kvargs_process(kvlist, SCHED_QUANTA_ARG, + set_sched_quanta, &sched_quanta); + if (ret != 0) { + SW_LOG_ERR( + "%s: Error parsing sched quanta parameter", + name); + rte_kvargs_free(kvlist); + return ret; + } + + ret = rte_kvargs_process(kvlist, CREDIT_QUANTA_ARG, + set_credit_quanta, &credit_quanta); + if (ret != 0) { + SW_LOG_ERR( + "%s: Error parsing credit quanta parameter", + name); + rte_kvargs_free(kvlist); + return ret; + } + + rte_kvargs_free(kvlist); + } + } + + SW_LOG_INFO( + "Creating eventdev sw device %s, numa_node=%d, sched_quanta=%d, credit_quanta=%d\n", + name, socket_id, sched_quanta, credit_quanta); + + dev = rte_event_pmd_vdev_init(name, + sizeof(struct sw_evdev), socket_id); + if (dev == NULL) { + SW_LOG_ERR("eventdev vdev init() failed"); + return -EFAULT; + } + dev->dev_ops = &evdev_sw_ops; + dev->enqueue = sw_event_enqueue; + dev->enqueue_burst = sw_event_enqueue_burst; + dev->dequeue = sw_event_dequeue; + dev->dequeue_burst = sw_event_dequeue_burst; + dev->schedule = sw_event_schedule; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + sw = dev->data->dev_private; + sw->data = dev->data; + + /* copy values passed from vdev command line to instance */ + sw->credit_update_quanta = credit_quanta; + sw->sched_quanta = sched_quanta; + + return 0; +} + +static int +sw_remove(struct rte_vdev_device *vdev) +{ + const char *name; + + name = rte_vdev_device_name(vdev); + if (name == NULL) + return -EINVAL; + + SW_LOG_INFO("Closing eventdev sw device %s\n", name); + + return rte_event_pmd_vdev_uninit(name); +} + +static struct rte_vdev_driver evdev_sw_pmd_drv = { + .probe = sw_probe, + .remove = sw_remove +}; + +RTE_PMD_REGISTER_VDEV(EVENTDEV_NAME_SW_PMD, evdev_sw_pmd_drv); +RTE_PMD_REGISTER_PARAM_STRING(event_sw, NUMA_NODE_ARG "= " + SCHED_QUANTA_ARG "=" CREDIT_QUANTA_ARG "="); diff --git a/src/seastar/dpdk/drivers/event/sw/sw_evdev.h b/src/seastar/dpdk/drivers/event/sw/sw_evdev.h new file mode 100644 index 00000000..61c671d6 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/sw_evdev.h @@ -0,0 +1,318 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SW_EVDEV_H_ +#define _SW_EVDEV_H_ + +#include +#include +#include + +#define SW_DEFAULT_CREDIT_QUANTA 32 +#define SW_DEFAULT_SCHED_QUANTA 128 +#define SW_QID_NUM_FIDS 16384 +#define SW_IQS_MAX 4 +#define SW_Q_PRIORITY_MAX 255 +#define SW_PORTS_MAX 64 +#define MAX_SW_CONS_Q_DEPTH 128 +#define SW_INFLIGHT_EVENTS_TOTAL 4096 +/* allow for lots of over-provisioning */ +#define MAX_SW_PROD_Q_DEPTH 4096 +#define SW_FRAGMENTS_MAX 16 + +/* report dequeue burst sizes in buckets */ +#define SW_DEQ_STAT_BUCKET_SHIFT 2 +/* how many packets pulled from port by sched */ +#define SCHED_DEQUEUE_BURST_SIZE 32 + +#define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our history list */ +#define NUM_SAMPLES 64 /* how many data points use for average stats */ + +#define EVENTDEV_NAME_SW_PMD event_sw +#define SW_PMD_NAME RTE_STR(event_sw) + +#define SW_SCHED_TYPE_DIRECT (RTE_SCHED_TYPE_PARALLEL + 1) + +#define SW_NUM_POLL_BUCKETS (MAX_SW_CONS_Q_DEPTH >> SW_DEQ_STAT_BUCKET_SHIFT) + +enum { + QE_FLAG_VALID_SHIFT = 0, + QE_FLAG_COMPLETE_SHIFT, + QE_FLAG_NOT_EOP_SHIFT, + _QE_FLAG_COUNT +}; + +#define QE_FLAG_VALID (1 << QE_FLAG_VALID_SHIFT) /* for NEW FWD, FRAG */ +#define QE_FLAG_COMPLETE (1 << QE_FLAG_COMPLETE_SHIFT) /* set for FWD, DROP */ +#define QE_FLAG_NOT_EOP (1 << QE_FLAG_NOT_EOP_SHIFT) /* set for FRAG only */ + +static const uint8_t sw_qe_flag_map[] = { + QE_FLAG_VALID /* NEW Event */, + QE_FLAG_VALID | QE_FLAG_COMPLETE /* FWD Event */, + QE_FLAG_COMPLETE /* RELEASE Event */, + + /* Values which can be used for future support for partial + * events, i.e. where one event comes back to the scheduler + * as multiple which need to be tracked together + */ + QE_FLAG_VALID | QE_FLAG_COMPLETE | QE_FLAG_NOT_EOP, +}; + +#ifdef RTE_LIBRTE_PMD_EVDEV_SW_DEBUG +#define SW_LOG_INFO(fmt, args...) \ + RTE_LOG(INFO, EVENTDEV, "[%s] %s() line %u: " fmt "\n", \ + SW_PMD_NAME, \ + __func__, __LINE__, ## args) + +#define SW_LOG_DBG(fmt, args...) \ + RTE_LOG(DEBUG, EVENTDEV, "[%s] %s() line %u: " fmt "\n", \ + SW_PMD_NAME, \ + __func__, __LINE__, ## args) +#else +#define SW_LOG_INFO(fmt, args...) +#define SW_LOG_DBG(fmt, args...) +#endif + +#define SW_LOG_ERR(fmt, args...) \ + RTE_LOG(ERR, EVENTDEV, "[%s] %s() line %u: " fmt "\n", \ + SW_PMD_NAME, \ + __func__, __LINE__, ## args) + +/* Records basic event stats at a given point. Used in port and qid structs */ +struct sw_point_stats { + uint64_t rx_pkts; + uint64_t rx_dropped; + uint64_t tx_pkts; +}; + +/* structure used to track what port a flow (FID) is pinned to */ +struct sw_fid_t { + /* which CQ this FID is currently pinned to */ + int32_t cq; + /* number of packets gone to the CQ with this FID */ + uint32_t pcount; +}; + +struct reorder_buffer_entry { + uint16_t num_fragments; /**< Number of packet fragments */ + uint16_t fragment_index; /**< Points to the oldest valid frag */ + uint8_t ready; /**< Entry is ready to be reordered */ + struct rte_event fragments[SW_FRAGMENTS_MAX]; +}; + +struct sw_qid { + /* set when the QID has been initialized */ + uint8_t initialized; + /* The type of this QID */ + int8_t type; + /* Integer ID representing the queue. This is used in history lists, + * to identify the stage of processing. + */ + uint32_t id; + struct sw_point_stats stats; + + /* Internal priority rings for packets */ + struct iq_ring *iq[SW_IQS_MAX]; + uint32_t iq_pkt_mask; /* A mask to indicate packets in an IQ */ + uint64_t iq_pkt_count[SW_IQS_MAX]; + + /* Information on what CQs are polling this IQ */ + uint32_t cq_num_mapped_cqs; + uint32_t cq_next_tx; /* cq to write next (non-atomic) packet */ + uint32_t cq_map[SW_PORTS_MAX]; + + /* Track flow ids for atomic load balancing */ + struct sw_fid_t fids[SW_QID_NUM_FIDS]; + + /* Track packet order for reordering when needed */ + struct reorder_buffer_entry *reorder_buffer; /*< pkts await reorder */ + struct rte_ring *reorder_buffer_freelist; /* available reorder slots */ + uint32_t reorder_buffer_index; /* oldest valid reorder buffer entry */ + uint32_t window_size; /* Used to wrap reorder_buffer_index */ + + uint8_t priority; +}; + +struct sw_hist_list_entry { + int32_t qid; + int32_t fid; + struct reorder_buffer_entry *rob_entry; +}; + +struct sw_evdev; + +struct sw_port { + /* new enqueue / dequeue API doesn't have an instance pointer, only the + * pointer to the port being enqueue/dequeued from + */ + struct sw_evdev *sw; + + /* set when the port is initialized */ + uint8_t initialized; + /* A numeric ID for the port */ + uint8_t id; + + int16_t is_directed; /** Takes from a single directed QID */ + /** + * For loadbalanced we can optimise pulling packets from + * producers if there is no reordering involved + */ + int16_t num_ordered_qids; + + /** Ring and buffer for pulling events from workers for scheduling */ + struct qe_ring *rx_worker_ring __rte_cache_aligned; + /** Ring and buffer for pushing packets to workers after scheduling */ + struct qe_ring *cq_worker_ring; + + /* hole */ + + /* num releases yet to be completed on this port */ + uint16_t outstanding_releases __rte_cache_aligned; + uint16_t inflight_max; /* app requested max inflights for this port */ + uint16_t inflight_credits; /* num credits this port has right now */ + + uint16_t last_dequeue_burst_sz; /* how big the burst was */ + uint64_t last_dequeue_ticks; /* used to track burst processing time */ + uint64_t avg_pkt_ticks; /* tracks average over NUM_SAMPLES burst */ + uint64_t total_polls; /* how many polls were counted in stats */ + uint64_t zero_polls; /* tracks polls returning nothing */ + uint32_t poll_buckets[SW_NUM_POLL_BUCKETS]; + /* bucket values in 4s for shorter reporting */ + + /* History list structs, containing info on pkts egressed to worker */ + uint16_t hist_head __rte_cache_aligned; + uint16_t hist_tail; + uint16_t inflights; + struct sw_hist_list_entry hist_list[SW_PORT_HIST_LIST]; + + /* track packets in and out of this port */ + struct sw_point_stats stats; + + + uint32_t pp_buf_start; + uint32_t pp_buf_count; + uint16_t cq_buf_count; + struct rte_event pp_buf[SCHED_DEQUEUE_BURST_SIZE]; + struct rte_event cq_buf[MAX_SW_CONS_Q_DEPTH]; + + uint8_t num_qids_mapped; +}; + +struct sw_evdev { + struct rte_eventdev_data *data; + + uint32_t port_count; + uint32_t qid_count; + uint32_t xstats_count; + struct sw_xstats_entry *xstats; + uint32_t xstats_count_mode_dev; + uint32_t xstats_count_mode_port; + uint32_t xstats_count_mode_queue; + + /* Contains all ports - load balanced and directed */ + struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned; + + rte_atomic32_t inflights __rte_cache_aligned; + + /* + * max events in this instance. Cached here for performance. + * (also available in data->conf.nb_events_limit) + */ + uint32_t nb_events_limit; + + /* Internal queues - one per logical queue */ + struct sw_qid qids[RTE_EVENT_MAX_QUEUES_PER_DEV] __rte_cache_aligned; + + /* Cache how many packets are in each cq */ + uint16_t cq_ring_space[SW_PORTS_MAX] __rte_cache_aligned; + + /* Array of pointers to load-balanced QIDs sorted by priority level */ + struct sw_qid *qids_prioritized[RTE_EVENT_MAX_QUEUES_PER_DEV]; + + /* Stats */ + struct sw_point_stats stats __rte_cache_aligned; + uint64_t sched_called; + int32_t sched_quanta; + uint64_t sched_no_iq_enqueues; + uint64_t sched_no_cq_enqueues; + uint64_t sched_cq_qid_called; + + uint8_t started; + uint32_t credit_update_quanta; + + /* store num stats and offset of the stats for each port */ + uint16_t xstats_count_per_port[SW_PORTS_MAX]; + uint16_t xstats_offset_for_port[SW_PORTS_MAX]; + /* store num stats and offset of the stats for each queue */ + uint16_t xstats_count_per_qid[RTE_EVENT_MAX_QUEUES_PER_DEV]; + uint16_t xstats_offset_for_qid[RTE_EVENT_MAX_QUEUES_PER_DEV]; +}; + +static inline struct sw_evdev * +sw_pmd_priv(const struct rte_eventdev *eventdev) +{ + return eventdev->data->dev_private; +} + +static inline const struct sw_evdev * +sw_pmd_priv_const(const struct rte_eventdev *eventdev) +{ + return eventdev->data->dev_private; +} + +uint16_t sw_event_enqueue(void *port, const struct rte_event *ev); +uint16_t sw_event_enqueue_burst(void *port, const struct rte_event ev[], + uint16_t num); + +uint16_t sw_event_dequeue(void *port, struct rte_event *ev, uint64_t wait); +uint16_t sw_event_dequeue_burst(void *port, struct rte_event *ev, uint16_t num, + uint64_t wait); +void sw_event_schedule(struct rte_eventdev *dev); +int sw_xstats_init(struct sw_evdev *dev); +int sw_xstats_uninit(struct sw_evdev *dev); +int sw_xstats_get_names(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + struct rte_event_dev_xstats_name *xstats_names, + unsigned int *ids, unsigned int size); +int sw_xstats_get(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + const unsigned int ids[], uint64_t values[], unsigned int n); +uint64_t sw_xstats_get_by_name(const struct rte_eventdev *dev, + const char *name, unsigned int *id); +int sw_xstats_reset(struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, + int16_t queue_port_id, + const uint32_t ids[], + uint32_t nb_ids); + + +#endif /* _SW_EVDEV_H_ */ diff --git a/src/seastar/dpdk/drivers/event/sw/sw_evdev_scheduler.c b/src/seastar/dpdk/drivers/event/sw/sw_evdev_scheduler.c new file mode 100644 index 00000000..a333a6f0 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/sw_evdev_scheduler.c @@ -0,0 +1,601 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "sw_evdev.h" +#include "iq_ring.h" +#include "event_ring.h" + +#define SW_IQS_MASK (SW_IQS_MAX-1) + +/* Retrieve the highest priority IQ or -1 if no pkts available. Doing the + * CLZ twice is faster than caching the value due to data dependencies + */ +#define PKT_MASK_TO_IQ(pkts) \ + (__builtin_ctz(pkts | (1 << SW_IQS_MAX))) + +#if SW_IQS_MAX != 4 +#error Misconfigured PRIO_TO_IQ caused by SW_IQS_MAX value change +#endif +#define PRIO_TO_IQ(prio) (prio >> 6) + +#define MAX_PER_IQ_DEQUEUE 48 +#define FLOWID_MASK (SW_QID_NUM_FIDS-1) +/* use cheap bit mixing, we only need to lose a few bits */ +#define SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK) + +static inline uint32_t +sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid * const qid, + uint32_t iq_num, unsigned int count) +{ + struct rte_event qes[MAX_PER_IQ_DEQUEUE]; /* count <= MAX */ + struct rte_event blocked_qes[MAX_PER_IQ_DEQUEUE]; + uint32_t nb_blocked = 0; + uint32_t i; + + if (count > MAX_PER_IQ_DEQUEUE) + count = MAX_PER_IQ_DEQUEUE; + + /* This is the QID ID. The QID ID is static, hence it can be + * used to identify the stage of processing in history lists etc + */ + uint32_t qid_id = qid->id; + + iq_ring_dequeue_burst(qid->iq[iq_num], qes, count); + for (i = 0; i < count; i++) { + const struct rte_event *qe = &qes[i]; + const uint16_t flow_id = SW_HASH_FLOWID(qes[i].flow_id); + struct sw_fid_t *fid = &qid->fids[flow_id]; + int cq = fid->cq; + + if (cq < 0) { + uint32_t cq_idx = qid->cq_next_tx++; + if (qid->cq_next_tx == qid->cq_num_mapped_cqs) + qid->cq_next_tx = 0; + cq = qid->cq_map[cq_idx]; + + /* find least used */ + int cq_free_cnt = sw->cq_ring_space[cq]; + for (cq_idx = 0; cq_idx < qid->cq_num_mapped_cqs; + cq_idx++) { + int test_cq = qid->cq_map[cq_idx]; + int test_cq_free = sw->cq_ring_space[test_cq]; + if (test_cq_free > cq_free_cnt) { + cq = test_cq; + cq_free_cnt = test_cq_free; + } + } + + fid->cq = cq; /* this pins early */ + } + + if (sw->cq_ring_space[cq] == 0 || + sw->ports[cq].inflights == SW_PORT_HIST_LIST) { + blocked_qes[nb_blocked++] = *qe; + continue; + } + + struct sw_port *p = &sw->ports[cq]; + + /* at this point we can queue up the packet on the cq_buf */ + fid->pcount++; + p->cq_buf[p->cq_buf_count++] = *qe; + p->inflights++; + sw->cq_ring_space[cq]--; + + int head = (p->hist_head++ & (SW_PORT_HIST_LIST-1)); + p->hist_list[head].fid = flow_id; + p->hist_list[head].qid = qid_id; + + p->stats.tx_pkts++; + qid->stats.tx_pkts++; + + /* if we just filled in the last slot, flush the buffer */ + if (sw->cq_ring_space[cq] == 0) { + struct qe_ring *worker = p->cq_worker_ring; + qe_ring_enqueue_burst(worker, p->cq_buf, + p->cq_buf_count, + &sw->cq_ring_space[cq]); + p->cq_buf_count = 0; + } + } + iq_ring_put_back(qid->iq[iq_num], blocked_qes, nb_blocked); + + return count - nb_blocked; +} + +static inline uint32_t +sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct sw_qid * const qid, + uint32_t iq_num, unsigned int count, int keep_order) +{ + uint32_t i; + uint32_t cq_idx = qid->cq_next_tx; + + /* This is the QID ID. The QID ID is static, hence it can be + * used to identify the stage of processing in history lists etc + */ + uint32_t qid_id = qid->id; + + if (count > MAX_PER_IQ_DEQUEUE) + count = MAX_PER_IQ_DEQUEUE; + + if (keep_order) + /* only schedule as many as we have reorder buffer entries */ + count = RTE_MIN(count, + rte_ring_count(qid->reorder_buffer_freelist)); + + for (i = 0; i < count; i++) { + const struct rte_event *qe = iq_ring_peek(qid->iq[iq_num]); + uint32_t cq_check_count = 0; + uint32_t cq; + + /* + * for parallel, just send to next available CQ in round-robin + * fashion. So scan for an available CQ. If all CQs are full + * just return and move on to next QID + */ + do { + if (++cq_check_count > qid->cq_num_mapped_cqs) + goto exit; + cq = qid->cq_map[cq_idx]; + if (++cq_idx == qid->cq_num_mapped_cqs) + cq_idx = 0; + } while (qe_ring_free_count(sw->ports[cq].cq_worker_ring) == 0 || + sw->ports[cq].inflights == SW_PORT_HIST_LIST); + + struct sw_port *p = &sw->ports[cq]; + if (sw->cq_ring_space[cq] == 0 || + p->inflights == SW_PORT_HIST_LIST) + break; + + sw->cq_ring_space[cq]--; + + qid->stats.tx_pkts++; + + const int head = (p->hist_head & (SW_PORT_HIST_LIST-1)); + p->hist_list[head].fid = SW_HASH_FLOWID(qe->flow_id); + p->hist_list[head].qid = qid_id; + + if (keep_order) + rte_ring_sc_dequeue(qid->reorder_buffer_freelist, + (void *)&p->hist_list[head].rob_entry); + + sw->ports[cq].cq_buf[sw->ports[cq].cq_buf_count++] = *qe; + iq_ring_pop(qid->iq[iq_num]); + + rte_compiler_barrier(); + p->inflights++; + p->stats.tx_pkts++; + p->hist_head++; + } +exit: + qid->cq_next_tx = cq_idx; + return i; +} + +static uint32_t +sw_schedule_dir_to_cq(struct sw_evdev *sw, struct sw_qid * const qid, + uint32_t iq_num, unsigned int count __rte_unused) +{ + uint32_t cq_id = qid->cq_map[0]; + struct sw_port *port = &sw->ports[cq_id]; + + /* get max burst enq size for cq_ring */ + uint32_t count_free = sw->cq_ring_space[cq_id]; + if (count_free == 0) + return 0; + + /* burst dequeue from the QID IQ ring */ + struct iq_ring *ring = qid->iq[iq_num]; + uint32_t ret = iq_ring_dequeue_burst(ring, + &port->cq_buf[port->cq_buf_count], count_free); + port->cq_buf_count += ret; + + /* Update QID, Port and Total TX stats */ + qid->stats.tx_pkts += ret; + port->stats.tx_pkts += ret; + + /* Subtract credits from cached value */ + sw->cq_ring_space[cq_id] -= ret; + + return ret; +} + +static uint32_t +sw_schedule_qid_to_cq(struct sw_evdev *sw) +{ + uint32_t pkts = 0; + uint32_t qid_idx; + + sw->sched_cq_qid_called++; + + for (qid_idx = 0; qid_idx < sw->qid_count; qid_idx++) { + struct sw_qid *qid = sw->qids_prioritized[qid_idx]; + + int type = qid->type; + int iq_num = PKT_MASK_TO_IQ(qid->iq_pkt_mask); + + /* zero mapped CQs indicates directed */ + if (iq_num >= SW_IQS_MAX) + continue; + + uint32_t pkts_done = 0; + uint32_t count = iq_ring_count(qid->iq[iq_num]); + + if (count > 0) { + if (type == SW_SCHED_TYPE_DIRECT) + pkts_done += sw_schedule_dir_to_cq(sw, qid, + iq_num, count); + else if (type == RTE_SCHED_TYPE_ATOMIC) + pkts_done += sw_schedule_atomic_to_cq(sw, qid, + iq_num, count); + else + pkts_done += sw_schedule_parallel_to_cq(sw, qid, + iq_num, count, + type == RTE_SCHED_TYPE_ORDERED); + } + + /* Check if the IQ that was polled is now empty, and unset it + * in the IQ mask if its empty. + */ + int all_done = (pkts_done == count); + + qid->iq_pkt_mask &= ~(all_done << (iq_num)); + pkts += pkts_done; + } + + return pkts; +} + +/* This function will perform re-ordering of packets, and injecting into + * the appropriate QID IQ. As LB and DIR QIDs are in the same array, but *NOT* + * contiguous in that array, this function accepts a "range" of QIDs to scan. + */ +static uint16_t +sw_schedule_reorder(struct sw_evdev *sw, int qid_start, int qid_end) +{ + /* Perform egress reordering */ + struct rte_event *qe; + uint32_t pkts_iter = 0; + + for (; qid_start < qid_end; qid_start++) { + struct sw_qid *qid = &sw->qids[qid_start]; + int i, num_entries_in_use; + + if (qid->type != RTE_SCHED_TYPE_ORDERED) + continue; + + num_entries_in_use = rte_ring_free_count( + qid->reorder_buffer_freelist); + + for (i = 0; i < num_entries_in_use; i++) { + struct reorder_buffer_entry *entry; + int j; + + entry = &qid->reorder_buffer[qid->reorder_buffer_index]; + + if (!entry->ready) + break; + + for (j = 0; j < entry->num_fragments; j++) { + uint16_t dest_qid; + uint16_t dest_iq; + + int idx = entry->fragment_index + j; + qe = &entry->fragments[idx]; + + dest_qid = qe->queue_id; + dest_iq = PRIO_TO_IQ(qe->priority); + + if (dest_qid >= sw->qid_count) { + sw->stats.rx_dropped++; + continue; + } + + struct sw_qid *dest_qid_ptr = + &sw->qids[dest_qid]; + const struct iq_ring *dest_iq_ptr = + dest_qid_ptr->iq[dest_iq]; + if (iq_ring_free_count(dest_iq_ptr) == 0) + break; + + pkts_iter++; + + struct sw_qid *q = &sw->qids[dest_qid]; + struct iq_ring *r = q->iq[dest_iq]; + + /* we checked for space above, so enqueue must + * succeed + */ + iq_ring_enqueue(r, qe); + q->iq_pkt_mask |= (1 << (dest_iq)); + q->iq_pkt_count[dest_iq]++; + q->stats.rx_pkts++; + } + + entry->ready = (j != entry->num_fragments); + entry->num_fragments -= j; + entry->fragment_index += j; + + if (!entry->ready) { + entry->fragment_index = 0; + + rte_ring_sp_enqueue( + qid->reorder_buffer_freelist, + entry); + + qid->reorder_buffer_index++; + qid->reorder_buffer_index %= qid->window_size; + } + } + } + return pkts_iter; +} + +static inline void __attribute__((always_inline)) +sw_refill_pp_buf(struct sw_evdev *sw, struct sw_port *port) +{ + RTE_SET_USED(sw); + struct qe_ring *worker = port->rx_worker_ring; + port->pp_buf_start = 0; + port->pp_buf_count = qe_ring_dequeue_burst(worker, port->pp_buf, + RTE_DIM(port->pp_buf)); +} + +static inline uint32_t __attribute__((always_inline)) +__pull_port_lb(struct sw_evdev *sw, uint32_t port_id, int allow_reorder) +{ + static struct reorder_buffer_entry dummy_rob; + uint32_t pkts_iter = 0; + struct sw_port *port = &sw->ports[port_id]; + + /* If shadow ring has 0 pkts, pull from worker ring */ + if (port->pp_buf_count == 0) + sw_refill_pp_buf(sw, port); + + while (port->pp_buf_count) { + const struct rte_event *qe = &port->pp_buf[port->pp_buf_start]; + struct sw_hist_list_entry *hist_entry = NULL; + uint8_t flags = qe->op; + const uint16_t eop = !(flags & QE_FLAG_NOT_EOP); + int needs_reorder = 0; + /* if no-reordering, having PARTIAL == NEW */ + if (!allow_reorder && !eop) + flags = QE_FLAG_VALID; + + /* + * if we don't have space for this packet in an IQ, + * then move on to next queue. Technically, for a + * packet that needs reordering, we don't need to check + * here, but it simplifies things not to special-case + */ + uint32_t iq_num = PRIO_TO_IQ(qe->priority); + struct sw_qid *qid = &sw->qids[qe->queue_id]; + + if ((flags & QE_FLAG_VALID) && + iq_ring_free_count(qid->iq[iq_num]) == 0) + break; + + /* now process based on flags. Note that for directed + * queues, the enqueue_flush masks off all but the + * valid flag. This makes FWD and PARTIAL enqueues just + * NEW type, and makes DROPS no-op calls. + */ + if ((flags & QE_FLAG_COMPLETE) && port->inflights > 0) { + const uint32_t hist_tail = port->hist_tail & + (SW_PORT_HIST_LIST - 1); + + hist_entry = &port->hist_list[hist_tail]; + const uint32_t hist_qid = hist_entry->qid; + const uint32_t hist_fid = hist_entry->fid; + + struct sw_fid_t *fid = + &sw->qids[hist_qid].fids[hist_fid]; + fid->pcount -= eop; + if (fid->pcount == 0) + fid->cq = -1; + + if (allow_reorder) { + /* set reorder ready if an ordered QID */ + uintptr_t rob_ptr = + (uintptr_t)hist_entry->rob_entry; + const uintptr_t valid = (rob_ptr != 0); + needs_reorder = valid; + rob_ptr |= + ((valid - 1) & (uintptr_t)&dummy_rob); + struct reorder_buffer_entry *tmp_rob_ptr = + (struct reorder_buffer_entry *)rob_ptr; + tmp_rob_ptr->ready = eop * needs_reorder; + } + + port->inflights -= eop; + port->hist_tail += eop; + } + if (flags & QE_FLAG_VALID) { + port->stats.rx_pkts++; + + if (allow_reorder && needs_reorder) { + struct reorder_buffer_entry *rob_entry = + hist_entry->rob_entry; + + hist_entry->rob_entry = NULL; + /* Although fragmentation not currently + * supported by eventdev API, we support it + * here. Open: How do we alert the user that + * they've exceeded max frags? + */ + int num_frag = rob_entry->num_fragments; + if (num_frag == SW_FRAGMENTS_MAX) + sw->stats.rx_dropped++; + else { + int idx = rob_entry->num_fragments++; + rob_entry->fragments[idx] = *qe; + } + goto end_qe; + } + + /* Use the iq_num from above to push the QE + * into the qid at the right priority + */ + + qid->iq_pkt_mask |= (1 << (iq_num)); + iq_ring_enqueue(qid->iq[iq_num], qe); + qid->iq_pkt_count[iq_num]++; + qid->stats.rx_pkts++; + pkts_iter++; + } + +end_qe: + port->pp_buf_start++; + port->pp_buf_count--; + } /* while (avail_qes) */ + + return pkts_iter; +} + +static uint32_t +sw_schedule_pull_port_lb(struct sw_evdev *sw, uint32_t port_id) +{ + return __pull_port_lb(sw, port_id, 1); +} + +static uint32_t +sw_schedule_pull_port_no_reorder(struct sw_evdev *sw, uint32_t port_id) +{ + return __pull_port_lb(sw, port_id, 0); +} + +static uint32_t +sw_schedule_pull_port_dir(struct sw_evdev *sw, uint32_t port_id) +{ + uint32_t pkts_iter = 0; + struct sw_port *port = &sw->ports[port_id]; + + /* If shadow ring has 0 pkts, pull from worker ring */ + if (port->pp_buf_count == 0) + sw_refill_pp_buf(sw, port); + + while (port->pp_buf_count) { + const struct rte_event *qe = &port->pp_buf[port->pp_buf_start]; + uint8_t flags = qe->op; + + if ((flags & QE_FLAG_VALID) == 0) + goto end_qe; + + uint32_t iq_num = PRIO_TO_IQ(qe->priority); + struct sw_qid *qid = &sw->qids[qe->queue_id]; + struct iq_ring *iq_ring = qid->iq[iq_num]; + + if (iq_ring_free_count(iq_ring) == 0) + break; /* move to next port */ + + port->stats.rx_pkts++; + + /* Use the iq_num from above to push the QE + * into the qid at the right priority + */ + qid->iq_pkt_mask |= (1 << (iq_num)); + iq_ring_enqueue(iq_ring, qe); + qid->iq_pkt_count[iq_num]++; + qid->stats.rx_pkts++; + pkts_iter++; + +end_qe: + port->pp_buf_start++; + port->pp_buf_count--; + } /* while port->pp_buf_count */ + + return pkts_iter; +} + +void +sw_event_schedule(struct rte_eventdev *dev) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + uint32_t in_pkts, out_pkts; + uint32_t out_pkts_total = 0, in_pkts_total = 0; + int32_t sched_quanta = sw->sched_quanta; + uint32_t i; + + sw->sched_called++; + if (!sw->started) + return; + + do { + uint32_t in_pkts_this_iteration = 0; + + /* Pull from rx_ring for ports */ + do { + in_pkts = 0; + for (i = 0; i < sw->port_count; i++) + if (sw->ports[i].is_directed) + in_pkts += sw_schedule_pull_port_dir(sw, i); + else if (sw->ports[i].num_ordered_qids > 0) + in_pkts += sw_schedule_pull_port_lb(sw, i); + else + in_pkts += sw_schedule_pull_port_no_reorder(sw, i); + + /* QID scan for re-ordered */ + in_pkts += sw_schedule_reorder(sw, 0, + sw->qid_count); + in_pkts_this_iteration += in_pkts; + } while (in_pkts > 4 && + (int)in_pkts_this_iteration < sched_quanta); + + out_pkts = 0; + out_pkts += sw_schedule_qid_to_cq(sw); + out_pkts_total += out_pkts; + in_pkts_total += in_pkts_this_iteration; + + if (in_pkts == 0 && out_pkts == 0) + break; + } while ((int)out_pkts_total < sched_quanta); + + /* push all the internal buffered QEs in port->cq_ring to the + * worker cores: aka, do the ring transfers batched. + */ + for (i = 0; i < sw->port_count; i++) { + struct qe_ring *worker = sw->ports[i].cq_worker_ring; + qe_ring_enqueue_burst(worker, sw->ports[i].cq_buf, + sw->ports[i].cq_buf_count, + &sw->cq_ring_space[i]); + sw->ports[i].cq_buf_count = 0; + } + + sw->stats.tx_pkts += out_pkts_total; + sw->stats.rx_pkts += in_pkts_total; + + sw->sched_no_iq_enqueues += (in_pkts_total == 0); + sw->sched_no_cq_enqueues += (out_pkts_total == 0); + +} diff --git a/src/seastar/dpdk/drivers/event/sw/sw_evdev_worker.c b/src/seastar/dpdk/drivers/event/sw/sw_evdev_worker.c new file mode 100644 index 00000000..9cb6bef5 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/sw_evdev_worker.c @@ -0,0 +1,185 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "sw_evdev.h" +#include "event_ring.h" + +#define PORT_ENQUEUE_MAX_BURST_SIZE 64 + +static inline void +sw_event_release(struct sw_port *p, uint8_t index) +{ + /* + * Drops the next outstanding event in our history. Used on dequeue + * to clear any history before dequeuing more events. + */ + RTE_SET_USED(index); + + /* create drop message */ + struct rte_event ev; + ev.op = sw_qe_flag_map[RTE_EVENT_OP_RELEASE]; + + uint16_t free_count; + qe_ring_enqueue_burst(p->rx_worker_ring, &ev, 1, &free_count); + + /* each release returns one credit */ + p->outstanding_releases--; + p->inflight_credits++; +} + +uint16_t +sw_event_enqueue_burst(void *port, const struct rte_event ev[], uint16_t num) +{ + int32_t i; + uint8_t new_ops[PORT_ENQUEUE_MAX_BURST_SIZE]; + struct sw_port *p = port; + struct sw_evdev *sw = (void *)p->sw; + uint32_t sw_inflights = rte_atomic32_read(&sw->inflights); + + if (unlikely(p->inflight_max < sw_inflights)) + return 0; + + if (num > PORT_ENQUEUE_MAX_BURST_SIZE) + num = PORT_ENQUEUE_MAX_BURST_SIZE; + + if (p->inflight_credits < num) { + /* check if event enqueue brings port over max threshold */ + uint32_t credit_update_quanta = sw->credit_update_quanta; + if (sw_inflights + credit_update_quanta > sw->nb_events_limit) + return 0; + + rte_atomic32_add(&sw->inflights, credit_update_quanta); + p->inflight_credits += (credit_update_quanta); + + if (p->inflight_credits < num) + return 0; + } + + for (i = 0; i < num; i++) { + int op = ev[i].op; + int outstanding = p->outstanding_releases > 0; + const uint8_t invalid_qid = (ev[i].queue_id >= sw->qid_count); + + p->inflight_credits -= (op == RTE_EVENT_OP_NEW); + p->inflight_credits += (op == RTE_EVENT_OP_RELEASE) * + outstanding; + + new_ops[i] = sw_qe_flag_map[op]; + new_ops[i] &= ~(invalid_qid << QE_FLAG_VALID_SHIFT); + + /* FWD and RELEASE packets will both resolve to taken (assuming + * correct usage of the API), providing very high correct + * prediction rate. + */ + if ((new_ops[i] & QE_FLAG_COMPLETE) && outstanding) + p->outstanding_releases--; + + /* error case: branch to avoid touching p->stats */ + if (unlikely(invalid_qid)) { + p->stats.rx_dropped++; + p->inflight_credits++; + } + } + + /* returns number of events actually enqueued */ + uint32_t enq = qe_ring_enqueue_burst_with_ops(p->rx_worker_ring, ev, i, + new_ops); + if (p->outstanding_releases == 0 && p->last_dequeue_burst_sz != 0) { + uint64_t burst_ticks = rte_get_timer_cycles() - + p->last_dequeue_ticks; + uint64_t burst_pkt_ticks = + burst_ticks / p->last_dequeue_burst_sz; + p->avg_pkt_ticks -= p->avg_pkt_ticks / NUM_SAMPLES; + p->avg_pkt_ticks += burst_pkt_ticks / NUM_SAMPLES; + p->last_dequeue_ticks = 0; + } + return enq; +} + +uint16_t +sw_event_enqueue(void *port, const struct rte_event *ev) +{ + return sw_event_enqueue_burst(port, ev, 1); +} + +uint16_t +sw_event_dequeue_burst(void *port, struct rte_event *ev, uint16_t num, + uint64_t wait) +{ + RTE_SET_USED(wait); + struct sw_port *p = (void *)port; + struct sw_evdev *sw = (void *)p->sw; + struct qe_ring *ring = p->cq_worker_ring; + uint32_t credit_update_quanta = sw->credit_update_quanta; + + /* check that all previous dequeues have been released */ + if (!p->is_directed) { + uint16_t out_rels = p->outstanding_releases; + uint16_t i; + for (i = 0; i < out_rels; i++) + sw_event_release(p, i); + } + + /* returns number of events actually dequeued */ + uint16_t ndeq = qe_ring_dequeue_burst(ring, ev, num); + if (unlikely(ndeq == 0)) { + p->outstanding_releases = 0; + p->zero_polls++; + p->total_polls++; + goto end; + } + + /* only add credits for directed ports - LB ports send RELEASEs */ + p->inflight_credits += ndeq * p->is_directed; + p->outstanding_releases = ndeq; + p->last_dequeue_burst_sz = ndeq; + p->last_dequeue_ticks = rte_get_timer_cycles(); + p->poll_buckets[(ndeq - 1) >> SW_DEQ_STAT_BUCKET_SHIFT]++; + p->total_polls++; + +end: + if (p->inflight_credits >= credit_update_quanta * 2 && + p->inflight_credits > credit_update_quanta + ndeq) { + rte_atomic32_sub(&sw->inflights, credit_update_quanta); + p->inflight_credits -= credit_update_quanta; + } + return ndeq; +} + +uint16_t +sw_event_dequeue(void *port, struct rte_event *ev, uint64_t wait) +{ + return sw_event_dequeue_burst(port, ev, 1, wait); +} diff --git a/src/seastar/dpdk/drivers/event/sw/sw_evdev_xstats.c b/src/seastar/dpdk/drivers/event/sw/sw_evdev_xstats.c new file mode 100644 index 00000000..c7b1abe8 --- /dev/null +++ b/src/seastar/dpdk/drivers/event/sw/sw_evdev_xstats.c @@ -0,0 +1,674 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sw_evdev.h" +#include "iq_ring.h" +#include "event_ring.h" + +enum xstats_type { + /* common stats */ + rx, + tx, + dropped, + inflight, + calls, + credits, + /* device instance specific */ + no_iq_enq, + no_cq_enq, + /* port_specific */ + rx_used, + rx_free, + tx_used, + tx_free, + pkt_cycles, + poll_return, /* for zero-count and used also for port bucket loop */ + /* qid_specific */ + iq_size, + iq_used, + /* qid port mapping specific */ + pinned, +}; + +typedef uint64_t (*xstats_fn)(const struct sw_evdev *dev, + uint16_t obj_idx, /* port or queue id */ + enum xstats_type stat, int extra_arg); + +struct sw_xstats_entry { + struct rte_event_dev_xstats_name name; + xstats_fn fn; + uint16_t obj_idx; + enum xstats_type stat; + enum rte_event_dev_xstats_mode mode; + int extra_arg; + uint8_t reset_allowed; /* when set, this value can be reset */ + uint64_t reset_value; /* an offset to be taken away to emulate resets */ +}; + +static uint64_t +get_dev_stat(const struct sw_evdev *sw, uint16_t obj_idx __rte_unused, + enum xstats_type type, int extra_arg __rte_unused) +{ + switch (type) { + case rx: return sw->stats.rx_pkts; + case tx: return sw->stats.tx_pkts; + case dropped: return sw->stats.rx_dropped; + case calls: return sw->sched_called; + case no_iq_enq: return sw->sched_no_iq_enqueues; + case no_cq_enq: return sw->sched_no_cq_enqueues; + default: return -1; + } +} + +static uint64_t +get_port_stat(const struct sw_evdev *sw, uint16_t obj_idx, + enum xstats_type type, int extra_arg __rte_unused) +{ + const struct sw_port *p = &sw->ports[obj_idx]; + + switch (type) { + case rx: return p->stats.rx_pkts; + case tx: return p->stats.tx_pkts; + case dropped: return p->stats.rx_dropped; + case inflight: return p->inflights; + case pkt_cycles: return p->avg_pkt_ticks; + case calls: return p->total_polls; + case credits: return p->inflight_credits; + case poll_return: return p->zero_polls; + case rx_used: return qe_ring_count(p->rx_worker_ring); + case rx_free: return qe_ring_free_count(p->rx_worker_ring); + case tx_used: return qe_ring_count(p->cq_worker_ring); + case tx_free: return qe_ring_free_count(p->cq_worker_ring); + default: return -1; + } +} + +static uint64_t +get_port_bucket_stat(const struct sw_evdev *sw, uint16_t obj_idx, + enum xstats_type type, int extra_arg) +{ + const struct sw_port *p = &sw->ports[obj_idx]; + + switch (type) { + case poll_return: return p->poll_buckets[extra_arg]; + default: return -1; + } +} + +static uint64_t +get_qid_stat(const struct sw_evdev *sw, uint16_t obj_idx, + enum xstats_type type, int extra_arg __rte_unused) +{ + const struct sw_qid *qid = &sw->qids[obj_idx]; + + switch (type) { + case rx: return qid->stats.rx_pkts; + case tx: return qid->stats.tx_pkts; + case dropped: return qid->stats.rx_dropped; + case inflight: + do { + uint64_t infl = 0; + unsigned int i; + for (i = 0; i < RTE_DIM(qid->fids); i++) + infl += qid->fids[i].pcount; + return infl; + } while (0); + break; + case iq_size: return RTE_DIM(qid->iq[0]->ring); + default: return -1; + } +} + +static uint64_t +get_qid_iq_stat(const struct sw_evdev *sw, uint16_t obj_idx, + enum xstats_type type, int extra_arg) +{ + const struct sw_qid *qid = &sw->qids[obj_idx]; + const int iq_idx = extra_arg; + + switch (type) { + case iq_used: return iq_ring_count(qid->iq[iq_idx]); + default: return -1; + } +} + +static uint64_t +get_qid_port_stat(const struct sw_evdev *sw, uint16_t obj_idx, + enum xstats_type type, int extra_arg) +{ + const struct sw_qid *qid = &sw->qids[obj_idx]; + uint16_t port = extra_arg; + + switch (type) { + case pinned: + do { + uint64_t pin = 0; + unsigned int i; + for (i = 0; i < RTE_DIM(qid->fids); i++) + if (qid->fids[i].cq == port) + pin++; + return pin; + } while (0); + break; + default: return -1; + } +} + +int +sw_xstats_init(struct sw_evdev *sw) +{ + /* + * define the stats names and types. Used to build up the device + * xstats array + * There are multiple set of stats: + * - device-level, + * - per-port, + * - per-port-dequeue-burst-sizes + * - per-qid, + * - per-iq + * - per-port-per-qid + * + * For each of these sets, we have three parallel arrays, one for the + * names, the other for the stat type parameter to be passed in the fn + * call to get that stat. The third array allows resetting or not. + * All these arrays must be kept in sync + */ + static const char * const dev_stats[] = { "rx", "tx", "drop", + "sched_calls", "sched_no_iq_enq", "sched_no_cq_enq", + }; + static const enum xstats_type dev_types[] = { rx, tx, dropped, + calls, no_iq_enq, no_cq_enq, + }; + /* all device stats are allowed to be reset */ + + static const char * const port_stats[] = {"rx", "tx", "drop", + "inflight", "avg_pkt_cycles", "credits", + "rx_ring_used", "rx_ring_free", + "cq_ring_used", "cq_ring_free", + "dequeue_calls", "dequeues_returning_0", + }; + static const enum xstats_type port_types[] = { rx, tx, dropped, + inflight, pkt_cycles, credits, + rx_used, rx_free, tx_used, tx_free, + calls, poll_return, + }; + static const uint8_t port_reset_allowed[] = {1, 1, 1, + 0, 1, 0, + 0, 0, 0, 0, + 1, 1, + }; + + static const char * const port_bucket_stats[] = { + "dequeues_returning" }; + static const enum xstats_type port_bucket_types[] = { poll_return }; + /* all bucket dequeues are allowed to be reset, handled in loop below */ + + static const char * const qid_stats[] = {"rx", "tx", "drop", + "inflight", "iq_size" + }; + static const enum xstats_type qid_types[] = { rx, tx, dropped, + inflight, iq_size + }; + static const uint8_t qid_reset_allowed[] = {1, 1, 1, + 0, 0 + }; + + static const char * const qid_iq_stats[] = { "used" }; + static const enum xstats_type qid_iq_types[] = { iq_used }; + /* reset allowed */ + + static const char * const qid_port_stats[] = { "pinned_flows" }; + static const enum xstats_type qid_port_types[] = { pinned }; + /* reset allowed */ + /* ---- end of stat definitions ---- */ + + /* check sizes, since a missed comma can lead to strings being + * joined by the compiler. + */ + RTE_BUILD_BUG_ON(RTE_DIM(dev_stats) != RTE_DIM(dev_types)); + RTE_BUILD_BUG_ON(RTE_DIM(port_stats) != RTE_DIM(port_types)); + RTE_BUILD_BUG_ON(RTE_DIM(qid_stats) != RTE_DIM(qid_types)); + RTE_BUILD_BUG_ON(RTE_DIM(qid_iq_stats) != RTE_DIM(qid_iq_types)); + RTE_BUILD_BUG_ON(RTE_DIM(qid_port_stats) != RTE_DIM(qid_port_types)); + RTE_BUILD_BUG_ON(RTE_DIM(port_bucket_stats) != + RTE_DIM(port_bucket_types)); + + RTE_BUILD_BUG_ON(RTE_DIM(port_stats) != RTE_DIM(port_reset_allowed)); + RTE_BUILD_BUG_ON(RTE_DIM(qid_stats) != RTE_DIM(qid_reset_allowed)); + + /* other vars */ + const uint32_t cons_bkt_shift = + (MAX_SW_CONS_Q_DEPTH >> SW_DEQ_STAT_BUCKET_SHIFT); + const unsigned int count = RTE_DIM(dev_stats) + + sw->port_count * RTE_DIM(port_stats) + + sw->port_count * RTE_DIM(port_bucket_stats) * + (cons_bkt_shift + 1) + + sw->qid_count * RTE_DIM(qid_stats) + + sw->qid_count * SW_IQS_MAX * RTE_DIM(qid_iq_stats) + + sw->qid_count * sw->port_count * + RTE_DIM(qid_port_stats); + unsigned int i, port, qid, iq, bkt, stat = 0; + + sw->xstats = rte_zmalloc_socket(NULL, sizeof(sw->xstats[0]) * count, 0, + sw->data->socket_id); + if (sw->xstats == NULL) + return -ENOMEM; + +#define sname sw->xstats[stat].name.name + for (i = 0; i < RTE_DIM(dev_stats); i++, stat++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_dev_stat, + .stat = dev_types[i], + .mode = RTE_EVENT_DEV_XSTATS_DEVICE, + .reset_allowed = 1, + }; + snprintf(sname, sizeof(sname), "dev_%s", dev_stats[i]); + } + sw->xstats_count_mode_dev = stat; + + for (port = 0; port < sw->port_count; port++) { + sw->xstats_offset_for_port[port] = stat; + + uint32_t count_offset = stat; + + for (i = 0; i < RTE_DIM(port_stats); i++, stat++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_port_stat, + .obj_idx = port, + .stat = port_types[i], + .mode = RTE_EVENT_DEV_XSTATS_PORT, + .reset_allowed = port_reset_allowed[i], + }; + snprintf(sname, sizeof(sname), "port_%u_%s", + port, port_stats[i]); + } + + for (bkt = 0; bkt < (sw->ports[port].cq_worker_ring->size >> + SW_DEQ_STAT_BUCKET_SHIFT) + 1; bkt++) { + for (i = 0; i < RTE_DIM(port_bucket_stats); i++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_port_bucket_stat, + .obj_idx = port, + .stat = port_bucket_types[i], + .mode = RTE_EVENT_DEV_XSTATS_PORT, + .extra_arg = bkt, + .reset_allowed = 1, + }; + snprintf(sname, sizeof(sname), + "port_%u_%s_%u-%u", + port, port_bucket_stats[i], + (bkt << SW_DEQ_STAT_BUCKET_SHIFT) + 1, + (bkt + 1) << SW_DEQ_STAT_BUCKET_SHIFT); + stat++; + } + } + + sw->xstats_count_per_port[port] = stat - count_offset; + } + + sw->xstats_count_mode_port = stat - sw->xstats_count_mode_dev; + + for (qid = 0; qid < sw->qid_count; qid++) { + uint32_t count_offset = stat; + sw->xstats_offset_for_qid[qid] = stat; + + for (i = 0; i < RTE_DIM(qid_stats); i++, stat++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_qid_stat, + .obj_idx = qid, + .stat = qid_types[i], + .mode = RTE_EVENT_DEV_XSTATS_QUEUE, + .reset_allowed = qid_reset_allowed[i], + }; + snprintf(sname, sizeof(sname), "qid_%u_%s", + qid, qid_stats[i]); + } + for (iq = 0; iq < SW_IQS_MAX; iq++) + for (i = 0; i < RTE_DIM(qid_iq_stats); i++, stat++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_qid_iq_stat, + .obj_idx = qid, + .stat = qid_iq_types[i], + .mode = RTE_EVENT_DEV_XSTATS_QUEUE, + .extra_arg = iq, + .reset_allowed = 0, + }; + snprintf(sname, sizeof(sname), + "qid_%u_iq_%u_%s", + qid, iq, + qid_iq_stats[i]); + } + + for (port = 0; port < sw->port_count; port++) + for (i = 0; i < RTE_DIM(qid_port_stats); i++, stat++) { + sw->xstats[stat] = (struct sw_xstats_entry){ + .fn = get_qid_port_stat, + .obj_idx = qid, + .stat = qid_port_types[i], + .mode = RTE_EVENT_DEV_XSTATS_QUEUE, + .extra_arg = port, + .reset_allowed = 0, + }; + snprintf(sname, sizeof(sname), + "qid_%u_port_%u_%s", + qid, port, + qid_port_stats[i]); + } + + sw->xstats_count_per_qid[qid] = stat - count_offset; + } + + sw->xstats_count_mode_queue = stat - + (sw->xstats_count_mode_dev + sw->xstats_count_mode_port); +#undef sname + + sw->xstats_count = stat; + + return stat; +} + +int +sw_xstats_uninit(struct sw_evdev *sw) +{ + rte_free(sw->xstats); + sw->xstats_count = 0; + return 0; +} + +int +sw_xstats_get_names(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + struct rte_event_dev_xstats_name *xstats_names, + unsigned int *ids, unsigned int size) +{ + const struct sw_evdev *sw = sw_pmd_priv_const(dev); + unsigned int i; + unsigned int xidx = 0; + RTE_SET_USED(mode); + RTE_SET_USED(queue_port_id); + + uint32_t xstats_mode_count = 0; + uint32_t start_offset = 0; + + switch (mode) { + case RTE_EVENT_DEV_XSTATS_DEVICE: + xstats_mode_count = sw->xstats_count_mode_dev; + break; + case RTE_EVENT_DEV_XSTATS_PORT: + if (queue_port_id >= (signed int)sw->port_count) + break; + xstats_mode_count = sw->xstats_count_per_port[queue_port_id]; + start_offset = sw->xstats_offset_for_port[queue_port_id]; + break; + case RTE_EVENT_DEV_XSTATS_QUEUE: + if (queue_port_id >= (signed int)sw->qid_count) + break; + xstats_mode_count = sw->xstats_count_per_qid[queue_port_id]; + start_offset = sw->xstats_offset_for_qid[queue_port_id]; + break; + default: + SW_LOG_ERR("Invalid mode received in sw_xstats_get_names()\n"); + return -EINVAL; + }; + + if (xstats_mode_count > size || !ids || !xstats_names) + return xstats_mode_count; + + for (i = 0; i < sw->xstats_count && xidx < size; i++) { + if (sw->xstats[i].mode != mode) + continue; + + if (mode != RTE_EVENT_DEV_XSTATS_DEVICE && + queue_port_id != sw->xstats[i].obj_idx) + continue; + + xstats_names[xidx] = sw->xstats[i].name; + if (ids) + ids[xidx] = start_offset + xidx; + xidx++; + } + return xidx; +} + +static int +sw_xstats_update(struct sw_evdev *sw, enum rte_event_dev_xstats_mode mode, + uint8_t queue_port_id, const unsigned int ids[], + uint64_t values[], unsigned int n, const uint32_t reset, + const uint32_t ret_if_n_lt_nstats) +{ + unsigned int i; + unsigned int xidx = 0; + RTE_SET_USED(mode); + RTE_SET_USED(queue_port_id); + + uint32_t xstats_mode_count = 0; + + switch (mode) { + case RTE_EVENT_DEV_XSTATS_DEVICE: + xstats_mode_count = sw->xstats_count_mode_dev; + break; + case RTE_EVENT_DEV_XSTATS_PORT: + if (queue_port_id >= (signed int)sw->port_count) + goto invalid_value; + xstats_mode_count = sw->xstats_count_per_port[queue_port_id]; + break; + case RTE_EVENT_DEV_XSTATS_QUEUE: + if (queue_port_id >= (signed int)sw->qid_count) + goto invalid_value; + xstats_mode_count = sw->xstats_count_per_qid[queue_port_id]; + break; + default: + SW_LOG_ERR("Invalid mode received in sw_xstats_get()\n"); + goto invalid_value; + }; + + /* this function can check num stats and return them (xstats_get() style + * behaviour) or ignore n for reset() of a single stat style behaviour. + */ + if (ret_if_n_lt_nstats && xstats_mode_count > n) + return xstats_mode_count; + + for (i = 0; i < n && xidx < xstats_mode_count; i++) { + struct sw_xstats_entry *xs = &sw->xstats[ids[i]]; + if (ids[i] > sw->xstats_count || xs->mode != mode) + continue; + + if (mode != RTE_EVENT_DEV_XSTATS_DEVICE && + queue_port_id != xs->obj_idx) + continue; + + uint64_t val = xs->fn(sw, xs->obj_idx, xs->stat, xs->extra_arg) + - xs->reset_value; + + if (values) + values[xidx] = val; + + if (xs->reset_allowed && reset) + xs->reset_value = val; + + xidx++; + } + + return xidx; +invalid_value: + return -EINVAL; +} + +int +sw_xstats_get(const struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id, + const unsigned int ids[], uint64_t values[], unsigned int n) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + const uint32_t reset = 0; + const uint32_t ret_n_lt_stats = 1; + return sw_xstats_update(sw, mode, queue_port_id, ids, values, n, + reset, ret_n_lt_stats); +} + +uint64_t +sw_xstats_get_by_name(const struct rte_eventdev *dev, + const char *name, unsigned int *id) +{ + const struct sw_evdev *sw = sw_pmd_priv_const(dev); + unsigned int i; + + for (i = 0; i < sw->xstats_count; i++) { + struct sw_xstats_entry *xs = &sw->xstats[i]; + if (strncmp(xs->name.name, name, + RTE_EVENT_DEV_XSTATS_NAME_SIZE) == 0){ + if (id != NULL) + *id = i; + return xs->fn(sw, xs->obj_idx, xs->stat, xs->extra_arg) + - xs->reset_value; + } + } + if (id != NULL) + *id = (uint32_t)-1; + return (uint64_t)-1; +} + +static void +sw_xstats_reset_range(struct sw_evdev *sw, uint32_t start, uint32_t num) +{ + uint32_t i; + for (i = start; i < start + num; i++) { + struct sw_xstats_entry *xs = &sw->xstats[i]; + if (!xs->reset_allowed) + continue; + + uint64_t val = xs->fn(sw, xs->obj_idx, xs->stat, xs->extra_arg) + - xs->reset_value; + xs->reset_value = val; + } +} + +static int +sw_xstats_reset_queue(struct sw_evdev *sw, uint8_t queue_id, + const uint32_t ids[], uint32_t nb_ids) +{ + const uint32_t reset = 1; + const uint32_t ret_n_lt_stats = 0; + if (ids) { + uint32_t nb_reset = sw_xstats_update(sw, + RTE_EVENT_DEV_XSTATS_QUEUE, + queue_id, ids, NULL, nb_ids, + reset, ret_n_lt_stats); + return nb_reset == nb_ids ? 0 : -EINVAL; + } + + if (ids == NULL) + sw_xstats_reset_range(sw, sw->xstats_offset_for_qid[queue_id], + sw->xstats_count_per_qid[queue_id]); + + return 0; +} + +static int +sw_xstats_reset_port(struct sw_evdev *sw, uint8_t port_id, + const uint32_t ids[], uint32_t nb_ids) +{ + const uint32_t reset = 1; + const uint32_t ret_n_lt_stats = 0; + int offset = sw->xstats_offset_for_port[port_id]; + int nb_stat = sw->xstats_count_per_port[port_id]; + + if (ids) { + uint32_t nb_reset = sw_xstats_update(sw, + RTE_EVENT_DEV_XSTATS_PORT, port_id, + ids, NULL, nb_ids, + reset, ret_n_lt_stats); + return nb_reset == nb_ids ? 0 : -EINVAL; + } + + sw_xstats_reset_range(sw, offset, nb_stat); + return 0; +} + +static int +sw_xstats_reset_dev(struct sw_evdev *sw, const uint32_t ids[], uint32_t nb_ids) +{ + uint32_t i; + if (ids) { + for (i = 0; i < nb_ids; i++) { + uint32_t id = ids[i]; + if (id >= sw->xstats_count_mode_dev) + return -EINVAL; + sw_xstats_reset_range(sw, id, 1); + } + } else { + for (i = 0; i < sw->xstats_count_mode_dev; i++) + sw_xstats_reset_range(sw, i, 1); + } + + return 0; +} + +int +sw_xstats_reset(struct rte_eventdev *dev, + enum rte_event_dev_xstats_mode mode, + int16_t queue_port_id, + const uint32_t ids[], + uint32_t nb_ids) +{ + struct sw_evdev *sw = sw_pmd_priv(dev); + uint32_t i, err; + + /* handle -1 for queue_port_id here, looping over all ports/queues */ + switch (mode) { + case RTE_EVENT_DEV_XSTATS_DEVICE: + sw_xstats_reset_dev(sw, ids, nb_ids); + break; + case RTE_EVENT_DEV_XSTATS_PORT: + if (queue_port_id == -1) { + for (i = 0; i < sw->port_count; i++) { + err = sw_xstats_reset_port(sw, i, ids, nb_ids); + if (err) + return -EINVAL; + } + } else if (queue_port_id < (int16_t)sw->port_count) + sw_xstats_reset_port(sw, queue_port_id, ids, nb_ids); + break; + case RTE_EVENT_DEV_XSTATS_QUEUE: + if (queue_port_id == -1) { + for (i = 0; i < sw->qid_count; i++) { + err = sw_xstats_reset_queue(sw, i, ids, nb_ids); + if (err) + return -EINVAL; + } + } else if (queue_port_id < (int16_t)sw->qid_count) + sw_xstats_reset_queue(sw, queue_port_id, ids, nb_ids); + break; + }; + + return 0; +} -- cgit v1.2.3