diff options
Diffstat (limited to 'src/seastar/dpdk/drivers/net/bonding')
12 files changed, 7164 insertions, 0 deletions
diff --git a/src/seastar/dpdk/drivers/net/bonding/Makefile b/src/seastar/dpdk/drivers/net/bonding/Makefile new file mode 100644 index 00000000..910c932d --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/Makefile @@ -0,0 +1,61 @@ +# BSD LICENSE +# +# Copyright(c) 2010-2014 Intel Corporation. All rights reserved. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_bond.a + +CFLAGS += -O3 +CFLAGS += $(WERROR_FLAGS) + +EXPORT_MAP := rte_eth_bond_version.map + +LIBABIVER := 1 + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_api.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_pmd.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_args.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_8023ad.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_alb.c + +# +# Export include files +# +SYMLINK-y-include += rte_eth_bond.h +SYMLINK-y-include += rte_eth_bond_8023ad.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond.h b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond.h new file mode 100644 index 00000000..8efbf071 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond.h @@ -0,0 +1,377 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ETH_BOND_H_ +#define _RTE_ETH_BOND_H_ + +/** + * @file rte_eth_bond.h + * + * RTE Link Bonding Ethernet Device + * Link Bonding for 1GbE and 10GbE ports to allow the aggregation of multiple + * (slave) NICs into a single logical interface. The bonded device processes + * these interfaces based on the mode of operation specified and supported. + * This implementation supports 4 modes of operation round robin, active backup + * balance and broadcast. Providing redundant links, fault tolerance and/or + * load balancing of network ports + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_ether.h> + +/* Supported modes of operation of link bonding library */ + +#define BONDING_MODE_ROUND_ROBIN (0) +/**< Round Robin (Mode 0). + * In this mode all transmitted packets will be balanced equally across all + * active slaves of the bonded in a round robin fashion. */ +#define BONDING_MODE_ACTIVE_BACKUP (1) +/**< Active Backup (Mode 1). + * In this mode all packets transmitted will be transmitted on the primary + * slave until such point as the primary slave is no longer available and then + * transmitted packets will be sent on the next available slaves. The primary + * slave can be defined by the user but defaults to the first active slave + * available if not specified. */ +#define BONDING_MODE_BALANCE (2) +/**< Balance (Mode 2). + * In this mode all packets transmitted will be balanced across the available + * slaves using one of three available transmit policies - l2, l2+3 or l3+4. + * See BALANCE_XMIT_POLICY macros definitions for further details on transmit + * policies. */ +#define BONDING_MODE_BROADCAST (3) +/**< Broadcast (Mode 3). + * In this mode all transmitted packets will be transmitted on all available + * active slaves of the bonded. */ +#define BONDING_MODE_8023AD (4) +/**< 802.3AD (Mode 4). + * + * This mode provides auto negotiation/configuration + * of peers and well as link status changes monitoring using out of band + * LACP (link aggregation control protocol) messages. For further details of + * LACP specification see the IEEE 802.3ad/802.1AX standards. It is also + * described here + * https://www.kernel.org/doc/Documentation/networking/bonding.txt. + * + * Important Usage Notes: + * - for LACP mode to work the rx/tx burst functions must be invoked + * at least once every 100ms, otherwise the out-of-band LACP messages will not + * be handled with the expected latency and this may cause the link status to be + * incorrectly marked as down or failure to correctly negotiate with peers. + * - For optimal performance during initial handshaking the array of mbufs provided + * to rx_burst should be at least 2 times the slave count size. + * + */ +#define BONDING_MODE_TLB (5) +/**< Adaptive TLB (Mode 5) + * This mode provides an adaptive transmit load balancing. It dynamically + * changes the transmitting slave, according to the computed load. Statistics + * are collected in 100ms intervals and scheduled every 10ms */ +#define BONDING_MODE_ALB (6) +/**< Adaptive Load Balancing (Mode 6) + * This mode includes adaptive TLB and receive load balancing (RLB). In RLB the + * bonding driver intercepts ARP replies send by local system and overwrites its + * source MAC address, so that different peers send data to the server on + * different slave interfaces. When local system sends ARP request, it saves IP + * information from it. When ARP reply from that peer is received, its MAC is + * stored, one of slave MACs assigned and ARP reply send to that peer. + */ + +/* Balance Mode Transmit Policies */ +#define BALANCE_XMIT_POLICY_LAYER2 (0) +/**< Layer 2 (Ethernet MAC) */ +#define BALANCE_XMIT_POLICY_LAYER23 (1) +/**< Layer 2+3 (Ethernet MAC + IP Addresses) transmit load balancing */ +#define BALANCE_XMIT_POLICY_LAYER34 (2) +/**< Layer 3+4 (IP Addresses + UDP Ports) transmit load balancing */ + +/** + * Create a bonded rte_eth_dev device + * + * @param name Name of new link bonding device. + * @param mode Mode to initialize bonding device in. + * @param socket_id Socket Id on which to allocate eth_dev resources. + * + * @return + * Port Id of created rte_eth_dev on success, negative value otherwise + */ +int +rte_eth_bond_create(const char *name, uint8_t mode, uint8_t socket_id); + +/** + * Free a bonded rte_eth_dev device + * + * @param name Name of the link bonding device. + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_free(const char *name); + +/** + * Add a rte_eth_dev device as a slave to the bonded device + * + * @param bonded_port_id Port ID of bonded device. + * @param slave_port_id Port ID of slave device. + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_slave_add(uint8_t bonded_port_id, uint8_t slave_port_id); + +/** + * Remove a slave rte_eth_dev device from the bonded device + * + * @param bonded_port_id Port ID of bonded device. + * @param slave_port_id Port ID of slave device. + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_slave_remove(uint8_t bonded_port_id, uint8_t slave_port_id); + +/** + * Set link bonding mode of bonded device + * + * @param bonded_port_id Port ID of bonded device. + * @param mode Bonding mode to set + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_mode_set(uint8_t bonded_port_id, uint8_t mode); + +/** + * Get link bonding mode of bonded device + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * link bonding mode on success, negative value otherwise + */ +int +rte_eth_bond_mode_get(uint8_t bonded_port_id); + +/** + * Set slave rte_eth_dev as primary slave of bonded device + * + * @param bonded_port_id Port ID of bonded device. + * @param slave_port_id Port ID of slave device. + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_primary_set(uint8_t bonded_port_id, uint8_t slave_port_id); + +/** + * Get primary slave of bonded device + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * Port Id of primary slave on success, -1 on failure + */ +int +rte_eth_bond_primary_get(uint8_t bonded_port_id); + +/** + * Populate an array with list of the slaves port id's of the bonded device + * + * @param bonded_port_id Port ID of bonded eth_dev to interrogate + * @param slaves Array to be populated with the current active slaves + * @param len Length of slaves array + * + * @return + * Number of slaves associated with bonded device on success, + * negative value otherwise + */ +int +rte_eth_bond_slaves_get(uint8_t bonded_port_id, uint8_t slaves[], uint8_t len); + +/** + * Populate an array with list of the active slaves port id's of the bonded + * device. + * + * @param bonded_port_id Port ID of bonded eth_dev to interrogate + * @param slaves Array to be populated with the current active slaves + * @param len Length of slaves array + * + * @return + * Number of active slaves associated with bonded device on success, + * negative value otherwise + */ +int +rte_eth_bond_active_slaves_get(uint8_t bonded_port_id, uint8_t slaves[], + uint8_t len); + +/** + * Set explicit MAC address to use on bonded device and it's slaves. + * + * @param bonded_port_id Port ID of bonded device. + * @param mac_addr MAC Address to use on bonded device overriding + * slaves MAC addresses + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_mac_address_set(uint8_t bonded_port_id, + struct ether_addr *mac_addr); + +/** + * Reset bonded device to use MAC from primary slave on bonded device and it's + * slaves. + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * 0 on success, negative value otherwise + */ +int +rte_eth_bond_mac_address_reset(uint8_t bonded_port_id); + +/** + * Set the transmit policy for bonded device to use when it is operating in + * balance mode, this parameter is otherwise ignored in other modes of + * operation. + * + * @param bonded_port_id Port ID of bonded device. + * @param policy Balance mode transmission policy. + * + * @return + * 0 on success, negative value otherwise. + */ +int +rte_eth_bond_xmit_policy_set(uint8_t bonded_port_id, uint8_t policy); + +/** + * Get the transmit policy set on bonded device for balance mode operation + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * Balance transmit policy on success, negative value otherwise. + */ +int +rte_eth_bond_xmit_policy_get(uint8_t bonded_port_id); + +/** + * Set the link monitoring frequency (in ms) for monitoring the link status of + * slave devices + * + * @param bonded_port_id Port ID of bonded device. + * @param internal_ms Monitoring interval in milliseconds + * + * @return + * 0 on success, negative value otherwise. + */ + +int +rte_eth_bond_link_monitoring_set(uint8_t bonded_port_id, uint32_t internal_ms); + +/** + * Get the current link monitoring frequency (in ms) for monitoring of the link + * status of slave devices + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * Monitoring interval on success, negative value otherwise. + */ +int +rte_eth_bond_link_monitoring_get(uint8_t bonded_port_id); + + +/** + * Set the period in milliseconds for delaying the disabling of a bonded link + * when the link down status has been detected + * + * @param bonded_port_id Port ID of bonded device. + * @param delay_ms Delay period in milliseconds. + * + * @return + * 0 on success, negative value otherwise. + */ +int +rte_eth_bond_link_down_prop_delay_set(uint8_t bonded_port_id, uint32_t delay_ms); + +/** + * Get the period in milliseconds set for delaying the disabling of a bonded + * link when the link down status has been detected + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * Delay period on success, negative value otherwise. + */ +int +rte_eth_bond_link_down_prop_delay_get(uint8_t bonded_port_id); + +/** + * Set the period in milliseconds for delaying the enabling of a bonded link + * when the link up status has been detected + * + * @param bonded_port_id Port ID of bonded device. + * @param delay_ms Delay period in milliseconds. + * + * @return + * 0 on success, negative value otherwise. + */ +int +rte_eth_bond_link_up_prop_delay_set(uint8_t bonded_port_id, uint32_t delay_ms); + +/** + * Get the period in milliseconds set for delaying the enabling of a bonded + * link when the link up status has been detected + * + * @param bonded_port_id Port ID of bonded device. + * + * @return + * Delay period on success, negative value otherwise. + */ +int +rte_eth_bond_link_up_prop_delay_get(uint8_t bonded_port_id); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.c b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.c new file mode 100644 index 00000000..7b863d6e --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.c @@ -0,0 +1,1506 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stddef.h> +#include <string.h> +#include <stdbool.h> + +#include <rte_alarm.h> +#include <rte_malloc.h> +#include <rte_errno.h> +#include <rte_cycles.h> +#include <rte_compat.h> + +#include "rte_eth_bond_private.h" + +static void bond_mode_8023ad_ext_periodic_cb(void *arg); + +#ifdef RTE_LIBRTE_BOND_DEBUG_8023AD +#define MODE4_DEBUG(fmt, ...) RTE_LOG(DEBUG, PMD, "%6u [Port %u: %s] " fmt, \ + bond_dbg_get_time_diff_ms(), slave_id, \ + __func__, ##__VA_ARGS__) + +static uint64_t start_time; + +static unsigned +bond_dbg_get_time_diff_ms(void) +{ + uint64_t now; + + now = rte_rdtsc(); + if (start_time == 0) + start_time = now; + + return ((now - start_time) * 1000) / rte_get_tsc_hz(); +} + +static void +bond_print_lacp(struct lacpdu *l) +{ + char a_address[18]; + char p_address[18]; + char a_state[256] = { 0 }; + char p_state[256] = { 0 }; + + static const char * const state_labels[] = { + "ACT", "TIMEOUT", "AGG", "SYNC", "COL", "DIST", "DEF", "EXP" + }; + + int a_len = 0; + int p_len = 0; + uint8_t i; + uint8_t *addr; + + addr = l->actor.port_params.system.addr_bytes; + snprintf(a_address, sizeof(a_address), "%02X:%02X:%02X:%02X:%02X:%02X", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + + addr = l->partner.port_params.system.addr_bytes; + snprintf(p_address, sizeof(p_address), "%02X:%02X:%02X:%02X:%02X:%02X", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + + for (i = 0; i < 8; i++) { + if ((l->actor.state >> i) & 1) { + a_len += snprintf(&a_state[a_len], RTE_DIM(a_state) - a_len, "%s ", + state_labels[i]); + } + + if ((l->partner.state >> i) & 1) { + p_len += snprintf(&p_state[p_len], RTE_DIM(p_state) - p_len, "%s ", + state_labels[i]); + } + } + + if (a_len && a_state[a_len-1] == ' ') + a_state[a_len-1] = '\0'; + + if (p_len && p_state[p_len-1] == ' ') + p_state[p_len-1] = '\0'; + + RTE_LOG(DEBUG, PMD, "LACP: {\n"\ + " subtype= %02X\n"\ + " ver_num=%02X\n"\ + " actor={ tlv=%02X, len=%02X\n"\ + " pri=%04X, system=%s, key=%04X, p_pri=%04X p_num=%04X\n"\ + " state={ %s }\n"\ + " }\n"\ + " partner={ tlv=%02X, len=%02X\n"\ + " pri=%04X, system=%s, key=%04X, p_pri=%04X p_num=%04X\n"\ + " state={ %s }\n"\ + " }\n"\ + " collector={info=%02X, length=%02X, max_delay=%04X\n, " \ + "type_term=%02X, terminator_length = %02X}\n",\ + l->subtype,\ + l->version_number,\ + l->actor.tlv_type_info,\ + l->actor.info_length,\ + l->actor.port_params.system_priority,\ + a_address,\ + l->actor.port_params.key,\ + l->actor.port_params.port_priority,\ + l->actor.port_params.port_number,\ + a_state,\ + l->partner.tlv_type_info,\ + l->partner.info_length,\ + l->partner.port_params.system_priority,\ + p_address,\ + l->partner.port_params.key,\ + l->partner.port_params.port_priority,\ + l->partner.port_params.port_number,\ + p_state,\ + l->tlv_type_collector_info,\ + l->collector_info_length,\ + l->collector_max_delay,\ + l->tlv_type_terminator,\ + l->terminator_length); + +} +#define BOND_PRINT_LACP(lacpdu) bond_print_lacp(lacpdu) +#else +#define BOND_PRINT_LACP(lacpdu) do { } while (0) +#define MODE4_DEBUG(fmt, ...) do { } while (0) +#endif + +static const struct ether_addr lacp_mac_addr = { + .addr_bytes = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 } +}; + +struct port mode_8023ad_ports[RTE_MAX_ETHPORTS]; + +static void +timer_cancel(uint64_t *timer) +{ + *timer = 0; +} + +static void +timer_set(uint64_t *timer, uint64_t timeout) +{ + *timer = rte_rdtsc() + timeout; +} + +/* Forces given timer to be in expired state. */ +static void +timer_force_expired(uint64_t *timer) +{ + *timer = rte_rdtsc(); +} + +static bool +timer_is_stopped(uint64_t *timer) +{ + return *timer == 0; +} + +static bool +timer_is_expired(uint64_t *timer) +{ + return *timer < rte_rdtsc(); +} + +/* Timer is in running state if it is not stopped nor expired */ +static bool +timer_is_running(uint64_t *timer) +{ + return !timer_is_stopped(timer) && !timer_is_expired(timer); +} + +static void +set_warning_flags(struct port *port, uint16_t flags) +{ + int retval; + uint16_t old; + uint16_t new_flag = 0; + + do { + old = port->warnings_to_show; + new_flag = old | flags; + retval = rte_atomic16_cmpset(&port->warnings_to_show, old, new_flag); + } while (unlikely(retval == 0)); +} + +static void +show_warnings(uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + uint8_t warnings; + + do { + warnings = port->warnings_to_show; + } while (rte_atomic16_cmpset(&port->warnings_to_show, warnings, 0) == 0); + + if (!warnings) + return; + + if (!timer_is_expired(&port->warning_timer)) + return; + + + timer_set(&port->warning_timer, BOND_8023AD_WARNINGS_PERIOD_MS * + rte_get_tsc_hz() / 1000); + + if (warnings & WRN_RX_QUEUE_FULL) { + RTE_LOG(DEBUG, PMD, + "Slave %u: failed to enqueue LACP packet into RX ring.\n" + "Receive and transmit functions must be invoked on bonded\n" + "interface at least 10 times per second or LACP will not\n" + "work correctly\n", slave_id); + } + + if (warnings & WRN_TX_QUEUE_FULL) { + RTE_LOG(DEBUG, PMD, + "Slave %u: failed to enqueue LACP packet into TX ring.\n" + "Receive and transmit functions must be invoked on bonded\n" + "interface at least 10 times per second or LACP will not\n" + "work correctly\n", slave_id); + } + + if (warnings & WRN_RX_MARKER_TO_FAST) + RTE_LOG(INFO, PMD, "Slave %u: marker to early - ignoring.\n", slave_id); + + if (warnings & WRN_UNKNOWN_SLOW_TYPE) { + RTE_LOG(INFO, PMD, + "Slave %u: ignoring unknown slow protocol frame type", slave_id); + } + + if (warnings & WRN_UNKNOWN_MARKER_TYPE) + RTE_LOG(INFO, PMD, "Slave %u: ignoring unknown marker type", slave_id); + + if (warnings & WRN_NOT_LACP_CAPABLE) + MODE4_DEBUG("Port %u is not LACP capable!\n", slave_id); +} + +static void +record_default(struct port *port) +{ + /* Record default parameters for partner. Partner admin parameters + * are not implemented so set them to arbitrary default (last known) and + * mark actor that parner is in defaulted state. */ + port->partner_state = STATE_LACP_ACTIVE; + ACTOR_STATE_SET(port, DEFAULTED); +} + +/** Function handles rx state machine. + * + * This function implements Receive State Machine from point 5.4.12 in + * 802.1AX documentation. It should be called periodically. + * + * @param lacpdu LACPDU received. + * @param port Port on which LACPDU was received. + */ +static void +rx_machine(struct bond_dev_private *internals, uint8_t slave_id, + struct lacpdu *lacp) +{ + struct port *agg, *port = &mode_8023ad_ports[slave_id]; + uint64_t timeout; + + if (SM_FLAG(port, BEGIN)) { + /* Initialize stuff */ + MODE4_DEBUG("-> INITIALIZE\n"); + SM_FLAG_CLR(port, MOVED); + port->selected = UNSELECTED; + + record_default(port); + + ACTOR_STATE_CLR(port, EXPIRED); + timer_cancel(&port->current_while_timer); + + /* DISABLED: On initialization partner is out of sync */ + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + + /* LACP DISABLED stuff if LACP not enabled on this port */ + if (!SM_FLAG(port, LACP_ENABLED)) + PARTNER_STATE_CLR(port, AGGREGATION); + else + PARTNER_STATE_SET(port, AGGREGATION); + } + + if (!SM_FLAG(port, LACP_ENABLED)) { + /* Update parameters only if state changed */ + if (!timer_is_stopped(&port->current_while_timer)) { + port->selected = UNSELECTED; + record_default(port); + PARTNER_STATE_CLR(port, AGGREGATION); + ACTOR_STATE_CLR(port, EXPIRED); + timer_cancel(&port->current_while_timer); + } + return; + } + + if (lacp) { + MODE4_DEBUG("LACP -> CURRENT\n"); + BOND_PRINT_LACP(lacp); + /* Update selected flag. If partner parameters are defaulted assume they + * are match. If not defaulted compare LACP actor with ports parner + * params. */ + if (!ACTOR_STATE(port, DEFAULTED) && + (ACTOR_STATE(port, AGGREGATION) != PARTNER_STATE(port, AGGREGATION) + || memcmp(&port->partner, &lacp->actor.port_params, + sizeof(port->partner)) != 0)) { + MODE4_DEBUG("selected <- UNSELECTED\n"); + port->selected = UNSELECTED; + } + + /* Record this PDU actor params as partner params */ + memcpy(&port->partner, &lacp->actor.port_params, + sizeof(struct port_params)); + port->partner_state = lacp->actor.state; + + /* Partner parameters are not defaulted any more */ + ACTOR_STATE_CLR(port, DEFAULTED); + + /* If LACP partner params match this port actor params */ + agg = &mode_8023ad_ports[port->aggregator_port_id]; + bool match = port->actor.system_priority == + lacp->partner.port_params.system_priority && + is_same_ether_addr(&agg->actor.system, + &lacp->partner.port_params.system) && + port->actor.port_priority == + lacp->partner.port_params.port_priority && + port->actor.port_number == + lacp->partner.port_params.port_number; + + /* Update NTT if partners information are outdated (xored and masked + * bits are set)*/ + uint8_t state_mask = STATE_LACP_ACTIVE | STATE_LACP_SHORT_TIMEOUT | + STATE_SYNCHRONIZATION | STATE_AGGREGATION; + + if (((port->actor_state ^ lacp->partner.state) & state_mask) || + match == false) { + SM_FLAG_SET(port, NTT); + } + + /* If LACP partner params match this port actor params */ + if (match == true && ACTOR_STATE(port, AGGREGATION) == + PARTNER_STATE(port, AGGREGATION)) + PARTNER_STATE_SET(port, SYNCHRONIZATION); + else if (!PARTNER_STATE(port, AGGREGATION) && ACTOR_STATE(port, + AGGREGATION)) + PARTNER_STATE_SET(port, SYNCHRONIZATION); + else + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + + if (ACTOR_STATE(port, LACP_SHORT_TIMEOUT)) + timeout = internals->mode4.short_timeout; + else + timeout = internals->mode4.long_timeout; + + timer_set(&port->current_while_timer, timeout); + ACTOR_STATE_CLR(port, EXPIRED); + return; /* No state change */ + } + + /* If CURRENT state timer is not running (stopped or expired) + * transit to EXPIRED state from DISABLED or CURRENT */ + if (!timer_is_running(&port->current_while_timer)) { + ACTOR_STATE_SET(port, EXPIRED); + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + PARTNER_STATE_SET(port, LACP_SHORT_TIMEOUT); + timer_set(&port->current_while_timer, internals->mode4.short_timeout); + } +} + +/** + * Function handles periodic tx state machine. + * + * Function implements Periodic Transmission state machine from point 5.4.13 + * in 802.1AX documentation. It should be called periodically. + * + * @param port Port to handle state machine. + */ +static void +periodic_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + /* Calculate if either site is LACP enabled */ + uint64_t timeout; + uint8_t active = ACTOR_STATE(port, LACP_ACTIVE) || + PARTNER_STATE(port, LACP_ACTIVE); + + uint8_t is_partner_fast, was_partner_fast; + /* No periodic is on BEGIN, LACP DISABLE or when both sides are pasive */ + if (SM_FLAG(port, BEGIN) || !SM_FLAG(port, LACP_ENABLED) || !active) { + timer_cancel(&port->periodic_timer); + timer_force_expired(&port->tx_machine_timer); + SM_FLAG_CLR(port, PARTNER_SHORT_TIMEOUT); + + MODE4_DEBUG("-> NO_PERIODIC ( %s%s%s)\n", + SM_FLAG(port, BEGIN) ? "begind " : "", + SM_FLAG(port, LACP_ENABLED) ? "" : "LACP disabled ", + active ? "LACP active " : "LACP pasive "); + return; + } + + is_partner_fast = PARTNER_STATE(port, LACP_SHORT_TIMEOUT); + was_partner_fast = SM_FLAG(port, PARTNER_SHORT_TIMEOUT); + + /* If periodic timer is not started, transit from NO PERIODIC to FAST/SLOW. + * Other case: check if timer expire or partners settings changed. */ + if (!timer_is_stopped(&port->periodic_timer)) { + if (timer_is_expired(&port->periodic_timer)) { + SM_FLAG_SET(port, NTT); + } else if (is_partner_fast != was_partner_fast) { + /* Partners timeout was slow and now it is fast -> send LACP. + * In other case (was fast and now it is slow) just switch + * timeout to slow without forcing send of LACP (because standard + * say so)*/ + if (!is_partner_fast) + SM_FLAG_SET(port, NTT); + } else + return; /* Nothing changed */ + } + + /* Handle state transition to FAST/SLOW LACP timeout */ + if (is_partner_fast) { + timeout = internals->mode4.fast_periodic_timeout; + SM_FLAG_SET(port, PARTNER_SHORT_TIMEOUT); + } else { + timeout = internals->mode4.slow_periodic_timeout; + SM_FLAG_CLR(port, PARTNER_SHORT_TIMEOUT); + } + + timer_set(&port->periodic_timer, timeout); +} + +/** + * Function handles mux state machine. + * + * Function implements Mux Machine from point 5.4.15 in 802.1AX documentation. + * It should be called periodically. + * + * @param port Port to handle state machine. + */ +static void +mux_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + + /* Save current state for later use */ + const uint8_t state_mask = STATE_SYNCHRONIZATION | STATE_DISTRIBUTING | + STATE_COLLECTING; + + /* Enter DETACHED state on BEGIN condition or from any other state if + * port was unselected */ + if (SM_FLAG(port, BEGIN) || + port->selected == UNSELECTED || (port->selected == STANDBY && + (port->actor_state & state_mask) != 0)) { + /* detach mux from aggregator */ + port->actor_state &= ~state_mask; + /* Set ntt to true if BEGIN condition or transition from any other state + * which is indicated that wait_while_timer was started */ + if (SM_FLAG(port, BEGIN) || + !timer_is_stopped(&port->wait_while_timer)) { + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("-> DETACHED\n"); + } + timer_cancel(&port->wait_while_timer); + } + + if (timer_is_stopped(&port->wait_while_timer)) { + if (port->selected == SELECTED || port->selected == STANDBY) { + timer_set(&port->wait_while_timer, + internals->mode4.aggregate_wait_timeout); + + MODE4_DEBUG("DETACHED -> WAITING\n"); + } + /* Waiting state entered */ + return; + } + + /* Transit next state if port is ready */ + if (!timer_is_expired(&port->wait_while_timer)) + return; + + if ((ACTOR_STATE(port, DISTRIBUTING) || ACTOR_STATE(port, COLLECTING)) && + !PARTNER_STATE(port, SYNCHRONIZATION)) { + /* If in COLLECTING or DISTRIBUTING state and partner becomes out of + * sync transit to ATACHED state. */ + ACTOR_STATE_CLR(port, DISTRIBUTING); + ACTOR_STATE_CLR(port, COLLECTING); + /* Clear actor sync to activate transit ATACHED in condition bellow */ + ACTOR_STATE_CLR(port, SYNCHRONIZATION); + MODE4_DEBUG("Out of sync -> ATTACHED\n"); + } + + if (!ACTOR_STATE(port, SYNCHRONIZATION)) { + /* attach mux to aggregator */ + RTE_ASSERT((port->actor_state & (STATE_COLLECTING | + STATE_DISTRIBUTING)) == 0); + + ACTOR_STATE_SET(port, SYNCHRONIZATION); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("ATTACHED Entered\n"); + } else if (!ACTOR_STATE(port, COLLECTING)) { + /* Start collecting if in sync */ + if (PARTNER_STATE(port, SYNCHRONIZATION)) { + MODE4_DEBUG("ATTACHED -> COLLECTING\n"); + ACTOR_STATE_SET(port, COLLECTING); + SM_FLAG_SET(port, NTT); + } + } else if (ACTOR_STATE(port, COLLECTING)) { + /* Check if partner is in COLLECTING state. If so this port can + * distribute frames to it */ + if (!ACTOR_STATE(port, DISTRIBUTING)) { + if (PARTNER_STATE(port, COLLECTING)) { + /* Enable DISTRIBUTING if partner is collecting */ + ACTOR_STATE_SET(port, DISTRIBUTING); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("COLLECTING -> DISTRIBUTING\n"); + RTE_LOG(INFO, PMD, + "Bond %u: slave id %u distributing started.\n", + internals->port_id, slave_id); + } + } else { + if (!PARTNER_STATE(port, COLLECTING)) { + /* Disable DISTRIBUTING (enter COLLECTING state) if partner + * is not collecting */ + ACTOR_STATE_CLR(port, DISTRIBUTING); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("DISTRIBUTING -> COLLECTING\n"); + RTE_LOG(INFO, PMD, + "Bond %u: slave id %u distributing stopped.\n", + internals->port_id, slave_id); + } + } + } +} + +/** + * Function handles transmit state machine. + * + * Function implements Transmit Machine from point 5.4.16 in 802.1AX + * documentation. + * + * @param port + */ +static void +tx_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *agg, *port = &mode_8023ad_ports[slave_id]; + + struct rte_mbuf *lacp_pkt = NULL; + struct lacpdu_header *hdr; + struct lacpdu *lacpdu; + + /* If periodic timer is not running periodic machine is in NO PERIODIC and + * according to 802.3ax standard tx machine should not transmit any frames + * and set ntt to false. */ + if (timer_is_stopped(&port->periodic_timer)) + SM_FLAG_CLR(port, NTT); + + if (!SM_FLAG(port, NTT)) + return; + + if (!timer_is_expired(&port->tx_machine_timer)) + return; + + lacp_pkt = rte_pktmbuf_alloc(port->mbuf_pool); + if (lacp_pkt == NULL) { + RTE_LOG(ERR, PMD, "Failed to allocate LACP packet from pool\n"); + return; + } + + lacp_pkt->data_len = sizeof(*hdr); + lacp_pkt->pkt_len = sizeof(*hdr); + + hdr = rte_pktmbuf_mtod(lacp_pkt, struct lacpdu_header *); + + /* Source and destination MAC */ + ether_addr_copy(&lacp_mac_addr, &hdr->eth_hdr.d_addr); + rte_eth_macaddr_get(slave_id, &hdr->eth_hdr.s_addr); + hdr->eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_SLOW); + + lacpdu = &hdr->lacpdu; + memset(lacpdu, 0, sizeof(*lacpdu)); + + /* Initialize LACP part */ + lacpdu->subtype = SLOW_SUBTYPE_LACP; + lacpdu->version_number = 1; + + /* ACTOR */ + lacpdu->actor.tlv_type_info = TLV_TYPE_ACTOR_INFORMATION; + lacpdu->actor.info_length = sizeof(struct lacpdu_actor_partner_params); + memcpy(&hdr->lacpdu.actor.port_params, &port->actor, + sizeof(port->actor)); + agg = &mode_8023ad_ports[port->aggregator_port_id]; + ether_addr_copy(&agg->actor.system, &hdr->lacpdu.actor.port_params.system); + lacpdu->actor.state = port->actor_state; + + /* PARTNER */ + lacpdu->partner.tlv_type_info = TLV_TYPE_PARTNER_INFORMATION; + lacpdu->partner.info_length = sizeof(struct lacpdu_actor_partner_params); + memcpy(&lacpdu->partner.port_params, &port->partner, + sizeof(struct port_params)); + lacpdu->partner.state = port->partner_state; + + /* Other fields */ + lacpdu->tlv_type_collector_info = TLV_TYPE_COLLECTOR_INFORMATION; + lacpdu->collector_info_length = 0x10; + lacpdu->collector_max_delay = 0; + + lacpdu->tlv_type_terminator = TLV_TYPE_TERMINATOR_INFORMATION; + lacpdu->terminator_length = 0; + + if (rte_ring_enqueue(port->tx_ring, lacp_pkt) == -ENOBUFS) { + /* If TX ring full, drop packet and free message. Retransmission + * will happen in next function call. */ + rte_pktmbuf_free(lacp_pkt); + set_warning_flags(port, WRN_TX_QUEUE_FULL); + return; + } + + MODE4_DEBUG("sending LACP frame\n"); + BOND_PRINT_LACP(lacpdu); + + timer_set(&port->tx_machine_timer, internals->mode4.tx_period_timeout); + SM_FLAG_CLR(port, NTT); +} + +/** + * Function assigns port to aggregator. + * + * @param bond_dev_private Pointer to bond_dev_private structure. + * @param port_pos Port to assign. + */ +static void +selection_logic(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *agg, *port; + uint8_t slaves_count, new_agg_id, i; + uint8_t *slaves; + + slaves = internals->active_slaves; + slaves_count = internals->active_slave_count; + port = &mode_8023ad_ports[slave_id]; + + /* Search for aggregator suitable for this port */ + for (i = 0; i < slaves_count; ++i) { + agg = &mode_8023ad_ports[slaves[i]]; + /* Skip ports that are not aggreagators */ + if (agg->aggregator_port_id != slaves[i]) + continue; + + /* Actors system ID is not checked since all slave device have the same + * ID (MAC address). */ + if ((agg->actor.key == port->actor.key && + agg->partner.system_priority == port->partner.system_priority && + is_same_ether_addr(&agg->partner.system, &port->partner.system) == 1 + && (agg->partner.key == port->partner.key)) && + is_zero_ether_addr(&port->partner.system) != 1 && + (agg->actor.key & + rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY)) != 0) { + + break; + } + } + + /* By default, port uses it self as agregator */ + if (i == slaves_count) + new_agg_id = slave_id; + else + new_agg_id = slaves[i]; + + if (new_agg_id != port->aggregator_port_id) { + port->aggregator_port_id = new_agg_id; + + MODE4_DEBUG("-> SELECTED: ID=%3u\n" + "\t%s aggregator ID=%3u\n", + port->aggregator_port_id, + port->aggregator_port_id == slave_id ? + "aggregator not found, using default" : "aggregator found", + port->aggregator_port_id); + } + + port->selected = SELECTED; +} + +/* Function maps DPDK speed to bonding speed stored in key field */ +static uint16_t +link_speed_key(uint16_t speed) { + uint16_t key_speed; + + switch (speed) { + case ETH_SPEED_NUM_NONE: + key_speed = 0x00; + break; + case ETH_SPEED_NUM_10M: + key_speed = BOND_LINK_SPEED_KEY_10M; + break; + case ETH_SPEED_NUM_100M: + key_speed = BOND_LINK_SPEED_KEY_100M; + break; + case ETH_SPEED_NUM_1G: + key_speed = BOND_LINK_SPEED_KEY_1000M; + break; + case ETH_SPEED_NUM_10G: + key_speed = BOND_LINK_SPEED_KEY_10G; + break; + case ETH_SPEED_NUM_20G: + key_speed = BOND_LINK_SPEED_KEY_20G; + break; + case ETH_SPEED_NUM_40G: + key_speed = BOND_LINK_SPEED_KEY_40G; + break; + default: + /* Unknown speed*/ + key_speed = 0xFFFF; + } + + return key_speed; +} + +static void +bond_mode_8023ad_periodic_cb(void *arg) +{ + struct rte_eth_dev *bond_dev = arg; + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct port *port; + struct rte_eth_link link_info; + struct ether_addr slave_addr; + + void *pkt = NULL; + uint8_t i, slave_id; + + + /* Update link status on each port */ + for (i = 0; i < internals->active_slave_count; i++) { + uint16_t key; + + slave_id = internals->active_slaves[i]; + rte_eth_link_get(slave_id, &link_info); + rte_eth_macaddr_get(slave_id, &slave_addr); + + if (link_info.link_status != 0) { + key = link_speed_key(link_info.link_speed) << 1; + if (link_info.link_duplex == ETH_LINK_FULL_DUPLEX) + key |= BOND_LINK_FULL_DUPLEX_KEY; + } else + key = 0; + + port = &mode_8023ad_ports[slave_id]; + + key = rte_cpu_to_be_16(key); + if (key != port->actor.key) { + if (!(key & rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY))) + set_warning_flags(port, WRN_NOT_LACP_CAPABLE); + + port->actor.key = key; + SM_FLAG_SET(port, NTT); + } + + if (!is_same_ether_addr(&port->actor.system, &slave_addr)) { + ether_addr_copy(&slave_addr, &port->actor.system); + if (port->aggregator_port_id == slave_id) + SM_FLAG_SET(port, NTT); + } + } + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + port = &mode_8023ad_ports[slave_id]; + + if ((port->actor.key & + rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY)) == 0) { + + SM_FLAG_SET(port, BEGIN); + + /* LACP is disabled on half duples or link is down */ + if (SM_FLAG(port, LACP_ENABLED)) { + /* If port was enabled set it to BEGIN state */ + SM_FLAG_CLR(port, LACP_ENABLED); + ACTOR_STATE_CLR(port, DISTRIBUTING); + ACTOR_STATE_CLR(port, COLLECTING); + } + + /* Skip this port processing */ + continue; + } + + SM_FLAG_SET(port, LACP_ENABLED); + + /* Find LACP packet to this port. Do not check subtype, it is done in + * function that queued packet */ + if (rte_ring_dequeue(port->rx_ring, &pkt) == 0) { + struct rte_mbuf *lacp_pkt = pkt; + struct lacpdu_header *lacp; + + lacp = rte_pktmbuf_mtod(lacp_pkt, struct lacpdu_header *); + RTE_ASSERT(lacp->lacpdu.subtype == SLOW_SUBTYPE_LACP); + + /* This is LACP frame so pass it to rx_machine */ + rx_machine(internals, slave_id, &lacp->lacpdu); + rte_pktmbuf_free(lacp_pkt); + } else + rx_machine(internals, slave_id, NULL); + + periodic_machine(internals, slave_id); + mux_machine(internals, slave_id); + tx_machine(internals, slave_id); + selection_logic(internals, slave_id); + + SM_FLAG_CLR(port, BEGIN); + show_warnings(slave_id); + } + + rte_eal_alarm_set(internals->mode4.update_timeout_us, + bond_mode_8023ad_periodic_cb, arg); +} + +void +bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev, uint8_t slave_id) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + + struct port *port = &mode_8023ad_ports[slave_id]; + struct port_params initial = { + .system = { { 0 } }, + .system_priority = rte_cpu_to_be_16(0xFFFF), + .key = rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY), + .port_priority = rte_cpu_to_be_16(0x00FF), + .port_number = 0, + }; + + char mem_name[RTE_ETH_NAME_MAX_LEN]; + int socket_id; + unsigned element_size; + uint32_t total_tx_desc; + struct bond_tx_queue *bd_tx_q; + uint16_t q_id; + + /* Given slave mus not be in active list */ + RTE_ASSERT(find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) == internals->active_slave_count); + RTE_SET_USED(internals); /* used only for assert when enabled */ + + memcpy(&port->actor, &initial, sizeof(struct port_params)); + /* Standard requires that port ID must be grater than 0. + * Add 1 do get corresponding port_number */ + port->actor.port_number = rte_cpu_to_be_16((uint16_t)slave_id + 1); + + memcpy(&port->partner, &initial, sizeof(struct port_params)); + + /* default states */ + port->actor_state = STATE_AGGREGATION | STATE_LACP_ACTIVE | STATE_DEFAULTED; + port->partner_state = STATE_LACP_ACTIVE; + port->sm_flags = SM_FLAGS_BEGIN; + + /* use this port as agregator */ + port->aggregator_port_id = slave_id; + rte_eth_promiscuous_enable(slave_id); + + timer_cancel(&port->warning_timer); + + if (port->mbuf_pool != NULL) + return; + + RTE_ASSERT(port->rx_ring == NULL); + RTE_ASSERT(port->tx_ring == NULL); + socket_id = rte_eth_devices[slave_id].data->numa_node; + + element_size = sizeof(struct slow_protocol_frame) + + RTE_PKTMBUF_HEADROOM; + + /* The size of the mempool should be at least: + * the sum of the TX descriptors + BOND_MODE_8023AX_SLAVE_TX_PKTS */ + total_tx_desc = BOND_MODE_8023AX_SLAVE_TX_PKTS; + for (q_id = 0; q_id < bond_dev->data->nb_tx_queues; q_id++) { + bd_tx_q = (struct bond_tx_queue*)bond_dev->data->tx_queues[q_id]; + total_tx_desc += bd_tx_q->nb_tx_desc; + } + + snprintf(mem_name, RTE_DIM(mem_name), "slave_port%u_pool", slave_id); + port->mbuf_pool = rte_pktmbuf_pool_create(mem_name, total_tx_desc, + RTE_MEMPOOL_CACHE_MAX_SIZE >= 32 ? + 32 : RTE_MEMPOOL_CACHE_MAX_SIZE, + 0, element_size, socket_id); + + /* Any memory allocation failure in initalization is critical because + * resources can't be free, so reinitialization is impossible. */ + if (port->mbuf_pool == NULL) { + rte_panic("Slave %u: Failed to create memory pool '%s': %s\n", + slave_id, mem_name, rte_strerror(rte_errno)); + } + + snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id); + port->rx_ring = rte_ring_create(mem_name, + rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0); + + if (port->rx_ring == NULL) { + rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id, + mem_name, rte_strerror(rte_errno)); + } + + /* TX ring is at least one pkt longer to make room for marker packet. */ + snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_tx", slave_id); + port->tx_ring = rte_ring_create(mem_name, + rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0); + + if (port->tx_ring == NULL) { + rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id, + mem_name, rte_strerror(rte_errno)); + } +} + +int +bond_mode_8023ad_deactivate_slave(struct rte_eth_dev *bond_dev, + uint8_t slave_id) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + void *pkt = NULL; + struct port *port; + uint8_t i; + + /* Given slave must be in active list */ + RTE_ASSERT(find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) < internals->active_slave_count); + + /* Exclude slave from transmit policy. If this slave is an aggregator + * make all aggregated slaves unselected to force selection logic + * to select suitable aggregator for this port. */ + for (i = 0; i < internals->active_slave_count; i++) { + port = &mode_8023ad_ports[internals->active_slaves[i]]; + if (port->aggregator_port_id != slave_id) + continue; + + port->selected = UNSELECTED; + + /* Use default aggregator */ + port->aggregator_port_id = internals->active_slaves[i]; + } + + port = &mode_8023ad_ports[slave_id]; + port->selected = UNSELECTED; + port->actor_state &= ~(STATE_SYNCHRONIZATION | STATE_DISTRIBUTING | + STATE_COLLECTING); + + while (rte_ring_dequeue(port->rx_ring, &pkt) == 0) + rte_pktmbuf_free((struct rte_mbuf *)pkt); + + while (rte_ring_dequeue(port->tx_ring, &pkt) == 0) + rte_pktmbuf_free((struct rte_mbuf *)pkt); + return 0; +} + +void +bond_mode_8023ad_mac_address_update(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct ether_addr slave_addr; + struct port *slave, *agg_slave; + uint8_t slave_id, i, j; + + bond_mode_8023ad_stop(bond_dev); + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + slave = &mode_8023ad_ports[slave_id]; + rte_eth_macaddr_get(slave_id, &slave_addr); + + if (is_same_ether_addr(&slave_addr, &slave->actor.system)) + continue; + + ether_addr_copy(&slave_addr, &slave->actor.system); + /* Do nothing if this port is not an aggregator. In other case + * Set NTT flag on every port that use this aggregator. */ + if (slave->aggregator_port_id != slave_id) + continue; + + for (j = 0; j < internals->active_slave_count; j++) { + agg_slave = &mode_8023ad_ports[internals->active_slaves[j]]; + if (agg_slave->aggregator_port_id == slave_id) + SM_FLAG_SET(agg_slave, NTT); + } + } + + if (bond_dev->data->dev_started) + bond_mode_8023ad_start(bond_dev); +} + +static void +bond_mode_8023ad_conf_get(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + uint64_t ms_ticks = rte_get_tsc_hz() / 1000; + + conf->fast_periodic_ms = mode4->fast_periodic_timeout / ms_ticks; + conf->slow_periodic_ms = mode4->slow_periodic_timeout / ms_ticks; + conf->short_timeout_ms = mode4->short_timeout / ms_ticks; + conf->long_timeout_ms = mode4->long_timeout / ms_ticks; + conf->aggregate_wait_timeout_ms = mode4->aggregate_wait_timeout / ms_ticks; + conf->tx_period_ms = mode4->tx_period_timeout / ms_ticks; + conf->update_timeout_ms = mode4->update_timeout_us / 1000; + conf->rx_marker_period_ms = mode4->rx_marker_timeout / ms_ticks; +} + +static void +bond_mode_8023ad_conf_get_v1607(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + + bond_mode_8023ad_conf_get(dev, conf); + conf->slowrx_cb = mode4->slowrx_cb; +} + +static void +bond_mode_8023ad_conf_get_default(struct rte_eth_bond_8023ad_conf *conf) +{ + conf->fast_periodic_ms = BOND_8023AD_FAST_PERIODIC_MS; + conf->slow_periodic_ms = BOND_8023AD_SLOW_PERIODIC_MS; + conf->short_timeout_ms = BOND_8023AD_SHORT_TIMEOUT_MS; + conf->long_timeout_ms = BOND_8023AD_LONG_TIMEOUT_MS; + conf->aggregate_wait_timeout_ms = BOND_8023AD_AGGREGATE_WAIT_TIMEOUT_MS; + conf->tx_period_ms = BOND_8023AD_TX_MACHINE_PERIOD_MS; + conf->rx_marker_period_ms = BOND_8023AD_RX_MARKER_PERIOD_MS; + conf->update_timeout_ms = BOND_MODE_8023AX_UPDATE_TIMEOUT_MS; + conf->slowrx_cb = NULL; +} + +static void +bond_mode_8023ad_conf_assign(struct mode8023ad_private *mode4, + struct rte_eth_bond_8023ad_conf *conf) +{ + uint64_t ms_ticks = rte_get_tsc_hz() / 1000; + + mode4->fast_periodic_timeout = conf->fast_periodic_ms * ms_ticks; + mode4->slow_periodic_timeout = conf->slow_periodic_ms * ms_ticks; + mode4->short_timeout = conf->short_timeout_ms * ms_ticks; + mode4->long_timeout = conf->long_timeout_ms * ms_ticks; + mode4->aggregate_wait_timeout = conf->aggregate_wait_timeout_ms * ms_ticks; + mode4->tx_period_timeout = conf->tx_period_ms * ms_ticks; + mode4->rx_marker_timeout = conf->rx_marker_period_ms * ms_ticks; + mode4->update_timeout_us = conf->update_timeout_ms * 1000; +} + +static void +bond_mode_8023ad_setup_v20(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_bond_8023ad_conf def_conf; + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + + if (conf == NULL) { + conf = &def_conf; + bond_mode_8023ad_conf_get_default(conf); + } + + bond_mode_8023ad_stop(dev); + bond_mode_8023ad_conf_assign(mode4, conf); + + if (dev->data->dev_started) + bond_mode_8023ad_start(dev); +} + + +void +bond_mode_8023ad_setup(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_bond_8023ad_conf def_conf; + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + + if (conf == NULL) { + conf = &def_conf; + bond_mode_8023ad_conf_get_default(conf); + } + + bond_mode_8023ad_stop(dev); + bond_mode_8023ad_conf_assign(mode4, conf); + mode4->slowrx_cb = conf->slowrx_cb; + + if (dev->data->dev_started) + bond_mode_8023ad_start(dev); +} + +int +bond_mode_8023ad_enable(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + uint8_t i; + + for (i = 0; i < internals->active_slave_count; i++) + bond_mode_8023ad_activate_slave(bond_dev, i); + + return 0; +} + +int +bond_mode_8023ad_start(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + static const uint64_t us = BOND_MODE_8023AX_UPDATE_TIMEOUT_MS * 1000; + + if (mode4->slowrx_cb) + return rte_eal_alarm_set(us, &bond_mode_8023ad_ext_periodic_cb, + bond_dev); + + return rte_eal_alarm_set(us, &bond_mode_8023ad_periodic_cb, bond_dev); +} + +void +bond_mode_8023ad_stop(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + + if (mode4->slowrx_cb) { + rte_eal_alarm_cancel(&bond_mode_8023ad_ext_periodic_cb, + bond_dev); + return; + } + rte_eal_alarm_cancel(&bond_mode_8023ad_periodic_cb, bond_dev); +} + +void +bond_mode_8023ad_handle_slow_pkt(struct bond_dev_private *internals, + uint8_t slave_id, struct rte_mbuf *pkt) +{ + struct mode8023ad_private *mode4 = &internals->mode4; + struct port *port = &mode_8023ad_ports[slave_id]; + struct marker_header *m_hdr; + uint64_t marker_timer, old_marker_timer; + int retval; + uint8_t wrn, subtype; + /* If packet is a marker, we send response now by reusing given packet + * and update only source MAC, destination MAC is multicast so don't + * update it. Other frames will be handled later by state machines */ + subtype = rte_pktmbuf_mtod(pkt, + struct slow_protocol_frame *)->slow_protocol.subtype; + + if (subtype == SLOW_SUBTYPE_MARKER) { + m_hdr = rte_pktmbuf_mtod(pkt, struct marker_header *); + + if (likely(m_hdr->marker.tlv_type_marker != MARKER_TLV_TYPE_INFO)) { + wrn = WRN_UNKNOWN_MARKER_TYPE; + goto free_out; + } + + /* Setup marker timer. Do it in loop in case concurrent access. */ + do { + old_marker_timer = port->rx_marker_timer; + if (!timer_is_expired(&old_marker_timer)) { + wrn = WRN_RX_MARKER_TO_FAST; + goto free_out; + } + + timer_set(&marker_timer, mode4->rx_marker_timeout); + retval = rte_atomic64_cmpset(&port->rx_marker_timer, + old_marker_timer, marker_timer); + } while (unlikely(retval == 0)); + + m_hdr->marker.tlv_type_marker = MARKER_TLV_TYPE_RESP; + rte_eth_macaddr_get(slave_id, &m_hdr->eth_hdr.s_addr); + + if (unlikely(rte_ring_enqueue(port->tx_ring, pkt) == -ENOBUFS)) { + /* reset timer */ + port->rx_marker_timer = 0; + wrn = WRN_TX_QUEUE_FULL; + goto free_out; + } + } else if (likely(subtype == SLOW_SUBTYPE_LACP)) { + if (unlikely(rte_ring_enqueue(port->rx_ring, pkt) == -ENOBUFS)) { + /* If RX fing full free lacpdu message and drop packet */ + wrn = WRN_RX_QUEUE_FULL; + goto free_out; + } + } else { + wrn = WRN_UNKNOWN_SLOW_TYPE; + goto free_out; + } + + return; + +free_out: + set_warning_flags(port, wrn); + rte_pktmbuf_free(pkt); +} + +int +rte_eth_bond_8023ad_conf_get_v20(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + + if (valid_bonded_port_id(port_id) != 0) + return -EINVAL; + + if (conf == NULL) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_conf_get(bond_dev, conf); + return 0; +} +VERSION_SYMBOL(rte_eth_bond_8023ad_conf_get, _v20, 2.0); + +int +rte_eth_bond_8023ad_conf_get_v1607(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + + if (valid_bonded_port_id(port_id) != 0) + return -EINVAL; + + if (conf == NULL) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_conf_get_v1607(bond_dev, conf); + return 0; +} +BIND_DEFAULT_SYMBOL(rte_eth_bond_8023ad_conf_get, _v1607, 16.07); +MAP_STATIC_SYMBOL(int rte_eth_bond_8023ad_conf_get(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf), + rte_eth_bond_8023ad_conf_get_v1607); + +static int +bond_8023ad_setup_validate(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + if (valid_bonded_port_id(port_id) != 0) + return -EINVAL; + + if (conf != NULL) { + /* Basic sanity check */ + if (conf->slow_periodic_ms == 0 || + conf->fast_periodic_ms >= conf->slow_periodic_ms || + conf->long_timeout_ms == 0 || + conf->short_timeout_ms >= conf->long_timeout_ms || + conf->aggregate_wait_timeout_ms == 0 || + conf->tx_period_ms == 0 || + conf->rx_marker_period_ms == 0 || + conf->update_timeout_ms == 0) { + RTE_LOG(ERR, PMD, "given mode 4 configuration is invalid\n"); + return -EINVAL; + } + } + + return 0; +} + +int +rte_eth_bond_8023ad_setup_v20(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + int err; + + err = bond_8023ad_setup_validate(port_id, conf); + if (err != 0) + return err; + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_setup_v20(bond_dev, conf); + + return 0; +} +VERSION_SYMBOL(rte_eth_bond_8023ad_setup, _v20, 2.0); + +int +rte_eth_bond_8023ad_setup_v1607(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + int err; + + err = bond_8023ad_setup_validate(port_id, conf); + if (err != 0) + return err; + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_setup(bond_dev, conf); + + return 0; +} +BIND_DEFAULT_SYMBOL(rte_eth_bond_8023ad_setup, _v1607, 16.07); +MAP_STATIC_SYMBOL(int rte_eth_bond_8023ad_setup(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf), + rte_eth_bond_8023ad_setup_v1607); + +int +rte_eth_bond_8023ad_slave_info(uint8_t port_id, uint8_t slave_id, + struct rte_eth_bond_8023ad_slave_info *info) +{ + struct rte_eth_dev *bond_dev; + struct bond_dev_private *internals; + struct port *port; + + if (info == NULL || valid_bonded_port_id(port_id) != 0 || + rte_eth_bond_mode_get(port_id) != BONDING_MODE_8023AD) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + + internals = bond_dev->data->dev_private; + if (find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) == + internals->active_slave_count) + return -EINVAL; + + port = &mode_8023ad_ports[slave_id]; + info->selected = port->selected; + + info->actor_state = port->actor_state; + rte_memcpy(&info->actor, &port->actor, sizeof(port->actor)); + + info->partner_state = port->partner_state; + rte_memcpy(&info->partner, &port->partner, sizeof(port->partner)); + + info->agg_port_id = port->aggregator_port_id; + return 0; +} + +static int +bond_8023ad_ext_validate(uint8_t port_id, uint8_t slave_id) +{ + struct rte_eth_dev *bond_dev; + struct bond_dev_private *internals; + struct mode8023ad_private *mode4; + + if (rte_eth_bond_mode_get(port_id) != BONDING_MODE_8023AD) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + + if (!bond_dev->data->dev_started) + return -EINVAL; + + internals = bond_dev->data->dev_private; + if (find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) == + internals->active_slave_count) + return -EINVAL; + + mode4 = &internals->mode4; + if (mode4->slowrx_cb == NULL) + return -EINVAL; + + return 0; +} + +int +rte_eth_bond_8023ad_ext_collect(uint8_t port_id, uint8_t slave_id, int enabled) +{ + struct port *port; + int res; + + res = bond_8023ad_ext_validate(port_id, slave_id); + if (res != 0) + return res; + + port = &mode_8023ad_ports[slave_id]; + + if (enabled) + ACTOR_STATE_SET(port, COLLECTING); + else + ACTOR_STATE_CLR(port, COLLECTING); + + return 0; +} + +int +rte_eth_bond_8023ad_ext_distrib(uint8_t port_id, uint8_t slave_id, int enabled) +{ + struct port *port; + int res; + + res = bond_8023ad_ext_validate(port_id, slave_id); + if (res != 0) + return res; + + port = &mode_8023ad_ports[slave_id]; + + if (enabled) + ACTOR_STATE_SET(port, DISTRIBUTING); + else + ACTOR_STATE_CLR(port, DISTRIBUTING); + + return 0; +} + +int +rte_eth_bond_8023ad_ext_distrib_get(uint8_t port_id, uint8_t slave_id) +{ + struct port *port; + int err; + + err = bond_8023ad_ext_validate(port_id, slave_id); + if (err != 0) + return err; + + port = &mode_8023ad_ports[slave_id]; + return ACTOR_STATE(port, DISTRIBUTING); +} + +int +rte_eth_bond_8023ad_ext_collect_get(uint8_t port_id, uint8_t slave_id) +{ + struct port *port; + int err; + + err = bond_8023ad_ext_validate(port_id, slave_id); + if (err != 0) + return err; + + port = &mode_8023ad_ports[slave_id]; + return ACTOR_STATE(port, COLLECTING); +} + +int +rte_eth_bond_8023ad_ext_slowtx(uint8_t port_id, uint8_t slave_id, + struct rte_mbuf *lacp_pkt) +{ + struct port *port; + int res; + + res = bond_8023ad_ext_validate(port_id, slave_id); + if (res != 0) + return res; + + port = &mode_8023ad_ports[slave_id]; + + if (rte_pktmbuf_pkt_len(lacp_pkt) < sizeof(struct lacpdu_header)) + return -EINVAL; + + struct lacpdu_header *lacp; + + /* only enqueue LACPDUs */ + lacp = rte_pktmbuf_mtod(lacp_pkt, struct lacpdu_header *); + if (lacp->lacpdu.subtype != SLOW_SUBTYPE_LACP) + return -EINVAL; + + MODE4_DEBUG("sending LACP frame\n"); + + return rte_ring_enqueue(port->tx_ring, lacp_pkt); +} + +static void +bond_mode_8023ad_ext_periodic_cb(void *arg) +{ + struct rte_eth_dev *bond_dev = arg; + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + struct port *port; + void *pkt = NULL; + uint16_t i, slave_id; + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + port = &mode_8023ad_ports[slave_id]; + + if (rte_ring_dequeue(port->rx_ring, &pkt) == 0) { + struct rte_mbuf *lacp_pkt = pkt; + struct lacpdu_header *lacp; + + lacp = rte_pktmbuf_mtod(lacp_pkt, + struct lacpdu_header *); + RTE_VERIFY(lacp->lacpdu.subtype == SLOW_SUBTYPE_LACP); + + /* This is LACP frame so pass it to rx callback. + * Callback is responsible for freeing mbuf. + */ + mode4->slowrx_cb(slave_id, lacp_pkt); + } + } + + rte_eal_alarm_set(internals->mode4.update_timeout_us, + bond_mode_8023ad_ext_periodic_cb, arg); +} diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.h b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.h new file mode 100644 index 00000000..6b8ff575 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad.h @@ -0,0 +1,305 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_ETH_BOND_8023AD_H_ +#define RTE_ETH_BOND_8023AD_H_ + +#include <rte_ether.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Actor/partner states + */ +#define STATE_LACP_ACTIVE 0x01 +#define STATE_LACP_SHORT_TIMEOUT 0x02 +#define STATE_AGGREGATION 0x04 +#define STATE_SYNCHRONIZATION 0x08 +#define STATE_COLLECTING 0x10 +#define STATE_DISTRIBUTING 0x20 +/** Partners parameters are defaulted */ +#define STATE_DEFAULTED 0x40 +#define STATE_EXPIRED 0x80 + +#define TLV_TYPE_ACTOR_INFORMATION 0x01 +#define TLV_TYPE_PARTNER_INFORMATION 0x02 +#define TLV_TYPE_COLLECTOR_INFORMATION 0x03 +#define TLV_TYPE_TERMINATOR_INFORMATION 0x00 + +#define SLOW_SUBTYPE_LACP 0x01 +#define SLOW_SUBTYPE_MARKER 0x02 + +#define MARKER_TLV_TYPE_INFO 0x01 +#define MARKER_TLV_TYPE_RESP 0x02 + +typedef void (*rte_eth_bond_8023ad_ext_slowrx_fn)(uint8_t slave_id, + struct rte_mbuf *lacp_pkt); + +enum rte_bond_8023ad_selection { + UNSELECTED, + STANDBY, + SELECTED +}; + +/** Generic slow protocol structure */ +struct slow_protocol { + uint8_t subtype; + uint8_t reserved_119[119]; +} __attribute__((__packed__)); + +/** Generic slow protocol frame type structure */ +struct slow_protocol_frame { + struct ether_hdr eth_hdr; + struct slow_protocol slow_protocol; +} __attribute__((__packed__)); + +struct port_params { + uint16_t system_priority; + /**< System priority (unused in current implementation) */ + struct ether_addr system; + /**< System ID - Slave MAC address, same as bonding MAC address */ + uint16_t key; + /**< Speed information (implementation dependednt) and duplex. */ + uint16_t port_priority; + /**< Priority of this (unused in current implementation) */ + uint16_t port_number; + /**< Port number. It corresponds to slave port id. */ +} __attribute__((__packed__)); + +struct lacpdu_actor_partner_params { + uint8_t tlv_type_info; + uint8_t info_length; + struct port_params port_params; + uint8_t state; + uint8_t reserved_3[3]; +} __attribute__((__packed__)); + +/** LACPDU structure (5.4.2 in 802.1AX documentation). */ +struct lacpdu { + uint8_t subtype; + uint8_t version_number; + + struct lacpdu_actor_partner_params actor; + struct lacpdu_actor_partner_params partner; + + uint8_t tlv_type_collector_info; + uint8_t collector_info_length; + uint16_t collector_max_delay; + uint8_t reserved_12[12]; + + uint8_t tlv_type_terminator; + uint8_t terminator_length; + uint8_t reserved_50[50]; +} __attribute__((__packed__)); + +/** LACPDU frame: Contains ethernet header and LACPDU. */ +struct lacpdu_header { + struct ether_hdr eth_hdr; + struct lacpdu lacpdu; +} __attribute__((__packed__)); + +struct marker { + uint8_t subtype; + uint8_t version_number; + + uint8_t tlv_type_marker; + uint8_t info_length; + uint16_t requester_port; + struct ether_addr requester_system; + uint32_t requester_transaction_id; + uint8_t reserved_2[2]; + + uint8_t tlv_type_terminator; + uint8_t terminator_length; + uint8_t reserved_90[90]; +} __attribute__((__packed__)); + +struct marker_header { + struct ether_hdr eth_hdr; + struct marker marker; +} __attribute__((__packed__)); + +struct rte_eth_bond_8023ad_conf { + uint32_t fast_periodic_ms; + uint32_t slow_periodic_ms; + uint32_t short_timeout_ms; + uint32_t long_timeout_ms; + uint32_t aggregate_wait_timeout_ms; + uint32_t tx_period_ms; + uint32_t rx_marker_period_ms; + uint32_t update_timeout_ms; + rte_eth_bond_8023ad_ext_slowrx_fn slowrx_cb; +}; + +struct rte_eth_bond_8023ad_slave_info { + enum rte_bond_8023ad_selection selected; + uint8_t actor_state; + struct port_params actor; + uint8_t partner_state; + struct port_params partner; + uint8_t agg_port_id; +}; + +/** + * @internal + * + * Function returns current configuration of 802.3AX mode. + * + * @param port_id Bonding device id + * @param conf Pointer to timeout structure. + * + * @return + * 0 - if ok + * -EINVAL if conf is NULL + */ +int +rte_eth_bond_8023ad_conf_get(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); +int +rte_eth_bond_8023ad_conf_get_v20(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); +int +rte_eth_bond_8023ad_conf_get_v1607(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Function set new configuration of 802.3AX mode. + * + * @param port_id Bonding device id + * @param conf Configuration, if NULL set default configuration. + * @return + * 0 - if ok + * -EINVAL if configuration is invalid. + */ +int +rte_eth_bond_8023ad_setup(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); +int +rte_eth_bond_8023ad_setup_v20(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); +int +rte_eth_bond_8023ad_setup_v1607(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Function returns current state of given slave device. + * + * @param slave_id Port id of valid slave. + * @param conf buffer for configuration + * @return + * 0 - if ok + * -EINVAL if conf is NULL or slave id is invalid (not a slave of given + * bonded device or is not inactive). + */ +int +rte_eth_bond_8023ad_slave_info(uint8_t port_id, uint8_t slave_id, + struct rte_eth_bond_8023ad_slave_info *conf); + +#ifdef __cplusplus +} +#endif + +/** + * Configure a slave port to start collecting. + * + * @param port_id Bonding device id + * @param slave_id Port id of valid slave. + * @param enabled Non-zero when collection enabled. + * @return + * 0 - if ok + * -EINVAL if slave is not valid. + */ +int +rte_eth_bond_8023ad_ext_collect(uint8_t port_id, uint8_t slave_id, int enabled); + +/** + * Get COLLECTING flag from slave port actor state. + * + * @param port_id Bonding device id + * @param slave_id Port id of valid slave. + * @return + * 0 - if not set + * 1 - if set + * -EINVAL if slave is not valid. + */ +int +rte_eth_bond_8023ad_ext_collect_get(uint8_t port_id, uint8_t slave_id); + +/** + * Configure a slave port to start distributing. + * + * @param port_id Bonding device id + * @param slave_id Port id of valid slave. + * @param enabled Non-zero when distribution enabled. + * @return + * 0 - if ok + * -EINVAL if slave is not valid. + */ +int +rte_eth_bond_8023ad_ext_distrib(uint8_t port_id, uint8_t slave_id, int enabled); + +/** + * Get DISTRIBUTING flag from slave port actor state. + * + * @param port_id Bonding device id + * @param slave_id Port id of valid slave. + * @return + * 0 - if not set + * 1 - if set + * -EINVAL if slave is not valid. + */ +int +rte_eth_bond_8023ad_ext_distrib_get(uint8_t port_id, uint8_t slave_id); + +/** + * LACPDU transmit path for external 802.3ad state machine. Caller retains + * ownership of the packet on failure. + * + * @param port_id Bonding device id + * @param slave_id Port ID of valid slave device. + * @param lacp_pkt mbuf containing LACPDU. + * + * @return + * 0 on success, negative value otherwise. + */ +int +rte_eth_bond_8023ad_ext_slowtx(uint8_t port_id, uint8_t slave_id, + struct rte_mbuf *lacp_pkt); + +#endif /* RTE_ETH_BOND_8023AD_H_ */ diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad_private.h b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad_private.h new file mode 100644 index 00000000..ca8858be --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_8023ad_private.h @@ -0,0 +1,298 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_ETH_BOND_8023AD_PRIVATE_H_ +#define RTE_ETH_BOND_8023AD_PRIVATE_H_ + +#include <stdint.h> + +#include <rte_ether.h> +#include <rte_byteorder.h> +#include <rte_atomic.h> + +#include "rte_eth_bond_8023ad.h" + +#define BOND_MODE_8023AX_UPDATE_TIMEOUT_MS 100 +/** Maximum number of packets to one slave queued in TX ring. */ +#define BOND_MODE_8023AX_SLAVE_RX_PKTS 3 +/** Maximum number of LACP packets from one slave queued in TX ring. */ +#define BOND_MODE_8023AX_SLAVE_TX_PKTS 1 +/** + * Timeouts deffinitions (5.4.4 in 802.1AX documentation). + */ +#define BOND_8023AD_FAST_PERIODIC_MS 900 +#define BOND_8023AD_SLOW_PERIODIC_MS 29000 +#define BOND_8023AD_SHORT_TIMEOUT_MS 3000 +#define BOND_8023AD_LONG_TIMEOUT_MS 90000 +#define BOND_8023AD_CHURN_DETECTION_TIMEOUT_MS 60000 +#define BOND_8023AD_AGGREGATE_WAIT_TIMEOUT_MS 2000 +#define BOND_8023AD_TX_MACHINE_PERIOD_MS 500 +#define BOND_8023AD_RX_MARKER_PERIOD_MS 2000 + +/** + * Interval of showing warning message from state machines. All messages will + * be held (and gathered together) to prevent flooding. + * This is no parto of 802.1AX standard. + */ +#define BOND_8023AD_WARNINGS_PERIOD_MS 1000 + + + +/** + * State machine flags + */ +#define SM_FLAGS_BEGIN 0x0001 +#define SM_FLAGS_LACP_ENABLED 0x0002 +#define SM_FLAGS_ACTOR_CHURN 0x0004 +#define SM_FLAGS_PARTNER_CHURN 0x0008 +#define SM_FLAGS_MOVED 0x0100 +#define SM_FLAGS_PARTNER_SHORT_TIMEOUT 0x0200 +#define SM_FLAGS_NTT 0x0400 + +#define BOND_LINK_FULL_DUPLEX_KEY 0x01 +#define BOND_LINK_SPEED_KEY_10M 0x02 +#define BOND_LINK_SPEED_KEY_100M 0x04 +#define BOND_LINK_SPEED_KEY_1000M 0x08 +#define BOND_LINK_SPEED_KEY_10G 0x10 +#define BOND_LINK_SPEED_KEY_20G 0x11 +#define BOND_LINK_SPEED_KEY_40G 0x12 + +#define WRN_RX_MARKER_TO_FAST 0x01 +#define WRN_UNKNOWN_SLOW_TYPE 0x02 +#define WRN_UNKNOWN_MARKER_TYPE 0x04 +#define WRN_NOT_LACP_CAPABLE 0x08 +#define WRN_RX_QUEUE_FULL 0x10 +#define WRN_TX_QUEUE_FULL 0x20 + +#define CHECK_FLAGS(_variable, _f) ((_variable) & (_f)) +#define SET_FLAGS(_variable, _f) ((_variable) |= (_f)) +#define CLEAR_FLAGS(_variable, _f) ((_variable) &= ~(_f)) + +#define SM_FLAG(_p, _f) (!!CHECK_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f)) +#define SM_FLAG_SET(_p, _f) SET_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f) +#define SM_FLAG_CLR(_p, _f) CLEAR_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f) + +#define ACTOR_STATE(_p, _f) (!!CHECK_FLAGS((_p)->actor_state, STATE_ ## _f)) +#define ACTOR_STATE_SET(_p, _f) SET_FLAGS((_p)->actor_state, STATE_ ## _f) +#define ACTOR_STATE_CLR(_p, _f) CLEAR_FLAGS((_p)->actor_state, STATE_ ## _f) + +#define PARTNER_STATE(_p, _f) (!!CHECK_FLAGS((_p)->partner_state, STATE_ ## _f)) +#define PARTNER_STATE_SET(_p, _f) SET_FLAGS((_p)->partner_state, STATE_ ## _f) +#define PARTNER_STATE_CLR(_p, _f) CLEAR_FLAGS((_p)->partner_state, STATE_ ## _f) + +/** Variables associated with each port (5.4.7 in 802.1AX documentation). */ +struct port { + /** + * The operational values of the Actor's state parameters. Bitmask + * of port states. + */ + uint8_t actor_state; + + /** The operational Actor's port parameters */ + struct port_params actor; + + /** + * The operational value of the Actor's view of the current values of + * the Partner's state parameters. The Actor sets this variable either + * to the value received from the Partner in an LACPDU, or to the value + * of Partner_Admin_Port_State. Bitmask of port states. + */ + uint8_t partner_state; + + /** The operational Partner's port parameters */ + struct port_params partner; + + /* Additional port parameters not listed in documentation */ + /** State machine flags */ + uint16_t sm_flags; + enum rte_bond_8023ad_selection selected; + + uint64_t current_while_timer; + uint64_t periodic_timer; + uint64_t wait_while_timer; + uint64_t tx_machine_timer; + uint64_t tx_marker_timer; + /* Agregator parameters */ + /** Used aggregator port ID */ + uint16_t aggregator_port_id; + + /** Memory pool used to allocate rings */ + struct rte_mempool *mbuf_pool; + + /** Ring of LACP packets from RX burst function */ + struct rte_ring *rx_ring; + + /** Ring of slow protocol packets (LACP and MARKERS) to TX burst function */ + struct rte_ring *tx_ring; + + /** Timer which is also used as mutex. If is 0 (not running) RX marker + * packet might be responded. Otherwise shall be dropped. It is zeroed in + * mode 4 callback function after expire. */ + volatile uint64_t rx_marker_timer; + + uint64_t warning_timer; + volatile uint16_t warnings_to_show; +}; + +struct mode8023ad_private { + uint64_t fast_periodic_timeout; + uint64_t slow_periodic_timeout; + uint64_t short_timeout; + uint64_t long_timeout; + uint64_t aggregate_wait_timeout; + uint64_t tx_period_timeout; + uint64_t rx_marker_timeout; + uint64_t update_timeout_us; + rte_eth_bond_8023ad_ext_slowrx_fn slowrx_cb; + uint8_t external_sm; +}; + +/** + * @internal + * The pool of *port* structures. The size of the pool + * is configured at compile-time in the <rte_eth_bond_8023ad.c> file. + */ +extern struct port mode_8023ad_ports[]; + +/* Forward declaration */ +struct bond_dev_private; + + +/** + * @internal + * + * Set mode 4 configuration of bonded interface. + * + * @pre Bonded interface must be stopped. + * + * @param dev Bonded interface + * @param conf new configuration. If NULL set default configuration. + */ +void +bond_mode_8023ad_setup(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Enables 802.1AX mode and all active slaves on bonded interface. + * + * @param dev Bonded interface + * @return + * 0 on success, negative value otherwise. + */ +int +bond_mode_8023ad_enable(struct rte_eth_dev *dev); + +/** + * @internal + * + * Disables 802.1AX mode of the bonded interface and slaves. + * + * @param dev Bonded interface + * @return + * 0 on success, negative value otherwise. + */ +int bond_mode_8023ad_disable(struct rte_eth_dev *dev); + +/** + * @internal + * + * Starts 802.3AX state machines management logic. + * @param dev Bonded interface + * @return + * 0 if machines was started, 1 if machines was already running, + * negative value otherwise. + */ +int +bond_mode_8023ad_start(struct rte_eth_dev *dev); + +/** + * @internal + * + * Stops 802.3AX state machines management logic. + * @param dev Bonded interface + * @return + * 0 if this call stopped state machines, -ENOENT if alarm was not set. + */ +void +bond_mode_8023ad_stop(struct rte_eth_dev *dev); + +/** + * @internal + * + * Passes given slow packet to state machines management logic. + * @param internals Bonded device private data. + * @param slave_id Slave port id. + * @param slot_pkt Slow packet. + */ +void +bond_mode_8023ad_handle_slow_pkt(struct bond_dev_private *internals, + uint8_t slave_id, struct rte_mbuf *pkt); + +/** + * @internal + * + * Appends given slave used slave + * + * @param dev Bonded interface. + * @param port_id Slave port ID to be added + * + * @return + * 0 on success, negative value otherwise. + */ +void +bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint8_t port_id); + +/** + * @internal + * + * Denitializes and removes given slave from 802.1AX mode. + * + * @param dev Bonded interface. + * @param slave_num Position of slave in active_slaves array + * + * @return + * 0 on success, negative value otherwise. + */ +int +bond_mode_8023ad_deactivate_slave(struct rte_eth_dev *dev, uint8_t slave_pos); + +/** + * Updates state when MAC was changed on bonded device or one of its slaves. + * @param bond_dev Bonded device + */ +void +bond_mode_8023ad_mac_address_update(struct rte_eth_dev *bond_dev); + +#endif /* RTE_ETH_BOND_8023AD_H_ */ diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.c b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.c new file mode 100644 index 00000000..38f5c4d4 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.c @@ -0,0 +1,287 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rte_eth_bond_private.h" +#include "rte_eth_bond_alb.h" + +static inline uint8_t +simple_hash(uint8_t *hash_start, int hash_size) +{ + int i; + uint8_t hash; + + hash = 0; + for (i = 0; i < hash_size; ++i) + hash ^= hash_start[i]; + + return hash; +} + +static uint8_t +calculate_slave(struct bond_dev_private *internals) +{ + uint8_t idx; + + idx = (internals->mode6.last_slave + 1) % internals->active_slave_count; + internals->mode6.last_slave = idx; + return internals->active_slaves[idx]; +} + +int +bond_mode_alb_enable(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct client_data *hash_table = internals->mode6.client_table; + + uint16_t data_size; + char mem_name[RTE_ETH_NAME_MAX_LEN]; + int socket_id = bond_dev->data->numa_node; + + /* Fill hash table with initial values */ + memset(hash_table, 0, sizeof(struct client_data) * ALB_HASH_TABLE_SIZE); + rte_spinlock_init(&internals->mode6.lock); + internals->mode6.last_slave = ALB_NULL_INDEX; + internals->mode6.ntt = 0; + + /* Initialize memory pool for ARP packets to send */ + if (internals->mode6.mempool == NULL) { + /* + * 256 is size of ETH header, ARP header and nested VLAN headers. + * The value is chosen to be cache aligned. + */ + data_size = 256 + RTE_PKTMBUF_HEADROOM; + snprintf(mem_name, sizeof(mem_name), "%s_MODE6", bond_dev->data->name); + internals->mode6.mempool = rte_pktmbuf_pool_create(mem_name, + 512 * RTE_MAX_ETHPORTS, + RTE_MEMPOOL_CACHE_MAX_SIZE >= 32 ? + 32 : RTE_MEMPOOL_CACHE_MAX_SIZE, + 0, data_size, socket_id); + + if (internals->mode6.mempool == NULL) { + RTE_LOG(ERR, PMD, "%s: Failed to initialize ALB mempool.\n", + bond_dev->data->name); + goto mempool_alloc_error; + } + } + + return 0; + +mempool_alloc_error: + return -ENOMEM; +} + +void bond_mode_alb_arp_recv(struct ether_hdr *eth_h, uint16_t offset, + struct bond_dev_private *internals) { + struct arp_hdr *arp; + + struct client_data *hash_table = internals->mode6.client_table; + struct client_data *client_info; + + uint8_t hash_index; + + arp = (struct arp_hdr *) ((char *) (eth_h + 1) + offset); + + /* ARP Requests are forwarded to the application with no changes */ + if (arp->arp_op != rte_cpu_to_be_16(ARP_OP_REPLY)) + return; + + /* From now on, we analyze only ARP Reply packets */ + hash_index = simple_hash((uint8_t *) &arp->arp_data.arp_sip, + sizeof(arp->arp_data.arp_sip)); + client_info = &hash_table[hash_index]; + + /* + * We got reply for ARP Request send by the application. We need to + * update client table when received data differ from what is stored + * in ALB table and issue sending update packet to that slave. + */ + rte_spinlock_lock(&internals->mode6.lock); + if (client_info->in_use == 0 || + client_info->app_ip != arp->arp_data.arp_tip || + client_info->cli_ip != arp->arp_data.arp_sip || + !is_same_ether_addr(&client_info->cli_mac, &arp->arp_data.arp_sha) || + client_info->vlan_count != offset / sizeof(struct vlan_hdr) || + memcmp(client_info->vlan, eth_h + 1, offset) != 0 + ) { + client_info->in_use = 1; + client_info->app_ip = arp->arp_data.arp_tip; + client_info->cli_ip = arp->arp_data.arp_sip; + ether_addr_copy(&arp->arp_data.arp_sha, &client_info->cli_mac); + client_info->slave_idx = calculate_slave(internals); + rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac); + ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_tha); + memcpy(client_info->vlan, eth_h + 1, offset); + client_info->vlan_count = offset / sizeof(struct vlan_hdr); + } + internals->mode6.ntt = 1; + rte_spinlock_unlock(&internals->mode6.lock); +} + +uint8_t +bond_mode_alb_arp_xmit(struct ether_hdr *eth_h, uint16_t offset, + struct bond_dev_private *internals) +{ + struct arp_hdr *arp; + + struct client_data *hash_table = internals->mode6.client_table; + struct client_data *client_info; + + uint8_t hash_index; + + struct ether_addr bonding_mac; + + arp = (struct arp_hdr *)((char *)(eth_h + 1) + offset); + + /* + * Traffic with src MAC other than bonding should be sent on + * current primary port. + */ + rte_eth_macaddr_get(internals->port_id, &bonding_mac); + if (!is_same_ether_addr(&bonding_mac, &arp->arp_data.arp_sha)) { + rte_eth_macaddr_get(internals->current_primary_port, + &arp->arp_data.arp_sha); + return internals->current_primary_port; + } + + hash_index = simple_hash((uint8_t *)&arp->arp_data.arp_tip, + sizeof(uint32_t)); + client_info = &hash_table[hash_index]; + + rte_spinlock_lock(&internals->mode6.lock); + if (arp->arp_op == rte_cpu_to_be_16(ARP_OP_REPLY)) { + if (client_info->in_use) { + if (client_info->app_ip == arp->arp_data.arp_sip && + client_info->cli_ip == arp->arp_data.arp_tip) { + /* Entry is already assigned to this client */ + if (!is_broadcast_ether_addr(&arp->arp_data.arp_tha)) { + ether_addr_copy(&arp->arp_data.arp_tha, + &client_info->cli_mac); + } + rte_eth_macaddr_get(client_info->slave_idx, + &client_info->app_mac); + ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_sha); + memcpy(client_info->vlan, eth_h + 1, offset); + client_info->vlan_count = offset / sizeof(struct vlan_hdr); + rte_spinlock_unlock(&internals->mode6.lock); + return client_info->slave_idx; + } + } + + /* Assign new slave to this client and update src mac in ARP */ + client_info->in_use = 1; + client_info->ntt = 0; + client_info->app_ip = arp->arp_data.arp_sip; + ether_addr_copy(&arp->arp_data.arp_tha, &client_info->cli_mac); + client_info->cli_ip = arp->arp_data.arp_tip; + client_info->slave_idx = calculate_slave(internals); + rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac); + ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_sha); + memcpy(client_info->vlan, eth_h + 1, offset); + client_info->vlan_count = offset / sizeof(struct vlan_hdr); + rte_spinlock_unlock(&internals->mode6.lock); + return client_info->slave_idx; + } + + /* If packet is not ARP Reply, send it on current primary port. */ + rte_spinlock_unlock(&internals->mode6.lock); + rte_eth_macaddr_get(internals->current_primary_port, + &arp->arp_data.arp_sha); + return internals->current_primary_port; +} + +uint8_t +bond_mode_alb_arp_upd(struct client_data *client_info, + struct rte_mbuf *pkt, struct bond_dev_private *internals) +{ + struct ether_hdr *eth_h; + struct arp_hdr *arp_h; + uint8_t slave_idx; + + rte_spinlock_lock(&internals->mode6.lock); + eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *); + + ether_addr_copy(&client_info->app_mac, ð_h->s_addr); + ether_addr_copy(&client_info->cli_mac, ð_h->d_addr); + if (client_info->vlan_count > 0) + eth_h->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); + else + eth_h->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + + arp_h = (struct arp_hdr *)((char *)eth_h + sizeof(struct ether_hdr) + + client_info->vlan_count * sizeof(struct vlan_hdr)); + + memcpy(eth_h + 1, client_info->vlan, + client_info->vlan_count * sizeof(struct vlan_hdr)); + + ether_addr_copy(&client_info->app_mac, &arp_h->arp_data.arp_sha); + arp_h->arp_data.arp_sip = client_info->app_ip; + ether_addr_copy(&client_info->cli_mac, &arp_h->arp_data.arp_tha); + arp_h->arp_data.arp_tip = client_info->cli_ip; + + arp_h->arp_hrd = rte_cpu_to_be_16(ARP_HRD_ETHER); + arp_h->arp_pro = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + arp_h->arp_hln = ETHER_ADDR_LEN; + arp_h->arp_pln = sizeof(uint32_t); + arp_h->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY); + + slave_idx = client_info->slave_idx; + rte_spinlock_unlock(&internals->mode6.lock); + + return slave_idx; +} + +void +bond_mode_alb_client_list_upd(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct client_data *client_info; + + int i; + + /* If active slave count is 0, it's pointless to refresh alb table */ + if (internals->active_slave_count <= 0) + return; + + rte_spinlock_lock(&internals->mode6.lock); + internals->mode6.last_slave = ALB_NULL_INDEX; + + for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) { + client_info = &internals->mode6.client_table[i]; + if (client_info->in_use) { + client_info->slave_idx = calculate_slave(internals); + rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac); + internals->mode6.ntt = 1; + } + } + rte_spinlock_unlock(&internals->mode6.lock); +} diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.h b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.h new file mode 100644 index 00000000..fd7c3aeb --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_alb.h @@ -0,0 +1,142 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_ETH_BOND_ALB_H_ +#define RTE_ETH_BOND_ALB_H_ + +#include <rte_ether.h> +#include <rte_arp.h> + +#define ALB_HASH_TABLE_SIZE 256 +#define ALB_NULL_INDEX 0xFFFFFFFF + +struct client_data { + /** ARP data of single client */ + struct ether_addr app_mac; + /**< MAC address of application running DPDK */ + uint32_t app_ip; + /**< IP address of application running DPDK */ + struct ether_addr cli_mac; + /**< Client MAC address */ + uint32_t cli_ip; + /**< Client IP address */ + + uint8_t slave_idx; + /**< Index of slave on which we connect with that client */ + uint8_t in_use; + /**< Flag indicating if entry in client table is currently used */ + uint8_t ntt; + /**< Flag indicating if we need to send update to this client on next tx */ + + struct vlan_hdr vlan[2]; + /**< Content of vlan headers */ + uint8_t vlan_count; + /**< Number of nested vlan headers */ +}; + +struct mode_alb_private { + struct client_data client_table[ALB_HASH_TABLE_SIZE]; + /**< Hash table storing ARP data of every client connected */ + struct rte_mempool *mempool; + /**< Mempool for creating ARP update packets */ + uint8_t ntt; + /**< Flag indicating if we need to send update to any client on next tx */ + uint32_t last_slave; + /**< Index of last used slave in client table */ + rte_spinlock_t lock; +}; + +/** + * ALB mode initialization. + * + * @param bond_dev Pointer to bonding device. + * + * @return + * Error code - 0 on success. + */ +int +bond_mode_alb_enable(struct rte_eth_dev *bond_dev); + +/** + * Function handles ARP packet reception. If received ARP request, it is + * forwarded to application without changes. If it is ARP reply, client table + * is updated. + * + * @param eth_h ETH header of received packet. + * @param offset Vlan header offset. + * @param internals Bonding data. + */ +void +bond_mode_alb_arp_recv(struct ether_hdr *eth_h, uint16_t offset, + struct bond_dev_private *internals); + +/** + * Function handles ARP packet transmission. It also decides on which slave + * send that packet. If packet is ARP Request, it is send on primary slave. + * If it is ARP Reply, it is send on slave stored in client table for that + * connection. On Reply function also updates data in client table. + * + * @param eth_h ETH header of transmitted packet. + * @param offset Vlan header offset. + * @param internals Bonding data. + * + * @return + * Index of slave on which packet should be sent. + */ +uint8_t +bond_mode_alb_arp_xmit(struct ether_hdr *eth_h, uint16_t offset, + struct bond_dev_private *internals); + +/** + * Function fills packet with ARP data from client_info. + * + * @param client_info Data of client to which packet is sent. + * @param pkt Pointer to packet which is sent. + * @param internals Bonding data. + * + * @return + * Index of slawe on which packet should be sent. + */ +uint8_t +bond_mode_alb_arp_upd(struct client_data *client_info, + struct rte_mbuf *pkt, struct bond_dev_private *internals); + +/** + * Function updates slave indexes of active connections. + * + * @param bond_dev Pointer to bonded device struct. + */ +void +bond_mode_alb_client_list_upd(struct rte_eth_dev *bond_dev); + +#endif /* RTE_ETH_BOND_ALB_H_ */ diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_api.c b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_api.c new file mode 100644 index 00000000..36ec65d6 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_api.c @@ -0,0 +1,784 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> + +#include <rte_mbuf.h> +#include <rte_malloc.h> +#include <rte_ethdev.h> +#include <rte_tcp.h> +#include <rte_vdev.h> +#include <rte_kvargs.h> + +#include "rte_eth_bond.h" +#include "rte_eth_bond_private.h" +#include "rte_eth_bond_8023ad_private.h" + +int +check_for_bonded_ethdev(const struct rte_eth_dev *eth_dev) +{ + /* Check valid pointer */ + if (eth_dev->data->drv_name == NULL) + return -1; + + /* return 0 if driver name matches */ + return eth_dev->data->drv_name != pmd_bond_drv.driver.name; +} + +int +valid_bonded_port_id(uint8_t port_id) +{ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1); + return check_for_bonded_ethdev(&rte_eth_devices[port_id]); +} + +int +valid_slave_port_id(uint8_t port_id) +{ + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -1); + + /* Verify that port_id refers to a non bonded port */ + if (check_for_bonded_ethdev(&rte_eth_devices[port_id]) == 0) + return -1; + + return 0; +} + +void +activate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id) +{ + struct bond_dev_private *internals = eth_dev->data->dev_private; + uint8_t active_count = internals->active_slave_count; + + if (internals->mode == BONDING_MODE_8023AD) + bond_mode_8023ad_activate_slave(eth_dev, port_id); + + if (internals->mode == BONDING_MODE_TLB + || internals->mode == BONDING_MODE_ALB) { + + internals->tlb_slaves_order[active_count] = port_id; + } + + RTE_ASSERT(internals->active_slave_count < + (RTE_DIM(internals->active_slaves) - 1)); + + internals->active_slaves[internals->active_slave_count] = port_id; + internals->active_slave_count++; + + if (internals->mode == BONDING_MODE_TLB) + bond_tlb_activate_slave(internals); + if (internals->mode == BONDING_MODE_ALB) + bond_mode_alb_client_list_upd(eth_dev); +} + +void +deactivate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id) +{ + uint8_t slave_pos; + struct bond_dev_private *internals = eth_dev->data->dev_private; + uint8_t active_count = internals->active_slave_count; + + if (internals->mode == BONDING_MODE_8023AD) { + bond_mode_8023ad_stop(eth_dev); + bond_mode_8023ad_deactivate_slave(eth_dev, port_id); + } else if (internals->mode == BONDING_MODE_TLB + || internals->mode == BONDING_MODE_ALB) + bond_tlb_disable(internals); + + slave_pos = find_slave_by_id(internals->active_slaves, active_count, + port_id); + + /* If slave was not at the end of the list + * shift active slaves up active array list */ + if (slave_pos < active_count) { + active_count--; + memmove(internals->active_slaves + slave_pos, + internals->active_slaves + slave_pos + 1, + (active_count - slave_pos) * + sizeof(internals->active_slaves[0])); + } + + RTE_ASSERT(active_count < RTE_DIM(internals->active_slaves)); + internals->active_slave_count = active_count; + + if (eth_dev->data->dev_started) { + if (internals->mode == BONDING_MODE_8023AD) { + bond_mode_8023ad_start(eth_dev); + } else if (internals->mode == BONDING_MODE_TLB) { + bond_tlb_enable(internals); + } else if (internals->mode == BONDING_MODE_ALB) { + bond_tlb_enable(internals); + bond_mode_alb_client_list_upd(eth_dev); + } + } +} + +uint8_t +number_of_sockets(void) +{ + int sockets = 0; + int i; + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + + for (i = 0; ((i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL)); i++) { + if (sockets < ms[i].socket_id) + sockets = ms[i].socket_id; + } + + /* Number of sockets = maximum socket_id + 1 */ + return ++sockets; +} + +int +rte_eth_bond_create(const char *name, uint8_t mode, uint8_t socket_id) +{ + struct bond_dev_private *internals; + char devargs[52]; + uint8_t port_id; + int ret; + + if (name == NULL) { + RTE_BOND_LOG(ERR, "Invalid name specified"); + return -EINVAL; + } + + ret = snprintf(devargs, sizeof(devargs), + "driver=net_bonding,mode=%d,socket_id=%d", mode, socket_id); + if (ret < 0 || ret >= (int)sizeof(devargs)) + return -ENOMEM; + + ret = rte_vdev_init(name, devargs); + if (ret) + return -ENOMEM; + + ret = rte_eth_dev_get_port_by_name(name, &port_id); + RTE_ASSERT(!ret); + + /* + * To make bond_ethdev_configure() happy we need to free the + * internals->kvlist here. + * + * Also see comment in bond_ethdev_configure(). + */ + internals = rte_eth_devices[port_id].data->dev_private; + rte_kvargs_free(internals->kvlist); + internals->kvlist = NULL; + + return port_id; +} + +int +rte_eth_bond_free(const char *name) +{ + return rte_vdev_uninit(name); +} + +static int +slave_vlan_filter_set(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + int found; + int res = 0; + uint64_t slab = 0; + uint32_t pos = 0; + uint16_t first; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + if (bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter == 0) + return 0; + + internals = bonded_eth_dev->data->dev_private; + found = rte_bitmap_scan(internals->vlan_filter_bmp, &pos, &slab); + first = pos; + + if (!found) + return 0; + + do { + uint32_t i; + uint64_t mask; + + for (i = 0, mask = 1; + i < RTE_BITMAP_SLAB_BIT_SIZE; + i ++, mask <<= 1) { + if (unlikely(slab & mask)) + res = rte_eth_dev_vlan_filter(slave_port_id, + (uint16_t)pos, 1); + } + found = rte_bitmap_scan(internals->vlan_filter_bmp, + &pos, &slab); + } while (found && first != pos && res == 0); + + return res; +} + +static int +__eth_bond_slave_add_lock_free(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev; + struct bond_dev_private *internals; + struct rte_eth_link link_props; + struct rte_eth_dev_info dev_info; + + if (valid_slave_port_id(slave_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + slave_eth_dev = &rte_eth_devices[slave_port_id]; + if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_BONDED_SLAVE) { + RTE_BOND_LOG(ERR, "Slave device is already a slave of a bonded device"); + return -1; + } + + /* Add slave details to bonded device */ + slave_eth_dev->data->dev_flags |= RTE_ETH_DEV_BONDED_SLAVE; + + rte_eth_dev_info_get(slave_port_id, &dev_info); + if (dev_info.max_rx_pktlen < internals->max_rx_pktlen) { + RTE_BOND_LOG(ERR, "Slave (port %u) max_rx_pktlen too small", + slave_port_id); + return -1; + } + + slave_add(internals, slave_eth_dev); + + /* We need to store slaves reta_size to be able to synchronize RETA for all + * slave devices even if its sizes are different. + */ + internals->slaves[internals->slave_count].reta_size = dev_info.reta_size; + + if (internals->slave_count < 1) { + /* if MAC is not user defined then use MAC of first slave add to + * bonded device */ + if (!internals->user_defined_mac) + mac_address_set(bonded_eth_dev, slave_eth_dev->data->mac_addrs); + + /* Inherit eth dev link properties from first slave */ + link_properties_set(bonded_eth_dev, + &(slave_eth_dev->data->dev_link)); + + /* Make primary slave */ + internals->primary_port = slave_port_id; + internals->current_primary_port = slave_port_id; + + /* Inherit queues settings from first slave */ + internals->nb_rx_queues = slave_eth_dev->data->nb_rx_queues; + internals->nb_tx_queues = slave_eth_dev->data->nb_tx_queues; + + internals->reta_size = dev_info.reta_size; + + /* Take the first dev's offload capabilities */ + internals->rx_offload_capa = dev_info.rx_offload_capa; + internals->tx_offload_capa = dev_info.tx_offload_capa; + internals->flow_type_rss_offloads = dev_info.flow_type_rss_offloads; + + /* Inherit first slave's max rx packet size */ + internals->candidate_max_rx_pktlen = dev_info.max_rx_pktlen; + + } else { + internals->rx_offload_capa &= dev_info.rx_offload_capa; + internals->tx_offload_capa &= dev_info.tx_offload_capa; + internals->flow_type_rss_offloads &= dev_info.flow_type_rss_offloads; + + /* RETA size is GCD of all slaves RETA sizes, so, if all sizes will be + * the power of 2, the lower one is GCD + */ + if (internals->reta_size > dev_info.reta_size) + internals->reta_size = dev_info.reta_size; + + if (!internals->max_rx_pktlen && + dev_info.max_rx_pktlen < internals->candidate_max_rx_pktlen) + internals->candidate_max_rx_pktlen = dev_info.max_rx_pktlen; + } + + bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf &= + internals->flow_type_rss_offloads; + + internals->slave_count++; + + /* Update all slave devices MACs*/ + mac_address_slaves_update(bonded_eth_dev); + + if (bonded_eth_dev->data->dev_started) { + if (slave_configure(bonded_eth_dev, slave_eth_dev) != 0) { + slave_eth_dev->data->dev_flags &= (~RTE_ETH_DEV_BONDED_SLAVE); + RTE_BOND_LOG(ERR, "rte_bond_slaves_configure: port=%d", + slave_port_id); + return -1; + } + } + + /* Register link status change callback with bonded device pointer as + * argument*/ + rte_eth_dev_callback_register(slave_port_id, RTE_ETH_EVENT_INTR_LSC, + bond_ethdev_lsc_event_callback, &bonded_eth_dev->data->port_id); + + /* If bonded device is started then we can add the slave to our active + * slave array */ + if (bonded_eth_dev->data->dev_started) { + rte_eth_link_get_nowait(slave_port_id, &link_props); + + if (link_props.link_status == ETH_LINK_UP) { + if (internals->active_slave_count == 0 && + !internals->user_defined_primary_port) + bond_ethdev_primary_set(internals, + slave_port_id); + + if (find_slave_by_id(internals->active_slaves, + internals->active_slave_count, + slave_port_id) == internals->active_slave_count) + activate_slave(bonded_eth_dev, slave_port_id); + } + } + + slave_vlan_filter_set(bonded_port_id, slave_port_id); + + return 0; + +} + +int +rte_eth_bond_slave_add(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + + int retval; + + /* Verify that port id's are valid bonded and slave ports */ + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + rte_spinlock_lock(&internals->lock); + + retval = __eth_bond_slave_add_lock_free(bonded_port_id, slave_port_id); + + rte_spinlock_unlock(&internals->lock); + + return retval; +} + +static int +__eth_bond_slave_remove_lock_free(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + struct rte_eth_dev *slave_eth_dev; + int i, slave_idx; + + if (valid_slave_port_id(slave_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + /* first remove from active slave list */ + slave_idx = find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_port_id); + + if (slave_idx < internals->active_slave_count) + deactivate_slave(bonded_eth_dev, slave_port_id); + + slave_idx = -1; + /* now find in slave list */ + for (i = 0; i < internals->slave_count; i++) + if (internals->slaves[i].port_id == slave_port_id) { + slave_idx = i; + break; + } + + if (slave_idx < 0) { + RTE_BOND_LOG(ERR, "Couldn't find slave in port list, slave count %d", + internals->slave_count); + return -1; + } + + /* Un-register link status change callback with bonded device pointer as + * argument*/ + rte_eth_dev_callback_unregister(slave_port_id, RTE_ETH_EVENT_INTR_LSC, + bond_ethdev_lsc_event_callback, + &rte_eth_devices[bonded_port_id].data->port_id); + + /* Restore original MAC address of slave device */ + mac_address_set(&rte_eth_devices[slave_port_id], + &(internals->slaves[slave_idx].persisted_mac_addr)); + + slave_eth_dev = &rte_eth_devices[slave_port_id]; + slave_remove(internals, slave_eth_dev); + slave_eth_dev->data->dev_flags &= (~RTE_ETH_DEV_BONDED_SLAVE); + + /* first slave in the active list will be the primary by default, + * otherwise use first device in list */ + if (internals->current_primary_port == slave_port_id) { + if (internals->active_slave_count > 0) + internals->current_primary_port = internals->active_slaves[0]; + else if (internals->slave_count > 0) + internals->current_primary_port = internals->slaves[0].port_id; + else + internals->primary_port = 0; + } + + if (internals->active_slave_count < 1) { + /* reset device link properties as no slaves are active */ + link_properties_reset(&rte_eth_devices[bonded_port_id]); + + /* if no slaves are any longer attached to bonded device and MAC is not + * user defined then clear MAC of bonded device as it will be reset + * when a new slave is added */ + if (internals->slave_count < 1 && !internals->user_defined_mac) + memset(rte_eth_devices[bonded_port_id].data->mac_addrs, 0, + sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs))); + } + if (internals->slave_count == 0) { + internals->rx_offload_capa = 0; + internals->tx_offload_capa = 0; + internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK; + internals->reta_size = 0; + internals->candidate_max_rx_pktlen = 0; + internals->max_rx_pktlen = 0; + } + return 0; +} + +int +rte_eth_bond_slave_remove(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + int retval; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + rte_spinlock_lock(&internals->lock); + + retval = __eth_bond_slave_remove_lock_free(bonded_port_id, slave_port_id); + + rte_spinlock_unlock(&internals->lock); + + return retval; +} + +int +rte_eth_bond_mode_set(uint8_t bonded_port_id, uint8_t mode) +{ + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + return bond_ethdev_mode_set(&rte_eth_devices[bonded_port_id], mode); +} + +int +rte_eth_bond_mode_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + return internals->mode; +} + +int +rte_eth_bond_primary_set(uint8_t bonded_port_id, uint8_t slave_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + if (valid_slave_port_id(slave_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + internals->user_defined_primary_port = 1; + internals->primary_port = slave_port_id; + + bond_ethdev_primary_set(internals, slave_port_id); + + return 0; +} + +int +rte_eth_bond_primary_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + if (internals->slave_count < 1) + return -1; + + return internals->current_primary_port; +} + +int +rte_eth_bond_slaves_get(uint8_t bonded_port_id, uint8_t slaves[], uint8_t len) +{ + struct bond_dev_private *internals; + uint8_t i; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + if (slaves == NULL) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + if (internals->slave_count > len) + return -1; + + for (i = 0; i < internals->slave_count; i++) + slaves[i] = internals->slaves[i].port_id; + + return internals->slave_count; +} + +int +rte_eth_bond_active_slaves_get(uint8_t bonded_port_id, uint8_t slaves[], + uint8_t len) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + if (slaves == NULL) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + if (internals->active_slave_count > len) + return -1; + + memcpy(slaves, internals->active_slaves, internals->active_slave_count); + + return internals->active_slave_count; +} + +int +rte_eth_bond_mac_address_set(uint8_t bonded_port_id, + struct ether_addr *mac_addr) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + /* Set MAC Address of Bonded Device */ + if (mac_address_set(bonded_eth_dev, mac_addr)) + return -1; + + internals->user_defined_mac = 1; + + /* Update all slave devices MACs*/ + if (internals->slave_count > 0) + return mac_address_slaves_update(bonded_eth_dev); + + return 0; +} + +int +rte_eth_bond_mac_address_reset(uint8_t bonded_port_id) +{ + struct rte_eth_dev *bonded_eth_dev; + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; + + internals->user_defined_mac = 0; + + if (internals->slave_count > 0) { + /* Set MAC Address of Bonded Device */ + if (mac_address_set(bonded_eth_dev, + &internals->slaves[internals->primary_port].persisted_mac_addr) + != 0) { + RTE_BOND_LOG(ERR, "Failed to set MAC address on bonded device"); + return -1; + } + /* Update all slave devices MAC addresses */ + return mac_address_slaves_update(bonded_eth_dev); + } + /* No need to update anything as no slaves present */ + return 0; +} + +int +rte_eth_bond_xmit_policy_set(uint8_t bonded_port_id, uint8_t policy) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + switch (policy) { + case BALANCE_XMIT_POLICY_LAYER2: + internals->balance_xmit_policy = policy; + internals->xmit_hash = xmit_l2_hash; + break; + case BALANCE_XMIT_POLICY_LAYER23: + internals->balance_xmit_policy = policy; + internals->xmit_hash = xmit_l23_hash; + break; + case BALANCE_XMIT_POLICY_LAYER34: + internals->balance_xmit_policy = policy; + internals->xmit_hash = xmit_l34_hash; + break; + + default: + return -1; + } + return 0; +} + +int +rte_eth_bond_xmit_policy_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + return internals->balance_xmit_policy; +} + +int +rte_eth_bond_link_monitoring_set(uint8_t bonded_port_id, uint32_t internal_ms) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + internals->link_status_polling_interval_ms = internal_ms; + + return 0; +} + +int +rte_eth_bond_link_monitoring_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + return internals->link_status_polling_interval_ms; +} + +int +rte_eth_bond_link_down_prop_delay_set(uint8_t bonded_port_id, uint32_t delay_ms) + +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + internals->link_down_delay_ms = delay_ms; + + return 0; +} + +int +rte_eth_bond_link_down_prop_delay_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + return internals->link_down_delay_ms; +} + +int +rte_eth_bond_link_up_prop_delay_set(uint8_t bonded_port_id, uint32_t delay_ms) + +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + internals->link_up_delay_ms = delay_ms; + + return 0; +} + +int +rte_eth_bond_link_up_prop_delay_get(uint8_t bonded_port_id) +{ + struct bond_dev_private *internals; + + if (valid_bonded_port_id(bonded_port_id) != 0) + return -1; + + internals = rte_eth_devices[bonded_port_id].data->dev_private; + + return internals->link_up_delay_ms; +} diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_args.c b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_args.c new file mode 100644 index 00000000..e3bdad9d --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_args.c @@ -0,0 +1,286 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rte_devargs.h> +#include <rte_kvargs.h> + +#include <cmdline_parse.h> +#include <cmdline_parse_etheraddr.h> + +#include "rte_eth_bond.h" +#include "rte_eth_bond_private.h" + +const char *pmd_bond_init_valid_arguments[] = { + PMD_BOND_SLAVE_PORT_KVARG, + PMD_BOND_PRIMARY_SLAVE_KVARG, + PMD_BOND_MODE_KVARG, + PMD_BOND_XMIT_POLICY_KVARG, + PMD_BOND_SOCKET_ID_KVARG, + PMD_BOND_MAC_ADDR_KVARG, + "driver", + NULL +}; + +static inline int +find_port_id_by_pci_addr(const struct rte_pci_addr *pci_addr) +{ + struct rte_pci_device *pci_dev; + struct rte_pci_addr *eth_pci_addr; + unsigned i; + + for (i = 0; i < rte_eth_dev_count(); i++) { + + /* Currently populated by rte_eth_copy_pci_info(). + * + * TODO: Once the PCI bus has arrived we should have a better + * way to test for being a PCI device or not. + */ + if (rte_eth_devices[i].data->kdrv == RTE_KDRV_UNKNOWN || + rte_eth_devices[i].data->kdrv == RTE_KDRV_NONE) + continue; + + pci_dev = RTE_DEV_TO_PCI(rte_eth_devices[i].device); + eth_pci_addr = &pci_dev->addr; + + if (pci_addr->bus == eth_pci_addr->bus && + pci_addr->devid == eth_pci_addr->devid && + pci_addr->domain == eth_pci_addr->domain && + pci_addr->function == eth_pci_addr->function) + return i; + } + return -1; +} + +static inline int +find_port_id_by_dev_name(const char *name) +{ + unsigned i; + + for (i = 0; i < rte_eth_dev_count(); i++) { + if (rte_eth_devices[i].data == NULL) + continue; + + if (strcmp(rte_eth_devices[i].data->name, name) == 0) + return i; + } + return -1; +} + +/** + * Parses a port identifier string to a port id by pci address, then by name, + * and finally port id. + */ +static inline int +parse_port_id(const char *port_str) +{ + struct rte_pci_addr dev_addr; + int port_id; + + /* try parsing as pci address, physical devices */ + if (eal_parse_pci_DomBDF(port_str, &dev_addr) == 0) { + port_id = find_port_id_by_pci_addr(&dev_addr); + if (port_id < 0) + return -1; + } else { + /* try parsing as device name, virtual devices */ + port_id = find_port_id_by_dev_name(port_str); + if (port_id < 0) { + char *end; + errno = 0; + + /* try parsing as port id */ + port_id = strtol(port_str, &end, 10); + if (*end != 0 || errno != 0) + return -1; + } + } + + if (port_id < 0 || port_id > RTE_MAX_ETHPORTS) { + RTE_BOND_LOG(ERR, "Slave port specified (%s) outside expected range", + port_str); + return -1; + } + return port_id; +} + +int +bond_ethdev_parse_slave_port_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + struct bond_ethdev_slave_ports *slave_ports; + + if (value == NULL || extra_args == NULL) + return -1; + + slave_ports = extra_args; + + if (strcmp(key, PMD_BOND_SLAVE_PORT_KVARG) == 0) { + int port_id = parse_port_id(value); + if (port_id < 0) { + RTE_BOND_LOG(ERR, "Invalid slave port value (%s) specified", value); + return -1; + } else + slave_ports->slaves[slave_ports->slave_count++] = + (uint8_t)port_id; + } + return 0; +} + +int +bond_ethdev_parse_slave_mode_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + uint8_t *mode; + char *endptr; + + if (value == NULL || extra_args == NULL) + return -1; + + mode = extra_args; + + errno = 0; + *mode = strtol(value, &endptr, 10); + if (*endptr != 0 || errno != 0) + return -1; + + /* validate mode value */ + switch (*mode) { + case BONDING_MODE_ROUND_ROBIN: + case BONDING_MODE_ACTIVE_BACKUP: + case BONDING_MODE_BALANCE: + case BONDING_MODE_BROADCAST: + case BONDING_MODE_8023AD: + case BONDING_MODE_TLB: + case BONDING_MODE_ALB: + return 0; + default: + RTE_BOND_LOG(ERR, "Invalid slave mode value (%s) specified", value); + return -1; + } +} + +int +bond_ethdev_parse_socket_id_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + int socket_id; + char *endptr; + + if (value == NULL || extra_args == NULL) + return -1; + + errno = 0; + socket_id = (uint8_t)strtol(value, &endptr, 10); + if (*endptr != 0 || errno != 0) + return -1; + + /* validate mode value */ + if (socket_id >= 0 && socket_id < number_of_sockets()) { + *(uint8_t *)extra_args = (uint8_t)socket_id; + return 0; + } + return -1; +} + +int +bond_ethdev_parse_primary_slave_port_id_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + int primary_slave_port_id; + + if (value == NULL || extra_args == NULL) + return -1; + + primary_slave_port_id = parse_port_id(value); + if (primary_slave_port_id < 0) + return -1; + + *(uint8_t *)extra_args = (uint8_t)primary_slave_port_id; + + return 0; +} + +int +bond_ethdev_parse_balance_xmit_policy_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + uint8_t *xmit_policy; + + if (value == NULL || extra_args == NULL) + return -1; + + xmit_policy = extra_args; + + if (strcmp(PMD_BOND_XMIT_POLICY_LAYER2_KVARG, value) == 0) + *xmit_policy = BALANCE_XMIT_POLICY_LAYER2; + else if (strcmp(PMD_BOND_XMIT_POLICY_LAYER23_KVARG, value) == 0) + *xmit_policy = BALANCE_XMIT_POLICY_LAYER23; + else if (strcmp(PMD_BOND_XMIT_POLICY_LAYER34_KVARG, value) == 0) + *xmit_policy = BALANCE_XMIT_POLICY_LAYER34; + else + return -1; + + return 0; +} + +int +bond_ethdev_parse_bond_mac_addr_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + if (value == NULL || extra_args == NULL) + return -1; + + /* Parse MAC */ + return cmdline_parse_etheraddr(NULL, value, extra_args, + sizeof(struct ether_addr)); +} + +int +bond_ethdev_parse_time_ms_kvarg(const char *key __rte_unused, + const char *value, void *extra_args) +{ + uint32_t time_ms; + char *endptr; + + if (value == NULL || extra_args == NULL) + return -1; + + errno = 0; + time_ms = (uint32_t)strtol(value, &endptr, 10); + if (*endptr != 0 || errno != 0) + return -1; + + *(uint32_t *)extra_args = time_ms; + + return 0; +} diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_pmd.c b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_pmd.c new file mode 100644 index 00000000..82959abc --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_pmd.c @@ -0,0 +1,2760 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdlib.h> +#include <netinet/in.h> + +#include <rte_mbuf.h> +#include <rte_malloc.h> +#include <rte_ethdev.h> +#include <rte_ethdev_vdev.h> +#include <rte_tcp.h> +#include <rte_udp.h> +#include <rte_ip.h> +#include <rte_ip_frag.h> +#include <rte_devargs.h> +#include <rte_kvargs.h> +#include <rte_vdev.h> +#include <rte_alarm.h> +#include <rte_cycles.h> + +#include "rte_eth_bond.h" +#include "rte_eth_bond_private.h" +#include "rte_eth_bond_8023ad_private.h" + +#define REORDER_PERIOD_MS 10 +#define DEFAULT_POLLING_INTERVAL_10_MS (10) + +#define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port) + +/* Table for statistics in mode 5 TLB */ +static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS]; + +static inline size_t +get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto) +{ + size_t vlan_offset = 0; + + if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + vlan_offset = sizeof(struct vlan_hdr); + *proto = vlan_hdr->eth_proto; + + if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) { + vlan_hdr = vlan_hdr + 1; + *proto = vlan_hdr->eth_proto; + vlan_offset += sizeof(struct vlan_hdr); + } + } + return vlan_offset; +} + +static uint16_t +bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + + uint16_t num_rx_slave = 0; + uint16_t num_rx_total = 0; + + int i; + + /* Cast to structure, containing bonded device's port id and queue id */ + struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue; + + internals = bd_rx_q->dev_private; + + + for (i = 0; i < internals->active_slave_count && nb_pkts; i++) { + /* Offset of pointer to *bufs increases as packets are received + * from other slaves */ + num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i], + bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts); + if (num_rx_slave) { + num_rx_total += num_rx_slave; + nb_pkts -= num_rx_slave; + } + } + + return num_rx_total; +} + +static uint16_t +bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + + /* Cast to structure, containing bonded device's port id and queue id */ + struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue; + + internals = bd_rx_q->dev_private; + + return rte_eth_rx_burst(internals->current_primary_port, + bd_rx_q->queue_id, bufs, nb_pkts); +} + +static inline uint8_t +is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci) +{ + const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW); + + return !vlan_tci && (ethertype == ether_type_slow_be && + (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP)); +} + +static uint16_t +bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + /* Cast to structure, containing bonded device's port id and queue id */ + struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue; + struct bond_dev_private *internals = bd_rx_q->dev_private; + struct ether_addr bond_mac; + + struct ether_hdr *hdr; + + const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW); + uint16_t num_rx_total = 0; /* Total number of received packets */ + uint8_t slaves[RTE_MAX_ETHPORTS]; + uint8_t slave_count, idx; + + uint8_t collecting; /* current slave collecting status */ + const uint8_t promisc = internals->promiscuous_en; + uint8_t i, j, k; + uint8_t subtype; + + rte_eth_macaddr_get(internals->port_id, &bond_mac); + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + slave_count = internals->active_slave_count; + memcpy(slaves, internals->active_slaves, + sizeof(internals->active_slaves[0]) * slave_count); + + idx = internals->active_slave; + if (idx >= slave_count) { + internals->active_slave = 0; + idx = 0; + } + for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) { + j = num_rx_total; + collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]], + COLLECTING); + + /* Read packets from this slave */ + num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id, + &bufs[num_rx_total], nb_pkts - num_rx_total); + + for (k = j; k < 2 && k < num_rx_total; k++) + rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *)); + + /* Handle slow protocol packets. */ + while (j < num_rx_total) { + if (j + 3 < num_rx_total) + rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *)); + + hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *); + subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype; + + /* Remove packet from array if it is slow packet or slave is not + * in collecting state or bondign interface is not in promiscus + * mode and packet address does not match. */ + if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) || + !collecting || (!promisc && + !is_multicast_ether_addr(&hdr->d_addr) && + !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) { + + if (hdr->ether_type == ether_type_slow_be) { + bond_mode_8023ad_handle_slow_pkt( + internals, slaves[idx], bufs[j]); + } else + rte_pktmbuf_free(bufs[j]); + + /* Packet is managed by mode 4 or dropped, shift the array */ + num_rx_total--; + if (j < num_rx_total) { + memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) * + (num_rx_total - j)); + } + } else + j++; + } + if (unlikely(++idx == slave_count)) + idx = 0; + } + + internals->active_slave = idx; + return num_rx_total; +} + +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) +uint32_t burstnumberRX; +uint32_t burstnumberTX; + +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB + +static void +arp_op_name(uint16_t arp_op, char *buf) +{ + switch (arp_op) { + case ARP_OP_REQUEST: + snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request"); + return; + case ARP_OP_REPLY: + snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply"); + return; + case ARP_OP_REVREQUEST: + snprintf(buf, sizeof("Reverse ARP Request"), "%s", + "Reverse ARP Request"); + return; + case ARP_OP_REVREPLY: + snprintf(buf, sizeof("Reverse ARP Reply"), "%s", + "Reverse ARP Reply"); + return; + case ARP_OP_INVREQUEST: + snprintf(buf, sizeof("Peer Identify Request"), "%s", + "Peer Identify Request"); + return; + case ARP_OP_INVREPLY: + snprintf(buf, sizeof("Peer Identify Reply"), "%s", + "Peer Identify Reply"); + return; + default: + break; + } + snprintf(buf, sizeof("Unknown"), "%s", "Unknown"); + return; +} +#endif +#define MaxIPv4String 16 +static void +ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size) +{ + uint32_t ipv4_addr; + + ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr); + snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF, + (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF, + ipv4_addr & 0xFF); +} + +#define MAX_CLIENTS_NUMBER 128 +uint8_t active_clients; +struct client_stats_t { + uint8_t port; + uint32_t ipv4_addr; + uint32_t ipv4_rx_packets; + uint32_t ipv4_tx_packets; +}; +struct client_stats_t client_stats[MAX_CLIENTS_NUMBER]; + +static void +update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator) +{ + int i = 0; + + for (; i < MAX_CLIENTS_NUMBER; i++) { + if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port)) { + /* Just update RX packets number for this client */ + if (TXorRXindicator == &burstnumberRX) + client_stats[i].ipv4_rx_packets++; + else + client_stats[i].ipv4_tx_packets++; + return; + } + } + /* We have a new client. Insert him to the table, and increment stats */ + if (TXorRXindicator == &burstnumberRX) + client_stats[active_clients].ipv4_rx_packets++; + else + client_stats[active_clients].ipv4_tx_packets++; + client_stats[active_clients].ipv4_addr = addr; + client_stats[active_clients].port = port; + active_clients++; + +} + +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB +#define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber) \ + RTE_LOG(DEBUG, PMD, \ + "%s " \ + "port:%d " \ + "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \ + "SrcIP:%s " \ + "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \ + "DstIP:%s " \ + "%s " \ + "%d\n", \ + info, \ + port, \ + eth_h->s_addr.addr_bytes[0], \ + eth_h->s_addr.addr_bytes[1], \ + eth_h->s_addr.addr_bytes[2], \ + eth_h->s_addr.addr_bytes[3], \ + eth_h->s_addr.addr_bytes[4], \ + eth_h->s_addr.addr_bytes[5], \ + src_ip, \ + eth_h->d_addr.addr_bytes[0], \ + eth_h->d_addr.addr_bytes[1], \ + eth_h->d_addr.addr_bytes[2], \ + eth_h->d_addr.addr_bytes[3], \ + eth_h->d_addr.addr_bytes[4], \ + eth_h->d_addr.addr_bytes[5], \ + dst_ip, \ + arp_op, \ + ++burstnumber) +#endif + +static void +mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h, + uint8_t port, uint32_t __attribute__((unused)) *burstnumber) +{ + struct ipv4_hdr *ipv4_h; +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB + struct arp_hdr *arp_h; + char dst_ip[16]; + char ArpOp[24]; + char buf[16]; +#endif + char src_ip[16]; + + uint16_t ether_type = eth_h->ether_type; + uint16_t offset = get_vlan_offset(eth_h, ðer_type); + +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB + snprintf(buf, 16, "%s", info); +#endif + + if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { + ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset); + ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String); +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB + ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String); + MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber); +#endif + update_client_stats(ipv4_h->src_addr, port, burstnumber); + } +#ifdef RTE_LIBRTE_BOND_DEBUG_ALB + else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { + arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset); + ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String); + ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String); + arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp); + MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber); + } +#endif +} +#endif + +static uint16_t +bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue; + struct bond_dev_private *internals = bd_tx_q->dev_private; + struct ether_hdr *eth_h; + uint16_t ether_type, offset; + uint16_t nb_recv_pkts; + int i; + + nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts); + + for (i = 0; i < nb_recv_pkts; i++) { + eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *); + ether_type = eth_h->ether_type; + offset = get_vlan_offset(eth_h, ðer_type); + + if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) + mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX); +#endif + bond_mode_alb_arp_recv(eth_h, offset, internals); + } +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) + else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) + mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX); +#endif + } + + return nb_recv_pkts; +} + +static uint16_t +bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts]; + uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + + uint8_t num_of_slaves; + uint8_t slaves[RTE_MAX_ETHPORTS]; + + uint16_t num_tx_total = 0, num_tx_slave; + + static int slave_idx = 0; + int i, cslave_idx = 0, tx_fail_total = 0; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + num_of_slaves = internals->active_slave_count; + memcpy(slaves, internals->active_slaves, + sizeof(internals->active_slaves[0]) * num_of_slaves); + + if (num_of_slaves < 1) + return num_tx_total; + + /* Populate slaves mbuf with which packets are to be sent on it */ + for (i = 0; i < nb_pkts; i++) { + cslave_idx = (slave_idx + i) % num_of_slaves; + slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i]; + } + + /* increment current slave index so the next call to tx burst starts on the + * next slave */ + slave_idx = ++cslave_idx; + + /* Send packet burst on each slave device */ + for (i = 0; i < num_of_slaves; i++) { + if (slave_nb_pkts[i] > 0) { + num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + slave_bufs[i], slave_nb_pkts[i]); + + /* if tx burst fails move packets to end of bufs */ + if (unlikely(num_tx_slave < slave_nb_pkts[i])) { + int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave; + + tx_fail_total += tx_fail_slave; + + memcpy(&bufs[nb_pkts - tx_fail_total], + &slave_bufs[i][num_tx_slave], + tx_fail_slave * sizeof(bufs[0])); + } + num_tx_total += num_tx_slave; + } + } + + return num_tx_total; +} + +static uint16_t +bond_ethdev_tx_burst_active_backup(void *queue, + struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + if (internals->active_slave_count < 1) + return 0; + + return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id, + bufs, nb_pkts); +} + +static inline uint16_t +ether_hash(struct ether_hdr *eth_hdr) +{ + unaligned_uint16_t *word_src_addr = + (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes; + unaligned_uint16_t *word_dst_addr = + (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes; + + return (word_src_addr[0] ^ word_dst_addr[0]) ^ + (word_src_addr[1] ^ word_dst_addr[1]) ^ + (word_src_addr[2] ^ word_dst_addr[2]); +} + +static inline uint32_t +ipv4_hash(struct ipv4_hdr *ipv4_hdr) +{ + return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr; +} + +static inline uint32_t +ipv6_hash(struct ipv6_hdr *ipv6_hdr) +{ + unaligned_uint32_t *word_src_addr = + (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]); + unaligned_uint32_t *word_dst_addr = + (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]); + + return (word_src_addr[0] ^ word_dst_addr[0]) ^ + (word_src_addr[1] ^ word_dst_addr[1]) ^ + (word_src_addr[2] ^ word_dst_addr[2]) ^ + (word_src_addr[3] ^ word_dst_addr[3]); +} + +uint16_t +xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count) +{ + struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *); + + uint32_t hash = ether_hash(eth_hdr); + + return (hash ^= hash >> 8) % slave_count; +} + +uint16_t +xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count) +{ + struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *); + uint16_t proto = eth_hdr->ether_type; + size_t vlan_offset = get_vlan_offset(eth_hdr, &proto); + uint32_t hash, l3hash = 0; + + hash = ether_hash(eth_hdr); + + if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) { + struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *) + ((char *)(eth_hdr + 1) + vlan_offset); + l3hash = ipv4_hash(ipv4_hdr); + + } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) { + struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *) + ((char *)(eth_hdr + 1) + vlan_offset); + l3hash = ipv6_hash(ipv6_hdr); + } + + hash = hash ^ l3hash; + hash ^= hash >> 16; + hash ^= hash >> 8; + + return hash % slave_count; +} + +uint16_t +xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count) +{ + struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *); + uint16_t proto = eth_hdr->ether_type; + size_t vlan_offset = get_vlan_offset(eth_hdr, &proto); + + struct udp_hdr *udp_hdr = NULL; + struct tcp_hdr *tcp_hdr = NULL; + uint32_t hash, l3hash = 0, l4hash = 0; + + if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) { + struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *) + ((char *)(eth_hdr + 1) + vlan_offset); + size_t ip_hdr_offset; + + l3hash = ipv4_hash(ipv4_hdr); + + /* there is no L4 header in fragmented packet */ + if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) { + ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) * + IPV4_IHL_MULTIPLIER; + + if (ipv4_hdr->next_proto_id == IPPROTO_TCP) { + tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + + ip_hdr_offset); + l4hash = HASH_L4_PORTS(tcp_hdr); + } else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) { + udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr + + ip_hdr_offset); + l4hash = HASH_L4_PORTS(udp_hdr); + } + } + } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) { + struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *) + ((char *)(eth_hdr + 1) + vlan_offset); + l3hash = ipv6_hash(ipv6_hdr); + + if (ipv6_hdr->proto == IPPROTO_TCP) { + tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1); + l4hash = HASH_L4_PORTS(tcp_hdr); + } else if (ipv6_hdr->proto == IPPROTO_UDP) { + udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1); + l4hash = HASH_L4_PORTS(udp_hdr); + } + } + + hash = l3hash ^ l4hash; + hash ^= hash >> 16; + hash ^= hash >> 8; + + return hash % slave_count; +} + +struct bwg_slave { + uint64_t bwg_left_int; + uint64_t bwg_left_remainder; + uint8_t slave; +}; + +void +bond_tlb_activate_slave(struct bond_dev_private *internals) { + int i; + + for (i = 0; i < internals->active_slave_count; i++) { + tlb_last_obytets[internals->active_slaves[i]] = 0; + } +} + +static int +bandwidth_cmp(const void *a, const void *b) +{ + const struct bwg_slave *bwg_a = a; + const struct bwg_slave *bwg_b = b; + int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int; + int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder - + (int64_t)bwg_a->bwg_left_remainder; + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + else if (diff2 > 0) + return 1; + else if (diff2 < 0) + return -1; + else + return 0; +} + +static void +bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx, + struct bwg_slave *bwg_slave) +{ + struct rte_eth_link link_status; + + rte_eth_link_get(port_id, &link_status); + uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8; + if (link_bwg == 0) + return; + link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS; + bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg; + bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg; +} + +static void +bond_ethdev_update_tlb_slave_cb(void *arg) +{ + struct bond_dev_private *internals = arg; + struct rte_eth_stats slave_stats; + struct bwg_slave bwg_array[RTE_MAX_ETHPORTS]; + uint8_t slave_count; + uint64_t tx_bytes; + + uint8_t update_stats = 0; + uint8_t i, slave_id; + + internals->slave_update_idx++; + + + if (internals->slave_update_idx >= REORDER_PERIOD_MS) + update_stats = 1; + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + rte_eth_stats_get(slave_id, &slave_stats); + tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id]; + bandwidth_left(slave_id, tx_bytes, + internals->slave_update_idx, &bwg_array[i]); + bwg_array[i].slave = slave_id; + + if (update_stats) { + tlb_last_obytets[slave_id] = slave_stats.obytes; + } + } + + if (update_stats == 1) + internals->slave_update_idx = 0; + + slave_count = i; + qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp); + for (i = 0; i < slave_count; i++) + internals->tlb_slaves_order[i] = bwg_array[i].slave; + + rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb, + (struct bond_dev_private *)internals); +} + +static uint16_t +bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue; + struct bond_dev_private *internals = bd_tx_q->dev_private; + + struct rte_eth_dev *primary_port = + &rte_eth_devices[internals->primary_port]; + uint16_t num_tx_total = 0; + uint8_t i, j; + + uint8_t num_of_slaves = internals->active_slave_count; + uint8_t slaves[RTE_MAX_ETHPORTS]; + + struct ether_hdr *ether_hdr; + struct ether_addr primary_slave_addr; + struct ether_addr active_slave_addr; + + if (num_of_slaves < 1) + return num_tx_total; + + memcpy(slaves, internals->tlb_slaves_order, + sizeof(internals->tlb_slaves_order[0]) * num_of_slaves); + + + ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr); + + if (nb_pkts > 3) { + for (i = 0; i < 3; i++) + rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*)); + } + + for (i = 0; i < num_of_slaves; i++) { + rte_eth_macaddr_get(slaves[i], &active_slave_addr); + for (j = num_tx_total; j < nb_pkts; j++) { + if (j + 3 < nb_pkts) + rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*)); + + ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *); + if (is_same_ether_addr(ðer_hdr->s_addr, &primary_slave_addr)) + ether_addr_copy(&active_slave_addr, ðer_hdr->s_addr); +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) + mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX); +#endif + } + + num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + bufs + num_tx_total, nb_pkts - num_tx_total); + + if (num_tx_total == nb_pkts) + break; + } + + return num_tx_total; +} + +void +bond_tlb_disable(struct bond_dev_private *internals) +{ + rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals); +} + +void +bond_tlb_enable(struct bond_dev_private *internals) +{ + bond_ethdev_update_tlb_slave_cb(internals); +} + +static uint16_t +bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue; + struct bond_dev_private *internals = bd_tx_q->dev_private; + + struct ether_hdr *eth_h; + uint16_t ether_type, offset; + + struct client_data *client_info; + + /* + * We create transmit buffers for every slave and one additional to send + * through tlb. In worst case every packet will be send on one port. + */ + struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts]; + uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 }; + + /* + * We create separate transmit buffers for update packets as they wont be + * counted in num_tx_total. + */ + struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE]; + uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 }; + + struct rte_mbuf *upd_pkt; + size_t pkt_size; + + uint16_t num_send, num_not_send = 0; + uint16_t num_tx_total = 0; + uint8_t slave_idx; + + int i, j; + + /* Search tx buffer for ARP packets and forward them to alb */ + for (i = 0; i < nb_pkts; i++) { + eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *); + ether_type = eth_h->ether_type; + offset = get_vlan_offset(eth_h, ðer_type); + + if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { + slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals); + + /* Change src mac in eth header */ + rte_eth_macaddr_get(slave_idx, ð_h->s_addr); + + /* Add packet to slave tx buffer */ + slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i]; + slave_bufs_pkts[slave_idx]++; + } else { + /* If packet is not ARP, send it with TLB policy */ + slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] = + bufs[i]; + slave_bufs_pkts[RTE_MAX_ETHPORTS]++; + } + } + + /* Update connected client ARP tables */ + if (internals->mode6.ntt) { + for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) { + client_info = &internals->mode6.client_table[i]; + + if (client_info->in_use) { + /* Allocate new packet to send ARP update on current slave */ + upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool); + if (upd_pkt == NULL) { + RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n"); + continue; + } + pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr) + + client_info->vlan_count * sizeof(struct vlan_hdr); + upd_pkt->data_len = pkt_size; + upd_pkt->pkt_len = pkt_size; + + slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt, + internals); + + /* Add packet to update tx buffer */ + update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt; + update_bufs_pkts[slave_idx]++; + } + } + internals->mode6.ntt = 0; + } + + /* Send ARP packets on proper slaves */ + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (slave_bufs_pkts[i] > 0) { + num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, + slave_bufs[i], slave_bufs_pkts[i]); + for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) { + bufs[nb_pkts - 1 - num_not_send - j] = + slave_bufs[i][nb_pkts - 1 - j]; + } + + num_tx_total += num_send; + num_not_send += slave_bufs_pkts[i] - num_send; + +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) + /* Print TX stats including update packets */ + for (j = 0; j < slave_bufs_pkts[i]; j++) { + eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *); + mode6_debug("TX ARP:", eth_h, i, &burstnumberTX); + } +#endif + } + } + + /* Send update packets on proper slaves */ + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (update_bufs_pkts[i] > 0) { + num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i], + update_bufs_pkts[i]); + for (j = num_send; j < update_bufs_pkts[i]; j++) { + rte_pktmbuf_free(update_bufs[i][j]); + } +#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1) + for (j = 0; j < update_bufs_pkts[i]; j++) { + eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *); + mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX); + } +#endif + } + } + + /* Send non-ARP packets using tlb policy */ + if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) { + num_send = bond_ethdev_tx_burst_tlb(queue, + slave_bufs[RTE_MAX_ETHPORTS], + slave_bufs_pkts[RTE_MAX_ETHPORTS]); + + for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) { + bufs[nb_pkts - 1 - num_not_send - j] = + slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j]; + } + + num_tx_total += num_send; + } + + return num_tx_total; +} + +static uint16_t +bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + uint8_t num_of_slaves; + uint8_t slaves[RTE_MAX_ETHPORTS]; + + uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0; + + int i, op_slave_id; + + struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts]; + uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + num_of_slaves = internals->active_slave_count; + memcpy(slaves, internals->active_slaves, + sizeof(internals->active_slaves[0]) * num_of_slaves); + + if (num_of_slaves < 1) + return num_tx_total; + + /* Populate slaves mbuf with the packets which are to be sent on it */ + for (i = 0; i < nb_pkts; i++) { + /* Select output slave using hash based on xmit policy */ + op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves); + + /* Populate slave mbuf arrays with mbufs for that slave */ + slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i]; + } + + /* Send packet burst on each slave device */ + for (i = 0; i < num_of_slaves; i++) { + if (slave_nb_pkts[i] > 0) { + num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + slave_bufs[i], slave_nb_pkts[i]); + + /* if tx burst fails move packets to end of bufs */ + if (unlikely(num_tx_slave < slave_nb_pkts[i])) { + int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave; + + tx_fail_total += slave_tx_fail_count; + memcpy(&bufs[nb_pkts - tx_fail_total], + &slave_bufs[i][num_tx_slave], + slave_tx_fail_count * sizeof(bufs[0])); + } + + num_tx_total += num_tx_slave; + } + } + + return num_tx_total; +} + +static uint16_t +bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + uint8_t num_of_slaves; + uint8_t slaves[RTE_MAX_ETHPORTS]; + /* positions in slaves, not ID */ + uint8_t distributing_offsets[RTE_MAX_ETHPORTS]; + uint8_t distributing_count; + + uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0; + uint16_t i, j, op_slave_idx; + const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1; + + /* Allocate additional packets in case 8023AD mode. */ + struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size]; + void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL }; + + /* Total amount of packets in slave_bufs */ + uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + /* Slow packets placed in each slave */ + uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + num_of_slaves = internals->active_slave_count; + if (num_of_slaves < 1) + return num_tx_total; + + memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves); + + distributing_count = 0; + for (i = 0; i < num_of_slaves; i++) { + struct port *port = &mode_8023ad_ports[slaves[i]]; + + slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring, + slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS, + NULL); + slave_nb_pkts[i] = slave_slow_nb_pkts[i]; + + for (j = 0; j < slave_slow_nb_pkts[i]; j++) + slave_bufs[i][j] = slow_pkts[j]; + + if (ACTOR_STATE(port, DISTRIBUTING)) + distributing_offsets[distributing_count++] = i; + } + + if (likely(distributing_count > 0)) { + /* Populate slaves mbuf with the packets which are to be sent on it */ + for (i = 0; i < nb_pkts; i++) { + /* Select output slave using hash based on xmit policy */ + op_slave_idx = internals->xmit_hash(bufs[i], distributing_count); + + /* Populate slave mbuf arrays with mbufs for that slave. Use only + * slaves that are currently distributing. */ + uint8_t slave_offset = distributing_offsets[op_slave_idx]; + slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i]; + slave_nb_pkts[slave_offset]++; + } + } + + /* Send packet burst on each slave device */ + for (i = 0; i < num_of_slaves; i++) { + if (slave_nb_pkts[i] == 0) + continue; + + num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + slave_bufs[i], slave_nb_pkts[i]); + + /* If tx burst fails drop slow packets */ + for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++) + rte_pktmbuf_free(slave_bufs[i][num_tx_slave]); + + num_tx_total += num_tx_slave - slave_slow_nb_pkts[i]; + num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave; + + /* If tx burst fails move packets to end of bufs */ + if (unlikely(num_tx_slave < slave_nb_pkts[i])) { + uint16_t j = nb_pkts - num_tx_fail_total; + for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++) + bufs[j] = slave_bufs[i][num_tx_slave]; + } + } + + return num_tx_total; +} + +static uint16_t +bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + uint8_t tx_failed_flag = 0, num_of_slaves; + uint8_t slaves[RTE_MAX_ETHPORTS]; + + uint16_t max_nb_of_tx_pkts = 0; + + int slave_tx_total[RTE_MAX_ETHPORTS]; + int i, most_successful_tx_slave = -1; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + num_of_slaves = internals->active_slave_count; + memcpy(slaves, internals->active_slaves, + sizeof(internals->active_slaves[0]) * num_of_slaves); + + if (num_of_slaves < 1) + return 0; + + /* Increment reference count on mbufs */ + for (i = 0; i < nb_pkts; i++) + rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1); + + /* Transmit burst on each active slave */ + for (i = 0; i < num_of_slaves; i++) { + slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + bufs, nb_pkts); + + if (unlikely(slave_tx_total[i] < nb_pkts)) + tx_failed_flag = 1; + + /* record the value and slave index for the slave which transmits the + * maximum number of packets */ + if (slave_tx_total[i] > max_nb_of_tx_pkts) { + max_nb_of_tx_pkts = slave_tx_total[i]; + most_successful_tx_slave = i; + } + } + + /* if slaves fail to transmit packets from burst, the calling application + * is not expected to know about multiple references to packets so we must + * handle failures of all packets except those of the most successful slave + */ + if (unlikely(tx_failed_flag)) + for (i = 0; i < num_of_slaves; i++) + if (i != most_successful_tx_slave) + while (slave_tx_total[i] < nb_pkts) + rte_pktmbuf_free(bufs[slave_tx_total[i]++]); + + return max_nb_of_tx_pkts; +} + +void +link_properties_set(struct rte_eth_dev *bonded_eth_dev, + struct rte_eth_link *slave_dev_link) +{ + struct rte_eth_link *bonded_dev_link = &bonded_eth_dev->data->dev_link; + struct bond_dev_private *internals = bonded_eth_dev->data->dev_private; + + if (slave_dev_link->link_status && + bonded_eth_dev->data->dev_started) { + bonded_dev_link->link_duplex = slave_dev_link->link_duplex; + bonded_dev_link->link_speed = slave_dev_link->link_speed; + + internals->link_props_set = 1; + } +} + +void +link_properties_reset(struct rte_eth_dev *bonded_eth_dev) +{ + struct bond_dev_private *internals = bonded_eth_dev->data->dev_private; + + memset(&(bonded_eth_dev->data->dev_link), 0, + sizeof(bonded_eth_dev->data->dev_link)); + + internals->link_props_set = 0; +} + +int +link_properties_valid(struct rte_eth_link *bonded_dev_link, + struct rte_eth_link *slave_dev_link) +{ + if (bonded_dev_link->link_duplex != slave_dev_link->link_duplex || + bonded_dev_link->link_speed != slave_dev_link->link_speed) + return -1; + + return 0; +} + +int +mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr) +{ + struct ether_addr *mac_addr; + + if (eth_dev == NULL) { + RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__); + return -1; + } + + if (dst_mac_addr == NULL) { + RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__); + return -1; + } + + mac_addr = eth_dev->data->mac_addrs; + + ether_addr_copy(mac_addr, dst_mac_addr); + return 0; +} + +int +mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr) +{ + struct ether_addr *mac_addr; + + if (eth_dev == NULL) { + RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified"); + return -1; + } + + if (new_mac_addr == NULL) { + RTE_BOND_LOG(ERR, "NULL pointer MAC specified"); + return -1; + } + + mac_addr = eth_dev->data->mac_addrs; + + /* If new MAC is different to current MAC then update */ + if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0) + memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr)); + + return 0; +} + +int +mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev) +{ + struct bond_dev_private *internals = bonded_eth_dev->data->dev_private; + int i; + + /* Update slave devices MAC addresses */ + if (internals->slave_count < 1) + return -1; + + switch (internals->mode) { + case BONDING_MODE_ROUND_ROBIN: + case BONDING_MODE_BALANCE: + case BONDING_MODE_BROADCAST: + for (i = 0; i < internals->slave_count; i++) { + if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id], + bonded_eth_dev->data->mac_addrs)) { + RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address", + internals->slaves[i].port_id); + return -1; + } + } + break; + case BONDING_MODE_8023AD: + bond_mode_8023ad_mac_address_update(bonded_eth_dev); + break; + case BONDING_MODE_ACTIVE_BACKUP: + case BONDING_MODE_TLB: + case BONDING_MODE_ALB: + default: + for (i = 0; i < internals->slave_count; i++) { + if (internals->slaves[i].port_id == + internals->current_primary_port) { + if (mac_address_set(&rte_eth_devices[internals->primary_port], + bonded_eth_dev->data->mac_addrs)) { + RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address", + internals->current_primary_port); + return -1; + } + } else { + if (mac_address_set( + &rte_eth_devices[internals->slaves[i].port_id], + &internals->slaves[i].persisted_mac_addr)) { + RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address", + internals->slaves[i].port_id); + return -1; + } + } + } + } + + return 0; +} + +int +bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode) +{ + struct bond_dev_private *internals; + + internals = eth_dev->data->dev_private; + + switch (mode) { + case BONDING_MODE_ROUND_ROBIN: + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst; + break; + case BONDING_MODE_ACTIVE_BACKUP: + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup; + break; + case BONDING_MODE_BALANCE: + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst; + break; + case BONDING_MODE_BROADCAST: + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst; + break; + case BONDING_MODE_8023AD: + if (bond_mode_8023ad_enable(eth_dev) != 0) + return -1; + + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad; + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad; + RTE_LOG(WARNING, PMD, + "Using mode 4, it is necessary to do TX burst and RX burst " + "at least every 100ms.\n"); + break; + case BONDING_MODE_TLB: + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup; + break; + case BONDING_MODE_ALB: + if (bond_mode_alb_enable(eth_dev) != 0) + return -1; + + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb; + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb; + break; + default: + return -1; + } + + internals->mode = mode; + + return 0; +} + +int +slave_configure(struct rte_eth_dev *bonded_eth_dev, + struct rte_eth_dev *slave_eth_dev) +{ + struct bond_rx_queue *bd_rx_q; + struct bond_tx_queue *bd_tx_q; + + int errval; + uint16_t q_id; + + /* Stop slave */ + rte_eth_dev_stop(slave_eth_dev->data->port_id); + + /* Enable interrupts on slave device if supported */ + if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) + slave_eth_dev->data->dev_conf.intr_conf.lsc = 1; + + /* If RSS is enabled for bonding, try to enable it for slaves */ + if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) { + if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len + != 0) { + slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = + bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len; + slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = + bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key; + } else { + slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL; + } + + slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = + bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; + slave_eth_dev->data->dev_conf.rxmode.mq_mode = + bonded_eth_dev->data->dev_conf.rxmode.mq_mode; + } + + slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter = + bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter; + + /* Configure device */ + errval = rte_eth_dev_configure(slave_eth_dev->data->port_id, + bonded_eth_dev->data->nb_rx_queues, + bonded_eth_dev->data->nb_tx_queues, + &(slave_eth_dev->data->dev_conf)); + if (errval != 0) { + RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)", + slave_eth_dev->data->port_id, errval); + return errval; + } + + /* Setup Rx Queues */ + for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) { + bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id]; + + errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id, + bd_rx_q->nb_rx_desc, + rte_eth_dev_socket_id(slave_eth_dev->data->port_id), + &(bd_rx_q->rx_conf), bd_rx_q->mb_pool); + if (errval != 0) { + RTE_BOND_LOG(ERR, + "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)", + slave_eth_dev->data->port_id, q_id, errval); + return errval; + } + } + + /* Setup Tx Queues */ + for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) { + bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id]; + + errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id, + bd_tx_q->nb_tx_desc, + rte_eth_dev_socket_id(slave_eth_dev->data->port_id), + &bd_tx_q->tx_conf); + if (errval != 0) { + RTE_BOND_LOG(ERR, + "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)", + slave_eth_dev->data->port_id, q_id, errval); + return errval; + } + } + + /* Start device */ + errval = rte_eth_dev_start(slave_eth_dev->data->port_id); + if (errval != 0) { + RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)", + slave_eth_dev->data->port_id, errval); + return -1; + } + + /* If RSS is enabled for bonding, synchronize RETA */ + if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) { + int i; + struct bond_dev_private *internals; + + internals = bonded_eth_dev->data->dev_private; + + for (i = 0; i < internals->slave_count; i++) { + if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) { + errval = rte_eth_dev_rss_reta_update( + slave_eth_dev->data->port_id, + &internals->reta_conf[0], + internals->slaves[i].reta_size); + if (errval != 0) { + RTE_LOG(WARNING, PMD, + "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)." + " RSS Configuration for bonding may be inconsistent.\n", + slave_eth_dev->data->port_id, errval); + } + break; + } + } + } + + /* If lsc interrupt is set, check initial slave's link status */ + if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) { + slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0); + bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id, + RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id); + } + + return 0; +} + +void +slave_remove(struct bond_dev_private *internals, + struct rte_eth_dev *slave_eth_dev) +{ + uint8_t i; + + for (i = 0; i < internals->slave_count; i++) + if (internals->slaves[i].port_id == + slave_eth_dev->data->port_id) + break; + + if (i < (internals->slave_count - 1)) + memmove(&internals->slaves[i], &internals->slaves[i + 1], + sizeof(internals->slaves[0]) * + (internals->slave_count - i - 1)); + + internals->slave_count--; + + /* force reconfiguration of slave interfaces */ + _rte_eth_dev_reset(slave_eth_dev); +} + +static void +bond_ethdev_slave_link_status_change_monitor(void *cb_arg); + +void +slave_add(struct bond_dev_private *internals, + struct rte_eth_dev *slave_eth_dev) +{ + struct bond_slave_details *slave_details = + &internals->slaves[internals->slave_count]; + + slave_details->port_id = slave_eth_dev->data->port_id; + slave_details->last_link_status = 0; + + /* Mark slave devices that don't support interrupts so we can + * compensate when we start the bond + */ + if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) { + slave_details->link_status_poll_enabled = 1; + } + + slave_details->link_status_wait_to_complete = 0; + /* clean tlb_last_obytes when adding port for bonding device */ + memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs, + sizeof(struct ether_addr)); +} + +void +bond_ethdev_primary_set(struct bond_dev_private *internals, + uint8_t slave_port_id) +{ + int i; + + if (internals->active_slave_count < 1) + internals->current_primary_port = slave_port_id; + else + /* Search bonded device slave ports for new proposed primary port */ + for (i = 0; i < internals->active_slave_count; i++) { + if (internals->active_slaves[i] == slave_port_id) + internals->current_primary_port = slave_port_id; + } +} + +static void +bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev); + +static int +bond_ethdev_start(struct rte_eth_dev *eth_dev) +{ + struct bond_dev_private *internals; + int i; + + /* slave eth dev will be started by bonded device */ + if (check_for_bonded_ethdev(eth_dev)) { + RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)", + eth_dev->data->port_id); + return -1; + } + + eth_dev->data->dev_link.link_status = ETH_LINK_DOWN; + eth_dev->data->dev_started = 1; + + internals = eth_dev->data->dev_private; + + if (internals->slave_count == 0) { + RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices"); + return -1; + } + + if (internals->user_defined_mac == 0) { + struct ether_addr *new_mac_addr = NULL; + + for (i = 0; i < internals->slave_count; i++) + if (internals->slaves[i].port_id == internals->primary_port) + new_mac_addr = &internals->slaves[i].persisted_mac_addr; + + if (new_mac_addr == NULL) + return -1; + + if (mac_address_set(eth_dev, new_mac_addr) != 0) { + RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address", + eth_dev->data->port_id); + return -1; + } + } + + /* Update all slave devices MACs*/ + if (mac_address_slaves_update(eth_dev) != 0) + return -1; + + /* If bonded device is configure in promiscuous mode then re-apply config */ + if (internals->promiscuous_en) + bond_ethdev_promiscuous_enable(eth_dev); + + /* Reconfigure each slave device if starting bonded device */ + for (i = 0; i < internals->slave_count; i++) { + if (slave_configure(eth_dev, + &(rte_eth_devices[internals->slaves[i].port_id])) != 0) { + RTE_BOND_LOG(ERR, + "bonded port (%d) failed to reconfigure slave device (%d)", + eth_dev->data->port_id, internals->slaves[i].port_id); + return -1; + } + /* We will need to poll for link status if any slave doesn't + * support interrupts + */ + if (internals->slaves[i].link_status_poll_enabled) + internals->link_status_polling_enabled = 1; + } + /* start polling if needed */ + if (internals->link_status_polling_enabled) { + rte_eal_alarm_set( + internals->link_status_polling_interval_ms * 1000, + bond_ethdev_slave_link_status_change_monitor, + (void *)&rte_eth_devices[internals->port_id]); + } + + if (internals->user_defined_primary_port) + bond_ethdev_primary_set(internals, internals->primary_port); + + if (internals->mode == BONDING_MODE_8023AD) + bond_mode_8023ad_start(eth_dev); + + if (internals->mode == BONDING_MODE_TLB || + internals->mode == BONDING_MODE_ALB) + bond_tlb_enable(internals); + + return 0; +} + +static void +bond_ethdev_free_queues(struct rte_eth_dev *dev) +{ + uint8_t i; + + if (dev->data->rx_queues != NULL) { + for (i = 0; i < dev->data->nb_rx_queues; i++) { + rte_free(dev->data->rx_queues[i]); + dev->data->rx_queues[i] = NULL; + } + dev->data->nb_rx_queues = 0; + } + + if (dev->data->tx_queues != NULL) { + for (i = 0; i < dev->data->nb_tx_queues; i++) { + rte_free(dev->data->tx_queues[i]); + dev->data->tx_queues[i] = NULL; + } + dev->data->nb_tx_queues = 0; + } +} + +void +bond_ethdev_stop(struct rte_eth_dev *eth_dev) +{ + struct bond_dev_private *internals = eth_dev->data->dev_private; + uint8_t i; + + if (internals->mode == BONDING_MODE_8023AD) { + struct port *port; + void *pkt = NULL; + + bond_mode_8023ad_stop(eth_dev); + + /* Discard all messages to/from mode 4 state machines */ + for (i = 0; i < internals->active_slave_count; i++) { + port = &mode_8023ad_ports[internals->active_slaves[i]]; + + RTE_ASSERT(port->rx_ring != NULL); + while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT) + rte_pktmbuf_free(pkt); + + RTE_ASSERT(port->tx_ring != NULL); + while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT) + rte_pktmbuf_free(pkt); + } + } + + if (internals->mode == BONDING_MODE_TLB || + internals->mode == BONDING_MODE_ALB) { + bond_tlb_disable(internals); + for (i = 0; i < internals->active_slave_count; i++) + tlb_last_obytets[internals->active_slaves[i]] = 0; + } + + internals->active_slave_count = 0; + internals->link_status_polling_enabled = 0; + for (i = 0; i < internals->slave_count; i++) + internals->slaves[i].last_link_status = 0; + + eth_dev->data->dev_link.link_status = ETH_LINK_DOWN; + eth_dev->data->dev_started = 0; +} + +void +bond_ethdev_close(struct rte_eth_dev *dev) +{ + struct bond_dev_private *internals = dev->data->dev_private; + uint8_t bond_port_id = internals->port_id; + int skipped = 0; + + RTE_LOG(INFO, EAL, "Closing bonded device %s\n", dev->data->name); + while (internals->slave_count != skipped) { + uint8_t port_id = internals->slaves[skipped].port_id; + + rte_eth_dev_stop(port_id); + + if (rte_eth_bond_slave_remove(bond_port_id, port_id) != 0) { + RTE_LOG(ERR, EAL, + "Failed to remove port %d from bonded device " + "%s\n", port_id, dev->data->name); + skipped++; + } + } + bond_ethdev_free_queues(dev); + rte_bitmap_reset(internals->vlan_filter_bmp); +} + +/* forward declaration */ +static int bond_ethdev_configure(struct rte_eth_dev *dev); + +static void +bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) +{ + struct bond_dev_private *internals = dev->data->dev_private; + + dev_info->max_mac_addrs = 1; + + dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen + ? internals->candidate_max_rx_pktlen + : ETHER_MAX_JUMBO_FRAME_LEN; + + dev_info->max_rx_queues = (uint16_t)128; + dev_info->max_tx_queues = (uint16_t)512; + + dev_info->min_rx_bufsize = 0; + + dev_info->rx_offload_capa = internals->rx_offload_capa; + dev_info->tx_offload_capa = internals->tx_offload_capa; + dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads; + + dev_info->reta_size = internals->reta_size; +} + +static int +bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) +{ + int res; + uint8_t i; + struct bond_dev_private *internals = dev->data->dev_private; + + /* don't do this while a slave is being added */ + rte_spinlock_lock(&internals->lock); + + if (on) + rte_bitmap_set(internals->vlan_filter_bmp, vlan_id); + else + rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id); + + for (i = 0; i < internals->slave_count; i++) { + uint8_t port_id = internals->slaves[i].port_id; + + res = rte_eth_dev_vlan_filter(port_id, vlan_id, on); + if (res == ENOTSUP) + RTE_LOG(WARNING, PMD, + "Setting VLAN filter on slave port %u not supported.\n", + port_id); + } + + rte_spinlock_unlock(&internals->lock); + return 0; +} + +static int +bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id, + uint16_t nb_rx_desc, unsigned int socket_id __rte_unused, + const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool) +{ + struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *) + rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue), + 0, dev->data->numa_node); + if (bd_rx_q == NULL) + return -1; + + bd_rx_q->queue_id = rx_queue_id; + bd_rx_q->dev_private = dev->data->dev_private; + + bd_rx_q->nb_rx_desc = nb_rx_desc; + + memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf)); + bd_rx_q->mb_pool = mb_pool; + + dev->data->rx_queues[rx_queue_id] = bd_rx_q; + + return 0; +} + +static int +bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id, + uint16_t nb_tx_desc, unsigned int socket_id __rte_unused, + const struct rte_eth_txconf *tx_conf) +{ + struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *) + rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue), + 0, dev->data->numa_node); + + if (bd_tx_q == NULL) + return -1; + + bd_tx_q->queue_id = tx_queue_id; + bd_tx_q->dev_private = dev->data->dev_private; + + bd_tx_q->nb_tx_desc = nb_tx_desc; + memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf)); + + dev->data->tx_queues[tx_queue_id] = bd_tx_q; + + return 0; +} + +static void +bond_ethdev_rx_queue_release(void *queue) +{ + if (queue == NULL) + return; + + rte_free(queue); +} + +static void +bond_ethdev_tx_queue_release(void *queue) +{ + if (queue == NULL) + return; + + rte_free(queue); +} + +static void +bond_ethdev_slave_link_status_change_monitor(void *cb_arg) +{ + struct rte_eth_dev *bonded_ethdev, *slave_ethdev; + struct bond_dev_private *internals; + + /* Default value for polling slave found is true as we don't want to + * disable the polling thread if we cannot get the lock */ + int i, polling_slave_found = 1; + + if (cb_arg == NULL) + return; + + bonded_ethdev = (struct rte_eth_dev *)cb_arg; + internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private; + + if (!bonded_ethdev->data->dev_started || + !internals->link_status_polling_enabled) + return; + + /* If device is currently being configured then don't check slaves link + * status, wait until next period */ + if (rte_spinlock_trylock(&internals->lock)) { + if (internals->slave_count > 0) + polling_slave_found = 0; + + for (i = 0; i < internals->slave_count; i++) { + if (!internals->slaves[i].link_status_poll_enabled) + continue; + + slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id]; + polling_slave_found = 1; + + /* Update slave link status */ + (*slave_ethdev->dev_ops->link_update)(slave_ethdev, + internals->slaves[i].link_status_wait_to_complete); + + /* if link status has changed since last checked then call lsc + * event callback */ + if (slave_ethdev->data->dev_link.link_status != + internals->slaves[i].last_link_status) { + internals->slaves[i].last_link_status = + slave_ethdev->data->dev_link.link_status; + + bond_ethdev_lsc_event_callback(internals->slaves[i].port_id, + RTE_ETH_EVENT_INTR_LSC, + &bonded_ethdev->data->port_id); + } + } + rte_spinlock_unlock(&internals->lock); + } + + if (polling_slave_found) + /* Set alarm to continue monitoring link status of slave ethdev's */ + rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000, + bond_ethdev_slave_link_status_change_monitor, cb_arg); +} + +static int +bond_ethdev_link_update(struct rte_eth_dev *bonded_eth_dev, + int wait_to_complete) +{ + struct bond_dev_private *internals = bonded_eth_dev->data->dev_private; + + if (!bonded_eth_dev->data->dev_started || + internals->active_slave_count == 0) { + bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN; + return 0; + } else { + struct rte_eth_dev *slave_eth_dev; + int i, link_up = 0; + + for (i = 0; i < internals->active_slave_count; i++) { + slave_eth_dev = &rte_eth_devices[internals->active_slaves[i]]; + + (*slave_eth_dev->dev_ops->link_update)(slave_eth_dev, + wait_to_complete); + if (slave_eth_dev->data->dev_link.link_status == ETH_LINK_UP) { + link_up = 1; + break; + } + } + + bonded_eth_dev->data->dev_link.link_status = link_up; + } + + return 0; +} + +static void +bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) +{ + struct bond_dev_private *internals = dev->data->dev_private; + struct rte_eth_stats slave_stats; + int i, j; + + for (i = 0; i < internals->slave_count; i++) { + rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats); + + stats->ipackets += slave_stats.ipackets; + stats->opackets += slave_stats.opackets; + stats->ibytes += slave_stats.ibytes; + stats->obytes += slave_stats.obytes; + stats->imissed += slave_stats.imissed; + stats->ierrors += slave_stats.ierrors; + stats->oerrors += slave_stats.oerrors; + stats->rx_nombuf += slave_stats.rx_nombuf; + + for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) { + stats->q_ipackets[j] += slave_stats.q_ipackets[j]; + stats->q_opackets[j] += slave_stats.q_opackets[j]; + stats->q_ibytes[j] += slave_stats.q_ibytes[j]; + stats->q_obytes[j] += slave_stats.q_obytes[j]; + stats->q_errors[j] += slave_stats.q_errors[j]; + } + + } +} + +static void +bond_ethdev_stats_reset(struct rte_eth_dev *dev) +{ + struct bond_dev_private *internals = dev->data->dev_private; + int i; + + for (i = 0; i < internals->slave_count; i++) + rte_eth_stats_reset(internals->slaves[i].port_id); +} + +static void +bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev) +{ + struct bond_dev_private *internals = eth_dev->data->dev_private; + int i; + + internals->promiscuous_en = 1; + + switch (internals->mode) { + /* Promiscuous mode is propagated to all slaves */ + case BONDING_MODE_ROUND_ROBIN: + case BONDING_MODE_BALANCE: + case BONDING_MODE_BROADCAST: + for (i = 0; i < internals->slave_count; i++) + rte_eth_promiscuous_enable(internals->slaves[i].port_id); + break; + /* In mode4 promiscus mode is managed when slave is added/removed */ + case BONDING_MODE_8023AD: + break; + /* Promiscuous mode is propagated only to primary slave */ + case BONDING_MODE_ACTIVE_BACKUP: + case BONDING_MODE_TLB: + case BONDING_MODE_ALB: + default: + rte_eth_promiscuous_enable(internals->current_primary_port); + } +} + +static void +bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev) +{ + struct bond_dev_private *internals = dev->data->dev_private; + int i; + + internals->promiscuous_en = 0; + + switch (internals->mode) { + /* Promiscuous mode is propagated to all slaves */ + case BONDING_MODE_ROUND_ROBIN: + case BONDING_MODE_BALANCE: + case BONDING_MODE_BROADCAST: + for (i = 0; i < internals->slave_count; i++) + rte_eth_promiscuous_disable(internals->slaves[i].port_id); + break; + /* In mode4 promiscus mode is set managed when slave is added/removed */ + case BONDING_MODE_8023AD: + break; + /* Promiscuous mode is propagated only to primary slave */ + case BONDING_MODE_ACTIVE_BACKUP: + case BONDING_MODE_TLB: + case BONDING_MODE_ALB: + default: + rte_eth_promiscuous_disable(internals->current_primary_port); + } +} + +static void +bond_ethdev_delayed_lsc_propagation(void *arg) +{ + if (arg == NULL) + return; + + _rte_eth_dev_callback_process((struct rte_eth_dev *)arg, + RTE_ETH_EVENT_INTR_LSC, NULL); +} + +void +bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type, + void *param) +{ + struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev; + struct bond_dev_private *internals; + struct rte_eth_link link; + + int i, valid_slave = 0; + uint8_t active_pos; + uint8_t lsc_flag = 0; + + if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL) + return; + + bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param]; + slave_eth_dev = &rte_eth_devices[port_id]; + + if (check_for_bonded_ethdev(bonded_eth_dev)) + return; + + internals = bonded_eth_dev->data->dev_private; + + /* If the device isn't started don't handle interrupts */ + if (!bonded_eth_dev->data->dev_started) + return; + + /* verify that port_id is a valid slave of bonded port */ + for (i = 0; i < internals->slave_count; i++) { + if (internals->slaves[i].port_id == port_id) { + valid_slave = 1; + break; + } + } + + if (!valid_slave) + return; + + /* Search for port in active port list */ + active_pos = find_slave_by_id(internals->active_slaves, + internals->active_slave_count, port_id); + + rte_eth_link_get_nowait(port_id, &link); + if (link.link_status) { + if (active_pos < internals->active_slave_count) + return; + + /* if no active slave ports then set this port to be primary port */ + if (internals->active_slave_count < 1) { + /* If first active slave, then change link status */ + bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP; + internals->current_primary_port = port_id; + lsc_flag = 1; + + mac_address_slaves_update(bonded_eth_dev); + + /* Inherit eth dev link properties from first active slave */ + link_properties_set(bonded_eth_dev, + &(slave_eth_dev->data->dev_link)); + } else { + if (link_properties_valid( + &bonded_eth_dev->data->dev_link, &link) != 0) { + slave_eth_dev->data->dev_flags &= + (~RTE_ETH_DEV_BONDED_SLAVE); + RTE_LOG(ERR, PMD, + "port %u invalid speed/duplex\n", + port_id); + return; + } + } + + activate_slave(bonded_eth_dev, port_id); + + /* If user has defined the primary port then default to using it */ + if (internals->user_defined_primary_port && + internals->primary_port == port_id) + bond_ethdev_primary_set(internals, port_id); + } else { + if (active_pos == internals->active_slave_count) + return; + + /* Remove from active slave list */ + deactivate_slave(bonded_eth_dev, port_id); + + /* No active slaves, change link status to down and reset other + * link properties */ + if (internals->active_slave_count < 1) { + lsc_flag = 1; + bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN; + + link_properties_reset(bonded_eth_dev); + } + + /* Update primary id, take first active slave from list or if none + * available set to -1 */ + if (port_id == internals->current_primary_port) { + if (internals->active_slave_count > 0) + bond_ethdev_primary_set(internals, + internals->active_slaves[0]); + else + internals->current_primary_port = internals->primary_port; + } + } + + if (lsc_flag) { + /* Cancel any possible outstanding interrupts if delays are enabled */ + if (internals->link_up_delay_ms > 0 || + internals->link_down_delay_ms > 0) + rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation, + bonded_eth_dev); + + if (bonded_eth_dev->data->dev_link.link_status) { + if (internals->link_up_delay_ms > 0) + rte_eal_alarm_set(internals->link_up_delay_ms * 1000, + bond_ethdev_delayed_lsc_propagation, + (void *)bonded_eth_dev); + else + _rte_eth_dev_callback_process(bonded_eth_dev, + RTE_ETH_EVENT_INTR_LSC, NULL); + + } else { + if (internals->link_down_delay_ms > 0) + rte_eal_alarm_set(internals->link_down_delay_ms * 1000, + bond_ethdev_delayed_lsc_propagation, + (void *)bonded_eth_dev); + else + _rte_eth_dev_callback_process(bonded_eth_dev, + RTE_ETH_EVENT_INTR_LSC, NULL); + } + } +} + +static int +bond_ethdev_rss_reta_update(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size) +{ + unsigned i, j; + int result = 0; + int slave_reta_size; + unsigned reta_count; + struct bond_dev_private *internals = dev->data->dev_private; + + if (reta_size != internals->reta_size) + return -EINVAL; + + /* Copy RETA table */ + reta_count = reta_size / RTE_RETA_GROUP_SIZE; + + for (i = 0; i < reta_count; i++) { + internals->reta_conf[i].mask = reta_conf[i].mask; + for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) + if ((reta_conf[i].mask >> j) & 0x01) + internals->reta_conf[i].reta[j] = reta_conf[i].reta[j]; + } + + /* Fill rest of array */ + for (; i < RTE_DIM(internals->reta_conf); i += reta_count) + memcpy(&internals->reta_conf[i], &internals->reta_conf[0], + sizeof(internals->reta_conf[0]) * reta_count); + + /* Propagate RETA over slaves */ + for (i = 0; i < internals->slave_count; i++) { + slave_reta_size = internals->slaves[i].reta_size; + result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id, + &internals->reta_conf[0], slave_reta_size); + if (result < 0) + return result; + } + + return 0; +} + +static int +bond_ethdev_rss_reta_query(struct rte_eth_dev *dev, + struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size) +{ + int i, j; + struct bond_dev_private *internals = dev->data->dev_private; + + if (reta_size != internals->reta_size) + return -EINVAL; + + /* Copy RETA table */ + for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++) + for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) + if ((reta_conf[i].mask >> j) & 0x01) + reta_conf[i].reta[j] = internals->reta_conf[i].reta[j]; + + return 0; +} + +static int +bond_ethdev_rss_hash_update(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf) +{ + int i, result = 0; + struct bond_dev_private *internals = dev->data->dev_private; + struct rte_eth_rss_conf bond_rss_conf; + + memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf)); + + bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads; + + if (bond_rss_conf.rss_hf != 0) + dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf; + + if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len < + sizeof(internals->rss_key)) { + if (bond_rss_conf.rss_key_len == 0) + bond_rss_conf.rss_key_len = 40; + internals->rss_key_len = bond_rss_conf.rss_key_len; + memcpy(internals->rss_key, bond_rss_conf.rss_key, + internals->rss_key_len); + } + + for (i = 0; i < internals->slave_count; i++) { + result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id, + &bond_rss_conf); + if (result < 0) + return result; + } + + return 0; +} + +static int +bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev, + struct rte_eth_rss_conf *rss_conf) +{ + struct bond_dev_private *internals = dev->data->dev_private; + + rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf; + rss_conf->rss_key_len = internals->rss_key_len; + if (rss_conf->rss_key) + memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len); + + return 0; +} + +const struct eth_dev_ops default_dev_ops = { + .dev_start = bond_ethdev_start, + .dev_stop = bond_ethdev_stop, + .dev_close = bond_ethdev_close, + .dev_configure = bond_ethdev_configure, + .dev_infos_get = bond_ethdev_info, + .vlan_filter_set = bond_ethdev_vlan_filter_set, + .rx_queue_setup = bond_ethdev_rx_queue_setup, + .tx_queue_setup = bond_ethdev_tx_queue_setup, + .rx_queue_release = bond_ethdev_rx_queue_release, + .tx_queue_release = bond_ethdev_tx_queue_release, + .link_update = bond_ethdev_link_update, + .stats_get = bond_ethdev_stats_get, + .stats_reset = bond_ethdev_stats_reset, + .promiscuous_enable = bond_ethdev_promiscuous_enable, + .promiscuous_disable = bond_ethdev_promiscuous_disable, + .reta_update = bond_ethdev_rss_reta_update, + .reta_query = bond_ethdev_rss_reta_query, + .rss_hash_update = bond_ethdev_rss_hash_update, + .rss_hash_conf_get = bond_ethdev_rss_hash_conf_get +}; + +static int +bond_alloc(struct rte_vdev_device *dev, uint8_t mode) +{ + const char *name = rte_vdev_device_name(dev); + uint8_t socket_id = dev->device.numa_node; + struct bond_dev_private *internals = NULL; + struct rte_eth_dev *eth_dev = NULL; + uint32_t vlan_filter_bmp_size; + + /* now do all data allocation - for eth_dev structure, dummy pci driver + * and internal (private) data + */ + + if (socket_id >= number_of_sockets()) { + RTE_BOND_LOG(ERR, + "Invalid socket id specified to create bonded device on."); + goto err; + } + + /* reserve an ethdev entry */ + eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internals)); + if (eth_dev == NULL) { + RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev"); + goto err; + } + + internals = eth_dev->data->dev_private; + eth_dev->data->nb_rx_queues = (uint16_t)1; + eth_dev->data->nb_tx_queues = (uint16_t)1; + + eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0, + socket_id); + if (eth_dev->data->mac_addrs == NULL) { + RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs"); + goto err; + } + + eth_dev->dev_ops = &default_dev_ops; + eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC | + RTE_ETH_DEV_DETACHABLE; + + rte_spinlock_init(&internals->lock); + + internals->port_id = eth_dev->data->port_id; + internals->mode = BONDING_MODE_INVALID; + internals->current_primary_port = RTE_MAX_ETHPORTS + 1; + internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2; + internals->xmit_hash = xmit_l2_hash; + internals->user_defined_mac = 0; + internals->link_props_set = 0; + + internals->link_status_polling_enabled = 0; + + internals->link_status_polling_interval_ms = + DEFAULT_POLLING_INTERVAL_10_MS; + internals->link_down_delay_ms = 0; + internals->link_up_delay_ms = 0; + + internals->slave_count = 0; + internals->active_slave_count = 0; + internals->rx_offload_capa = 0; + internals->tx_offload_capa = 0; + internals->candidate_max_rx_pktlen = 0; + internals->max_rx_pktlen = 0; + + /* Initially allow to choose any offload type */ + internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK; + + memset(internals->active_slaves, 0, sizeof(internals->active_slaves)); + memset(internals->slaves, 0, sizeof(internals->slaves)); + + /* Set mode 4 default configuration */ + bond_mode_8023ad_setup(eth_dev, NULL); + if (bond_ethdev_mode_set(eth_dev, mode)) { + RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d", + eth_dev->data->port_id, mode); + goto err; + } + + vlan_filter_bmp_size = + rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1); + internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size, + RTE_CACHE_LINE_SIZE); + if (internals->vlan_filter_bmpmem == NULL) { + RTE_BOND_LOG(ERR, + "Failed to allocate vlan bitmap for bonded device %u\n", + eth_dev->data->port_id); + goto err; + } + + internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1, + internals->vlan_filter_bmpmem, vlan_filter_bmp_size); + if (internals->vlan_filter_bmp == NULL) { + RTE_BOND_LOG(ERR, + "Failed to init vlan bitmap for bonded device %u\n", + eth_dev->data->port_id); + rte_free(internals->vlan_filter_bmpmem); + goto err; + } + + return eth_dev->data->port_id; + +err: + rte_free(internals); + if (eth_dev != NULL) { + rte_free(eth_dev->data->mac_addrs); + rte_eth_dev_release_port(eth_dev); + } + return -1; +} + +static int +bond_probe(struct rte_vdev_device *dev) +{ + const char *name; + struct bond_dev_private *internals; + struct rte_kvargs *kvlist; + uint8_t bonding_mode, socket_id; + int arg_count, port_id; + + if (!dev) + return -EINVAL; + + name = rte_vdev_device_name(dev); + RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name); + + kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), + pmd_bond_init_valid_arguments); + if (kvlist == NULL) + return -1; + + /* Parse link bonding mode */ + if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) { + if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG, + &bond_ethdev_parse_slave_mode_kvarg, + &bonding_mode) != 0) { + RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n", + name); + goto parse_error; + } + } else { + RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded " + "device %s\n", name); + goto parse_error; + } + + /* Parse socket id to create bonding device on */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG); + if (arg_count == 1) { + if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG, + &bond_ethdev_parse_socket_id_kvarg, &socket_id) + != 0) { + RTE_LOG(ERR, EAL, "Invalid socket Id specified for " + "bonded device %s\n", name); + goto parse_error; + } + } else if (arg_count > 1) { + RTE_LOG(ERR, EAL, "Socket Id can be specified only once for " + "bonded device %s\n", name); + goto parse_error; + } else { + socket_id = rte_socket_id(); + } + + dev->device.numa_node = socket_id; + + /* Create link bonding eth device */ + port_id = bond_alloc(dev, bonding_mode); + if (port_id < 0) { + RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on " + "socket %u.\n", name, bonding_mode, socket_id); + goto parse_error; + } + internals = rte_eth_devices[port_id].data->dev_private; + internals->kvlist = kvlist; + + RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on " + "socket %u.\n", name, port_id, bonding_mode, socket_id); + return 0; + +parse_error: + rte_kvargs_free(kvlist); + + return -1; +} + +static int +bond_remove(struct rte_vdev_device *dev) +{ + struct rte_eth_dev *eth_dev; + struct bond_dev_private *internals; + const char *name; + + if (!dev) + return -EINVAL; + + name = rte_vdev_device_name(dev); + RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name); + + /* now free all data allocation - for eth_dev structure, + * dummy pci driver and internal (private) data + */ + + /* find an ethdev entry */ + eth_dev = rte_eth_dev_allocated(name); + if (eth_dev == NULL) + return -ENODEV; + + RTE_ASSERT(eth_dev->device == &dev->device); + + internals = eth_dev->data->dev_private; + if (internals->slave_count != 0) + return -EBUSY; + + if (eth_dev->data->dev_started == 1) { + bond_ethdev_stop(eth_dev); + bond_ethdev_close(eth_dev); + } + + eth_dev->dev_ops = NULL; + eth_dev->rx_pkt_burst = NULL; + eth_dev->tx_pkt_burst = NULL; + + internals = eth_dev->data->dev_private; + rte_bitmap_free(internals->vlan_filter_bmp); + rte_free(internals->vlan_filter_bmpmem); + rte_free(eth_dev->data->dev_private); + rte_free(eth_dev->data->mac_addrs); + + rte_eth_dev_release_port(eth_dev); + + return 0; +} + +/* this part will resolve the slave portids after all the other pdev and vdev + * have been allocated */ +static int +bond_ethdev_configure(struct rte_eth_dev *dev) +{ + char *name = dev->data->name; + struct bond_dev_private *internals = dev->data->dev_private; + struct rte_kvargs *kvlist = internals->kvlist; + int arg_count; + uint8_t port_id = dev - rte_eth_devices; + + static const uint8_t default_rss_key[40] = { + 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D, + 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4, + 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B, + 0xBE, 0xAC, 0x01, 0xFA + }; + + unsigned i, j; + + /* If RSS is enabled, fill table and key with default values */ + if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) { + dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key; + dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0; + memcpy(internals->rss_key, default_rss_key, 40); + + for (i = 0; i < RTE_DIM(internals->reta_conf); i++) { + internals->reta_conf[i].mask = ~0LL; + for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) + internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues; + } + } + + /* set the max_rx_pktlen */ + internals->max_rx_pktlen = internals->candidate_max_rx_pktlen; + + /* + * if no kvlist, it means that this bonded device has been created + * through the bonding api. + */ + if (!kvlist) + return 0; + + /* Parse MAC address for bonded device */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG); + if (arg_count == 1) { + struct ether_addr bond_mac; + + if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG, + &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) { + RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n", + name); + return -1; + } + + /* Set MAC address */ + if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) { + RTE_LOG(ERR, EAL, + "Failed to set mac address on bonded device %s\n", + name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(ERR, EAL, + "MAC address can be specified only once for bonded device %s\n", + name); + return -1; + } + + /* Parse/set balance mode transmit policy */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG); + if (arg_count == 1) { + uint8_t xmit_policy; + + if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG, + &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) != + 0) { + RTE_LOG(INFO, EAL, + "Invalid xmit policy specified for bonded device %s\n", + name); + return -1; + } + + /* Set balance mode transmit policy*/ + if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) { + RTE_LOG(ERR, EAL, + "Failed to set balance xmit policy on bonded device %s\n", + name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(ERR, EAL, + "Transmit policy can be specified only once for bonded device" + " %s\n", name); + return -1; + } + + /* Parse/add slave ports to bonded device */ + if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) { + struct bond_ethdev_slave_ports slave_ports; + unsigned i; + + memset(&slave_ports, 0, sizeof(slave_ports)); + + if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG, + &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) { + RTE_LOG(ERR, EAL, + "Failed to parse slave ports for bonded device %s\n", + name); + return -1; + } + + for (i = 0; i < slave_ports.slave_count; i++) { + if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) { + RTE_LOG(ERR, EAL, + "Failed to add port %d as slave to bonded device %s\n", + slave_ports.slaves[i], name); + } + } + + } else { + RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name); + return -1; + } + + /* Parse/set primary slave port id*/ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG); + if (arg_count == 1) { + uint8_t primary_slave_port_id; + + if (rte_kvargs_process(kvlist, + PMD_BOND_PRIMARY_SLAVE_KVARG, + &bond_ethdev_parse_primary_slave_port_id_kvarg, + &primary_slave_port_id) < 0) { + RTE_LOG(INFO, EAL, + "Invalid primary slave port id specified for bonded device" + " %s\n", name); + return -1; + } + + /* Set balance mode transmit policy*/ + if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id) + != 0) { + RTE_LOG(ERR, EAL, + "Failed to set primary slave port %d on bonded device %s\n", + primary_slave_port_id, name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(INFO, EAL, + "Primary slave can be specified only once for bonded device" + " %s\n", name); + return -1; + } + + /* Parse link status monitor polling interval */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG); + if (arg_count == 1) { + uint32_t lsc_poll_interval_ms; + + if (rte_kvargs_process(kvlist, + PMD_BOND_LSC_POLL_PERIOD_KVARG, + &bond_ethdev_parse_time_ms_kvarg, + &lsc_poll_interval_ms) < 0) { + RTE_LOG(INFO, EAL, + "Invalid lsc polling interval value specified for bonded" + " device %s\n", name); + return -1; + } + + if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms) + != 0) { + RTE_LOG(ERR, EAL, + "Failed to set lsc monitor polling interval (%u ms) on" + " bonded device %s\n", lsc_poll_interval_ms, name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(INFO, EAL, + "LSC polling interval can be specified only once for bonded" + " device %s\n", name); + return -1; + } + + /* Parse link up interrupt propagation delay */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG); + if (arg_count == 1) { + uint32_t link_up_delay_ms; + + if (rte_kvargs_process(kvlist, + PMD_BOND_LINK_UP_PROP_DELAY_KVARG, + &bond_ethdev_parse_time_ms_kvarg, + &link_up_delay_ms) < 0) { + RTE_LOG(INFO, EAL, + "Invalid link up propagation delay value specified for" + " bonded device %s\n", name); + return -1; + } + + /* Set balance mode transmit policy*/ + if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms) + != 0) { + RTE_LOG(ERR, EAL, + "Failed to set link up propagation delay (%u ms) on bonded" + " device %s\n", link_up_delay_ms, name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(INFO, EAL, + "Link up propagation delay can be specified only once for" + " bonded device %s\n", name); + return -1; + } + + /* Parse link down interrupt propagation delay */ + arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG); + if (arg_count == 1) { + uint32_t link_down_delay_ms; + + if (rte_kvargs_process(kvlist, + PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG, + &bond_ethdev_parse_time_ms_kvarg, + &link_down_delay_ms) < 0) { + RTE_LOG(INFO, EAL, + "Invalid link down propagation delay value specified for" + " bonded device %s\n", name); + return -1; + } + + /* Set balance mode transmit policy*/ + if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms) + != 0) { + RTE_LOG(ERR, EAL, + "Failed to set link down propagation delay (%u ms) on" + " bonded device %s\n", link_down_delay_ms, name); + return -1; + } + } else if (arg_count > 1) { + RTE_LOG(INFO, EAL, + "Link down propagation delay can be specified only once for" + " bonded device %s\n", name); + return -1; + } + + return 0; +} + +struct rte_vdev_driver pmd_bond_drv = { + .probe = bond_probe, + .remove = bond_remove, +}; + +RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv); +RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond); + +RTE_PMD_REGISTER_PARAM_STRING(net_bonding, + "slave=<ifc> " + "primary=<ifc> " + "mode=[0-6] " + "xmit_policy=[l2 | l23 | l34] " + "socket_id=<int> " + "mac=<mac addr> " + "lsc_poll_period_ms=<int> " + "up_delay=<int> " + "down_delay=<int>"); diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_private.h b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_private.h new file mode 100644 index 00000000..c8db0900 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_private.h @@ -0,0 +1,313 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_ETH_BOND_PRIVATE_H_ +#define _RTE_ETH_BOND_PRIVATE_H_ + +#include <rte_ethdev.h> +#include <rte_spinlock.h> +#include <rte_bitmap.h> + +#include "rte_eth_bond.h" +#include "rte_eth_bond_8023ad_private.h" +#include "rte_eth_bond_alb.h" + +#define PMD_BOND_SLAVE_PORT_KVARG ("slave") +#define PMD_BOND_PRIMARY_SLAVE_KVARG ("primary") +#define PMD_BOND_MODE_KVARG ("mode") +#define PMD_BOND_XMIT_POLICY_KVARG ("xmit_policy") +#define PMD_BOND_SOCKET_ID_KVARG ("socket_id") +#define PMD_BOND_MAC_ADDR_KVARG ("mac") +#define PMD_BOND_LSC_POLL_PERIOD_KVARG ("lsc_poll_period_ms") +#define PMD_BOND_LINK_UP_PROP_DELAY_KVARG ("up_delay") +#define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG ("down_delay") + +#define PMD_BOND_XMIT_POLICY_LAYER2_KVARG ("l2") +#define PMD_BOND_XMIT_POLICY_LAYER23_KVARG ("l23") +#define PMD_BOND_XMIT_POLICY_LAYER34_KVARG ("l34") + +#define RTE_BOND_LOG(lvl, msg, ...) \ + RTE_LOG(lvl, PMD, "%s(%d) - " msg "\n", __func__, __LINE__, ##__VA_ARGS__) + +#define BONDING_MODE_INVALID 0xFF + +extern const char *pmd_bond_init_valid_arguments[]; + +extern struct rte_vdev_driver pmd_bond_drv; + +/** Port Queue Mapping Structure */ +struct bond_rx_queue { + uint16_t queue_id; + /**< Queue Id */ + struct bond_dev_private *dev_private; + /**< Reference to eth_dev private structure */ + uint16_t nb_rx_desc; + /**< Number of RX descriptors available for the queue */ + struct rte_eth_rxconf rx_conf; + /**< Copy of RX configuration structure for queue */ + struct rte_mempool *mb_pool; + /**< Reference to mbuf pool to use for RX queue */ +}; + +struct bond_tx_queue { + uint16_t queue_id; + /**< Queue Id */ + struct bond_dev_private *dev_private; + /**< Reference to dev private structure */ + uint16_t nb_tx_desc; + /**< Number of TX descriptors available for the queue */ + struct rte_eth_txconf tx_conf; + /**< Copy of TX configuration structure for queue */ +}; + +/** Bonded slave devices structure */ +struct bond_ethdev_slave_ports { + uint8_t slaves[RTE_MAX_ETHPORTS]; /**< Slave port id array */ + uint8_t slave_count; /**< Number of slaves */ +}; + +struct bond_slave_details { + uint8_t port_id; + + uint8_t link_status_poll_enabled; + uint8_t link_status_wait_to_complete; + uint8_t last_link_status; + /**< Port Id of slave eth_dev */ + struct ether_addr persisted_mac_addr; + + uint16_t reta_size; +}; + + +typedef uint16_t (*xmit_hash_t)(const struct rte_mbuf *buf, uint8_t slave_count); + +/** Link Bonding PMD device private configuration Structure */ +struct bond_dev_private { + uint8_t port_id; /**< Port Id of Bonded Port */ + uint8_t mode; /**< Link Bonding Mode */ + + rte_spinlock_t lock; + + uint8_t primary_port; /**< Primary Slave Port */ + uint8_t current_primary_port; /**< Primary Slave Port */ + uint8_t user_defined_primary_port; + /**< Flag for whether primary port is user defined or not */ + + uint8_t balance_xmit_policy; + /**< Transmit policy - l2 / l23 / l34 for operation in balance mode */ + xmit_hash_t xmit_hash; + /**< Transmit policy hash function */ + + uint8_t user_defined_mac; + /**< Flag for whether MAC address is user defined or not */ + uint8_t promiscuous_en; + /**< Enabled/disable promiscuous mode on bonding device */ + uint8_t link_props_set; + /**< flag to denote if the link properties are set */ + + uint8_t link_status_polling_enabled; + uint32_t link_status_polling_interval_ms; + + uint32_t link_down_delay_ms; + uint32_t link_up_delay_ms; + + uint16_t nb_rx_queues; /**< Total number of rx queues */ + uint16_t nb_tx_queues; /**< Total number of tx queues*/ + + uint8_t active_slave; /**< Next active_slave to poll */ + uint8_t active_slave_count; /**< Number of active slaves */ + uint8_t active_slaves[RTE_MAX_ETHPORTS]; /**< Active slave list */ + + uint8_t slave_count; /**< Number of bonded slaves */ + struct bond_slave_details slaves[RTE_MAX_ETHPORTS]; + /**< Arary of bonded slaves details */ + + struct mode8023ad_private mode4; + uint8_t tlb_slaves_order[RTE_MAX_ETHPORTS]; /* TLB active slaves send order */ + struct mode_alb_private mode6; + + uint32_t rx_offload_capa; /** Rx offload capability */ + uint32_t tx_offload_capa; /** Tx offload capability */ + + /** Bit mask of RSS offloads, the bit offset also means flow type */ + uint64_t flow_type_rss_offloads; + + uint16_t reta_size; + struct rte_eth_rss_reta_entry64 reta_conf[ETH_RSS_RETA_SIZE_512 / + RTE_RETA_GROUP_SIZE]; + + uint8_t rss_key[52]; /**< 52-byte hash key buffer. */ + uint8_t rss_key_len; /**< hash key length in bytes. */ + + struct rte_kvargs *kvlist; + uint8_t slave_update_idx; + + uint32_t candidate_max_rx_pktlen; + uint32_t max_rx_pktlen; + + void *vlan_filter_bmpmem; /* enabled vlan filter bitmap */ + struct rte_bitmap *vlan_filter_bmp; +}; + +extern const struct eth_dev_ops default_dev_ops; + +int +check_for_bonded_ethdev(const struct rte_eth_dev *eth_dev); + +/* Search given slave array to find possition of given id. + * Return slave pos or slaves_count if not found. */ +static inline uint8_t +find_slave_by_id(uint8_t *slaves, uint8_t slaves_count, uint8_t slave_id) { + + uint8_t pos; + for (pos = 0; pos < slaves_count; pos++) { + if (slave_id == slaves[pos]) + break; + } + + return pos; +} + +int +valid_port_id(uint8_t port_id); + +int +valid_bonded_port_id(uint8_t port_id); + +int +valid_slave_port_id(uint8_t port_id); + +void +deactivate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id); + +void +activate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id); + +void +link_properties_set(struct rte_eth_dev *bonded_eth_dev, + struct rte_eth_link *slave_dev_link); +void +link_properties_reset(struct rte_eth_dev *bonded_eth_dev); + +int +link_properties_valid(struct rte_eth_link *bonded_dev_link, + struct rte_eth_link *slave_dev_link); + +int +mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr); + +int +mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr); + +int +mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev); + +uint8_t +number_of_sockets(void); + +int +bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode); + +int +slave_configure(struct rte_eth_dev *bonded_eth_dev, + struct rte_eth_dev *slave_eth_dev); + +void +slave_remove(struct bond_dev_private *internals, + struct rte_eth_dev *slave_eth_dev); + +void +slave_add(struct bond_dev_private *internals, + struct rte_eth_dev *slave_eth_dev); + +uint16_t +xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count); + +uint16_t +xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count); + +uint16_t +xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count); + +void +bond_ethdev_primary_set(struct bond_dev_private *internals, + uint8_t slave_port_id); + +void +bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type, + void *param); + +int +bond_ethdev_parse_slave_port_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_slave_mode_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_socket_id_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_primary_slave_port_id_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_balance_xmit_policy_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_bond_mac_addr_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +int +bond_ethdev_parse_time_ms_kvarg(const char *key __rte_unused, + const char *value, void *extra_args); + +void +bond_tlb_disable(struct bond_dev_private *internals); + +void +bond_tlb_enable(struct bond_dev_private *internals); + +void +bond_tlb_activate_slave(struct bond_dev_private *internals); + +void +bond_ethdev_stop(struct rte_eth_dev *eth_dev); + +void +bond_ethdev_close(struct rte_eth_dev *dev); + +#endif diff --git a/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_version.map b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_version.map new file mode 100644 index 00000000..2de0a7d3 --- /dev/null +++ b/src/seastar/dpdk/drivers/net/bonding/rte_eth_bond_version.map @@ -0,0 +1,45 @@ +DPDK_2.0 { + global: + + rte_eth_bond_8023ad_conf_get; + rte_eth_bond_8023ad_setup; + rte_eth_bond_active_slaves_get; + rte_eth_bond_create; + rte_eth_bond_link_monitoring_set; + rte_eth_bond_mac_address_reset; + rte_eth_bond_mac_address_set; + rte_eth_bond_mode_get; + rte_eth_bond_mode_set; + rte_eth_bond_primary_get; + rte_eth_bond_primary_set; + rte_eth_bond_slave_add; + rte_eth_bond_slave_remove; + rte_eth_bond_slaves_get; + rte_eth_bond_xmit_policy_get; + rte_eth_bond_xmit_policy_set; + + local: *; +}; + +DPDK_2.1 { + global: + + rte_eth_bond_free; + +} DPDK_2.0; + +DPDK_16.04 { +}; + +DPDK_16.07 { + global: + + rte_eth_bond_8023ad_ext_collect; + rte_eth_bond_8023ad_ext_collect_get; + rte_eth_bond_8023ad_ext_distrib; + rte_eth_bond_8023ad_ext_distrib_get; + rte_eth_bond_8023ad_ext_slowtx; + rte_eth_bond_8023ad_conf_get; + rte_eth_bond_8023ad_setup; + +} DPDK_16.04; |