diff options
Diffstat (limited to 'drivers/net/ethernet/google/gve')
-rw-r--r-- | drivers/net/ethernet/google/gve/Makefile | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve.h | 543 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.c | 630 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.h | 268 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc.h | 113 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_ethtool.c | 542 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_main.c | 1467 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_register.h | 28 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx.c | 463 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 603 |
10 files changed, 4661 insertions, 0 deletions
diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile new file mode 100644 index 000000000..3354ce40e --- /dev/null +++ b/drivers/net/ethernet/google/gve/Makefile @@ -0,0 +1,4 @@ +# Makefile for the Google virtual Ethernet (gve) driver + +obj-$(CONFIG_GVE) += gve.o +gve-objs := gve_main.o gve_tx.o gve_rx.o gve_ethtool.o gve_adminq.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h new file mode 100644 index 000000000..5c9a4d436 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve.h @@ -0,0 +1,543 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#ifndef _GVE_H_ +#define _GVE_H_ + +#include <linux/dma-mapping.h> +#include <linux/netdevice.h> +#include <linux/pci.h> +#include <linux/u64_stats_sync.h> +#include "gve_desc.h" + +#ifndef PCI_VENDOR_ID_GOOGLE +#define PCI_VENDOR_ID_GOOGLE 0x1ae0 +#endif + +#define PCI_DEV_ID_GVNIC 0x0042 + +#define GVE_REGISTER_BAR 0 +#define GVE_DOORBELL_BAR 2 + +/* Driver can alloc up to 2 segments for the header and 2 for the payload. */ +#define GVE_TX_MAX_IOVEC 4 +/* 1 for management, 1 for rx, 1 for tx */ +#define GVE_MIN_MSIX 3 + +/* Numbers of gve tx/rx stats in stats report. */ +#define GVE_TX_STATS_REPORT_NUM 6 +#define GVE_RX_STATS_REPORT_NUM 2 + +/* Interval to schedule a stats report update, 20000ms. */ +#define GVE_STATS_REPORT_TIMER_PERIOD 20000 + +/* Numbers of NIC tx/rx stats in stats report. */ +#define NIC_TX_STATS_REPORT_NUM 0 +#define NIC_RX_STATS_REPORT_NUM 4 + +/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */ +struct gve_rx_desc_queue { + struct gve_rx_desc *desc_ring; /* the descriptor ring */ + dma_addr_t bus; /* the bus for the desc_ring */ + u8 seqno; /* the next expected seqno for this desc*/ +}; + +/* The page info for a single slot in the RX data queue */ +struct gve_rx_slot_page_info { + struct page *page; + void *page_address; + u32 page_offset; /* offset to write to in page */ +}; + +/* A list of pages registered with the device during setup and used by a queue + * as buffers + */ +struct gve_queue_page_list { + u32 id; /* unique id */ + u32 num_entries; + struct page **pages; /* list of num_entries pages */ + dma_addr_t *page_buses; /* the dma addrs of the pages */ +}; + +/* Each slot in the data ring has a 1:1 mapping to a slot in the desc ring */ +struct gve_rx_data_queue { + struct gve_rx_data_slot *data_ring; /* read by NIC */ + dma_addr_t data_bus; /* dma mapping of the slots */ + struct gve_rx_slot_page_info *page_info; /* page info of the buffers */ + struct gve_queue_page_list *qpl; /* qpl assigned to this queue */ +}; + +struct gve_priv; + +/* An RX ring that contains a power-of-two sized desc and data ring. */ +struct gve_rx_ring { + struct gve_priv *gve; + struct gve_rx_desc_queue desc; + struct gve_rx_data_queue data; + u64 rbytes; /* free-running bytes received */ + u64 rpackets; /* free-running packets received */ + u32 cnt; /* free-running total number of completed packets */ + u32 fill_cnt; /* free-running total number of descs and buffs posted */ + u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ + u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ + u64 rx_copied_pkt; /* free-running total number of copied packets */ + u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ + u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */ + u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */ + u32 q_num; /* queue index */ + u32 ntfy_id; /* notification block index */ + struct gve_queue_resources *q_resources; /* head and tail pointer idx */ + dma_addr_t q_resources_bus; /* dma address for the queue resources */ + struct u64_stats_sync statss; /* sync stats for 32bit archs */ +}; + +/* A TX desc ring entry */ +union gve_tx_desc { + struct gve_tx_pkt_desc pkt; /* first desc for a packet */ + struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ +}; + +/* Tracks the memory in the fifo occupied by a segment of a packet */ +struct gve_tx_iovec { + u32 iov_offset; /* offset into this segment */ + u32 iov_len; /* length */ + u32 iov_padding; /* padding associated with this segment */ +}; + +/* Tracks the memory in the fifo occupied by the skb. Mapped 1:1 to a desc + * ring entry but only used for a pkt_desc not a seg_desc + */ +struct gve_tx_buffer_state { + struct sk_buff *skb; /* skb for this pkt */ + struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */ +}; + +/* A TX buffer - each queue has one */ +struct gve_tx_fifo { + void *base; /* address of base of FIFO */ + u32 size; /* total size */ + atomic_t available; /* how much space is still available */ + u32 head; /* offset to write at */ + struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */ +}; + +/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */ +struct gve_tx_ring { + /* Cacheline 0 -- Accessed & dirtied during transmit */ + struct gve_tx_fifo tx_fifo; + u32 req; /* driver tracked head pointer */ + u32 done; /* driver tracked tail pointer */ + + /* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */ + __be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */ + u64 pkt_done; /* free-running - total packets completed */ + u64 bytes_done; /* free-running - total bytes completed */ + + /* Cacheline 2 -- Read-mostly fields */ + union gve_tx_desc *desc ____cacheline_aligned; + struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */ + struct netdev_queue *netdev_txq; + struct gve_queue_resources *q_resources; /* head and tail pointer idx */ + u32 mask; /* masks req and done down to queue size */ + + /* Slow-path fields */ + u32 q_num ____cacheline_aligned; /* queue idx */ + u32 stop_queue; /* count of queue stops */ + u32 wake_queue; /* count of queue wakes */ + u32 queue_timeout; /* count of queue timeouts */ + u32 ntfy_id; /* notification block index */ + u32 last_kick_msec; /* Last time the queue was kicked */ + dma_addr_t bus; /* dma address of the descr ring */ + dma_addr_t q_resources_bus; /* dma address of the queue resources */ + struct u64_stats_sync statss; /* sync stats for 32bit archs */ +} ____cacheline_aligned; + +/* Wraps the info for one irq including the napi struct and the queues + * associated with that irq. + */ +struct gve_notify_block { + __be32 irq_db_index; /* idx into Bar2 - set by device, must be 1st */ + char name[IFNAMSIZ + 16]; /* name registered with the kernel */ + struct napi_struct napi; /* kernel napi struct for this block */ + struct gve_priv *priv; + struct gve_tx_ring *tx; /* tx rings on this block */ + struct gve_rx_ring *rx; /* rx rings on this block */ +} ____cacheline_aligned; + +/* Tracks allowed and current queue settings */ +struct gve_queue_config { + u16 max_queues; + u16 num_queues; /* current */ +}; + +/* Tracks the available and used qpl IDs */ +struct gve_qpl_config { + u32 qpl_map_size; /* map memory size */ + unsigned long *qpl_id_map; /* bitmap of used qpl ids */ +}; + +struct gve_priv { + struct net_device *dev; + struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ + struct gve_rx_ring *rx; /* array of rx_cfg.num_queues */ + struct gve_queue_page_list *qpls; /* array of num qpls */ + struct gve_notify_block *ntfy_blocks; /* array of num_ntfy_blks */ + dma_addr_t ntfy_block_bus; + struct msix_entry *msix_vectors; /* array of num_ntfy_blks + 1 */ + char mgmt_msix_name[IFNAMSIZ + 16]; + u32 mgmt_msix_idx; + __be32 *counter_array; /* array of num_event_counters */ + dma_addr_t counter_array_bus; + + u16 num_event_counters; + u16 tx_desc_cnt; /* num desc per ring */ + u16 rx_desc_cnt; /* num desc per ring */ + u16 tx_pages_per_qpl; /* tx buffer length */ + u16 rx_pages_per_qpl; /* rx buffer length */ + u64 max_registered_pages; + u64 num_registered_pages; /* num pages registered with NIC */ + u32 rx_copybreak; /* copy packets smaller than this */ + u16 default_num_queues; /* default num queues to set up */ + + struct gve_queue_config tx_cfg; + struct gve_queue_config rx_cfg; + struct gve_qpl_config qpl_cfg; /* map used QPL ids */ + u32 num_ntfy_blks; /* spilt between TX and RX so must be even */ + + struct gve_registers __iomem *reg_bar0; /* see gve_register.h */ + __be32 __iomem *db_bar2; /* "array" of doorbells */ + u32 msg_enable; /* level for netif* netdev print macros */ + struct pci_dev *pdev; + + /* metrics */ + u32 tx_timeo_cnt; + + /* Admin queue - see gve_adminq.h*/ + union gve_adminq_command *adminq; + dma_addr_t adminq_bus_addr; + u32 adminq_mask; /* masks prod_cnt to adminq size */ + u32 adminq_prod_cnt; /* free-running count of AQ cmds executed */ + u32 adminq_cmd_fail; /* free-running count of AQ cmds failed */ + u32 adminq_timeouts; /* free-running count of AQ cmds timeouts */ + /* free-running count of per AQ cmd executed */ + u32 adminq_describe_device_cnt; + u32 adminq_cfg_device_resources_cnt; + u32 adminq_register_page_list_cnt; + u32 adminq_unregister_page_list_cnt; + u32 adminq_create_tx_queue_cnt; + u32 adminq_create_rx_queue_cnt; + u32 adminq_destroy_tx_queue_cnt; + u32 adminq_destroy_rx_queue_cnt; + u32 adminq_dcfg_device_resources_cnt; + u32 adminq_set_driver_parameter_cnt; + u32 adminq_report_stats_cnt; + u32 adminq_report_link_speed_cnt; + + /* Global stats */ + u32 interface_up_cnt; /* count of times interface turned up since last reset */ + u32 interface_down_cnt; /* count of times interface turned down since last reset */ + u32 reset_cnt; /* count of reset */ + u32 page_alloc_fail; /* count of page alloc fails */ + u32 dma_mapping_error; /* count of dma mapping errors */ + u32 stats_report_trigger_cnt; /* count of device-requested stats-reports since last reset */ + struct workqueue_struct *gve_wq; + struct work_struct service_task; + struct work_struct stats_report_task; + unsigned long service_task_flags; + unsigned long state_flags; + + struct gve_stats_report *stats_report; + u64 stats_report_len; + dma_addr_t stats_report_bus; /* dma address for the stats report */ + unsigned long ethtool_flags; + + unsigned long stats_report_timer_period; + struct timer_list stats_report_timer; + + /* Gvnic device link speed from hypervisor. */ + u64 link_speed; +}; + +enum gve_service_task_flags_bit { + GVE_PRIV_FLAGS_DO_RESET = 1, + GVE_PRIV_FLAGS_RESET_IN_PROGRESS = 2, + GVE_PRIV_FLAGS_PROBE_IN_PROGRESS = 3, + GVE_PRIV_FLAGS_DO_REPORT_STATS = 4, +}; + +enum gve_state_flags_bit { + GVE_PRIV_FLAGS_ADMIN_QUEUE_OK = 1, + GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK = 2, + GVE_PRIV_FLAGS_DEVICE_RINGS_OK = 3, + GVE_PRIV_FLAGS_NAPI_ENABLED = 4, +}; + +enum gve_ethtool_flags_bit { + GVE_PRIV_FLAGS_REPORT_STATS = 0, +}; + +static inline bool gve_get_do_reset(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_DO_RESET, &priv->service_task_flags); +} + +static inline void gve_set_do_reset(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_DO_RESET, &priv->service_task_flags); +} + +static inline void gve_clear_do_reset(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_DO_RESET, &priv->service_task_flags); +} + +static inline bool gve_get_reset_in_progress(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_RESET_IN_PROGRESS, + &priv->service_task_flags); +} + +static inline void gve_set_reset_in_progress(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_RESET_IN_PROGRESS, &priv->service_task_flags); +} + +static inline void gve_clear_reset_in_progress(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_RESET_IN_PROGRESS, &priv->service_task_flags); +} + +static inline bool gve_get_probe_in_progress(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, + &priv->service_task_flags); +} + +static inline void gve_set_probe_in_progress(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, &priv->service_task_flags); +} + +static inline void gve_clear_probe_in_progress(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_PROBE_IN_PROGRESS, &priv->service_task_flags); +} + +static inline bool gve_get_do_report_stats(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, + &priv->service_task_flags); +} + +static inline void gve_set_do_report_stats(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} + +static inline void gve_clear_do_report_stats(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_DO_REPORT_STATS, &priv->service_task_flags); +} + +static inline bool gve_get_admin_queue_ok(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags); +} + +static inline void gve_set_admin_queue_ok(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags); +} + +static inline void gve_clear_admin_queue_ok(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_ADMIN_QUEUE_OK, &priv->state_flags); +} + +static inline bool gve_get_device_resources_ok(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK, &priv->state_flags); +} + +static inline void gve_set_device_resources_ok(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK, &priv->state_flags); +} + +static inline void gve_clear_device_resources_ok(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_DEVICE_RESOURCES_OK, &priv->state_flags); +} + +static inline bool gve_get_device_rings_ok(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_DEVICE_RINGS_OK, &priv->state_flags); +} + +static inline void gve_set_device_rings_ok(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_DEVICE_RINGS_OK, &priv->state_flags); +} + +static inline void gve_clear_device_rings_ok(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_DEVICE_RINGS_OK, &priv->state_flags); +} + +static inline bool gve_get_napi_enabled(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags); +} + +static inline void gve_set_napi_enabled(struct gve_priv *priv) +{ + set_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags); +} + +static inline void gve_clear_napi_enabled(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_NAPI_ENABLED, &priv->state_flags); +} + +static inline bool gve_get_report_stats(struct gve_priv *priv) +{ + return test_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} + +static inline void gve_clear_report_stats(struct gve_priv *priv) +{ + clear_bit(GVE_PRIV_FLAGS_REPORT_STATS, &priv->ethtool_flags); +} + +/* Returns the address of the ntfy_blocks irq doorbell + */ +static inline __be32 __iomem *gve_irq_doorbell(struct gve_priv *priv, + struct gve_notify_block *block) +{ + return &priv->db_bar2[be32_to_cpu(block->irq_db_index)]; +} + +/* Returns the index into ntfy_blocks of the given tx ring's block + */ +static inline u32 gve_tx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) +{ + return queue_idx; +} + +/* Returns the index into ntfy_blocks of the given rx ring's block + */ +static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) +{ + return (priv->num_ntfy_blks / 2) + queue_idx; +} + +/* Returns the number of tx queue page lists + */ +static inline u32 gve_num_tx_qpls(struct gve_priv *priv) +{ + return priv->tx_cfg.num_queues; +} + +/* Returns the number of rx queue page lists + */ +static inline u32 gve_num_rx_qpls(struct gve_priv *priv) +{ + return priv->rx_cfg.num_queues; +} + +/* Returns a pointer to the next available tx qpl in the list of qpls + */ +static inline +struct gve_queue_page_list *gve_assign_tx_qpl(struct gve_priv *priv) +{ + int id = find_first_zero_bit(priv->qpl_cfg.qpl_id_map, + priv->qpl_cfg.qpl_map_size); + + /* we are out of tx qpls */ + if (id >= gve_num_tx_qpls(priv)) + return NULL; + + set_bit(id, priv->qpl_cfg.qpl_id_map); + return &priv->qpls[id]; +} + +/* Returns a pointer to the next available rx qpl in the list of qpls + */ +static inline +struct gve_queue_page_list *gve_assign_rx_qpl(struct gve_priv *priv) +{ + int id = find_next_zero_bit(priv->qpl_cfg.qpl_id_map, + priv->qpl_cfg.qpl_map_size, + gve_num_tx_qpls(priv)); + + /* we are out of rx qpls */ + if (id == gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv)) + return NULL; + + set_bit(id, priv->qpl_cfg.qpl_id_map); + return &priv->qpls[id]; +} + +/* Unassigns the qpl with the given id + */ +static inline void gve_unassign_qpl(struct gve_priv *priv, int id) +{ + clear_bit(id, priv->qpl_cfg.qpl_id_map); +} + +/* Returns the correct dma direction for tx and rx qpls + */ +static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv, + int id) +{ + if (id < gve_num_tx_qpls(priv)) + return DMA_TO_DEVICE; + else + return DMA_FROM_DEVICE; +} + +/* Returns true if the max mtu allows page recycling */ +static inline bool gve_can_recycle_pages(struct net_device *dev) +{ + /* We can't recycle the pages if we can't fit a packet into half a + * page. + */ + return dev->max_mtu <= PAGE_SIZE / 2; +} + +/* buffers */ +int gve_alloc_page(struct gve_priv *priv, struct device *dev, + struct page **page, dma_addr_t *dma, + enum dma_data_direction); +void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, + enum dma_data_direction); +/* tx handling */ +netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev); +bool gve_tx_poll(struct gve_notify_block *block, int budget); +int gve_tx_alloc_rings(struct gve_priv *priv); +void gve_tx_free_rings(struct gve_priv *priv); +__be32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx); +/* rx handling */ +void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx); +bool gve_rx_poll(struct gve_notify_block *block, int budget); +int gve_rx_alloc_rings(struct gve_priv *priv); +void gve_rx_free_rings(struct gve_priv *priv); +bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, + netdev_features_t feat); +/* Reset */ +void gve_schedule_reset(struct gve_priv *priv); +int gve_reset(struct gve_priv *priv, bool attempt_teardown); +int gve_adjust_queues(struct gve_priv *priv, + struct gve_queue_config new_rx_config, + struct gve_queue_config new_tx_config); +/* report stats handling */ +void gve_handle_report_stats(struct gve_priv *priv); +/* exported by ethtool.c */ +extern const struct ethtool_ops gve_ethtool_ops; +/* needed by ethtool */ +extern const char gve_version_str[]; +#endif /* _GVE_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c new file mode 100644 index 000000000..67f2b9a61 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#include <linux/etherdevice.h> +#include <linux/pci.h> +#include "gve.h" +#include "gve_adminq.h" +#include "gve_register.h" + +#define GVE_MAX_ADMINQ_RELEASE_CHECK 500 +#define GVE_ADMINQ_SLEEP_LEN 20 +#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 100 + +int gve_adminq_alloc(struct device *dev, struct gve_priv *priv) +{ + priv->adminq = dma_alloc_coherent(dev, PAGE_SIZE, + &priv->adminq_bus_addr, GFP_KERNEL); + if (unlikely(!priv->adminq)) + return -ENOMEM; + + priv->adminq_mask = (PAGE_SIZE / sizeof(union gve_adminq_command)) - 1; + priv->adminq_prod_cnt = 0; + priv->adminq_cmd_fail = 0; + priv->adminq_timeouts = 0; + priv->adminq_describe_device_cnt = 0; + priv->adminq_cfg_device_resources_cnt = 0; + priv->adminq_register_page_list_cnt = 0; + priv->adminq_unregister_page_list_cnt = 0; + priv->adminq_create_tx_queue_cnt = 0; + priv->adminq_create_rx_queue_cnt = 0; + priv->adminq_destroy_tx_queue_cnt = 0; + priv->adminq_destroy_rx_queue_cnt = 0; + priv->adminq_dcfg_device_resources_cnt = 0; + priv->adminq_set_driver_parameter_cnt = 0; + priv->adminq_report_stats_cnt = 0; + priv->adminq_report_link_speed_cnt = 0; + + /* Setup Admin queue with the device */ + iowrite32be(priv->adminq_bus_addr / PAGE_SIZE, + &priv->reg_bar0->adminq_pfn); + + gve_set_admin_queue_ok(priv); + return 0; +} + +void gve_adminq_release(struct gve_priv *priv) +{ + int i = 0; + + /* Tell the device the adminq is leaving */ + iowrite32be(0x0, &priv->reg_bar0->adminq_pfn); + while (ioread32be(&priv->reg_bar0->adminq_pfn)) { + /* If this is reached the device is unrecoverable and still + * holding memory. Continue looping to avoid memory corruption, + * but WARN so it is visible what is going on. + */ + if (i == GVE_MAX_ADMINQ_RELEASE_CHECK) + WARN(1, "Unrecoverable platform error!"); + i++; + msleep(GVE_ADMINQ_SLEEP_LEN); + } + gve_clear_device_rings_ok(priv); + gve_clear_device_resources_ok(priv); + gve_clear_admin_queue_ok(priv); +} + +void gve_adminq_free(struct device *dev, struct gve_priv *priv) +{ + if (!gve_get_admin_queue_ok(priv)) + return; + gve_adminq_release(priv); + dma_free_coherent(dev, PAGE_SIZE, priv->adminq, priv->adminq_bus_addr); + gve_clear_admin_queue_ok(priv); +} + +static void gve_adminq_kick_cmd(struct gve_priv *priv, u32 prod_cnt) +{ + iowrite32be(prod_cnt, &priv->reg_bar0->adminq_doorbell); +} + +static bool gve_adminq_wait_for_cmd(struct gve_priv *priv, u32 prod_cnt) +{ + int i; + + for (i = 0; i < GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK; i++) { + if (ioread32be(&priv->reg_bar0->adminq_event_counter) + == prod_cnt) + return true; + msleep(GVE_ADMINQ_SLEEP_LEN); + } + + return false; +} + +static int gve_adminq_parse_err(struct gve_priv *priv, u32 status) +{ + if (status != GVE_ADMINQ_COMMAND_PASSED && + status != GVE_ADMINQ_COMMAND_UNSET) { + dev_err(&priv->pdev->dev, "AQ command failed with status %d\n", status); + priv->adminq_cmd_fail++; + } + switch (status) { + case GVE_ADMINQ_COMMAND_PASSED: + return 0; + case GVE_ADMINQ_COMMAND_UNSET: + dev_err(&priv->pdev->dev, "parse_aq_err: err and status both unset, this should not be possible.\n"); + return -EINVAL; + case GVE_ADMINQ_COMMAND_ERROR_ABORTED: + case GVE_ADMINQ_COMMAND_ERROR_CANCELLED: + case GVE_ADMINQ_COMMAND_ERROR_DATALOSS: + case GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION: + case GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE: + return -EAGAIN; + case GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS: + case GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR: + case GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT: + case GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND: + case GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE: + case GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR: + return -EINVAL; + case GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED: + return -ETIME; + case GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED: + case GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED: + return -EACCES; + case GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED: + return -ENOMEM; + case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED: + return -ENOTSUPP; + default: + dev_err(&priv->pdev->dev, "parse_aq_err: unknown status code %d\n", status); + return -EINVAL; + } +} + +/* Flushes all AQ commands currently queued and waits for them to complete. + * If there are failures, it will return the first error. + */ +static int gve_adminq_kick_and_wait(struct gve_priv *priv) +{ + int tail, head; + int i; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + head = priv->adminq_prod_cnt; + + gve_adminq_kick_cmd(priv, head); + if (!gve_adminq_wait_for_cmd(priv, head)) { + dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n"); + priv->adminq_timeouts++; + return -ENOTRECOVERABLE; + } + + for (i = tail; i < head; i++) { + union gve_adminq_command *cmd; + u32 status, err; + + cmd = &priv->adminq[i & priv->adminq_mask]; + status = be32_to_cpu(READ_ONCE(cmd->status)); + err = gve_adminq_parse_err(priv, status); + if (err) + // Return the first error if we failed. + return err; + } + + return 0; +} + +/* This function is not threadsafe - the caller is responsible for any + * necessary locks. + */ +static int gve_adminq_issue_cmd(struct gve_priv *priv, + union gve_adminq_command *cmd_orig) +{ + union gve_adminq_command *cmd; + u32 opcode; + u32 tail; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + + // Check if next command will overflow the buffer. + if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == + (tail & priv->adminq_mask)) { + int err; + + // Flush existing commands to make room. + err = gve_adminq_kick_and_wait(priv); + if (err) + return err; + + // Retry. + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) == + (tail & priv->adminq_mask)) { + // This should never happen. We just flushed the + // command queue so there should be enough space. + return -ENOMEM; + } + } + + cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask]; + priv->adminq_prod_cnt++; + + memcpy(cmd, cmd_orig, sizeof(*cmd_orig)); + opcode = be32_to_cpu(READ_ONCE(cmd->opcode)); + + switch (opcode) { + case GVE_ADMINQ_DESCRIBE_DEVICE: + priv->adminq_describe_device_cnt++; + break; + case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES: + priv->adminq_cfg_device_resources_cnt++; + break; + case GVE_ADMINQ_REGISTER_PAGE_LIST: + priv->adminq_register_page_list_cnt++; + break; + case GVE_ADMINQ_UNREGISTER_PAGE_LIST: + priv->adminq_unregister_page_list_cnt++; + break; + case GVE_ADMINQ_CREATE_TX_QUEUE: + priv->adminq_create_tx_queue_cnt++; + break; + case GVE_ADMINQ_CREATE_RX_QUEUE: + priv->adminq_create_rx_queue_cnt++; + break; + case GVE_ADMINQ_DESTROY_TX_QUEUE: + priv->adminq_destroy_tx_queue_cnt++; + break; + case GVE_ADMINQ_DESTROY_RX_QUEUE: + priv->adminq_destroy_rx_queue_cnt++; + break; + case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES: + priv->adminq_dcfg_device_resources_cnt++; + break; + case GVE_ADMINQ_SET_DRIVER_PARAMETER: + priv->adminq_set_driver_parameter_cnt++; + break; + case GVE_ADMINQ_REPORT_STATS: + priv->adminq_report_stats_cnt++; + break; + case GVE_ADMINQ_REPORT_LINK_SPEED: + priv->adminq_report_link_speed_cnt++; + break; + default: + dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode); + } + + return 0; +} + +/* This function is not threadsafe - the caller is responsible for any + * necessary locks. + * The caller is also responsible for making sure there are no commands + * waiting to be executed. + */ +static int gve_adminq_execute_cmd(struct gve_priv *priv, union gve_adminq_command *cmd_orig) +{ + u32 tail, head; + int err; + + tail = ioread32be(&priv->reg_bar0->adminq_event_counter); + head = priv->adminq_prod_cnt; + if (tail != head) + // This is not a valid path + return -EINVAL; + + err = gve_adminq_issue_cmd(priv, cmd_orig); + if (err) + return err; + + return gve_adminq_kick_and_wait(priv); +} + +/* The device specifies that the management vector can either be the first irq + * or the last irq. ntfy_blk_msix_base_idx indicates the first irq assigned to + * the ntfy blks. It if is 0 then the management vector is last, if it is 1 then + * the management vector is first. + * + * gve arranges the msix vectors so that the management vector is last. + */ +#define GVE_NTFY_BLK_BASE_MSIX_IDX 0 +int gve_adminq_configure_device_resources(struct gve_priv *priv, + dma_addr_t counter_array_bus_addr, + u32 num_counters, + dma_addr_t db_array_bus_addr, + u32 num_ntfy_blks) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES); + cmd.configure_device_resources = + (struct gve_adminq_configure_device_resources) { + .counter_array = cpu_to_be64(counter_array_bus_addr), + .num_counters = cpu_to_be32(num_counters), + .irq_db_addr = cpu_to_be64(db_array_bus_addr), + .num_irq_dbs = cpu_to_be32(num_ntfy_blks), + .irq_db_stride = cpu_to_be32(sizeof(priv->ntfy_blocks[0])), + .ntfy_blk_msix_base_idx = + cpu_to_be32(GVE_NTFY_BLK_BASE_MSIX_IDX), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_deconfigure_device_resources(struct gve_priv *priv) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES); + + return gve_adminq_execute_cmd(priv, &cmd); +} + +static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) +{ + struct gve_tx_ring *tx = &priv->tx[queue_index]; + union gve_adminq_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE); + cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { + .queue_id = cpu_to_be32(queue_index), + .reserved = 0, + .queue_resources_addr = + cpu_to_be64(tx->q_resources_bus), + .tx_ring_addr = cpu_to_be64(tx->bus), + .queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id), + .ntfy_id = cpu_to_be32(tx->ntfy_id), + }; + + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_tx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) +{ + struct gve_rx_ring *rx = &priv->rx[queue_index]; + union gve_adminq_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE); + cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { + .queue_id = cpu_to_be32(queue_index), + .index = cpu_to_be32(queue_index), + .reserved = 0, + .ntfy_id = cpu_to_be32(rx->ntfy_id), + .queue_resources_addr = cpu_to_be64(rx->q_resources_bus), + .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus), + .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus), + .queue_page_list_id = cpu_to_be32(rx->data.qpl->id), + }; + + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_rx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index) +{ + union gve_adminq_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_TX_QUEUE); + cmd.destroy_tx_queue = (struct gve_adminq_destroy_tx_queue) { + .queue_id = cpu_to_be32(queue_index), + }; + + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_tx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index) +{ + union gve_adminq_command cmd; + int err; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESTROY_RX_QUEUE); + cmd.destroy_rx_queue = (struct gve_adminq_destroy_rx_queue) { + .queue_id = cpu_to_be32(queue_index), + }; + + err = gve_adminq_issue_cmd(priv, &cmd); + if (err) + return err; + + return 0; +} + +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_rx_queue(priv, i); + if (err) + return err; + } + + return gve_adminq_kick_and_wait(priv); +} + +int gve_adminq_describe_device(struct gve_priv *priv) +{ + struct gve_device_descriptor *descriptor; + union gve_adminq_command cmd; + dma_addr_t descriptor_bus; + int err = 0; + u8 *mac; + u16 mtu; + + memset(&cmd, 0, sizeof(cmd)); + descriptor = dma_alloc_coherent(&priv->pdev->dev, PAGE_SIZE, + &descriptor_bus, GFP_KERNEL); + if (!descriptor) + return -ENOMEM; + cmd.opcode = cpu_to_be32(GVE_ADMINQ_DESCRIBE_DEVICE); + cmd.describe_device.device_descriptor_addr = + cpu_to_be64(descriptor_bus); + cmd.describe_device.device_descriptor_version = + cpu_to_be32(GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION); + cmd.describe_device.available_length = cpu_to_be32(PAGE_SIZE); + + err = gve_adminq_execute_cmd(priv, &cmd); + if (err) + goto free_device_descriptor; + + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Tx desc count %d too low\n", priv->tx_desc_cnt); + err = -EINVAL; + goto free_device_descriptor; + } + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) + < PAGE_SIZE || + priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0]) + < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Rx desc count %d too low\n", priv->rx_desc_cnt); + err = -EINVAL; + goto free_device_descriptor; + } + priv->max_registered_pages = + be64_to_cpu(descriptor->max_registered_pages); + mtu = be16_to_cpu(descriptor->mtu); + if (mtu < ETH_MIN_MTU) { + dev_err(&priv->pdev->dev, "MTU %d below minimum MTU\n", mtu); + err = -EINVAL; + goto free_device_descriptor; + } + priv->dev->max_mtu = mtu; + priv->num_event_counters = be16_to_cpu(descriptor->counters); + ether_addr_copy(priv->dev->dev_addr, descriptor->mac); + mac = descriptor->mac; + dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac); + priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl); + priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl); + if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) { + dev_err(&priv->pdev->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", + priv->rx_pages_per_qpl); + priv->rx_desc_cnt = priv->rx_pages_per_qpl; + } + priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues); + +free_device_descriptor: + dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor, + descriptor_bus); + return err; +} + +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl) +{ + struct device *hdev = &priv->pdev->dev; + u32 num_entries = qpl->num_entries; + u32 size = num_entries * sizeof(qpl->page_buses[0]); + union gve_adminq_command cmd; + dma_addr_t page_list_bus; + __be64 *page_list; + int err; + int i; + + memset(&cmd, 0, sizeof(cmd)); + page_list = dma_alloc_coherent(hdev, size, &page_list_bus, GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) + page_list[i] = cpu_to_be64(qpl->page_buses[i]); + + cmd.opcode = cpu_to_be32(GVE_ADMINQ_REGISTER_PAGE_LIST); + cmd.reg_page_list = (struct gve_adminq_register_page_list) { + .page_list_id = cpu_to_be32(qpl->id), + .num_pages = cpu_to_be32(num_entries), + .page_address_list_addr = cpu_to_be64(page_list_bus), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + dma_free_coherent(hdev, size, page_list, page_list_bus); + return err; +} + +int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_UNREGISTER_PAGE_LIST); + cmd.unreg_page_list = (struct gve_adminq_unregister_page_list) { + .page_list_id = cpu_to_be32(page_list_id), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_SET_DRIVER_PARAMETER); + cmd.set_driver_param = (struct gve_adminq_set_driver_parameter) { + .parameter_type = cpu_to_be32(GVE_SET_PARAM_MTU), + .parameter_value = cpu_to_be64(mtu), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, + dma_addr_t stats_report_addr, u64 interval) +{ + union gve_adminq_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_STATS); + cmd.report_stats = (struct gve_adminq_report_stats) { + .stats_report_len = cpu_to_be64(stats_report_len), + .stats_report_addr = cpu_to_be64(stats_report_addr), + .interval = cpu_to_be64(interval), + }; + + return gve_adminq_execute_cmd(priv, &cmd); +} + +int gve_adminq_report_link_speed(struct gve_priv *priv) +{ + union gve_adminq_command gvnic_cmd; + dma_addr_t link_speed_region_bus; + __be64 *link_speed_region; + int err; + + link_speed_region = + dma_alloc_coherent(&priv->pdev->dev, sizeof(*link_speed_region), + &link_speed_region_bus, GFP_KERNEL); + + if (!link_speed_region) + return -ENOMEM; + + memset(&gvnic_cmd, 0, sizeof(gvnic_cmd)); + gvnic_cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_LINK_SPEED); + gvnic_cmd.report_link_speed.link_speed_address = + cpu_to_be64(link_speed_region_bus); + + err = gve_adminq_execute_cmd(priv, &gvnic_cmd); + + priv->link_speed = be64_to_cpu(*link_speed_region); + dma_free_coherent(&priv->pdev->dev, sizeof(*link_speed_region), link_speed_region, + link_speed_region_bus); + return err; +} diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h new file mode 100644 index 000000000..8dbc2c03f --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#ifndef _GVE_ADMINQ_H +#define _GVE_ADMINQ_H + +#include <linux/build_bug.h> + +/* Admin queue opcodes */ +enum gve_adminq_opcodes { + GVE_ADMINQ_DESCRIBE_DEVICE = 0x1, + GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES = 0x2, + GVE_ADMINQ_REGISTER_PAGE_LIST = 0x3, + GVE_ADMINQ_UNREGISTER_PAGE_LIST = 0x4, + GVE_ADMINQ_CREATE_TX_QUEUE = 0x5, + GVE_ADMINQ_CREATE_RX_QUEUE = 0x6, + GVE_ADMINQ_DESTROY_TX_QUEUE = 0x7, + GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, + GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, + GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, + GVE_ADMINQ_REPORT_STATS = 0xC, + GVE_ADMINQ_REPORT_LINK_SPEED = 0xD +}; + +/* Admin queue status codes */ +enum gve_adminq_statuses { + GVE_ADMINQ_COMMAND_UNSET = 0x0, + GVE_ADMINQ_COMMAND_PASSED = 0x1, + GVE_ADMINQ_COMMAND_ERROR_ABORTED = 0xFFFFFFF0, + GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS = 0xFFFFFFF1, + GVE_ADMINQ_COMMAND_ERROR_CANCELLED = 0xFFFFFFF2, + GVE_ADMINQ_COMMAND_ERROR_DATALOSS = 0xFFFFFFF3, + GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED = 0xFFFFFFF4, + GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION = 0xFFFFFFF5, + GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR = 0xFFFFFFF6, + GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT = 0xFFFFFFF7, + GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND = 0xFFFFFFF8, + GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE = 0xFFFFFFF9, + GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED = 0xFFFFFFFA, + GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED = 0xFFFFFFFB, + GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED = 0xFFFFFFFC, + GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE = 0xFFFFFFFD, + GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED = 0xFFFFFFFE, + GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR = 0xFFFFFFFF, +}; + +#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 + +/* All AdminQ command structs should be naturally packed. The static_assert + * calls make sure this is the case at compile time. + */ + +struct gve_adminq_describe_device { + __be64 device_descriptor_addr; + __be32 device_descriptor_version; + __be32 available_length; +}; + +static_assert(sizeof(struct gve_adminq_describe_device) == 16); + +struct gve_device_descriptor { + __be64 max_registered_pages; + __be16 reserved1; + __be16 tx_queue_entries; + __be16 rx_queue_entries; + __be16 default_num_queues; + __be16 mtu; + __be16 counters; + __be16 tx_pages_per_qpl; + __be16 rx_pages_per_qpl; + u8 mac[ETH_ALEN]; + __be16 num_device_options; + __be16 total_length; + u8 reserved2[6]; +}; + +static_assert(sizeof(struct gve_device_descriptor) == 40); + +struct device_option { + __be32 option_id; + __be32 option_length; +}; + +static_assert(sizeof(struct device_option) == 8); + +struct gve_adminq_configure_device_resources { + __be64 counter_array; + __be64 irq_db_addr; + __be32 num_counters; + __be32 num_irq_dbs; + __be32 irq_db_stride; + __be32 ntfy_blk_msix_base_idx; +}; + +static_assert(sizeof(struct gve_adminq_configure_device_resources) == 32); + +struct gve_adminq_register_page_list { + __be32 page_list_id; + __be32 num_pages; + __be64 page_address_list_addr; +}; + +static_assert(sizeof(struct gve_adminq_register_page_list) == 16); + +struct gve_adminq_unregister_page_list { + __be32 page_list_id; +}; + +static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4); + +struct gve_adminq_create_tx_queue { + __be32 queue_id; + __be32 reserved; + __be64 queue_resources_addr; + __be64 tx_ring_addr; + __be32 queue_page_list_id; + __be32 ntfy_id; +}; + +static_assert(sizeof(struct gve_adminq_create_tx_queue) == 32); + +struct gve_adminq_create_rx_queue { + __be32 queue_id; + __be32 index; + __be32 reserved; + __be32 ntfy_id; + __be64 queue_resources_addr; + __be64 rx_desc_ring_addr; + __be64 rx_data_ring_addr; + __be32 queue_page_list_id; + u8 padding[4]; +}; + +static_assert(sizeof(struct gve_adminq_create_rx_queue) == 48); + +/* Queue resources that are shared with the device */ +struct gve_queue_resources { + union { + struct { + __be32 db_index; /* Device -> Guest */ + __be32 counter_index; /* Device -> Guest */ + }; + u8 reserved[64]; + }; +}; + +static_assert(sizeof(struct gve_queue_resources) == 64); + +struct gve_adminq_destroy_tx_queue { + __be32 queue_id; +}; + +static_assert(sizeof(struct gve_adminq_destroy_tx_queue) == 4); + +struct gve_adminq_destroy_rx_queue { + __be32 queue_id; +}; + +static_assert(sizeof(struct gve_adminq_destroy_rx_queue) == 4); + +/* GVE Set Driver Parameter Types */ +enum gve_set_driver_param_types { + GVE_SET_PARAM_MTU = 0x1, +}; + +struct gve_adminq_set_driver_parameter { + __be32 parameter_type; + u8 reserved[4]; + __be64 parameter_value; +}; + +static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16); + +struct gve_adminq_report_stats { + __be64 stats_report_len; + __be64 stats_report_addr; + __be64 interval; +}; + +static_assert(sizeof(struct gve_adminq_report_stats) == 24); + +struct gve_adminq_report_link_speed { + __be64 link_speed_address; +}; + +static_assert(sizeof(struct gve_adminq_report_link_speed) == 8); + +struct stats { + __be32 stat_name; + __be32 queue_id; + __be64 value; +}; + +static_assert(sizeof(struct stats) == 16); + +struct gve_stats_report { + __be64 written_count; + struct stats stats[]; +}; + +static_assert(sizeof(struct gve_stats_report) == 8); + +enum gve_stat_names { + // stats from gve + TX_WAKE_CNT = 1, + TX_STOP_CNT = 2, + TX_FRAMES_SENT = 3, + TX_BYTES_SENT = 4, + TX_LAST_COMPLETION_PROCESSED = 5, + RX_NEXT_EXPECTED_SEQUENCE = 6, + RX_BUFFERS_POSTED = 7, + TX_TIMEOUT_CNT = 8, + // stats from NIC + RX_QUEUE_DROP_CNT = 65, + RX_NO_BUFFERS_POSTED = 66, + RX_DROPS_PACKET_OVER_MRU = 67, + RX_DROPS_INVALID_CHECKSUM = 68, +}; + +union gve_adminq_command { + struct { + __be32 opcode; + __be32 status; + union { + struct gve_adminq_configure_device_resources + configure_device_resources; + struct gve_adminq_create_tx_queue create_tx_queue; + struct gve_adminq_create_rx_queue create_rx_queue; + struct gve_adminq_destroy_tx_queue destroy_tx_queue; + struct gve_adminq_destroy_rx_queue destroy_rx_queue; + struct gve_adminq_describe_device describe_device; + struct gve_adminq_register_page_list reg_page_list; + struct gve_adminq_unregister_page_list unreg_page_list; + struct gve_adminq_set_driver_parameter set_driver_param; + struct gve_adminq_report_stats report_stats; + struct gve_adminq_report_link_speed report_link_speed; + }; + }; + u8 reserved[64]; +}; + +static_assert(sizeof(union gve_adminq_command) == 64); + +int gve_adminq_alloc(struct device *dev, struct gve_priv *priv); +void gve_adminq_free(struct device *dev, struct gve_priv *priv); +void gve_adminq_release(struct gve_priv *priv); +int gve_adminq_describe_device(struct gve_priv *priv); +int gve_adminq_configure_device_resources(struct gve_priv *priv, + dma_addr_t counter_array_bus_addr, + u32 num_counters, + dma_addr_t db_array_bus_addr, + u32 num_ntfy_blks); +int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); +int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 queue_id); +int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues); +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id); +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl); +int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id); +int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); +int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, + dma_addr_t stats_report_addr, u64 interval); +int gve_adminq_report_link_speed(struct gve_priv *priv); +#endif /* _GVE_ADMINQ_H */ diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h new file mode 100644 index 000000000..54779871d --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_desc.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +/* GVE Transmit Descriptor formats */ + +#ifndef _GVE_DESC_H_ +#define _GVE_DESC_H_ + +#include <linux/build_bug.h> + +/* A note on seg_addrs + * + * Base addresses encoded in seg_addr are not assumed to be physical + * addresses. The ring format assumes these come from some linear address + * space. This could be physical memory, kernel virtual memory, user virtual + * memory. gVNIC uses lists of registered pages. Each queue is assumed + * to be associated with a single such linear address space to ensure a + * consistent meaning for seg_addrs posted to its rings. + */ + +struct gve_tx_pkt_desc { + u8 type_flags; /* desc type is lower 4 bits, flags upper */ + u8 l4_csum_offset; /* relative offset of L4 csum word */ + u8 l4_hdr_offset; /* Offset of start of L4 headers in packet */ + u8 desc_cnt; /* Total descriptors for this packet */ + __be16 len; /* Total length of this packet (in bytes) */ + __be16 seg_len; /* Length of this descriptor's segment */ + __be64 seg_addr; /* Base address (see note) of this segment */ +} __packed; + +struct gve_tx_seg_desc { + u8 type_flags; /* type is lower 4 bits, flags upper */ + u8 l3_offset; /* TSO: 2 byte units to start of IPH */ + __be16 reserved; + __be16 mss; /* TSO MSS */ + __be16 seg_len; + __be64 seg_addr; +} __packed; + +/* GVE Transmit Descriptor Types */ +#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */ +#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */ +#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */ + +/* GVE Transmit Descriptor Flags for Std Pkts */ +#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */ +#define GVE_TXF_TSTAMP BIT(2) /* Timestamp required */ + +/* GVE Transmit Descriptor Flags for TSO Segs */ +#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */ + +/* GVE Receive Packet Descriptor */ +/* The start of an ethernet packet comes 2 bytes into the rx buffer. + * gVNIC adds this padding so that both the DMA and the L3/4 protocol header + * access is aligned. + */ +#define GVE_RX_PAD 2 + +struct gve_rx_desc { + u8 padding[48]; + __be32 rss_hash; /* Receive-side scaling hash (Toeplitz for gVNIC) */ + __be16 mss; + __be16 reserved; /* Reserved to zero */ + u8 hdr_len; /* Header length (L2-L4) including padding */ + u8 hdr_off; /* 64-byte-scaled offset into RX_DATA entry */ + __sum16 csum; /* 1's-complement partial checksum of L3+ bytes */ + __be16 len; /* Length of the received packet */ + __be16 flags_seq; /* Flags [15:3] and sequence number [2:0] (1-7) */ +} __packed; +static_assert(sizeof(struct gve_rx_desc) == 64); + +/* As with the Tx ring format, the qpl_offset entries below are offsets into an + * ordered list of registered pages. + */ +struct gve_rx_data_slot { + /* byte offset into the rx registered segment of this slot */ + __be64 qpl_offset; +}; + +/* GVE Recive Packet Descriptor Seq No */ +#define GVE_SEQNO(x) (be16_to_cpu(x) & 0x7) + +/* GVE Recive Packet Descriptor Flags */ +#define GVE_RXFLG(x) cpu_to_be16(1 << (3 + (x))) +#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ +#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ +#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ +#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ +#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ +#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ + +/* GVE IRQ */ +#define GVE_IRQ_ACK BIT(31) +#define GVE_IRQ_MASK BIT(30) +#define GVE_IRQ_EVENT BIT(29) + +static inline bool gve_needs_rss(__be16 flag) +{ + if (flag & GVE_RXF_FRAG) + return false; + if (flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) + return true; + return false; +} + +static inline u8 gve_next_seqno(u8 seq) +{ + return (seq + 1) == 8 ? 1 : seq + 1; +} +#endif /* _GVE_DESC_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c new file mode 100644 index 000000000..cbfd00744 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#include <linux/rtnetlink.h> +#include "gve.h" +#include "gve_adminq.h" + +static void gve_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + struct gve_priv *priv = netdev_priv(netdev); + + strlcpy(info->driver, "gve", sizeof(info->driver)); + strlcpy(info->version, gve_version_str, sizeof(info->version)); + strlcpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info)); +} + +static void gve_set_msglevel(struct net_device *netdev, u32 value) +{ + struct gve_priv *priv = netdev_priv(netdev); + + priv->msg_enable = value; +} + +static u32 gve_get_msglevel(struct net_device *netdev) +{ + struct gve_priv *priv = netdev_priv(netdev); + + return priv->msg_enable; +} + +static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = { + "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", + "rx_dropped", "tx_dropped", "tx_timeouts", + "rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt", + "interface_up_cnt", "interface_down_cnt", "reset_cnt", + "page_alloc_fail", "dma_mapping_error", "stats_report_trigger_cnt", +}; + +static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { + "rx_posted_desc[%u]", "rx_completed_desc[%u]", "rx_bytes[%u]", + "rx_dropped_pkt[%u]", "rx_copybreak_pkt[%u]", "rx_copied_pkt[%u]", + "rx_queue_drop_cnt[%u]", "rx_no_buffers_posted[%u]", + "rx_drops_packet_over_mru[%u]", "rx_drops_invalid_checksum[%u]", +}; + +static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { + "tx_posted_desc[%u]", "tx_completed_desc[%u]", "tx_bytes[%u]", + "tx_wake[%u]", "tx_stop[%u]", "tx_event_counter[%u]", +}; + +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { + "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", + "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", + "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", + "adminq_create_tx_queue_cnt", "adminq_create_rx_queue_cnt", + "adminq_destroy_tx_queue_cnt", "adminq_destroy_rx_queue_cnt", + "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", + "adminq_report_stats_cnt", "adminq_report_link_speed_cnt" +}; + +static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = { + "report-stats", +}; + +#define GVE_MAIN_STATS_LEN ARRAY_SIZE(gve_gstrings_main_stats) +#define GVE_ADMINQ_STATS_LEN ARRAY_SIZE(gve_gstrings_adminq_stats) +#define NUM_GVE_TX_CNTS ARRAY_SIZE(gve_gstrings_tx_stats) +#define NUM_GVE_RX_CNTS ARRAY_SIZE(gve_gstrings_rx_stats) +#define GVE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(gve_gstrings_priv_flags) + +static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + struct gve_priv *priv = netdev_priv(netdev); + char *s = (char *)data; + int i, j; + + switch (stringset) { + case ETH_SS_STATS: + memcpy(s, *gve_gstrings_main_stats, + sizeof(gve_gstrings_main_stats)); + s += sizeof(gve_gstrings_main_stats); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + for (j = 0; j < NUM_GVE_RX_CNTS; j++) { + snprintf(s, ETH_GSTRING_LEN, + gve_gstrings_rx_stats[j], i); + s += ETH_GSTRING_LEN; + } + } + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + for (j = 0; j < NUM_GVE_TX_CNTS; j++) { + snprintf(s, ETH_GSTRING_LEN, + gve_gstrings_tx_stats[j], i); + s += ETH_GSTRING_LEN; + } + } + + memcpy(s, *gve_gstrings_adminq_stats, + sizeof(gve_gstrings_adminq_stats)); + s += sizeof(gve_gstrings_adminq_stats); + break; + + case ETH_SS_PRIV_FLAGS: + memcpy(s, *gve_gstrings_priv_flags, + sizeof(gve_gstrings_priv_flags)); + s += sizeof(gve_gstrings_priv_flags); + break; + + default: + break; + } +} + +static int gve_get_sset_count(struct net_device *netdev, int sset) +{ + struct gve_priv *priv = netdev_priv(netdev); + + switch (sset) { + case ETH_SS_STATS: + return GVE_MAIN_STATS_LEN + GVE_ADMINQ_STATS_LEN + + (priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS) + + (priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS); + case ETH_SS_PRIV_FLAGS: + return GVE_PRIV_FLAGS_STR_LEN; + default: + return -EOPNOTSUPP; + } +} + +static void +gve_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, u64 *data) +{ + u64 tmp_rx_pkts, tmp_rx_bytes, tmp_rx_skb_alloc_fail, tmp_rx_buf_alloc_fail, + tmp_rx_desc_err_dropped_pkt, tmp_tx_pkts, tmp_tx_bytes; + u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_pkts, + rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes; + int stats_idx, base_stats_idx, max_stats_idx; + struct stats *report_stats; + int *rx_qid_to_stats_idx; + int *tx_qid_to_stats_idx; + struct gve_priv *priv; + bool skip_nic_stats; + unsigned int start; + int ring; + int i, j; + + ASSERT_RTNL(); + + priv = netdev_priv(netdev); + report_stats = priv->stats_report->stats; + rx_qid_to_stats_idx = kmalloc_array(priv->rx_cfg.num_queues, + sizeof(int), GFP_KERNEL); + if (!rx_qid_to_stats_idx) + return; + tx_qid_to_stats_idx = kmalloc_array(priv->tx_cfg.num_queues, + sizeof(int), GFP_KERNEL); + if (!tx_qid_to_stats_idx) { + kfree(rx_qid_to_stats_idx); + return; + } + for (rx_pkts = 0, rx_bytes = 0, rx_skb_alloc_fail = 0, + rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, ring = 0; + ring < priv->rx_cfg.num_queues; ring++) { + if (priv->rx) { + do { + struct gve_rx_ring *rx = &priv->rx[ring]; + + start = + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + tmp_rx_pkts = rx->rpackets; + tmp_rx_bytes = rx->rbytes; + tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; + tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_desc_err_dropped_pkt = + rx->rx_desc_err_dropped_pkt; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, + start)); + rx_pkts += tmp_rx_pkts; + rx_bytes += tmp_rx_bytes; + rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; + rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; + rx_desc_err_dropped_pkt += tmp_rx_desc_err_dropped_pkt; + } + } + for (tx_pkts = 0, tx_bytes = 0, ring = 0; + ring < priv->tx_cfg.num_queues; ring++) { + if (priv->tx) { + do { + start = + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + tmp_tx_pkts = priv->tx[ring].pkt_done; + tmp_tx_bytes = priv->tx[ring].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, + start)); + tx_pkts += tmp_tx_pkts; + tx_bytes += tmp_tx_bytes; + } + } + + i = 0; + data[i++] = rx_pkts; + data[i++] = tx_pkts; + data[i++] = rx_bytes; + data[i++] = tx_bytes; + /* total rx dropped packets */ + data[i++] = rx_skb_alloc_fail + rx_buf_alloc_fail + + rx_desc_err_dropped_pkt; + /* Skip tx_dropped */ + i++; + + data[i++] = priv->tx_timeo_cnt; + data[i++] = rx_skb_alloc_fail; + data[i++] = rx_buf_alloc_fail; + data[i++] = rx_desc_err_dropped_pkt; + data[i++] = priv->interface_up_cnt; + data[i++] = priv->interface_down_cnt; + data[i++] = priv->reset_cnt; + data[i++] = priv->page_alloc_fail; + data[i++] = priv->dma_mapping_error; + data[i++] = priv->stats_report_trigger_cnt; + i = GVE_MAIN_STATS_LEN; + + /* For rx cross-reporting stats, start from nic rx stats in report */ + base_stats_idx = GVE_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + + GVE_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues; + max_stats_idx = NIC_RX_STATS_REPORT_NUM * priv->rx_cfg.num_queues + + base_stats_idx; + /* Preprocess the stats report for rx, map queue id to start index */ + skip_nic_stats = false; + for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; + stats_idx += NIC_RX_STATS_REPORT_NUM) { + u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); + u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + + if (stat_name == 0) { + /* no stats written by NIC yet */ + skip_nic_stats = true; + break; + } + rx_qid_to_stats_idx[queue_id] = stats_idx; + } + /* walk RX rings */ + if (priv->rx) { + for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { + struct gve_rx_ring *rx = &priv->rx[ring]; + + data[i++] = rx->fill_cnt; + data[i++] = rx->cnt; + do { + start = + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + tmp_rx_bytes = rx->rbytes; + tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; + tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_desc_err_dropped_pkt = + rx->rx_desc_err_dropped_pkt; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, + start)); + data[i++] = tmp_rx_bytes; + /* rx dropped packets */ + data[i++] = tmp_rx_skb_alloc_fail + + tmp_rx_buf_alloc_fail + + tmp_rx_desc_err_dropped_pkt; + data[i++] = rx->rx_copybreak_pkt; + data[i++] = rx->rx_copied_pkt; + /* stats from NIC */ + if (skip_nic_stats) { + /* skip NIC rx stats */ + i += NIC_RX_STATS_REPORT_NUM; + continue; + } + for (j = 0; j < NIC_RX_STATS_REPORT_NUM; j++) { + u64 value = + be64_to_cpu(report_stats[rx_qid_to_stats_idx[ring] + j].value); + + data[i++] = value; + } + } + } else { + i += priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS; + } + + /* For tx cross-reporting stats, start from nic tx stats in report */ + base_stats_idx = max_stats_idx; + max_stats_idx = NIC_TX_STATS_REPORT_NUM * priv->tx_cfg.num_queues + + max_stats_idx; + /* Preprocess the stats report for tx, map queue id to start index */ + skip_nic_stats = false; + for (stats_idx = base_stats_idx; stats_idx < max_stats_idx; + stats_idx += NIC_TX_STATS_REPORT_NUM) { + u32 stat_name = be32_to_cpu(report_stats[stats_idx].stat_name); + u32 queue_id = be32_to_cpu(report_stats[stats_idx].queue_id); + + if (stat_name == 0) { + /* no stats written by NIC yet */ + skip_nic_stats = true; + break; + } + tx_qid_to_stats_idx[queue_id] = stats_idx; + } + /* walk TX rings */ + if (priv->tx) { + for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { + struct gve_tx_ring *tx = &priv->tx[ring]; + + data[i++] = tx->req; + data[i++] = tx->done; + do { + start = + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + tmp_tx_bytes = tx->bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, + start)); + data[i++] = tmp_tx_bytes; + data[i++] = tx->wake_queue; + data[i++] = tx->stop_queue; + data[i++] = be32_to_cpu(gve_tx_load_event_counter(priv, + tx)); + /* stats from NIC */ + if (skip_nic_stats) { + /* skip NIC tx stats */ + i += NIC_TX_STATS_REPORT_NUM; + continue; + } + for (j = 0; j < NIC_TX_STATS_REPORT_NUM; j++) { + u64 value = + be64_to_cpu(report_stats[tx_qid_to_stats_idx[ring] + j].value); + data[i++] = value; + } + } + } else { + i += priv->tx_cfg.num_queues * NUM_GVE_TX_CNTS; + } + + kfree(rx_qid_to_stats_idx); + kfree(tx_qid_to_stats_idx); + /* AQ Stats */ + data[i++] = priv->adminq_prod_cnt; + data[i++] = priv->adminq_cmd_fail; + data[i++] = priv->adminq_timeouts; + data[i++] = priv->adminq_describe_device_cnt; + data[i++] = priv->adminq_cfg_device_resources_cnt; + data[i++] = priv->adminq_register_page_list_cnt; + data[i++] = priv->adminq_unregister_page_list_cnt; + data[i++] = priv->adminq_create_tx_queue_cnt; + data[i++] = priv->adminq_create_rx_queue_cnt; + data[i++] = priv->adminq_destroy_tx_queue_cnt; + data[i++] = priv->adminq_destroy_rx_queue_cnt; + data[i++] = priv->adminq_dcfg_device_resources_cnt; + data[i++] = priv->adminq_set_driver_parameter_cnt; + data[i++] = priv->adminq_report_stats_cnt; + data[i++] = priv->adminq_report_link_speed_cnt; +} + +static void gve_get_channels(struct net_device *netdev, + struct ethtool_channels *cmd) +{ + struct gve_priv *priv = netdev_priv(netdev); + + cmd->max_rx = priv->rx_cfg.max_queues; + cmd->max_tx = priv->tx_cfg.max_queues; + cmd->max_other = 0; + cmd->max_combined = 0; + cmd->rx_count = priv->rx_cfg.num_queues; + cmd->tx_count = priv->tx_cfg.num_queues; + cmd->other_count = 0; + cmd->combined_count = 0; +} + +static int gve_set_channels(struct net_device *netdev, + struct ethtool_channels *cmd) +{ + struct gve_priv *priv = netdev_priv(netdev); + struct gve_queue_config new_tx_cfg = priv->tx_cfg; + struct gve_queue_config new_rx_cfg = priv->rx_cfg; + struct ethtool_channels old_settings; + int new_tx = cmd->tx_count; + int new_rx = cmd->rx_count; + + gve_get_channels(netdev, &old_settings); + + /* Changing combined is not allowed allowed */ + if (cmd->combined_count != old_settings.combined_count) + return -EINVAL; + + if (!new_rx || !new_tx) + return -EINVAL; + + if (!netif_carrier_ok(netdev)) { + priv->tx_cfg.num_queues = new_tx; + priv->rx_cfg.num_queues = new_rx; + return 0; + } + + new_tx_cfg.num_queues = new_tx; + new_rx_cfg.num_queues = new_rx; + + return gve_adjust_queues(priv, new_rx_cfg, new_tx_cfg); +} + +static void gve_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *cmd) +{ + struct gve_priv *priv = netdev_priv(netdev); + + cmd->rx_max_pending = priv->rx_desc_cnt; + cmd->tx_max_pending = priv->tx_desc_cnt; + cmd->rx_pending = priv->rx_desc_cnt; + cmd->tx_pending = priv->tx_desc_cnt; +} + +static int gve_user_reset(struct net_device *netdev, u32 *flags) +{ + struct gve_priv *priv = netdev_priv(netdev); + + if (*flags == ETH_RESET_ALL) { + *flags = 0; + return gve_reset(priv, true); + } + + return -EOPNOTSUPP; +} + +static int gve_get_tunable(struct net_device *netdev, + const struct ethtool_tunable *etuna, void *value) +{ + struct gve_priv *priv = netdev_priv(netdev); + + switch (etuna->id) { + case ETHTOOL_RX_COPYBREAK: + *(u32 *)value = priv->rx_copybreak; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int gve_set_tunable(struct net_device *netdev, + const struct ethtool_tunable *etuna, + const void *value) +{ + struct gve_priv *priv = netdev_priv(netdev); + u32 len; + + switch (etuna->id) { + case ETHTOOL_RX_COPYBREAK: + len = *(u32 *)value; + if (len > PAGE_SIZE / 2) + return -EINVAL; + priv->rx_copybreak = len; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static u32 gve_get_priv_flags(struct net_device *netdev) +{ + struct gve_priv *priv = netdev_priv(netdev); + u32 ret_flags = 0; + + /* Only 1 flag exists currently: report-stats (BIT(O)), so set that flag. */ + if (priv->ethtool_flags & BIT(0)) + ret_flags |= BIT(0); + return ret_flags; +} + +static int gve_set_priv_flags(struct net_device *netdev, u32 flags) +{ + struct gve_priv *priv = netdev_priv(netdev); + u64 ori_flags, new_flags; + + ori_flags = READ_ONCE(priv->ethtool_flags); + new_flags = ori_flags; + + /* Only one priv flag exists: report-stats (BIT(0))*/ + if (flags & BIT(0)) + new_flags |= BIT(0); + else + new_flags &= ~(BIT(0)); + priv->ethtool_flags = new_flags; + /* start report-stats timer when user turns report stats on. */ + if (flags & BIT(0)) { + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + } + /* Zero off gve stats when report-stats turned off and */ + /* delete report stats timer. */ + if (!(flags & BIT(0)) && (ori_flags & BIT(0))) { + int tx_stats_num = GVE_TX_STATS_REPORT_NUM * + priv->tx_cfg.num_queues; + int rx_stats_num = GVE_RX_STATS_REPORT_NUM * + priv->rx_cfg.num_queues; + + memset(priv->stats_report->stats, 0, (tx_stats_num + rx_stats_num) * + sizeof(struct stats)); + del_timer_sync(&priv->stats_report_timer); + } + return 0; +} + +static int gve_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) +{ + struct gve_priv *priv = netdev_priv(netdev); + int err = 0; + + if (priv->link_speed == 0) + err = gve_adminq_report_link_speed(priv); + + cmd->base.speed = priv->link_speed; + + cmd->base.duplex = DUPLEX_FULL; + + return err; +} + +const struct ethtool_ops gve_ethtool_ops = { + .get_drvinfo = gve_get_drvinfo, + .get_strings = gve_get_strings, + .get_sset_count = gve_get_sset_count, + .get_ethtool_stats = gve_get_ethtool_stats, + .set_msglevel = gve_set_msglevel, + .get_msglevel = gve_get_msglevel, + .set_channels = gve_set_channels, + .get_channels = gve_get_channels, + .get_link = ethtool_op_get_link, + .get_ringparam = gve_get_ringparam, + .reset = gve_user_reset, + .get_tunable = gve_get_tunable, + .set_tunable = gve_set_tunable, + .get_priv_flags = gve_get_priv_flags, + .set_priv_flags = gve_set_priv_flags, + .get_link_ksettings = gve_get_link_ksettings +}; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c new file mode 100644 index 000000000..b76d1d019 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -0,0 +1,1467 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#include <linux/cpumask.h> +#include <linux/etherdevice.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <net/sch_generic.h> +#include "gve.h" +#include "gve_adminq.h" +#include "gve_register.h" + +#define GVE_DEFAULT_RX_COPYBREAK (256) + +#define DEFAULT_MSG_LEVEL (NETIF_MSG_DRV | NETIF_MSG_LINK) +#define GVE_VERSION "1.0.0" +#define GVE_VERSION_PREFIX "GVE-" + +// Minimum amount of time between queue kicks in msec (10 seconds) +#define MIN_TX_TIMEOUT_GAP (1000 * 10) + +const char gve_version_str[] = GVE_VERSION; +static const char gve_version_prefix[] = GVE_VERSION_PREFIX; + +static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + struct gve_priv *priv = netdev_priv(dev); + unsigned int start; + u64 packets, bytes; + int ring; + + if (priv->rx) { + for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { + do { + start = + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); + packets = priv->rx[ring].rpackets; + bytes = priv->rx[ring].rbytes; + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, + start)); + s->rx_packets += packets; + s->rx_bytes += bytes; + } + } + if (priv->tx) { + for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { + do { + start = + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); + packets = priv->tx[ring].pkt_done; + bytes = priv->tx[ring].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, + start)); + s->tx_packets += packets; + s->tx_bytes += bytes; + } + } +} + +static int gve_alloc_counter_array(struct gve_priv *priv) +{ + priv->counter_array = + dma_alloc_coherent(&priv->pdev->dev, + priv->num_event_counters * + sizeof(*priv->counter_array), + &priv->counter_array_bus, GFP_KERNEL); + if (!priv->counter_array) + return -ENOMEM; + + return 0; +} + +static void gve_free_counter_array(struct gve_priv *priv) +{ + if (!priv->counter_array) + return; + + dma_free_coherent(&priv->pdev->dev, + priv->num_event_counters * + sizeof(*priv->counter_array), + priv->counter_array, priv->counter_array_bus); + priv->counter_array = NULL; +} + +/* NIC requests to report stats */ +static void gve_stats_report_task(struct work_struct *work) +{ + struct gve_priv *priv = container_of(work, struct gve_priv, + stats_report_task); + if (gve_get_do_report_stats(priv)) { + gve_handle_report_stats(priv); + gve_clear_do_report_stats(priv); + } +} + +static void gve_stats_report_schedule(struct gve_priv *priv) +{ + if (!gve_get_probe_in_progress(priv) && + !gve_get_reset_in_progress(priv)) { + gve_set_do_report_stats(priv); + queue_work(priv->gve_wq, &priv->stats_report_task); + } +} + +static void gve_stats_report_timer(struct timer_list *t) +{ + struct gve_priv *priv = from_timer(priv, t, stats_report_timer); + + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + gve_stats_report_schedule(priv); +} + +static int gve_alloc_stats_report(struct gve_priv *priv) +{ + int tx_stats_num, rx_stats_num; + + tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) * + priv->tx_cfg.num_queues; + rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) * + priv->rx_cfg.num_queues; + priv->stats_report_len = struct_size(priv->stats_report, stats, + size_add(tx_stats_num, rx_stats_num)); + priv->stats_report = + dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len, + &priv->stats_report_bus, GFP_KERNEL); + if (!priv->stats_report) + return -ENOMEM; + /* Set up timer for the report-stats task */ + timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0); + priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD; + return 0; +} + +static void gve_free_stats_report(struct gve_priv *priv) +{ + if (!priv->stats_report) + return; + + del_timer_sync(&priv->stats_report_timer); + dma_free_coherent(&priv->pdev->dev, priv->stats_report_len, + priv->stats_report, priv->stats_report_bus); + priv->stats_report = NULL; +} + +static irqreturn_t gve_mgmnt_intr(int irq, void *arg) +{ + struct gve_priv *priv = arg; + + queue_work(priv->gve_wq, &priv->service_task); + return IRQ_HANDLED; +} + +static irqreturn_t gve_intr(int irq, void *arg) +{ + struct gve_notify_block *block = arg; + struct gve_priv *priv = block->priv; + + iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); + napi_schedule_irqoff(&block->napi); + return IRQ_HANDLED; +} + +static int gve_napi_poll(struct napi_struct *napi, int budget) +{ + struct gve_notify_block *block; + __be32 __iomem *irq_doorbell; + bool reschedule = false; + struct gve_priv *priv; + + block = container_of(napi, struct gve_notify_block, napi); + priv = block->priv; + + if (block->tx) + reschedule |= gve_tx_poll(block, budget); + if (block->rx) + reschedule |= gve_rx_poll(block, budget); + + if (reschedule) + return budget; + + napi_complete(napi); + irq_doorbell = gve_irq_doorbell(priv, block); + iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell); + + /* Double check we have no extra work. + * Ensure unmask synchronizes with checking for work. + */ + mb(); + if (block->tx) + reschedule |= gve_tx_poll(block, -1); + if (block->rx) + reschedule |= gve_rx_poll(block, -1); + if (reschedule && napi_reschedule(napi)) + iowrite32be(GVE_IRQ_MASK, irq_doorbell); + + return 0; +} + +static int gve_alloc_notify_blocks(struct gve_priv *priv) +{ + int num_vecs_requested = priv->num_ntfy_blks + 1; + char *name = priv->dev->name; + unsigned int active_cpus; + int vecs_enabled; + int i, j; + int err; + + priv->msix_vectors = kvzalloc(num_vecs_requested * + sizeof(*priv->msix_vectors), GFP_KERNEL); + if (!priv->msix_vectors) + return -ENOMEM; + for (i = 0; i < num_vecs_requested; i++) + priv->msix_vectors[i].entry = i; + vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors, + GVE_MIN_MSIX, num_vecs_requested); + if (vecs_enabled < 0) { + dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n", + GVE_MIN_MSIX, vecs_enabled); + err = vecs_enabled; + goto abort_with_msix_vectors; + } + if (vecs_enabled != num_vecs_requested) { + int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1; + int vecs_per_type = new_num_ntfy_blks / 2; + int vecs_left = new_num_ntfy_blks % 2; + + priv->num_ntfy_blks = new_num_ntfy_blks; + priv->mgmt_msix_idx = priv->num_ntfy_blks; + priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues, + vecs_per_type); + priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues, + vecs_per_type + vecs_left); + dev_err(&priv->pdev->dev, + "Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n", + vecs_enabled, priv->tx_cfg.max_queues, + priv->rx_cfg.max_queues); + if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues) + priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; + if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues) + priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; + } + /* Half the notification blocks go to TX and half to RX */ + active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus()); + + /* Setup Management Vector - the last vector */ + snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "%s-mgmnt", + name); + err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, + gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv); + if (err) { + dev_err(&priv->pdev->dev, "Did not receive management vector.\n"); + goto abort_with_msix_enabled; + } + priv->ntfy_blocks = + dma_alloc_coherent(&priv->pdev->dev, + priv->num_ntfy_blks * + sizeof(*priv->ntfy_blocks), + &priv->ntfy_block_bus, GFP_KERNEL); + if (!priv->ntfy_blocks) { + err = -ENOMEM; + goto abort_with_mgmt_vector; + } + /* Setup the other blocks - the first n-1 vectors */ + for (i = 0; i < priv->num_ntfy_blks; i++) { + struct gve_notify_block *block = &priv->ntfy_blocks[i]; + int msix_idx = i; + + snprintf(block->name, sizeof(block->name), "%s-ntfy-block.%d", + name, i); + block->priv = priv; + err = request_irq(priv->msix_vectors[msix_idx].vector, + gve_intr, 0, block->name, block); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to receive msix vector %d\n", i); + goto abort_with_some_ntfy_blocks; + } + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + get_cpu_mask(i % active_cpus)); + } + return 0; +abort_with_some_ntfy_blocks: + for (j = 0; j < i; j++) { + struct gve_notify_block *block = &priv->ntfy_blocks[j]; + int msix_idx = j; + + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + NULL); + free_irq(priv->msix_vectors[msix_idx].vector, block); + } + dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * + sizeof(*priv->ntfy_blocks), + priv->ntfy_blocks, priv->ntfy_block_bus); + priv->ntfy_blocks = NULL; +abort_with_mgmt_vector: + free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); +abort_with_msix_enabled: + pci_disable_msix(priv->pdev); +abort_with_msix_vectors: + kvfree(priv->msix_vectors); + priv->msix_vectors = NULL; + return err; +} + +static void gve_free_notify_blocks(struct gve_priv *priv) +{ + int i; + + if (!priv->msix_vectors) + return; + + /* Free the irqs */ + for (i = 0; i < priv->num_ntfy_blks; i++) { + struct gve_notify_block *block = &priv->ntfy_blocks[i]; + int msix_idx = i; + + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + NULL); + free_irq(priv->msix_vectors[msix_idx].vector, block); + } + free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); + dma_free_coherent(&priv->pdev->dev, + priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), + priv->ntfy_blocks, priv->ntfy_block_bus); + priv->ntfy_blocks = NULL; + pci_disable_msix(priv->pdev); + kvfree(priv->msix_vectors); + priv->msix_vectors = NULL; +} + +static int gve_setup_device_resources(struct gve_priv *priv) +{ + int err; + + err = gve_alloc_counter_array(priv); + if (err) + return err; + err = gve_alloc_notify_blocks(priv); + if (err) + goto abort_with_counter; + err = gve_alloc_stats_report(priv); + if (err) + goto abort_with_ntfy_blocks; + err = gve_adminq_configure_device_resources(priv, + priv->counter_array_bus, + priv->num_event_counters, + priv->ntfy_block_bus, + priv->num_ntfy_blks); + if (unlikely(err)) { + dev_err(&priv->pdev->dev, + "could not setup device_resources: err=%d\n", err); + err = -ENXIO; + goto abort_with_stats_report; + } + err = gve_adminq_report_stats(priv, priv->stats_report_len, + priv->stats_report_bus, + GVE_STATS_REPORT_TIMER_PERIOD); + if (err) + dev_err(&priv->pdev->dev, + "Failed to report stats: err=%d\n", err); + gve_set_device_resources_ok(priv); + return 0; +abort_with_stats_report: + gve_free_stats_report(priv); +abort_with_ntfy_blocks: + gve_free_notify_blocks(priv); +abort_with_counter: + gve_free_counter_array(priv); + return err; +} + +static void gve_trigger_reset(struct gve_priv *priv); + +static void gve_teardown_device_resources(struct gve_priv *priv) +{ + int err; + + /* Tell device its resources are being freed */ + if (gve_get_device_resources_ok(priv)) { + /* detach the stats report */ + err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to detach stats report: err=%d\n", err); + gve_trigger_reset(priv); + } + err = gve_adminq_deconfigure_device_resources(priv); + if (err) { + dev_err(&priv->pdev->dev, + "Could not deconfigure device resources: err=%d\n", + err); + gve_trigger_reset(priv); + } + } + gve_free_counter_array(priv); + gve_free_notify_blocks(priv); + gve_free_stats_report(priv); + gve_clear_device_resources_ok(priv); +} + +static void gve_add_napi(struct gve_priv *priv, int ntfy_idx) +{ + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + netif_napi_add(priv->dev, &block->napi, gve_napi_poll, + NAPI_POLL_WEIGHT); +} + +static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx) +{ + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + netif_napi_del(&block->napi); +} + +static int gve_register_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to register queue page list %d\n", + priv->qpls[i].id); + /* This failure will trigger a reset - no need to clean + * up + */ + return err; + } + } + return 0; +} + +static int gve_unregister_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + /* This failure will trigger a reset - no need to clean up */ + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to unregister queue page list %d\n", + priv->qpls[i].id); + return err; + } + } + return 0; +} + +static int gve_create_rings(struct gve_priv *priv) +{ + int err; + int i; + + err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n", + priv->tx_cfg.num_queues); + /* This failure will trigger a reset - no need to clean + * up + */ + return err; + } + netif_dbg(priv, drv, priv->dev, "created %d tx queues\n", + priv->tx_cfg.num_queues); + + err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n", + priv->rx_cfg.num_queues); + /* This failure will trigger a reset - no need to clean + * up + */ + return err; + } + netif_dbg(priv, drv, priv->dev, "created %d rx queues\n", + priv->rx_cfg.num_queues); + + /* Rx data ring has been prefilled with packet buffers at queue + * allocation time. + * Write the doorbell to provide descriptor slots and packet buffers + * to the NIC. + */ + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_write_doorbell(priv, &priv->rx[i]); + + return 0; +} + +static int gve_alloc_rings(struct gve_priv *priv) +{ + int ntfy_idx; + int err; + int i; + + /* Setup tx rings */ + priv->tx = kvzalloc(priv->tx_cfg.num_queues * sizeof(*priv->tx), + GFP_KERNEL); + if (!priv->tx) + return -ENOMEM; + err = gve_tx_alloc_rings(priv); + if (err) + goto free_tx; + /* Setup rx rings */ + priv->rx = kvzalloc(priv->rx_cfg.num_queues * sizeof(*priv->rx), + GFP_KERNEL); + if (!priv->rx) { + err = -ENOMEM; + goto free_tx_queue; + } + err = gve_rx_alloc_rings(priv); + if (err) + goto free_rx; + /* Add tx napi & init sync stats*/ + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + u64_stats_init(&priv->tx[i].statss); + ntfy_idx = gve_tx_idx_to_ntfy(priv, i); + gve_add_napi(priv, ntfy_idx); + } + /* Add rx napi & init sync stats*/ + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + u64_stats_init(&priv->rx[i].statss); + ntfy_idx = gve_rx_idx_to_ntfy(priv, i); + gve_add_napi(priv, ntfy_idx); + } + + return 0; + +free_rx: + kvfree(priv->rx); + priv->rx = NULL; +free_tx_queue: + gve_tx_free_rings(priv); +free_tx: + kvfree(priv->tx); + priv->tx = NULL; + return err; +} + +static int gve_destroy_rings(struct gve_priv *priv) +{ + int err; + + err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to destroy tx queues\n"); + /* This failure will trigger a reset - no need to clean up */ + return err; + } + netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n"); + err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); + if (err) { + netif_err(priv, drv, priv->dev, + "failed to destroy rx queues\n"); + /* This failure will trigger a reset - no need to clean up */ + return err; + } + netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n"); + return 0; +} + +static void gve_free_rings(struct gve_priv *priv) +{ + int ntfy_idx; + int i; + + if (priv->tx) { + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + ntfy_idx = gve_tx_idx_to_ntfy(priv, i); + gve_remove_napi(priv, ntfy_idx); + } + gve_tx_free_rings(priv); + kvfree(priv->tx); + priv->tx = NULL; + } + if (priv->rx) { + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + ntfy_idx = gve_rx_idx_to_ntfy(priv, i); + gve_remove_napi(priv, ntfy_idx); + } + gve_rx_free_rings(priv); + kvfree(priv->rx); + priv->rx = NULL; + } +} + +int gve_alloc_page(struct gve_priv *priv, struct device *dev, + struct page **page, dma_addr_t *dma, + enum dma_data_direction dir) +{ + *page = alloc_page(GFP_KERNEL); + if (!*page) { + priv->page_alloc_fail++; + return -ENOMEM; + } + *dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(dev, *dma)) { + priv->dma_mapping_error++; + put_page(*page); + return -ENOMEM; + } + return 0; +} + +static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id, + int pages) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int err; + int i; + + if (pages + priv->num_registered_pages > priv->max_registered_pages) { + netif_err(priv, drv, priv->dev, + "Reached max number of registered pages %llu > %llu\n", + pages + priv->num_registered_pages, + priv->max_registered_pages); + return -EINVAL; + } + + qpl->id = id; + qpl->num_entries = 0; + qpl->pages = kvzalloc(pages * sizeof(*qpl->pages), GFP_KERNEL); + /* caller handles clean up */ + if (!qpl->pages) + return -ENOMEM; + qpl->page_buses = kvzalloc(pages * sizeof(*qpl->page_buses), + GFP_KERNEL); + /* caller handles clean up */ + if (!qpl->page_buses) + return -ENOMEM; + + for (i = 0; i < pages; i++) { + err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i], + &qpl->page_buses[i], + gve_qpl_dma_dir(priv, id)); + /* caller handles clean up */ + if (err) + return -ENOMEM; + qpl->num_entries++; + } + priv->num_registered_pages += pages; + + return 0; +} + +void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, + enum dma_data_direction dir) +{ + if (!dma_mapping_error(dev, dma)) + dma_unmap_page(dev, dma, PAGE_SIZE, dir); + if (page) + put_page(page); +} + +static void gve_free_queue_page_list(struct gve_priv *priv, + int id) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int i; + + if (!qpl->pages) + return; + if (!qpl->page_buses) + goto free_pages; + + for (i = 0; i < qpl->num_entries; i++) + gve_free_page(&priv->pdev->dev, qpl->pages[i], + qpl->page_buses[i], gve_qpl_dma_dir(priv, id)); + + kvfree(qpl->page_buses); +free_pages: + kvfree(qpl->pages); + priv->num_registered_pages -= qpl->num_entries; +} + +static int gve_alloc_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int i, j; + int err; + + priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL); + if (!priv->qpls) + return -ENOMEM; + + for (i = 0; i < gve_num_tx_qpls(priv); i++) { + err = gve_alloc_queue_page_list(priv, i, + priv->tx_pages_per_qpl); + if (err) + goto free_qpls; + } + for (; i < num_qpls; i++) { + err = gve_alloc_queue_page_list(priv, i, + priv->rx_pages_per_qpl); + if (err) + goto free_qpls; + } + + priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(num_qpls) * + sizeof(unsigned long) * BITS_PER_BYTE; + priv->qpl_cfg.qpl_id_map = kvzalloc(BITS_TO_LONGS(num_qpls) * + sizeof(unsigned long), GFP_KERNEL); + if (!priv->qpl_cfg.qpl_id_map) { + err = -ENOMEM; + goto free_qpls; + } + + return 0; + +free_qpls: + for (j = 0; j <= i; j++) + gve_free_queue_page_list(priv, j); + kvfree(priv->qpls); + return err; +} + +static void gve_free_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int i; + + kvfree(priv->qpl_cfg.qpl_id_map); + + for (i = 0; i < num_qpls; i++) + gve_free_queue_page_list(priv, i); + + kvfree(priv->qpls); +} + +/* Use this to schedule a reset when the device is capable of continuing + * to handle other requests in its current state. If it is not, do a reset + * in thread instead. + */ +void gve_schedule_reset(struct gve_priv *priv) +{ + gve_set_do_reset(priv); + queue_work(priv->gve_wq, &priv->service_task); +} + +static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up); +static int gve_reset_recovery(struct gve_priv *priv, bool was_up); +static void gve_turndown(struct gve_priv *priv); +static void gve_turnup(struct gve_priv *priv); + +static int gve_open(struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + int err; + + err = gve_alloc_qpls(priv); + if (err) + return err; + err = gve_alloc_rings(priv); + if (err) + goto free_qpls; + + err = netif_set_real_num_tx_queues(dev, priv->tx_cfg.num_queues); + if (err) + goto free_rings; + err = netif_set_real_num_rx_queues(dev, priv->rx_cfg.num_queues); + if (err) + goto free_rings; + + err = gve_register_qpls(priv); + if (err) + goto reset; + err = gve_create_rings(priv); + if (err) + goto reset; + gve_set_device_rings_ok(priv); + + if (gve_get_report_stats(priv)) + mod_timer(&priv->stats_report_timer, + round_jiffies(jiffies + + msecs_to_jiffies(priv->stats_report_timer_period))); + + gve_turnup(priv); + queue_work(priv->gve_wq, &priv->service_task); + priv->interface_up_cnt++; + return 0; + +free_rings: + gve_free_rings(priv); +free_qpls: + gve_free_qpls(priv); + return err; + +reset: + /* This must have been called from a reset due to the rtnl lock + * so just return at this point. + */ + if (gve_get_reset_in_progress(priv)) + return err; + /* Otherwise reset before returning */ + gve_reset_and_teardown(priv, true); + /* if this fails there is nothing we can do so just ignore the return */ + gve_reset_recovery(priv, false); + /* return the original error */ + return err; +} + +static int gve_close(struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + int err; + + netif_carrier_off(dev); + if (gve_get_device_rings_ok(priv)) { + gve_turndown(priv); + err = gve_destroy_rings(priv); + if (err) + goto err; + err = gve_unregister_qpls(priv); + if (err) + goto err; + gve_clear_device_rings_ok(priv); + } + del_timer_sync(&priv->stats_report_timer); + + gve_free_rings(priv); + gve_free_qpls(priv); + priv->interface_down_cnt++; + return 0; + +err: + /* This must have been called from a reset due to the rtnl lock + * so just return at this point. + */ + if (gve_get_reset_in_progress(priv)) + return err; + /* Otherwise reset before returning */ + gve_reset_and_teardown(priv, true); + return gve_reset_recovery(priv, false); +} + +int gve_adjust_queues(struct gve_priv *priv, + struct gve_queue_config new_rx_config, + struct gve_queue_config new_tx_config) +{ + int err; + + if (netif_carrier_ok(priv->dev)) { + /* To make this process as simple as possible we teardown the + * device, set the new configuration, and then bring the device + * up again. + */ + err = gve_close(priv->dev); + /* we have already tried to reset in close, + * just fail at this point + */ + if (err) + return err; + priv->tx_cfg = new_tx_config; + priv->rx_cfg = new_rx_config; + + err = gve_open(priv->dev); + if (err) + goto err; + + return 0; + } + /* Set the config for the next up. */ + priv->tx_cfg = new_tx_config; + priv->rx_cfg = new_rx_config; + + return 0; +err: + netif_err(priv, drv, priv->dev, + "Adjust queues failed! !!! DISABLING ALL QUEUES !!!\n"); + gve_turndown(priv); + return err; +} + +static void gve_turndown(struct gve_priv *priv) +{ + int idx; + + if (netif_carrier_ok(priv->dev)) + netif_carrier_off(priv->dev); + + if (!gve_get_napi_enabled(priv)) + return; + + /* Disable napi to prevent more work from coming in */ + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + napi_disable(&block->napi); + } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + napi_disable(&block->napi); + } + + /* Stop tx queues */ + netif_tx_disable(priv->dev); + + gve_clear_napi_enabled(priv); + gve_clear_report_stats(priv); +} + +static void gve_turnup(struct gve_priv *priv) +{ + int idx; + + /* Start the tx queues */ + netif_tx_start_all_queues(priv->dev); + + /* Enable napi and unmask interrupts for all queues */ + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + napi_enable(&block->napi); + iowrite32be(0, gve_irq_doorbell(priv, block)); + } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + + napi_enable(&block->napi); + iowrite32be(0, gve_irq_doorbell(priv, block)); + } + + gve_set_napi_enabled(priv); +} + +static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct gve_notify_block *block; + struct gve_tx_ring *tx = NULL; + struct gve_priv *priv; + u32 last_nic_done; + u32 current_time; + u32 ntfy_idx; + + netdev_info(dev, "Timeout on tx queue, %d", txqueue); + priv = netdev_priv(dev); + if (txqueue > priv->tx_cfg.num_queues) + goto reset; + + ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue); + if (ntfy_idx >= priv->num_ntfy_blks) + goto reset; + + block = &priv->ntfy_blocks[ntfy_idx]; + tx = block->tx; + + current_time = jiffies_to_msecs(jiffies); + if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time) + goto reset; + + /* Check to see if there are missed completions, which will allow us to + * kick the queue. + */ + last_nic_done = gve_tx_load_event_counter(priv, tx); + if (last_nic_done - tx->done) { + netdev_info(dev, "Kicking queue %d", txqueue); + iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); + napi_schedule(&block->napi); + tx->last_kick_msec = current_time; + goto out; + } // Else reset. + +reset: + gve_schedule_reset(priv); + +out: + if (tx) + tx->queue_timeout++; + priv->tx_timeo_cnt++; +} + +static const struct net_device_ops gve_netdev_ops = { + .ndo_start_xmit = gve_tx, + .ndo_open = gve_open, + .ndo_stop = gve_close, + .ndo_get_stats64 = gve_get_stats, + .ndo_tx_timeout = gve_tx_timeout, +}; + +static void gve_handle_status(struct gve_priv *priv, u32 status) +{ + if (GVE_DEVICE_STATUS_RESET_MASK & status) { + dev_info(&priv->pdev->dev, "Device requested reset.\n"); + gve_set_do_reset(priv); + } + if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) { + priv->stats_report_trigger_cnt++; + gve_set_do_report_stats(priv); + } +} + +static void gve_handle_reset(struct gve_priv *priv) +{ + /* A service task will be scheduled at the end of probe to catch any + * resets that need to happen, and we don't want to reset until + * probe is done. + */ + if (gve_get_probe_in_progress(priv)) + return; + + if (gve_get_do_reset(priv)) { + rtnl_lock(); + gve_reset(priv, false); + rtnl_unlock(); + } +} + +void gve_handle_report_stats(struct gve_priv *priv) +{ + struct stats *stats = priv->stats_report->stats; + int idx, stats_idx = 0; + unsigned int start = 0; + u64 tx_bytes; + + if (!gve_get_report_stats(priv)) + return; + + be64_add_cpu(&priv->stats_report->written_count, 1); + /* tx stats */ + if (priv->tx) { + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + do { + start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); + tx_bytes = priv->tx[idx].bytes_done; + } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_WAKE_CNT), + .value = cpu_to_be64(priv->tx[idx].wake_queue), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_STOP_CNT), + .value = cpu_to_be64(priv->tx[idx].stop_queue), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_FRAMES_SENT), + .value = cpu_to_be64(priv->tx[idx].req), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_BYTES_SENT), + .value = cpu_to_be64(tx_bytes), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED), + .value = cpu_to_be64(priv->tx[idx].done), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(TX_TIMEOUT_CNT), + .value = cpu_to_be64(priv->tx[idx].queue_timeout), + .queue_id = cpu_to_be32(idx), + }; + } + } + /* rx stats */ + if (priv->rx) { + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE), + .value = cpu_to_be64(priv->rx[idx].desc.seqno), + .queue_id = cpu_to_be32(idx), + }; + stats[stats_idx++] = (struct stats) { + .stat_name = cpu_to_be32(RX_BUFFERS_POSTED), + .value = cpu_to_be64(priv->rx[0].fill_cnt), + .queue_id = cpu_to_be32(idx), + }; + } + } +} + +static void gve_handle_link_status(struct gve_priv *priv, bool link_status) +{ + if (!gve_get_napi_enabled(priv)) + return; + + if (link_status == netif_carrier_ok(priv->dev)) + return; + + if (link_status) { + netdev_info(priv->dev, "Device link is up.\n"); + netif_carrier_on(priv->dev); + } else { + netdev_info(priv->dev, "Device link is down.\n"); + netif_carrier_off(priv->dev); + } +} + +/* Handle NIC status register changes, reset requests and report stats */ +static void gve_service_task(struct work_struct *work) +{ + struct gve_priv *priv = container_of(work, struct gve_priv, + service_task); + u32 status = ioread32be(&priv->reg_bar0->device_status); + + gve_handle_status(priv, status); + + gve_handle_reset(priv); + gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status); +} + +static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) +{ + int num_ntfy; + int err; + + /* Set up the adminq */ + err = gve_adminq_alloc(&priv->pdev->dev, priv); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to alloc admin queue: err=%d\n", err); + return err; + } + + if (skip_describe_device) + goto setup_device; + + /* Get the initial information we need from the device */ + err = gve_adminq_describe_device(priv); + if (err) { + dev_err(&priv->pdev->dev, + "Could not get device information: err=%d\n", err); + goto err; + } + if (priv->dev->max_mtu > PAGE_SIZE) { + priv->dev->max_mtu = PAGE_SIZE; + err = gve_adminq_set_mtu(priv, priv->dev->mtu); + if (err) { + dev_err(&priv->pdev->dev, "Could not set mtu"); + goto err; + } + } + priv->dev->mtu = priv->dev->max_mtu; + num_ntfy = pci_msix_vec_count(priv->pdev); + if (num_ntfy <= 0) { + dev_err(&priv->pdev->dev, + "could not count MSI-x vectors: err=%d\n", num_ntfy); + err = num_ntfy; + goto err; + } else if (num_ntfy < GVE_MIN_MSIX) { + dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n", + GVE_MIN_MSIX, num_ntfy); + err = -EINVAL; + goto err; + } + + priv->num_registered_pages = 0; + priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; + /* gvnic has one Notification Block per MSI-x vector, except for the + * management vector + */ + priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1; + priv->mgmt_msix_idx = priv->num_ntfy_blks; + + priv->tx_cfg.max_queues = + min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2); + priv->rx_cfg.max_queues = + min_t(int, priv->rx_cfg.max_queues, priv->num_ntfy_blks / 2); + + priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; + priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; + if (priv->default_num_queues > 0) { + priv->tx_cfg.num_queues = min_t(int, priv->default_num_queues, + priv->tx_cfg.num_queues); + priv->rx_cfg.num_queues = min_t(int, priv->default_num_queues, + priv->rx_cfg.num_queues); + } + + dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n", + priv->tx_cfg.num_queues, priv->rx_cfg.num_queues); + dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n", + priv->tx_cfg.max_queues, priv->rx_cfg.max_queues); + +setup_device: + err = gve_setup_device_resources(priv); + if (!err) + return 0; +err: + gve_adminq_free(&priv->pdev->dev, priv); + return err; +} + +static void gve_teardown_priv_resources(struct gve_priv *priv) +{ + gve_teardown_device_resources(priv); + gve_adminq_free(&priv->pdev->dev, priv); +} + +static void gve_trigger_reset(struct gve_priv *priv) +{ + /* Reset the device by releasing the AQ */ + gve_adminq_release(priv); +} + +static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up) +{ + gve_trigger_reset(priv); + /* With the reset having already happened, close cannot fail */ + if (was_up) + gve_close(priv->dev); + gve_teardown_priv_resources(priv); +} + +static int gve_reset_recovery(struct gve_priv *priv, bool was_up) +{ + int err; + + err = gve_init_priv(priv, true); + if (err) + goto err; + if (was_up) { + err = gve_open(priv->dev); + if (err) + goto err; + } + return 0; +err: + dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n"); + gve_turndown(priv); + return err; +} + +int gve_reset(struct gve_priv *priv, bool attempt_teardown) +{ + bool was_up = netif_carrier_ok(priv->dev); + int err; + + dev_info(&priv->pdev->dev, "Performing reset\n"); + gve_clear_do_reset(priv); + gve_set_reset_in_progress(priv); + /* If we aren't attempting to teardown normally, just go turndown and + * reset right away. + */ + if (!attempt_teardown) { + gve_turndown(priv); + gve_reset_and_teardown(priv, was_up); + } else { + /* Otherwise attempt to close normally */ + if (was_up) { + err = gve_close(priv->dev); + /* If that fails reset as we did above */ + if (err) + gve_reset_and_teardown(priv, was_up); + } + /* Clean up any remaining resources */ + gve_teardown_priv_resources(priv); + } + + /* Set it all back up */ + err = gve_reset_recovery(priv, was_up); + gve_clear_reset_in_progress(priv); + priv->reset_cnt++; + priv->interface_up_cnt = 0; + priv->interface_down_cnt = 0; + priv->stats_report_trigger_cnt = 0; + return err; +} + +static void gve_write_version(u8 __iomem *driver_version_register) +{ + const char *c = gve_version_prefix; + + while (*c) { + writeb(*c, driver_version_register); + c++; + } + + c = gve_version_str; + while (*c) { + writeb(*c, driver_version_register); + c++; + } + writeb('\n', driver_version_register); +} + +static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int max_tx_queues, max_rx_queues; + struct net_device *dev; + __be32 __iomem *db_bar; + struct gve_registers __iomem *reg_bar; + struct gve_priv *priv; + int err; + + err = pci_enable_device(pdev); + if (err) + return -ENXIO; + + err = pci_request_regions(pdev, "gvnic-cfg"); + if (err) + goto abort_with_enabled; + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err); + goto abort_with_pci_region; + } + + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_err(&pdev->dev, + "Failed to set consistent dma mask: err=%d\n", err); + goto abort_with_pci_region; + } + + reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0); + if (!reg_bar) { + dev_err(&pdev->dev, "Failed to map pci bar!\n"); + err = -ENOMEM; + goto abort_with_pci_region; + } + + db_bar = pci_iomap(pdev, GVE_DOORBELL_BAR, 0); + if (!db_bar) { + dev_err(&pdev->dev, "Failed to map doorbell bar!\n"); + err = -ENOMEM; + goto abort_with_reg_bar; + } + + gve_write_version(®_bar->driver_version); + /* Get max queues to alloc etherdev */ + max_tx_queues = ioread32be(®_bar->max_tx_queues); + max_rx_queues = ioread32be(®_bar->max_rx_queues); + /* Alloc and setup the netdev and priv */ + dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues); + if (!dev) { + dev_err(&pdev->dev, "could not allocate netdev\n"); + goto abort_with_db_bar; + } + SET_NETDEV_DEV(dev, &pdev->dev); + pci_set_drvdata(pdev, dev); + dev->ethtool_ops = &gve_ethtool_ops; + dev->netdev_ops = &gve_netdev_ops; + /* advertise features */ + dev->hw_features = NETIF_F_HIGHDMA; + dev->hw_features |= NETIF_F_SG; + dev->hw_features |= NETIF_F_HW_CSUM; + dev->hw_features |= NETIF_F_TSO; + dev->hw_features |= NETIF_F_TSO6; + dev->hw_features |= NETIF_F_TSO_ECN; + dev->hw_features |= NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_RXHASH; + dev->features = dev->hw_features; + dev->watchdog_timeo = 5 * HZ; + dev->min_mtu = ETH_MIN_MTU; + netif_carrier_off(dev); + + priv = netdev_priv(dev); + priv->dev = dev; + priv->pdev = pdev; + priv->msg_enable = DEFAULT_MSG_LEVEL; + priv->reg_bar0 = reg_bar; + priv->db_bar2 = db_bar; + priv->service_task_flags = 0x0; + priv->state_flags = 0x0; + priv->ethtool_flags = 0x0; + + gve_set_probe_in_progress(priv); + priv->gve_wq = alloc_ordered_workqueue("gve", 0); + if (!priv->gve_wq) { + dev_err(&pdev->dev, "Could not allocate workqueue"); + err = -ENOMEM; + goto abort_with_netdev; + } + INIT_WORK(&priv->service_task, gve_service_task); + INIT_WORK(&priv->stats_report_task, gve_stats_report_task); + priv->tx_cfg.max_queues = max_tx_queues; + priv->rx_cfg.max_queues = max_rx_queues; + + err = gve_init_priv(priv, false); + if (err) + goto abort_with_wq; + + err = register_netdev(dev); + if (err) + goto abort_with_gve_init; + + dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); + gve_clear_probe_in_progress(priv); + queue_work(priv->gve_wq, &priv->service_task); + return 0; + +abort_with_gve_init: + gve_teardown_priv_resources(priv); + +abort_with_wq: + destroy_workqueue(priv->gve_wq); + +abort_with_netdev: + free_netdev(dev); + +abort_with_db_bar: + pci_iounmap(pdev, db_bar); + +abort_with_reg_bar: + pci_iounmap(pdev, reg_bar); + +abort_with_pci_region: + pci_release_regions(pdev); + +abort_with_enabled: + pci_disable_device(pdev); + return -ENXIO; +} + +static void gve_remove(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct gve_priv *priv = netdev_priv(netdev); + __be32 __iomem *db_bar = priv->db_bar2; + void __iomem *reg_bar = priv->reg_bar0; + + unregister_netdev(netdev); + gve_teardown_priv_resources(priv); + destroy_workqueue(priv->gve_wq); + free_netdev(netdev); + pci_iounmap(pdev, db_bar); + pci_iounmap(pdev, reg_bar); + pci_release_regions(pdev); + pci_disable_device(pdev); +} + +static const struct pci_device_id gve_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) }, + { } +}; + +static struct pci_driver gvnic_driver = { + .name = "gvnic", + .id_table = gve_id_table, + .probe = gve_probe, + .remove = gve_remove, +}; + +module_pci_driver(gvnic_driver); + +MODULE_DEVICE_TABLE(pci, gve_id_table); +MODULE_AUTHOR("Google, Inc."); +MODULE_DESCRIPTION("gVNIC Driver"); +MODULE_LICENSE("Dual MIT/GPL"); +MODULE_VERSION(GVE_VERSION); diff --git a/drivers/net/ethernet/google/gve/gve_register.h b/drivers/net/ethernet/google/gve/gve_register.h new file mode 100644 index 000000000..fb655463c --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_register.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#ifndef _GVE_REGISTER_H_ +#define _GVE_REGISTER_H_ + +/* Fixed Configuration Registers */ +struct gve_registers { + __be32 device_status; + __be32 driver_status; + __be32 max_tx_queues; + __be32 max_rx_queues; + __be32 adminq_pfn; + __be32 adminq_doorbell; + __be32 adminq_event_counter; + u8 reserved[3]; + u8 driver_version; +}; + +enum gve_device_status_flags { + GVE_DEVICE_STATUS_RESET_MASK = BIT(1), + GVE_DEVICE_STATUS_LINK_STATUS_MASK = BIT(2), + GVE_DEVICE_STATUS_REPORT_STATS_MASK = BIT(3), +}; +#endif /* _GVE_REGISTER_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c new file mode 100644 index 000000000..008fa897a --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include <linux/etherdevice.h> + +static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; + + block->rx = NULL; +} + +static void gve_rx_free_ring(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *dev = &priv->pdev->dev; + size_t bytes; + u32 slots; + + gve_rx_remove_from_block(priv, idx); + + bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; + dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus); + rx->desc.desc_ring = NULL; + + dma_free_coherent(dev, sizeof(*rx->q_resources), + rx->q_resources, rx->q_resources_bus); + rx->q_resources = NULL; + + gve_unassign_qpl(priv, rx->data.qpl->id); + rx->data.qpl = NULL; + kvfree(rx->data.page_info); + + slots = rx->mask + 1; + bytes = sizeof(*rx->data.data_ring) * slots; + dma_free_coherent(dev, bytes, rx->data.data_ring, + rx->data.data_bus); + rx->data.data_ring = NULL; + netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); +} + +static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info, + struct gve_rx_data_slot *slot, + dma_addr_t addr, struct page *page) +{ + page_info->page = page; + page_info->page_offset = 0; + page_info->page_address = page_address(page); + slot->qpl_offset = cpu_to_be64(addr); +} + +static int gve_prefill_rx_pages(struct gve_rx_ring *rx) +{ + struct gve_priv *priv = rx->gve; + u32 slots; + int i; + + /* Allocate one page per Rx queue slot. Each page is split into two + * packet buffers, when possible we "page flip" between the two. + */ + slots = rx->mask + 1; + + rx->data.page_info = kvzalloc(slots * + sizeof(*rx->data.page_info), GFP_KERNEL); + if (!rx->data.page_info) + return -ENOMEM; + + rx->data.qpl = gve_assign_rx_qpl(priv); + + for (i = 0; i < slots; i++) { + struct page *page = rx->data.qpl->pages[i]; + dma_addr_t addr = i * PAGE_SIZE; + + gve_setup_rx_buffer(&rx->data.page_info[i], + &rx->data.data_ring[i], addr, page); + } + + return slots; +} + +static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_rx_ring *rx = &priv->rx[queue_idx]; + + block->rx = rx; + rx->ntfy_id = ntfy_idx; +} + +static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *hdev = &priv->pdev->dev; + u32 slots, npages; + int filled_pages; + size_t bytes; + int err; + + netif_dbg(priv, drv, priv->dev, "allocating rx ring\n"); + /* Make sure everything is zeroed to start with */ + memset(rx, 0, sizeof(*rx)); + + rx->gve = priv; + rx->q_num = idx; + + slots = priv->rx_pages_per_qpl; + rx->mask = slots - 1; + + /* alloc rx data ring */ + bytes = sizeof(*rx->data.data_ring) * slots; + rx->data.data_ring = dma_alloc_coherent(hdev, bytes, + &rx->data.data_bus, + GFP_KERNEL); + if (!rx->data.data_ring) + return -ENOMEM; + filled_pages = gve_prefill_rx_pages(rx); + if (filled_pages < 0) { + err = -ENOMEM; + goto abort_with_slots; + } + rx->fill_cnt = filled_pages; + /* Ensure data ring slots (packet buffers) are visible. */ + dma_wmb(); + + /* Alloc gve_queue_resources */ + rx->q_resources = + dma_alloc_coherent(hdev, + sizeof(*rx->q_resources), + &rx->q_resources_bus, + GFP_KERNEL); + if (!rx->q_resources) { + err = -ENOMEM; + goto abort_filled; + } + netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx, + (unsigned long)rx->data.data_bus); + + /* alloc rx desc ring */ + bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt; + npages = bytes / PAGE_SIZE; + if (npages * PAGE_SIZE != bytes) { + err = -EIO; + goto abort_with_q_resources; + } + + rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus, + GFP_KERNEL); + if (!rx->desc.desc_ring) { + err = -ENOMEM; + goto abort_with_q_resources; + } + rx->mask = slots - 1; + rx->cnt = 0; + rx->desc.seqno = 1; + gve_rx_add_to_block(priv, idx); + + return 0; + +abort_with_q_resources: + dma_free_coherent(hdev, sizeof(*rx->q_resources), + rx->q_resources, rx->q_resources_bus); + rx->q_resources = NULL; +abort_filled: + kvfree(rx->data.page_info); +abort_with_slots: + bytes = sizeof(*rx->data.data_ring) * slots; + dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); + rx->data.data_ring = NULL; + + return err; +} + +int gve_rx_alloc_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_rx_alloc_ring(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc rx ring=%d: err=%d\n", + i, err); + break; + } + } + /* Unallocate if there was an error */ + if (err) { + int j; + + for (j = 0; j < i; j++) + gve_rx_free_ring(priv, j); + } + return err; +} + +void gve_rx_free_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_free_ring(priv, i); +} + +void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx) +{ + u32 db_idx = be32_to_cpu(rx->q_resources->db_index); + + iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]); +} + +static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) +{ + if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP))) + return PKT_HASH_TYPE_L4; + if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) + return PKT_HASH_TYPE_L3; + return PKT_HASH_TYPE_L2; +} + +static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx, + struct net_device *dev, + struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, + u16 len) +{ + struct sk_buff *skb = napi_alloc_skb(napi, len); + void *va = page_info->page_address + GVE_RX_PAD + + page_info->page_offset; + + if (unlikely(!skb)) + return NULL; + + __skb_put(skb, len); + + skb_copy_to_linear_data(skb, va, len); + + skb->protocol = eth_type_trans(skb, dev); + + u64_stats_update_begin(&rx->statss); + rx->rx_copied_pkt++; + u64_stats_update_end(&rx->statss); + + return skb; +} + +static struct sk_buff *gve_rx_add_frags(struct net_device *dev, + struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, + u16 len) +{ + struct sk_buff *skb = napi_get_frags(napi); + + if (unlikely(!skb)) + return NULL; + + skb_add_rx_frag(skb, 0, page_info->page, + page_info->page_offset + + GVE_RX_PAD, len, PAGE_SIZE / 2); + + return skb; +} + +static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, + struct gve_rx_data_slot *data_ring) +{ + u64 addr = be64_to_cpu(data_ring->qpl_offset); + + page_info->page_offset ^= PAGE_SIZE / 2; + addr ^= PAGE_SIZE / 2; + data_ring->qpl_offset = cpu_to_be64(addr); +} + +static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, + netdev_features_t feat, u32 idx) +{ + struct gve_rx_slot_page_info *page_info; + struct gve_priv *priv = rx->gve; + struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; + struct net_device *dev = priv->dev; + struct sk_buff *skb; + int pagecount; + u16 len; + + /* drop this packet */ + if (unlikely(rx_desc->flags_seq & GVE_RXF_ERR)) { + u64_stats_update_begin(&rx->statss); + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + return true; + } + + len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD; + page_info = &rx->data.page_info[idx]; + dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx], + PAGE_SIZE, DMA_FROM_DEVICE); + + /* gvnic can only receive into registered segments. If the buffer + * can't be recycled, our only choice is to copy the data out of + * it so that we can return it to the device. + */ + + if (PAGE_SIZE == 4096) { + if (len <= priv->rx_copybreak) { + /* Just copy small packets */ + skb = gve_rx_copy(rx, dev, napi, page_info, len); + u64_stats_update_begin(&rx->statss); + rx->rx_copybreak_pkt++; + u64_stats_update_end(&rx->statss); + goto have_skb; + } + if (unlikely(!gve_can_recycle_pages(dev))) { + skb = gve_rx_copy(rx, dev, napi, page_info, len); + goto have_skb; + } + pagecount = page_count(page_info->page); + if (pagecount == 1) { + /* No part of this page is used by any SKBs; we attach + * the page fragment to a new SKB and pass it up the + * stack. + */ + skb = gve_rx_add_frags(dev, napi, page_info, len); + if (!skb) { + u64_stats_update_begin(&rx->statss); + rx->rx_skb_alloc_fail++; + u64_stats_update_end(&rx->statss); + return true; + } + /* Make sure the kernel stack can't release the page */ + get_page(page_info->page); + /* "flip" to other packet buffer on this page */ + gve_rx_flip_buff(page_info, &rx->data.data_ring[idx]); + } else if (pagecount >= 2) { + /* We have previously passed the other half of this + * page up the stack, but it has not yet been freed. + */ + skb = gve_rx_copy(rx, dev, napi, page_info, len); + } else { + WARN(pagecount < 1, "Pagecount should never be < 1"); + return false; + } + } else { + skb = gve_rx_copy(rx, dev, napi, page_info, len); + } + +have_skb: + /* We didn't manage to allocate an skb but we haven't had any + * reset worthy failures. + */ + if (!skb) { + u64_stats_update_begin(&rx->statss); + rx->rx_skb_alloc_fail++; + u64_stats_update_end(&rx->statss); + return true; + } + + if (likely(feat & NETIF_F_RXCSUM)) { + /* NIC passes up the partial sum */ + if (rx_desc->csum) + skb->ip_summed = CHECKSUM_COMPLETE; + else + skb->ip_summed = CHECKSUM_NONE; + skb->csum = csum_unfold(rx_desc->csum); + } + + /* parse flags & pass relevant info up */ + if (likely(feat & NETIF_F_RXHASH) && + gve_needs_rss(rx_desc->flags_seq)) + skb_set_hash(skb, be32_to_cpu(rx_desc->rss_hash), + gve_rss_type(rx_desc->flags_seq)); + + if (skb_is_nonlinear(skb)) + napi_gro_frags(napi); + else + napi_gro_receive(napi, skb); + return true; +} + +static bool gve_rx_work_pending(struct gve_rx_ring *rx) +{ + struct gve_rx_desc *desc; + __be16 flags_seq; + u32 next_idx; + + next_idx = rx->cnt & rx->mask; + desc = rx->desc.desc_ring + next_idx; + + flags_seq = desc->flags_seq; + /* Make sure we have synchronized the seq no with the device */ + smp_rmb(); + + return (GVE_SEQNO(flags_seq) == rx->desc.seqno); +} + +bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, + netdev_features_t feat) +{ + struct gve_priv *priv = rx->gve; + struct gve_rx_desc *desc; + u32 cnt = rx->cnt; + u32 idx = cnt & rx->mask; + u32 work_done = 0; + u64 bytes = 0; + + desc = rx->desc.desc_ring + idx; + while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && + work_done < budget) { + netif_info(priv, rx_status, priv->dev, + "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n", + rx->q_num, idx, desc, desc->flags_seq); + netif_info(priv, rx_status, priv->dev, + "[%d] seqno=%d rx->desc.seqno=%d\n", + rx->q_num, GVE_SEQNO(desc->flags_seq), + rx->desc.seqno); + bytes += be16_to_cpu(desc->len) - GVE_RX_PAD; + if (!gve_rx(rx, desc, feat, idx)) + gve_schedule_reset(priv); + cnt++; + idx = cnt & rx->mask; + desc = rx->desc.desc_ring + idx; + rx->desc.seqno = gve_next_seqno(rx->desc.seqno); + work_done++; + } + + if (!work_done) + return false; + + u64_stats_update_begin(&rx->statss); + rx->rpackets += work_done; + rx->rbytes += bytes; + u64_stats_update_end(&rx->statss); + rx->cnt = cnt; + rx->fill_cnt += work_done; + + gve_rx_write_doorbell(priv, rx); + return gve_rx_work_pending(rx); +} + +bool gve_rx_poll(struct gve_notify_block *block, int budget) +{ + struct gve_rx_ring *rx = block->rx; + netdev_features_t feat; + bool repoll = false; + + feat = block->napi.dev->features; + + /* If budget is 0, do all the work */ + if (budget == 0) + budget = INT_MAX; + + if (budget > 0) + repoll |= gve_clean_rx_done(rx, budget, feat); + else + repoll |= gve_rx_work_pending(rx); + return repoll; +} diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c new file mode 100644 index 000000000..b653197b3 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2019 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/vmalloc.h> +#include <linux/skbuff.h> + +static inline void gve_tx_put_doorbell(struct gve_priv *priv, + struct gve_queue_resources *q_resources, + u32 val) +{ + iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]); +} + +/* gvnic can only transmit from a Registered Segment. + * We copy skb payloads into the registered segment before writing Tx + * descriptors and ringing the Tx doorbell. + * + * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must + * free allocations in the order they were allocated. + */ + +static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_fifo *fifo) +{ + fifo->base = vmap(fifo->qpl->pages, fifo->qpl->num_entries, VM_MAP, + PAGE_KERNEL); + if (unlikely(!fifo->base)) { + netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n", + fifo->qpl->id); + return -ENOMEM; + } + + fifo->size = fifo->qpl->num_entries * PAGE_SIZE; + atomic_set(&fifo->available, fifo->size); + fifo->head = 0; + return 0; +} + +static void gve_tx_fifo_release(struct gve_priv *priv, struct gve_tx_fifo *fifo) +{ + WARN(atomic_read(&fifo->available) != fifo->size, + "Releasing non-empty fifo"); + + vunmap(fifo->base); +} + +static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, + size_t bytes) +{ + return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; +} + +static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) +{ + return (atomic_read(&fifo->available) <= bytes) ? false : true; +} + +/* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO + * @fifo: FIFO to allocate from + * @bytes: Allocation size + * @iov: Scatter-gather elements to fill with allocation fragment base/len + * + * Returns number of valid elements in iov[] or negative on error. + * + * Allocations from a given FIFO must be externally synchronized but concurrent + * allocation and frees are allowed. + */ +static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, + struct gve_tx_iovec iov[2]) +{ + size_t overflow, padding; + u32 aligned_head; + int nfrags = 0; + + if (!bytes) + return 0; + + /* This check happens before we know how much padding is needed to + * align to a cacheline boundary for the payload, but that is fine, + * because the FIFO head always start aligned, and the FIFO's boundaries + * are aligned, so if there is space for the data, there is space for + * the padding to the next alignment. + */ + WARN(!gve_tx_fifo_can_alloc(fifo, bytes), + "Reached %s when there's not enough space in the fifo", __func__); + + nfrags++; + + iov[0].iov_offset = fifo->head; + iov[0].iov_len = bytes; + fifo->head += bytes; + + if (fifo->head > fifo->size) { + /* If the allocation did not fit in the tail fragment of the + * FIFO, also use the head fragment. + */ + nfrags++; + overflow = fifo->head - fifo->size; + iov[0].iov_len -= overflow; + iov[1].iov_offset = 0; /* Start of fifo*/ + iov[1].iov_len = overflow; + + fifo->head = overflow; + } + + /* Re-align to a cacheline boundary */ + aligned_head = L1_CACHE_ALIGN(fifo->head); + padding = aligned_head - fifo->head; + iov[nfrags - 1].iov_padding = padding; + atomic_sub(bytes + padding, &fifo->available); + fifo->head = aligned_head; + + if (fifo->head == fifo->size) + fifo->head = 0; + + return nfrags; +} + +/* gve_tx_free_fifo - Return space to Tx FIFO + * @fifo: FIFO to return fragments to + * @bytes: Bytes to free + */ +static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) +{ + atomic_add(bytes, &fifo->available); +} + +static void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; + + block->tx = NULL; +} + +static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, + u32 to_do, bool try_to_wake); + +static void gve_tx_free_ring(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t bytes; + u32 slots; + + gve_tx_remove_from_block(priv, idx); + slots = tx->mask + 1; + gve_clean_tx_done(priv, tx, tx->req, false); + netdev_tx_reset_queue(tx->netdev_txq); + + dma_free_coherent(hdev, sizeof(*tx->q_resources), + tx->q_resources, tx->q_resources_bus); + tx->q_resources = NULL; + + gve_tx_fifo_release(priv, &tx->tx_fifo); + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); + tx->tx_fifo.qpl = NULL; + + bytes = sizeof(*tx->desc) * slots; + dma_free_coherent(hdev, bytes, tx->desc, tx->bus); + tx->desc = NULL; + + vfree(tx->info); + tx->info = NULL; + + netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); +} + +static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_tx_ring *tx = &priv->tx[queue_idx]; + + block->tx = tx; + tx->ntfy_id = ntfy_idx; +} + +static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + u32 slots = priv->tx_desc_cnt; + size_t bytes; + + /* Make sure everything is zeroed to start */ + memset(tx, 0, sizeof(*tx)); + tx->q_num = idx; + + tx->mask = slots - 1; + + /* alloc metadata */ + tx->info = vzalloc(sizeof(*tx->info) * slots); + if (!tx->info) + return -ENOMEM; + + /* alloc tx queue */ + bytes = sizeof(*tx->desc) * slots; + tx->desc = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); + if (!tx->desc) + goto abort_with_info; + + tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); + if (!tx->tx_fifo.qpl) + goto abort_with_desc; + + /* map Tx FIFO */ + if (gve_tx_fifo_init(priv, &tx->tx_fifo)) + goto abort_with_qpl; + + tx->q_resources = + dma_alloc_coherent(hdev, + sizeof(*tx->q_resources), + &tx->q_resources_bus, + GFP_KERNEL); + if (!tx->q_resources) + goto abort_with_fifo; + + netif_dbg(priv, drv, priv->dev, "tx[%d]->bus=%lx\n", idx, + (unsigned long)tx->bus); + tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); + gve_tx_add_to_block(priv, idx); + + return 0; + +abort_with_fifo: + gve_tx_fifo_release(priv, &tx->tx_fifo); +abort_with_qpl: + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); +abort_with_desc: + dma_free_coherent(hdev, bytes, tx->desc, tx->bus); + tx->desc = NULL; +abort_with_info: + vfree(tx->info); + tx->info = NULL; + return -ENOMEM; +} + +int gve_tx_alloc_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_tx_alloc_ring(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc tx ring=%d: err=%d\n", + i, err); + break; + } + } + /* Unallocate if there was an error */ + if (err) { + int j; + + for (j = 0; j < i; j++) + gve_tx_free_ring(priv, j); + } + return err; +} + +void gve_tx_free_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_tx_free_ring(priv, i); +} + +/* gve_tx_avail - Calculates the number of slots available in the ring + * @tx: tx ring to check + * + * Returns the number of slots available + * + * The capacity of the queue is mask + 1. We don't need to reserve an entry. + **/ +static inline u32 gve_tx_avail(struct gve_tx_ring *tx) +{ + return tx->mask + 1 - (tx->req - tx->done); +} + +static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + int pad_bytes, align_hdr_pad; + int bytes; + int hlen; + + hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + + tcp_hdrlen(skb) : skb_headlen(skb); + + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, + hlen); + /* We need to take into account the header alignment padding. */ + align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen; + bytes = align_hdr_pad + pad_bytes + skb->len; + + return bytes; +} + +/* The most descriptors we could need are 3 - 1 for the headers, 1 for + * the beginning of the payload at the end of the FIFO, and 1 if the + * payload wraps to the beginning of the FIFO. + */ +#define MAX_TX_DESC_NEEDED 3 + +/* Check if sufficient resources (descriptor ring space, FIFO space) are + * available to transmit the given number of bytes. + */ +static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) +{ + return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && + gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required)); +} + +/* Stops the queue if the skb cannot be transmitted. */ +static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb) +{ + int bytes_required; + + bytes_required = gve_skb_fifo_bytes_required(tx, skb); + if (likely(gve_can_tx(tx, bytes_required))) + return 0; + + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + smp_mb(); /* sync with restarting queue in gve_clean_tx_done() */ + + /* Now check for resources again, in case gve_clean_tx_done() freed + * resources after we checked and we stopped the queue after + * gve_clean_tx_done() checked. + * + * gve_maybe_stop_tx() gve_clean_tx_done() + * nsegs/can_alloc test failed + * gve_tx_free_fifo() + * if (tx queue stopped) + * netif_tx_queue_wake() + * netif_tx_stop_queue() + * Need to check again for space here! + */ + if (likely(!gve_can_tx(tx, bytes_required))) + return -EBUSY; + + netif_tx_start_queue(tx->netdev_txq); + tx->wake_queue++; + return 0; +} + +static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc, + struct sk_buff *skb, bool is_gso, + int l4_hdr_offset, u32 desc_cnt, + u16 hlen, u64 addr) +{ + /* l4_hdr_offset and csum_offset are in units of 16-bit words */ + if (is_gso) { + pkt_desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; + pkt_desc->pkt.l4_csum_offset = skb->csum_offset >> 1; + pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1; + } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + pkt_desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; + pkt_desc->pkt.l4_csum_offset = skb->csum_offset >> 1; + pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1; + } else { + pkt_desc->pkt.type_flags = GVE_TXD_STD; + pkt_desc->pkt.l4_csum_offset = 0; + pkt_desc->pkt.l4_hdr_offset = 0; + } + pkt_desc->pkt.desc_cnt = desc_cnt; + pkt_desc->pkt.len = cpu_to_be16(skb->len); + pkt_desc->pkt.seg_len = cpu_to_be16(hlen); + pkt_desc->pkt.seg_addr = cpu_to_be64(addr); +} + +static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc, + struct sk_buff *skb, bool is_gso, + u16 len, u64 addr) +{ + seg_desc->seg.type_flags = GVE_TXD_SEG; + if (is_gso) { + if (skb_is_gso_v6(skb)) + seg_desc->seg.type_flags |= GVE_TXSF_IPV6; + seg_desc->seg.l3_offset = skb_network_offset(skb) >> 1; + seg_desc->seg.mss = cpu_to_be16(skb_shinfo(skb)->gso_size); + } + seg_desc->seg.seg_len = cpu_to_be16(len); + seg_desc->seg.seg_addr = cpu_to_be64(addr); +} + +static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses, + u64 iov_offset, u64 iov_len) +{ + u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; + u64 first_page = iov_offset / PAGE_SIZE; + dma_addr_t dma; + u64 page; + + for (page = first_page; page <= last_page; page++) { + dma = page_buses[page]; + dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE); + } +} + +static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb, + struct device *dev) +{ + int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset; + union gve_tx_desc *pkt_desc, *seg_desc; + struct gve_tx_buffer_state *info; + bool is_gso = skb_is_gso(skb); + u32 idx = tx->req & tx->mask; + int payload_iov = 2; + int copy_offset; + u32 next_idx; + int i; + + info = &tx->info[idx]; + pkt_desc = &tx->desc[idx]; + + l4_hdr_offset = skb_checksum_start_offset(skb); + /* If the skb is gso, then we want the tcp header in the first segment + * otherwise we want the linear portion of the skb (which will contain + * the checksum because skb->csum_start and skb->csum_offset are given + * relative to skb->head) in the first segment. + */ + hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : + skb_headlen(skb); + + info->skb = skb; + /* We don't want to split the header, so if necessary, pad to the end + * of the fifo and then put the header at the beginning of the fifo. + */ + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, hlen); + hdr_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, hlen + pad_bytes, + &info->iov[0]); + WARN(!hdr_nfrags, "hdr_nfrags should never be 0!"); + payload_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, skb->len - hlen, + &info->iov[payload_iov]); + + gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset, + 1 + payload_nfrags, hlen, + info->iov[hdr_nfrags - 1].iov_offset); + + skb_copy_bits(skb, 0, + tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset, + hlen); + gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + info->iov[hdr_nfrags - 1].iov_offset, + info->iov[hdr_nfrags - 1].iov_len); + copy_offset = hlen; + + for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { + next_idx = (tx->req + 1 + i - payload_iov) & tx->mask; + seg_desc = &tx->desc[next_idx]; + + gve_tx_fill_seg_desc(seg_desc, skb, is_gso, + info->iov[i].iov_len, + info->iov[i].iov_offset); + + skb_copy_bits(skb, copy_offset, + tx->tx_fifo.base + info->iov[i].iov_offset, + info->iov[i].iov_len); + gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses, + info->iov[i].iov_offset, + info->iov[i].iov_len); + copy_offset += info->iov[i].iov_len; + } + + return 1 + payload_nfrags; +} + +netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx; + int nsegs; + + WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues, + "skb queue index out of range"); + tx = &priv->tx[skb_get_queue_mapping(skb)]; + if (unlikely(gve_maybe_stop_tx(tx, skb))) { + /* We need to ring the txq doorbell -- we have stopped the Tx + * queue for want of resources, but prior calls to gve_tx() + * may have added descriptors without ringing the doorbell. + */ + + gve_tx_put_doorbell(priv, tx->q_resources, tx->req); + return NETDEV_TX_BUSY; + } + nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev); + + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + + /* give packets to NIC */ + tx->req += nsegs; + + if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) + return NETDEV_TX_OK; + + gve_tx_put_doorbell(priv, tx->q_resources, tx->req); + return NETDEV_TX_OK; +} + +#define GVE_TX_START_THRESH PAGE_SIZE + +static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, + u32 to_do, bool try_to_wake) +{ + struct gve_tx_buffer_state *info; + u64 pkts = 0, bytes = 0; + size_t space_freed = 0; + struct sk_buff *skb; + int i, j; + u32 idx; + + for (j = 0; j < to_do; j++) { + idx = tx->done & tx->mask; + netif_info(priv, tx_done, priv->dev, + "[%d] %s: idx=%d (req=%u done=%u)\n", + tx->q_num, __func__, idx, tx->req, tx->done); + info = &tx->info[idx]; + skb = info->skb; + + /* Mark as free */ + if (skb) { + info->skb = NULL; + bytes += skb->len; + pkts++; + dev_consume_skb_any(skb); + /* FIFO free */ + for (i = 0; i < ARRAY_SIZE(info->iov); i++) { + space_freed += info->iov[i].iov_len + + info->iov[i].iov_padding; + info->iov[i].iov_len = 0; + info->iov[i].iov_padding = 0; + } + } + tx->done++; + } + + gve_tx_free_fifo(&tx->tx_fifo, space_freed); + u64_stats_update_begin(&tx->statss); + tx->bytes_done += bytes; + tx->pkt_done += pkts; + u64_stats_update_end(&tx->statss); + netdev_tx_completed_queue(tx->netdev_txq, pkts, bytes); + + /* start the queue if we've stopped it */ +#ifndef CONFIG_BQL + /* Make sure that the doorbells are synced */ + smp_mb(); +#endif + if (try_to_wake && netif_tx_queue_stopped(tx->netdev_txq) && + likely(gve_can_tx(tx, GVE_TX_START_THRESH))) { + tx->wake_queue++; + netif_tx_wake_queue(tx->netdev_txq); + } + + return pkts; +} + +__be32 gve_tx_load_event_counter(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + u32 counter_index = be32_to_cpu((tx->q_resources->counter_index)); + + return READ_ONCE(priv->counter_array[counter_index]); +} + +bool gve_tx_poll(struct gve_notify_block *block, int budget) +{ + struct gve_priv *priv = block->priv; + struct gve_tx_ring *tx = block->tx; + bool repoll = false; + u32 nic_done; + u32 to_do; + + /* If budget is 0, do all the work */ + if (budget == 0) + budget = INT_MAX; + + /* Find out how much work there is to be done */ + tx->last_nic_done = gve_tx_load_event_counter(priv, tx); + nic_done = be32_to_cpu(tx->last_nic_done); + if (budget > 0) { + /* Do as much work as we have that the budget will + * allow + */ + to_do = min_t(u32, (nic_done - tx->done), budget); + gve_clean_tx_done(priv, tx, to_do, true); + } + /* If we still have work we want to repoll */ + repoll |= (nic_done != tx->done); + return repoll; +} |