diff options
Diffstat (limited to 'drivers/infiniband/hw/hfi1')
92 files changed, 78557 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig new file mode 100644 index 0000000000..14b92e12bf --- /dev/null +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_HFI1 + tristate "Cornelis OPX Gen1 support" + depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML + select MMU_NOTIFIER + select CRC32 + select I2C_ALGOBIT + help + This is a low-level driver for Cornelis OPX Gen1 adapter. +config HFI1_DEBUG_SDMA_ORDER + bool "HFI1 SDMA Order debug" + depends on INFINIBAND_HFI1 + default n + help + This is a debug flag to test for out of order + sdma completions for unit testing +config SDMA_VERBOSITY + bool "Config SDMA Verbosity" + depends on INFINIBAND_HFI1 + default n + help + This is a configuration flag to enable verbose + SDMA debug diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile new file mode 100644 index 0000000000..5d977f3636 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# HFI driver +# +# +# +# Called from the kernel module build system. +# +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o + +hfi1-y := \ + affinity.o \ + aspm.o \ + chip.o \ + device.o \ + driver.o \ + efivar.o \ + eprom.o \ + exp_rcv.o \ + file_ops.o \ + firmware.o \ + init.o \ + intr.o \ + iowait.o \ + ipoib_main.o \ + ipoib_rx.o \ + ipoib_tx.o \ + mad.o \ + mmu_rb.o \ + msix.o \ + netdev_rx.o \ + opfn.o \ + pcie.o \ + pin_system.o \ + pio.o \ + pio_copy.o \ + platform.o \ + qp.o \ + qsfp.o \ + rc.o \ + ruc.o \ + sdma.o \ + sysfs.o \ + tid_rdma.o \ + trace.o \ + uc.o \ + ud.o \ + user_exp_rcv.o \ + user_pages.o \ + user_sdma.o \ + verbs.o \ + verbs_txreq.o \ + vnic_main.o \ + vnic_sdma.o + +ifdef CONFIG_DEBUG_FS +hfi1-y += debugfs.o +ifdef CONFIG_FAULT_INJECTION +ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +hfi1-y += fault.o +endif +endif +endif + +CFLAGS_trace.o = -I$(src) +ifdef MVERSION +CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\" +endif diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c new file mode 100644 index 0000000000..bbc957c578 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -0,0 +1,1192 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/numa.h> + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" +#include "trace.h" + +struct hfi1_affinity_node_list node_affinity = { + .list = LIST_HEAD_INIT(node_affinity.list), + .lock = __MUTEX_INITIALIZER(node_affinity.lock) +}; + +/* Name of IRQ types, indexed by enum irq_type */ +static const char * const irq_type_names[] = { + "SDMA", + "RCVCTXT", + "NETDEVCTXT", + "GENERAL", + "OTHER", +}; + +/* Per NUMA node count of HFI devices */ +static unsigned int *hfi1_per_node_cntr; + +static inline void init_cpu_mask_set(struct cpu_mask_set *set) +{ + cpumask_clear(&set->mask); + cpumask_clear(&set->used); + set->gen = 0; +} + +/* Increment generation of CPU set if needed */ +static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) +{ + if (cpumask_equal(&set->mask, &set->used)) { + /* + * We've used up all the CPUs, bump up the generation + * and reset the 'used' map + */ + set->gen++; + cpumask_clear(&set->used); + } +} + +static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) +{ + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } +} + +/* Get the first CPU from the list of unused CPUs in a CPU set data structure */ +static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) +{ + int cpu; + + if (!diff || !set) + return -EINVAL; + + _cpu_mask_set_gen_inc(set); + + /* Find out CPUs left in CPU mask */ + cpumask_andnot(diff, &set->mask, &set->used); + + cpu = cpumask_first(diff); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -EINVAL; + else + cpumask_set_cpu(cpu, &set->used); + + return cpu; +} + +static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) +{ + if (!set) + return; + + cpumask_clear_cpu(cpu, &set->used); + _cpu_mask_set_gen_dec(set); +} + +/* Initialize non-HT cpu cores mask */ +void init_real_cpu_mask(void) +{ + int possible, curr_cpu, i, ht; + + cpumask_clear(&node_affinity.real_cpu_mask); + + /* Start with cpu online mask as the real cpu mask */ + cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); + + /* + * Remove HT cores from the real cpu mask. Do this in two steps below. + */ + possible = cpumask_weight(&node_affinity.real_cpu_mask); + ht = cpumask_weight(topology_sibling_cpumask( + cpumask_first(&node_affinity.real_cpu_mask))); + /* + * Step 1. Skip over the first N HT siblings and use them as the + * "real" cores. Assumes that HT cores are not enumerated in + * succession (except in the single core case). + */ + curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); + for (i = 0; i < possible / ht; i++) + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); + /* + * Step 2. Remove the remaining HT siblings. Use cpumask_next() to + * skip any gaps. + */ + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); + } +} + +int node_affinity_init(void) +{ + int node; + struct pci_dev *dev = NULL; + const struct pci_device_id *ids = hfi1_pci_tbl; + + cpumask_clear(&node_affinity.proc.used); + cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); + + node_affinity.proc.gen = 0; + node_affinity.num_core_siblings = + cpumask_weight(topology_sibling_cpumask( + cpumask_first(&node_affinity.proc.mask) + )); + node_affinity.num_possible_nodes = num_possible_nodes(); + node_affinity.num_online_nodes = num_online_nodes(); + node_affinity.num_online_cpus = num_online_cpus(); + + /* + * The real cpu mask is part of the affinity struct but it has to be + * initialized early. It is needed to calculate the number of user + * contexts in set_up_context_variables(). + */ + init_real_cpu_mask(); + + hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, + sizeof(*hfi1_per_node_cntr), GFP_KERNEL); + if (!hfi1_per_node_cntr) + return -ENOMEM; + + while (ids->vendor) { + dev = NULL; + while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { + node = pcibus_to_node(dev->bus); + if (node < 0) + goto out; + + hfi1_per_node_cntr[node]++; + } + ids++; + } + + return 0; + +out: + /* + * Invalid PCI NUMA node information found, note it, and populate + * our database 1:1. + */ + pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n"); + pr_err("HFI: System BIOS may need to be upgraded\n"); + for (node = 0; node < node_affinity.num_possible_nodes; node++) + hfi1_per_node_cntr[node] = 1; + + pci_dev_put(dev); + + return 0; +} + +static void node_affinity_destroy(struct hfi1_affinity_node *entry) +{ + free_percpu(entry->comp_vect_affinity); + kfree(entry); +} + +void node_affinity_destroy_all(void) +{ + struct list_head *pos, *q; + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + list_for_each_safe(pos, q, &node_affinity.list) { + entry = list_entry(pos, struct hfi1_affinity_node, + list); + list_del(pos); + node_affinity_destroy(entry); + } + mutex_unlock(&node_affinity.lock); + kfree(hfi1_per_node_cntr); +} + +static struct hfi1_affinity_node *node_affinity_allocate(int node) +{ + struct hfi1_affinity_node *entry; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + entry->node = node; + entry->comp_vect_affinity = alloc_percpu(u16); + INIT_LIST_HEAD(&entry->list); + + return entry; +} + +/* + * It appends an entry to the list. + * It *must* be called with node_affinity.lock held. + */ +static void node_affinity_add_tail(struct hfi1_affinity_node *entry) +{ + list_add_tail(&entry->list, &node_affinity.list); +} + +/* It must be called with node_affinity.lock held */ +static struct hfi1_affinity_node *node_affinity_lookup(int node) +{ + struct hfi1_affinity_node *entry; + + list_for_each_entry(entry, &node_affinity.list, list) { + if (entry->node == node) + return entry; + } + + return NULL; +} + +static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + u16 cntr; + u16 prev_cntr; + int ret_cpu; + + if (!possible_cpumask) { + ret_cpu = -EINVAL; + goto fail; + } + + if (!comp_vect_affinity) { + ret_cpu = -EINVAL; + goto fail; + } + + ret_cpu = cpumask_first(possible_cpumask); + if (ret_cpu >= nr_cpu_ids) { + ret_cpu = -EINVAL; + goto fail; + } + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr < prev_cntr) { + ret_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; + +fail: + return ret_cpu; +} + +static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + int max_cpu; + u16 cntr; + u16 prev_cntr; + + if (!possible_cpumask) + return -EINVAL; + + if (!comp_vect_affinity) + return -EINVAL; + + max_cpu = cpumask_first(possible_cpumask); + if (max_cpu >= nr_cpu_ids) + return -EINVAL; + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr > prev_cntr) { + max_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; + + return max_cpu; +} + +/* + * Non-interrupt CPUs are used first, then interrupt CPUs. + * Two already allocated cpu masks must be passed. + */ +static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + cpumask_var_t non_intr_cpus, + cpumask_var_t available_cpus) + __must_hold(&node_affinity.lock) +{ + int cpu; + struct cpu_mask_set *set = dd->comp_vect; + + lockdep_assert_held(&node_affinity.lock); + if (!non_intr_cpus) { + cpu = -1; + goto fail; + } + + if (!available_cpus) { + cpu = -1; + goto fail; + } + + /* Available CPUs for pinning completion vectors */ + _cpu_mask_set_gen_inc(set); + cpumask_andnot(available_cpus, &set->mask, &set->used); + + /* Available CPUs without SDMA engine interrupts */ + cpumask_andnot(non_intr_cpus, available_cpus, + &entry->def_intr.used); + + /* If there are non-interrupt CPUs available, use them first */ + if (!cpumask_empty(non_intr_cpus)) + cpu = cpumask_first(non_intr_cpus); + else /* Otherwise, use interrupt CPUs */ + cpu = cpumask_first(available_cpus); + + if (cpu >= nr_cpu_ids) { /* empty */ + cpu = -1; + goto fail; + } + cpumask_set_cpu(cpu, &set->used); + +fail: + return cpu; +} + +static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) +{ + struct cpu_mask_set *set = dd->comp_vect; + + if (cpu < 0) + return; + + cpu_mask_set_put(set, cpu); +} + +/* _dev_comp_vect_mappings_destroy() is reentrant */ +static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) +{ + int i, cpu; + + if (!dd->comp_vect_mappings) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = dd->comp_vect_mappings[i]; + _dev_comp_vect_cpu_put(dd, cpu); + dd->comp_vect_mappings[i] = -1; + hfi1_cdbg(AFFINITY, + "[%s] Release CPU %d from completion vector %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); + } + + kfree(dd->comp_vect_mappings); + dd->comp_vect_mappings = NULL; +} + +/* + * This function creates the table for looking up CPUs for completion vectors. + * num_comp_vectors needs to have been initilized before calling this function. + */ +static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu, ret; + cpumask_var_t non_intr_cpus; + cpumask_var_t available_cpus; + + lockdep_assert_held(&node_affinity.lock); + + if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { + free_cpumask_var(non_intr_cpus); + return -ENOMEM; + } + + dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, + sizeof(*dd->comp_vect_mappings), + GFP_KERNEL); + if (!dd->comp_vect_mappings) { + ret = -ENOMEM; + goto fail; + } + for (i = 0; i < dd->comp_vect_possible_cpus; i++) + dd->comp_vect_mappings[i] = -1; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, + available_cpus); + if (cpu < 0) { + ret = -EINVAL; + goto fail; + } + + dd->comp_vect_mappings[i] = cpu; + hfi1_cdbg(AFFINITY, + "[%s] Completion Vector %d -> CPU %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); + } + + free_cpumask_var(available_cpus); + free_cpumask_var(non_intr_cpus); + return 0; + +fail: + free_cpumask_var(available_cpus); + free_cpumask_var(non_intr_cpus); + _dev_comp_vect_mappings_destroy(dd); + + return ret; +} + +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) +{ + int ret; + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) { + ret = -EINVAL; + goto unlock; + } + ret = _dev_comp_vect_mappings_create(dd, entry); +unlock: + mutex_unlock(&node_affinity.lock); + + return ret; +} + +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) +{ + _dev_comp_vect_mappings_destroy(dd); +} + +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + + if (!dd->comp_vect_mappings) + return -EINVAL; + if (comp_vect >= dd->comp_vect_possible_cpus) + return -EINVAL; + + return dd->comp_vect_mappings[comp_vect]; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + bool first_dev_init) + __must_hold(&node_affinity.lock) +{ + int i, j, curr_cpu; + int possible_cpus_comp_vect = 0; + struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; + + lockdep_assert_held(&node_affinity.lock); + /* + * If there's only one CPU available for completion vectors, then + * there will only be one completion vector available. Othewise, + * the number of completion vector available will be the number of + * available CPUs divide it by the number of devices in the + * local NUMA node. + */ + if (cpumask_weight(&entry->comp_vect_mask) == 1) { + possible_cpus_comp_vect = 1; + dd_dev_warn(dd, + "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); + } else { + possible_cpus_comp_vect += + cpumask_weight(&entry->comp_vect_mask) / + hfi1_per_node_cntr[dd->node]; + + /* + * If the completion vector CPUs available doesn't divide + * evenly among devices, then the first device device to be + * initialized gets an extra CPU. + */ + if (first_dev_init && + cpumask_weight(&entry->comp_vect_mask) % + hfi1_per_node_cntr[dd->node] != 0) + possible_cpus_comp_vect++; + } + + dd->comp_vect_possible_cpus = possible_cpus_comp_vect; + + /* Reserving CPUs for device completion vector */ + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, + entry->comp_vect_affinity); + if (curr_cpu < 0) + goto fail; + + cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); + } + + hfi1_cdbg(AFFINITY, + "[%s] Completion vector affinity CPU set(s) %*pbl", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), + cpumask_pr_args(dev_comp_vect_mask)); + + return 0; + +fail: + for (j = 0; j < i; j++) + per_cpu_affinity_put_max(&entry->comp_vect_mask, + entry->comp_vect_affinity); + + return curr_cpu; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu; + + lockdep_assert_held(&node_affinity.lock); + if (!dd->comp_vect_possible_cpus) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, + entry->comp_vect_affinity); + /* Clearing CPU in device completion vector cpu mask */ + if (cpu >= 0) + cpumask_clear_cpu(cpu, &dd->comp_vect->mask); + } + + dd->comp_vect_possible_cpus = 0; +} + +/* + * Interrupt affinity. + * + * non-rcv avail gets a default mask that + * starts as possible cpus with threads reset + * and each rcv avail reset. + * + * rcv avail gets node relative 1 wrapping back + * to the node relative 1 as necessary. + * + */ +int hfi1_dev_affinity_init(struct hfi1_devdata *dd) +{ + struct hfi1_affinity_node *entry; + const struct cpumask *local_mask; + int curr_cpu, possible, i, ret; + bool new_entry = false; + + local_mask = cpumask_of_node(dd->node); + if (cpumask_first(local_mask) >= nr_cpu_ids) + local_mask = topology_core_cpumask(0); + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + + /* + * If this is the first time this NUMA node's affinity is used, + * create an entry in the global affinity structure and initialize it. + */ + if (!entry) { + entry = node_affinity_allocate(dd->node); + if (!entry) { + dd_dev_err(dd, + "Unable to allocate global affinity node\n"); + ret = -ENOMEM; + goto fail; + } + new_entry = true; + + init_cpu_mask_set(&entry->def_intr); + init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->comp_vect_mask); + cpumask_clear(&entry->general_intr_mask); + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, + local_mask); + + /* fill in the receive list */ + possible = cpumask_weight(&entry->def_intr.mask); + curr_cpu = cpumask_first(&entry->def_intr.mask); + + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + } else { + /* + * The general/control context will be the first CPU in + * the default list, so it is removed from the default + * list and added to the general interrupt list. + */ + cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; + i < (dd->n_krcv_queues - 1) * + hfi1_per_node_cntr[dd->node]; + i++) { + cpumask_clear_cpu(curr_cpu, + &entry->def_intr.mask); + cpumask_set_cpu(curr_cpu, + &entry->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, + &entry->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } + + /* + * If there ends up being 0 CPU cores leftover for SDMA + * engines, use the same CPU cores as general/control + * context. + */ + if (cpumask_empty(&entry->def_intr.mask)) + cpumask_copy(&entry->def_intr.mask, + &entry->general_intr_mask); + } + + /* Determine completion vector CPUs for the entire node */ + cpumask_and(&entry->comp_vect_mask, + &node_affinity.real_cpu_mask, local_mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->rcv_intr.mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->general_intr_mask); + + /* + * If there ends up being 0 CPU cores leftover for completion + * vectors, use the same CPU core as the general/control + * context. + */ + if (cpumask_empty(&entry->comp_vect_mask)) + cpumask_copy(&entry->comp_vect_mask, + &entry->general_intr_mask); + } + + ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); + if (ret < 0) + goto fail; + + if (new_entry) + node_affinity_add_tail(entry); + + dd->affinity_entry = entry; + mutex_unlock(&node_affinity.lock); + + return 0; + +fail: + if (new_entry) + node_affinity_destroy(entry); + mutex_unlock(&node_affinity.lock); + return ret; +} + +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) +{ + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + if (!dd->affinity_entry) + goto unlock; + entry = node_affinity_lookup(dd->node); + if (!entry) + goto unlock; + + /* + * Free device completion vector CPUs to be used by future + * completion vectors + */ + _dev_comp_vect_cpu_mask_clean_up(dd, entry); +unlock: + dd->affinity_entry = NULL; + mutex_unlock(&node_affinity.lock); +} + +/* + * Function updates the irq affinity hint for msix after it has been changed + * by the user using the /proc/irq interface. This function only accepts + * one cpu in the mask. + */ +static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) +{ + struct sdma_engine *sde = msix->arg; + struct hfi1_devdata *dd = sde->dd; + struct hfi1_affinity_node *entry; + struct cpu_mask_set *set; + int i, old_cpu; + + if (cpu > num_online_cpus() || cpu == sde->cpu) + return; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) + goto unlock; + + old_cpu = sde->cpu; + sde->cpu = cpu; + cpumask_clear(&msix->mask); + cpumask_set_cpu(cpu, &msix->mask); + dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", + msix->irq, irq_type_names[msix->type], + sde->this_idx, cpu); + irq_set_affinity_hint(msix->irq, &msix->mask); + + /* + * Set the new cpu in the hfi1_affinity_node and clean + * the old cpu if it is not used by any other IRQ + */ + set = &entry->def_intr; + cpumask_set_cpu(cpu, &set->mask); + cpumask_set_cpu(cpu, &set->used); + for (i = 0; i < dd->msix_info.max_requested; i++) { + struct hfi1_msix_entry *other_msix; + + other_msix = &dd->msix_info.msix_entries[i]; + if (other_msix->type != IRQ_SDMA || other_msix == msix) + continue; + + if (cpumask_test_cpu(old_cpu, &other_msix->mask)) + goto unlock; + } + cpumask_clear_cpu(old_cpu, &set->mask); + cpumask_clear_cpu(old_cpu, &set->used); +unlock: + mutex_unlock(&node_affinity.lock); +} + +static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + int cpu = cpumask_first(mask); + struct hfi1_msix_entry *msix = container_of(notify, + struct hfi1_msix_entry, + notify); + + /* Only one CPU configuration supported currently */ + hfi1_update_sdma_affinity(msix, cpu); +} + +static void hfi1_irq_notifier_release(struct kref *ref) +{ + /* + * This is required by affinity notifier. We don't have anything to + * free here. + */ +} + +static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) +{ + struct irq_affinity_notify *notify = &msix->notify; + + notify->irq = msix->irq; + notify->notify = hfi1_irq_notifier_notify; + notify->release = hfi1_irq_notifier_release; + + if (irq_set_affinity_notifier(notify->irq, notify)) + pr_err("Failed to register sdma irq affinity notifier for irq %d\n", + notify->irq); +} + +static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) +{ + struct irq_affinity_notify *notify = &msix->notify; + + if (irq_set_affinity_notifier(notify->irq, NULL)) + pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", + notify->irq); +} + +/* + * Function sets the irq affinity for msix. + * It *must* be called with node_affinity.lock held. + */ +static int get_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix) +{ + cpumask_var_t diff; + struct hfi1_affinity_node *entry; + struct cpu_mask_set *set = NULL; + struct sdma_engine *sde = NULL; + struct hfi1_ctxtdata *rcd = NULL; + char extra[64]; + int cpu = -1; + + extra[0] = '\0'; + cpumask_clear(&msix->mask); + + entry = node_affinity_lookup(dd->node); + + switch (msix->type) { + case IRQ_SDMA: + sde = (struct sdma_engine *)msix->arg; + scnprintf(extra, 64, "engine %u", sde->this_idx); + set = &entry->def_intr; + break; + case IRQ_GENERAL: + cpu = cpumask_first(&entry->general_intr_mask); + break; + case IRQ_RCVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + if (rcd->ctxt == HFI1_CTRL_CTXT) + cpu = cpumask_first(&entry->general_intr_mask); + else + set = &entry->rcv_intr; + scnprintf(extra, 64, "ctxt %u", rcd->ctxt); + break; + case IRQ_NETDEVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + set = &entry->def_intr; + scnprintf(extra, 64, "ctxt %u", rcd->ctxt); + break; + default: + dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); + return -EINVAL; + } + + /* + * The general and control contexts are placed on a particular + * CPU, which is set above. Skip accounting for it. Everything else + * finds its CPU here. + */ + if (cpu == -1 && set) { + if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) + return -ENOMEM; + + cpu = cpu_mask_set_get_first(set, diff); + if (cpu < 0) { + free_cpumask_var(diff); + dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); + return cpu; + } + + free_cpumask_var(diff); + } + + cpumask_set_cpu(cpu, &msix->mask); + dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", + msix->irq, irq_type_names[msix->type], + extra, cpu); + irq_set_affinity_hint(msix->irq, &msix->mask); + + if (msix->type == IRQ_SDMA) { + sde->cpu = cpu; + hfi1_setup_sdma_notifier(msix); + } + + return 0; +} + +int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) +{ + int ret; + + mutex_lock(&node_affinity.lock); + ret = get_irq_affinity(dd, msix); + mutex_unlock(&node_affinity.lock); + return ret; +} + +void hfi1_put_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix) +{ + struct cpu_mask_set *set = NULL; + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + + switch (msix->type) { + case IRQ_SDMA: + set = &entry->def_intr; + hfi1_cleanup_sdma_notifier(msix); + break; + case IRQ_GENERAL: + /* Don't do accounting for general contexts */ + break; + case IRQ_RCVCTXT: { + struct hfi1_ctxtdata *rcd = msix->arg; + + /* Don't do accounting for control contexts */ + if (rcd->ctxt != HFI1_CTRL_CTXT) + set = &entry->rcv_intr; + break; + } + case IRQ_NETDEVCTXT: + set = &entry->def_intr; + break; + default: + mutex_unlock(&node_affinity.lock); + return; + } + + if (set) { + cpumask_andnot(&set->used, &set->used, &msix->mask); + _cpu_mask_set_gen_dec(set); + } + + irq_set_affinity_hint(msix->irq, NULL); + cpumask_clear(&msix->mask); + mutex_unlock(&node_affinity.lock); +} + +/* This should be called with node_affinity.lock held */ +static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, + struct hfi1_affinity_node_list *affinity) +{ + int possible, curr_cpu, i; + uint num_cores_per_socket = node_affinity.num_online_cpus / + affinity->num_core_siblings / + node_affinity.num_online_nodes; + + cpumask_copy(hw_thread_mask, &affinity->proc.mask); + if (affinity->num_core_siblings > 0) { + /* Removing other siblings not needed for now */ + possible = cpumask_weight(hw_thread_mask); + curr_cpu = cpumask_first(hw_thread_mask); + for (i = 0; + i < num_cores_per_socket * node_affinity.num_online_nodes; + i++) + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, hw_thread_mask); + curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); + } + + /* Identifying correct HW threads within physical cores */ + cpumask_shift_left(hw_thread_mask, hw_thread_mask, + num_cores_per_socket * + node_affinity.num_online_nodes * + hw_thread_no); + } +} + +int hfi1_get_proc_affinity(int node) +{ + int cpu = -1, ret, i; + struct hfi1_affinity_node *entry; + cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; + const struct cpumask *node_mask, + *proc_mask = current->cpus_ptr; + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; + + /* + * check whether process/context affinity has already + * been set + */ + if (current->nr_cpus_allowed == 1) { + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", + current->pid, current->comm, + cpumask_pr_args(proc_mask)); + /* + * Mark the pre-set CPU as used. This is atomic so we don't + * need the lock + */ + cpu = cpumask_first(proc_mask); + cpumask_set_cpu(cpu, &set->used); + goto done; + } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", + current->pid, current->comm, + cpumask_pr_args(proc_mask)); + goto done; + } + + /* + * The process does not have a preset CPU affinity so find one to + * recommend using the following algorithm: + * + * For each user process that is opening a context on HFI Y: + * a) If all cores are filled, reinitialize the bitmask + * b) Fill real cores first, then HT cores (First set of HT + * cores on all physical cores, then second set of HT core, + * and, so on) in the following order: + * + * 1. Same NUMA node as HFI Y and not running an IRQ + * handler + * 2. Same NUMA node as HFI Y and running an IRQ handler + * 3. Different NUMA node to HFI Y and not running an IRQ + * handler + * 4. Different NUMA node to HFI Y and running an IRQ + * handler + * c) Mark core as filled in the bitmask. As user processes are + * done, clear cores from the bitmask. + */ + + ret = zalloc_cpumask_var(&diff, GFP_KERNEL); + if (!ret) + goto done; + ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); + if (!ret) + goto free_diff; + ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); + if (!ret) + goto free_hw_thread_mask; + ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); + if (!ret) + goto free_available_mask; + + mutex_lock(&affinity->lock); + /* + * If we've used all available HW threads, clear the mask and start + * overloading. + */ + _cpu_mask_set_gen_inc(set); + + /* + * If NUMA node has CPUs used by interrupt handlers, include them in the + * interrupt handler mask. + */ + entry = node_affinity_lookup(node); + if (entry) { + cpumask_copy(intrs_mask, (entry->def_intr.gen ? + &entry->def_intr.mask : + &entry->def_intr.used)); + cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? + &entry->rcv_intr.mask : + &entry->rcv_intr.used)); + cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); + } + hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", + cpumask_pr_args(intrs_mask)); + + cpumask_copy(hw_thread_mask, &set->mask); + + /* + * If HT cores are enabled, identify which HW threads within the + * physical cores should be used. + */ + if (affinity->num_core_siblings > 0) { + for (i = 0; i < affinity->num_core_siblings; i++) { + find_hw_thread_mask(i, hw_thread_mask, affinity); + + /* + * If there's at least one available core for this HW + * thread number, stop looking for a core. + * + * diff will always be not empty at least once in this + * loop as the used mask gets reset when + * (set->mask == set->used) before this loop. + */ + cpumask_andnot(diff, hw_thread_mask, &set->used); + if (!cpumask_empty(diff)) + break; + } + } + hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", + cpumask_pr_args(hw_thread_mask)); + + node_mask = cpumask_of_node(node); + hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, + cpumask_pr_args(node_mask)); + + /* Get cpumask of available CPUs on preferred NUMA */ + cpumask_and(available_mask, hw_thread_mask, node_mask); + cpumask_andnot(available_mask, available_mask, &set->used); + hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, + cpumask_pr_args(available_mask)); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. Then, CPUs running interrupt + * handlers are used. + * + * 1) If diff is not empty, then there are CPUs not running + * non-interrupt handlers available, so diff gets copied + * over to available_mask. + * 2) If diff is empty, then all CPUs not running interrupt + * handlers are taken, so available_mask contains all + * available CPUs running interrupt handlers. + * 3) If available_mask is empty, then all CPUs on the + * preferred NUMA node are taken, so other NUMA nodes are + * used for process assignments using the same method as + * the preferred NUMA node. + */ + cpumask_andnot(diff, available_mask, intrs_mask); + if (!cpumask_empty(diff)) + cpumask_copy(available_mask, diff); + + /* If we don't have CPUs on the preferred node, use other NUMA nodes */ + if (cpumask_empty(available_mask)) { + cpumask_andnot(available_mask, hw_thread_mask, &set->used); + /* Excluding preferred NUMA cores */ + cpumask_andnot(available_mask, available_mask, node_mask); + hfi1_cdbg(PROC, + "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", + cpumask_pr_args(available_mask)); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. + */ + cpumask_andnot(diff, available_mask, intrs_mask); + if (!cpumask_empty(diff)) + cpumask_copy(available_mask, diff); + } + hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", + cpumask_pr_args(available_mask)); + + cpu = cpumask_first(available_mask); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -1; + else + cpumask_set_cpu(cpu, &set->used); + + mutex_unlock(&affinity->lock); + hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); + + free_cpumask_var(intrs_mask); +free_available_mask: + free_cpumask_var(available_mask); +free_hw_thread_mask: + free_cpumask_var(hw_thread_mask); +free_diff: + free_cpumask_var(diff); +done: + return cpu; +} + +void hfi1_put_proc_affinity(int cpu) +{ + struct hfi1_affinity_node_list *affinity = &node_affinity; + struct cpu_mask_set *set = &affinity->proc; + + if (cpu < 0) + return; + + mutex_lock(&affinity->lock); + cpu_mask_set_put(set, cpu); + hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); + mutex_unlock(&affinity->lock); +} diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h new file mode 100644 index 0000000000..00854f2178 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#ifndef _HFI1_AFFINITY_H +#define _HFI1_AFFINITY_H + +#include "hfi.h" + +enum irq_type { + IRQ_SDMA, + IRQ_RCVCTXT, + IRQ_NETDEVCTXT, + IRQ_GENERAL, + IRQ_OTHER +}; + +/* Can be used for both memory and cpu */ +enum affinity_flags { + AFF_AUTO, + AFF_NUMA_LOCAL, + AFF_DEV_LOCAL, + AFF_IRQ_LOCAL +}; + +struct cpu_mask_set { + struct cpumask mask; + struct cpumask used; + uint gen; +}; + +struct hfi1_msix_entry; + +/* Initialize non-HT cpu cores mask */ +void init_real_cpu_mask(void); +/* Initialize driver affinity data */ +int hfi1_dev_affinity_init(struct hfi1_devdata *dd); +/* + * Set IRQ affinity to a CPU. The function will determine the + * CPU and set the affinity to it. + */ +int hfi1_get_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix); +/* + * Remove the IRQ's CPU affinity. This function also updates + * any internal CPU tracking data + */ +void hfi1_put_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix); +/* + * Determine a CPU affinity for a user process, if the process does not + * have an affinity set yet. + */ +int hfi1_get_proc_affinity(int node); +/* Release a CPU used by a user process. */ +void hfi1_put_proc_affinity(int cpu); + +struct hfi1_affinity_node { + int node; + u16 __percpu *comp_vect_affinity; + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpumask general_intr_mask; + struct cpumask comp_vect_mask; + struct list_head list; +}; + +struct hfi1_affinity_node_list { + struct list_head list; + struct cpumask real_cpu_mask; + struct cpu_mask_set proc; + int num_core_siblings; + int num_possible_nodes; + int num_online_nodes; + int num_online_cpus; + struct mutex lock; /* protects affinity nodes */ +}; + +int node_affinity_init(void); +void node_affinity_destroy_all(void); +extern struct hfi1_affinity_node_list node_affinity; +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd); +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect); +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd); +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd); + +#endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c new file mode 100644 index 0000000000..a3c53be407 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/aspm.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2019 Intel Corporation. + * + */ + +#include "aspm.h" + +/* Time after which the timer interrupt will re-enable ASPM */ +#define ASPM_TIMER_MS 1000 +/* Time for which interrupts are ignored after a timer has been scheduled */ +#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2) +/* Two interrupts within this time trigger ASPM disable */ +#define ASPM_TRIGGER_MS 1 +#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull) +#define ASPM_L1_SUPPORTED(reg) \ + ((((reg) & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2) + +uint aspm_mode = ASPM_MODE_DISABLED; +module_param_named(aspm, aspm_mode, uint, 0444); +MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); + +static bool aspm_hw_l1_supported(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u32 up, dn; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return false; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn); + dn = ASPM_L1_SUPPORTED(dn); + + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up); + up = ASPM_L1_SUPPORTED(up); + + /* ASPM works on A-step but is reported as not supported */ + return (!!dn || is_ax(dd)) && !!up; +} + +/* Set L1 entrance latency for slower entry to L1 */ +static void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd) +{ + u32 l1_ent_lat = 0x4u; + u32 reg32; + + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32); + reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK; + reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32); +} + +static void aspm_hw_enable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return; + + /* Enable ASPM L1 first in upstream component and then downstream */ + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); +} + +void aspm_hw_disable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* Disable ASPM L1 first in downstream component and then upstream */ + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); + if (parent) + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); +} + +static void aspm_enable(struct hfi1_devdata *dd) +{ + if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED || + !dd->aspm_supported) + return; + + aspm_hw_enable_l1(dd); + dd->aspm_enabled = true; +} + +static void aspm_disable(struct hfi1_devdata *dd) +{ + if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED) + return; + + aspm_hw_disable_l1(dd); + dd->aspm_enabled = false; +} + +static void aspm_disable_inc(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + aspm_disable(dd); + atomic_inc(&dd->aspm_disabled_cnt); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +static void aspm_enable_dec(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + if (atomic_dec_and_test(&dd->aspm_disabled_cnt)) + aspm_enable(dd); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +/* ASPM processing for each receive context interrupt */ +void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd) +{ + bool restart_timer; + bool close_interrupts; + unsigned long flags; + ktime_t now, prev; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + /* PSM contexts are open */ + if (!rcd->aspm_intr_enable) + goto unlock; + + prev = rcd->aspm_ts_last_intr; + now = ktime_get(); + rcd->aspm_ts_last_intr = now; + + /* An interrupt pair close together in time */ + close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS; + + /* Don't push out our timer till this much time has elapsed */ + restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) > + ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC; + restart_timer = restart_timer && close_interrupts; + + /* Disable ASPM and schedule timer */ + if (rcd->aspm_enabled && close_interrupts) { + aspm_disable_inc(rcd->dd); + rcd->aspm_enabled = false; + restart_timer = true; + } + + if (restart_timer) { + mod_timer(&rcd->aspm_timer, + jiffies + msecs_to_jiffies(ASPM_TIMER_MS)); + rcd->aspm_ts_timer_sched = now; + } +unlock: + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Timer function for re-enabling ASPM in the absence of interrupt activity */ +static void aspm_ctx_timer_function(struct timer_list *t) +{ + struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer); + unsigned long flags; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + aspm_enable_dec(rcd->dd); + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* + * Disable interrupt processing for verbs contexts when PSM or VNIC contexts + * are open. + */ +void aspm_disable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + u16 i; + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + del_timer_sync(&rcd->aspm_timer); + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = false; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } + } + + aspm_disable(dd); + atomic_set(&dd->aspm_disabled_cnt, 0); +} + +/* Re-enable interrupt processing for verbs contexts */ +void aspm_enable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + u16 i; + + aspm_enable(dd); + + if (aspm_mode != ASPM_MODE_DYNAMIC) + return; + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) { + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = true; + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + hfi1_rcd_put(rcd); + } + } +} + +static void aspm_ctx_init(struct hfi1_ctxtdata *rcd) +{ + spin_lock_init(&rcd->aspm_lock); + timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0); + rcd->aspm_intr_supported = rcd->dd->aspm_supported && + aspm_mode == ASPM_MODE_DYNAMIC && + rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt; +} + +void aspm_init(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + u16 i; + + spin_lock_init(&dd->aspm_lock); + dd->aspm_supported = aspm_hw_l1_supported(dd); + + for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + aspm_ctx_init(rcd); + hfi1_rcd_put(rcd); + } + + /* Start with ASPM disabled */ + aspm_hw_set_l1_ent_latency(dd); + dd->aspm_enabled = false; + aspm_hw_disable_l1(dd); + + /* Now turn on ASPM if configured */ + aspm_enable_all(dd); +} + +void aspm_exit(struct hfi1_devdata *dd) +{ + aspm_disable_all(dd); + + /* Turn on ASPM on exit to conserve power */ + aspm_enable(dd); +} + diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h new file mode 100644 index 0000000000..df295f47b3 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/aspm.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015-2017 Intel Corporation. + */ + +#ifndef _ASPM_H +#define _ASPM_H + +#include "hfi.h" + +extern uint aspm_mode; + +enum aspm_mode { + ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */ + ASPM_MODE_ENABLED = 1, /* ASPM always enabled, power saving mode */ + ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */ +}; + +void aspm_init(struct hfi1_devdata *dd); +void aspm_exit(struct hfi1_devdata *dd); +void aspm_hw_disable_l1(struct hfi1_devdata *dd); +void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd); +void aspm_disable_all(struct hfi1_devdata *dd); +void aspm_enable_all(struct hfi1_devdata *dd); + +static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd) +{ + /* Quickest exit for minimum impact */ + if (likely(!rcd->aspm_intr_supported)) + return; + + __aspm_ctx_disable(rcd); +} + +#endif /* _ASPM_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c new file mode 100644 index 0000000000..0814291a04 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -0,0 +1,15501 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + * Copyright(c) 2021 Cornelis Networks. + */ + +/* + * This file contains all of the code that is specific to the HFI chip + */ + +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/module.h> + +#include "hfi.h" +#include "trace.h" +#include "mad.h" +#include "pio.h" +#include "sdma.h" +#include "eprom.h" +#include "efivar.h" +#include "platform.h" +#include "aspm.h" +#include "affinity.h" +#include "debugfs.h" +#include "fault.h" +#include "netdev.h" + +uint num_vls = HFI1_MAX_VLS_SUPPORTED; +module_param(num_vls, uint, S_IRUGO); +MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); + +/* + * Default time to aggregate two 10K packets from the idle state + * (timer not running). The timer starts at the end of the first packet, + * so only the time for one 10K packet and header plus a bit extra is needed. + * 10 * 1024 + 64 header byte = 10304 byte + * 10304 byte / 12.5 GB/s = 824.32ns + */ +uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */ +module_param(rcv_intr_timeout, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns"); + +uint rcv_intr_count = 16; /* same as qib */ +module_param(rcv_intr_count, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count"); + +ushort link_crc_mask = SUPPORTED_CRCS; +module_param(link_crc_mask, ushort, S_IRUGO); +MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link"); + +uint loopback; +module_param_named(loopback, loopback, uint, S_IRUGO); +MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable"); + +/* Other driver tunables */ +uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/ +static ushort crc_14b_sideband = 1; +static uint use_flr = 1; +uint quick_linkup; /* skip LNI */ + +struct flag_table { + u64 flag; /* the flag */ + char *str; /* description string */ + u16 extra; /* extra information */ + u16 unused0; + u32 unused1; +}; + +/* str must be a string constant */ +#define FLAG_ENTRY(str, extra, flag) {flag, str, extra} +#define FLAG_ENTRY0(str, flag) {flag, str, 0} + +/* Send Error Consequences */ +#define SEC_WRITE_DROPPED 0x1 +#define SEC_PACKET_DROPPED 0x2 +#define SEC_SC_HALTED 0x4 /* per-context only */ +#define SEC_SPC_FREEZE 0x8 /* per-HFI only */ + +#define DEFAULT_KRCVQS 2 +#define MIN_KERNEL_KCTXTS 2 +#define FIRST_KERNEL_KCTXT 1 + +/* + * RSM instance allocation + * 0 - User Fecn Handling + * 1 - Vnic + * 2 - AIP + * 3 - Verbs + */ +#define RSM_INS_FECN 0 +#define RSM_INS_VNIC 1 +#define RSM_INS_AIP 2 +#define RSM_INS_VERBS 3 + +/* Bit offset into the GUID which carries HFI id information */ +#define GUID_HFI_INDEX_SHIFT 39 + +/* extract the emulation revision */ +#define emulator_rev(dd) ((dd)->irev >> 8) +/* parallel and serial emulation versions are 3 and 4 respectively */ +#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3) +#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4) + +/* RSM fields for Verbs */ +/* packet type */ +#define IB_PACKET_TYPE 2ull +#define QW_SHIFT 6ull +/* QPN[7..1] */ +#define QPN_WIDTH 7ull + +/* LRH.BTH: QW 0, OFFSET 48 - for match */ +#define LRH_BTH_QW 0ull +#define LRH_BTH_BIT_OFFSET 48ull +#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off)) +#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET) +#define LRH_BTH_SELECT +#define LRH_BTH_MASK 3ull +#define LRH_BTH_VALUE 2ull + +/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */ +#define LRH_SC_QW 0ull +#define LRH_SC_BIT_OFFSET 56ull +#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off)) +#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET) +#define LRH_SC_MASK 128ull +#define LRH_SC_VALUE 0ull + +/* SC[n..0] QW 0, OFFSET 60 - for select */ +#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull)) + +/* QPN[m+n:1] QW 1, OFFSET 1 */ +#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull)) + +/* RSM fields for AIP */ +/* LRH.BTH above is reused for this rule */ + +/* BTH.DESTQP: QW 1, OFFSET 16 for match */ +#define BTH_DESTQP_QW 1ull +#define BTH_DESTQP_BIT_OFFSET 16ull +#define BTH_DESTQP_OFFSET(off) ((BTH_DESTQP_QW << QW_SHIFT) | (off)) +#define BTH_DESTQP_MATCH_OFFSET BTH_DESTQP_OFFSET(BTH_DESTQP_BIT_OFFSET) +#define BTH_DESTQP_MASK 0xFFull +#define BTH_DESTQP_VALUE 0x81ull + +/* DETH.SQPN: QW 1 Offset 56 for select */ +/* We use 8 most significant Soure QPN bits as entropy fpr AIP */ +#define DETH_AIP_SQPN_QW 3ull +#define DETH_AIP_SQPN_BIT_OFFSET 56ull +#define DETH_AIP_SQPN_OFFSET(off) ((DETH_AIP_SQPN_QW << QW_SHIFT) | (off)) +#define DETH_AIP_SQPN_SELECT_OFFSET \ + DETH_AIP_SQPN_OFFSET(DETH_AIP_SQPN_BIT_OFFSET) + +/* RSM fields for Vnic */ +/* L2_TYPE: QW 0, OFFSET 61 - for match */ +#define L2_TYPE_QW 0ull +#define L2_TYPE_BIT_OFFSET 61ull +#define L2_TYPE_OFFSET(off) ((L2_TYPE_QW << QW_SHIFT) | (off)) +#define L2_TYPE_MATCH_OFFSET L2_TYPE_OFFSET(L2_TYPE_BIT_OFFSET) +#define L2_TYPE_MASK 3ull +#define L2_16B_VALUE 2ull + +/* L4_TYPE QW 1, OFFSET 0 - for match */ +#define L4_TYPE_QW 1ull +#define L4_TYPE_BIT_OFFSET 0ull +#define L4_TYPE_OFFSET(off) ((L4_TYPE_QW << QW_SHIFT) | (off)) +#define L4_TYPE_MATCH_OFFSET L4_TYPE_OFFSET(L4_TYPE_BIT_OFFSET) +#define L4_16B_TYPE_MASK 0xFFull +#define L4_16B_ETH_VALUE 0x78ull + +/* 16B VESWID - for select */ +#define L4_16B_HDR_VESWID_OFFSET ((2 << QW_SHIFT) | (16ull)) +/* 16B ENTROPY - for select */ +#define L2_16B_ENTROPY_OFFSET ((1 << QW_SHIFT) | (32ull)) + +/* defines to build power on SC2VL table */ +#define SC2VL_VAL( \ + num, \ + sc0, sc0val, \ + sc1, sc1val, \ + sc2, sc2val, \ + sc3, sc3val, \ + sc4, sc4val, \ + sc5, sc5val, \ + sc6, sc6val, \ + sc7, sc7val) \ +( \ + ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \ + ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \ + ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \ + ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \ + ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \ + ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \ + ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \ + ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \ +) + +#define DC_SC_VL_VAL( \ + range, \ + e0, e0val, \ + e1, e1val, \ + e2, e2val, \ + e3, e3val, \ + e4, e4val, \ + e5, e5val, \ + e6, e6val, \ + e7, e7val, \ + e8, e8val, \ + e9, e9val, \ + e10, e10val, \ + e11, e11val, \ + e12, e12val, \ + e13, e13val, \ + e14, e14val, \ + e15, e15val) \ +( \ + ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \ + ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \ + ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \ + ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \ + ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \ + ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \ + ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \ + ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \ + ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \ + ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \ + ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \ + ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \ + ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \ + ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \ + ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \ + ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \ +) + +/* all CceStatus sub-block freeze bits */ +#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \ + | CCE_STATUS_RXE_FROZE_SMASK \ + | CCE_STATUS_TXE_FROZE_SMASK \ + | CCE_STATUS_TXE_PIO_FROZE_SMASK) +/* all CceStatus sub-block TXE pause bits */ +#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \ + | CCE_STATUS_TXE_PAUSED_SMASK \ + | CCE_STATUS_SDMA_PAUSED_SMASK) +/* all CceStatus sub-block RXE pause bits */ +#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK + +#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL +#define CNTR_32BIT_MAX 0x00000000FFFFFFFF + +/* + * CCE Error flags. + */ +static struct flag_table cce_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CceCsrParityErr", + CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr", + CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK), +/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr", + CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr", + CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK), +/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK), +/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK), +/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK), +/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK), +/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK), +/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK), +/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK), +/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK), +/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK), +/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr", + CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK), +/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK), +/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK), +/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK), +/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY0("PcicReceiveParityErr", + CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK), +/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr", + CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK), +/*31*/ FLAG_ENTRY0("LATriggered", + CCE_ERR_STATUS_LA_TRIGGERED_SMASK), +/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK), +/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK), +/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr", + CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK), +/*36*/ FLAG_ENTRY0("CceMsixTableCorErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK), +/*37*/ FLAG_ENTRY0("CceMsixTableUncErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK), +/*38*/ FLAG_ENTRY0("CceIntMapCorErr", + CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK), +/*39*/ FLAG_ENTRY0("CceIntMapUncErr", + CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK), +/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr", + CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK), +/*41-63 reserved*/ +}; + +/* + * Misc Error flags + */ +#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK +static struct flag_table misc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), +/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)), +/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)), +/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)), +/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)), +/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)), +/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)), +/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)), +/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)), +/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)), +/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL)) +}; + +/* + * TXE PIO Error flags and consequences + */ +static struct flag_table pio_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("PioWriteBadCtxt", + SEC_WRITE_DROPPED, + SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), +/* 1*/ FLAG_ENTRY("PioWriteAddrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY("PioCsrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("PioSbMemFifo0", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK), +/* 4*/ FLAG_ENTRY("PioSbMemFifo1", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK), +/* 5*/ FLAG_ENTRY("PioPccFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY("PioPecFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY("PioSmPktResetParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK), +/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK), +/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY("PioCreditRetFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK), +/*16*/ FLAG_ENTRY("PioPpmcPblFifo", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK), +/*17*/ FLAG_ENTRY("PioInitSmIn", + 0, + SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK), +/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK), +/*19*/ FLAG_ENTRY("PioHostAddrMemUnc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK), +/*20*/ FLAG_ENTRY("PioHostAddrMemCor", + 0, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK), +/*21*/ FLAG_ENTRY("PioWriteDataParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK), +/*22*/ FLAG_ENTRY("PioStateMachine", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK), +/*23*/ FLAG_ENTRY("PioWriteQwValidParity", + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK), +/*24*/ FLAG_ENTRY("PioBlockQwCountParity", + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK), +/*25*/ FLAG_ENTRY("PioVlfVlLenParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK), +/*26*/ FLAG_ENTRY("PioVlfSopParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK), +/*27*/ FLAG_ENTRY("PioVlFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY("PioPpmcSopLen", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK), +/*30-31 reserved*/ +/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK), +/*33*/ FLAG_ENTRY("PioLastReturnedCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK), +/*34*/ FLAG_ENTRY("PioPccSopHeadParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK), +/*36-63 reserved*/ +}; + +/* TXE PIO errors that cause an SPC freeze */ +#define ALL_PIO_FREEZE_ERR \ + (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK) + +/* + * TXE SDMA Error flags + */ +static struct flag_table sdma_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", + SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", + SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK), +/*04-63 reserved*/ +}; + +/* TXE SDMA errors that cause an SPC freeze */ +#define ALL_SDMA_FREEZE_ERR \ + (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK) + +/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */ +#define PORT_DISCARD_EGRESS_ERRS \ + (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK) + +/* + * TXE Egress Error flags + */ +#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK +static struct flag_table egress_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), +/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), +/* 2 reserved */ +/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr", + SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)), +/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)), +/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)), +/* 6 reserved */ +/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr", + SEES(TX_PIO_LAUNCH_INTF_PARITY)), +/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr", + SEES(TX_SDMA_LAUNCH_INTF_PARITY)), +/* 9-10 reserved */ +/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr", + SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)), +/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)), +/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)), +/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)), +/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)), +/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr", + SEES(TX_SDMA0_DISALLOWED_PACKET)), +/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr", + SEES(TX_SDMA1_DISALLOWED_PACKET)), +/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr", + SEES(TX_SDMA2_DISALLOWED_PACKET)), +/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr", + SEES(TX_SDMA3_DISALLOWED_PACKET)), +/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr", + SEES(TX_SDMA4_DISALLOWED_PACKET)), +/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr", + SEES(TX_SDMA5_DISALLOWED_PACKET)), +/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr", + SEES(TX_SDMA6_DISALLOWED_PACKET)), +/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr", + SEES(TX_SDMA7_DISALLOWED_PACKET)), +/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr", + SEES(TX_SDMA8_DISALLOWED_PACKET)), +/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr", + SEES(TX_SDMA9_DISALLOWED_PACKET)), +/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr", + SEES(TX_SDMA10_DISALLOWED_PACKET)), +/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr", + SEES(TX_SDMA11_DISALLOWED_PACKET)), +/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr", + SEES(TX_SDMA12_DISALLOWED_PACKET)), +/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr", + SEES(TX_SDMA13_DISALLOWED_PACKET)), +/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr", + SEES(TX_SDMA14_DISALLOWED_PACKET)), +/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr", + SEES(TX_SDMA15_DISALLOWED_PACKET)), +/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr", + SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)), +/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr", + SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)), +/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr", + SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)), +/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr", + SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)), +/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr", + SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)), +/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr", + SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)), +/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr", + SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)), +/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr", + SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)), +/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr", + SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)), +/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)), +/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)), +/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)), +/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)), +/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)), +/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)), +/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)), +/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)), +/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)), +/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)), +/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)), +/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)), +/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)), +/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)), +/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)), +/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)), +/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)), +/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)), +/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)), +/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)), +/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)), +/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr", + SEES(TX_READ_SDMA_MEMORY_CSR_UNC)), +/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr", + SEES(TX_READ_PIO_MEMORY_CSR_UNC)), +}; + +/* + * TXE Egress Error Info flags + */ +#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK +static struct flag_table egress_err_info_flags[] = { +/* 0*/ FLAG_ENTRY0("Reserved", 0ull), +/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), +/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)), +/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)), +/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)), +/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)), +/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)), +/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)), +/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)), +/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)), +/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)), +/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)), +/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)), +/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)), +/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)), +/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)), +/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)), +/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)), +/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)), +/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)), +}; + +/* TXE Egress errors that cause an SPC freeze */ +#define ALL_TXE_EGRESS_FREEZE_ERR \ + (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \ + | SEES(TX_PIO_LAUNCH_INTF_PARITY) \ + | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \ + | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \ + | SEES(TX_LAUNCH_CSR_PARITY) \ + | SEES(TX_SBRD_CTL_CSR_PARITY) \ + | SEES(TX_CONFIG_PARITY) \ + | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \ + | SEES(TX_CREDIT_RETURN_PARITY)) + +/* + * TXE Send error flags + */ +#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK +static struct flag_table send_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) +}; + +/* + * TXE Send Context Error flags and consequences + */ +static struct flag_table sc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("InconsistentSop", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), +/* 1*/ FLAG_ENTRY("DisallowedPacket", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK), +/* 2*/ FLAG_ENTRY("WriteCrossesBoundary", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("WriteOverflow", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK), +/* 4*/ FLAG_ENTRY("WriteOutOfBounds", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK), +/* 5-63 reserved*/ +}; + +/* + * RXE Receive Error flags + */ +#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK +static struct flag_table rxe_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), +/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), +/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), +/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)), +/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)), +/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)), +/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)), +/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)), +/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)), +/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)), +/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)), +/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)), +/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)), +/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)), +/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)), +/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)), +/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr", + RXES(RBUF_LOOKUP_DES_REG_UNC_COR)), +/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)), +/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)), +/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr", + RXES(RBUF_BLOCK_LIST_READ_UNC)), +/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr", + RXES(RBUF_BLOCK_LIST_READ_COR)), +/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr", + RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)), +/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr", + RXES(RBUF_CSR_QENT_CNT_PARITY)), +/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr", + RXES(RBUF_CSR_QNEXT_BUF_PARITY)), +/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr", + RXES(RBUF_CSR_QVLD_BIT_PARITY)), +/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)), +/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)), +/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr", + RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)), +/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)), +/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)), +/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)), +/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)), +/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)), +/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)), +/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)), +/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr", + RXES(RBUF_FL_INITDONE_PARITY)), +/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr", + RXES(RBUF_FL_INIT_WR_ADDR_PARITY)), +/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)), +/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)), +/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)), +/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr", + RXES(LOOKUP_DES_PART1_UNC_COR)), +/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr", + RXES(LOOKUP_DES_PART2_PARITY)), +/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)), +/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)), +/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)), +/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)), +/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)), +/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)), +/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)), +/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)), +/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)), +/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)), +/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)), +/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)), +/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)), +/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)), +/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)), +/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)), +/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)), +/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)), +/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)), +/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)), +/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)), +/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY)) +}; + +/* RXE errors that will trigger an SPC freeze */ +#define ALL_RXE_FREEZE_ERR \ + (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK) + +#define RXE_FREEZE_ABORT_MASK \ + (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK) + +/* + * DCC Error Flags + */ +#define DCCE(name) DCC_ERR_FLG_##name##_SMASK +static struct flag_table dcc_err_flags[] = { + FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), + FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), + FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), + FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)), + FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)), + FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)), + FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)), + FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)), + FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)), + FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)), + FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)), + FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)), + FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)), + FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)), + FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("link_err", DCCE(LINK_ERR)), + FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)), + FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)), + FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)), + FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)), + FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)), + FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)), + FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)), + FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)), + FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)), + FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)), + FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)), + FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)), + FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)), + FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)), + FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)), + FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)), + FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)), + FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)), + FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)), + FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)), + FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)), + FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)), + FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)), +}; + +/* + * LCB error flags + */ +#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK +static struct flag_table lcb_err_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), +/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), +/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), +/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST", + LCBE(ALL_LNS_FAILED_REINIT_TEST)), +/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)), +/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)), +/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)), +/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)), +/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)), +/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)), +/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)), +/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)), +/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)), +/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER", + LCBE(UNEXPECTED_ROUND_TRIP_MARKER)), +/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)), +/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)), +/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)), +/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)), +/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)), +/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE", + LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)), +/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)), +/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)), +/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)), +/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)), +/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)), +/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)), +/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP", + LCBE(RST_FOR_INCOMPLT_RND_TRIP)), +/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)), +/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE", + LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)), +/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR", + LCBE(REDUNDANT_FLIT_PARITY_ERR)) +}; + +/* + * DC8051 Error Flags + */ +#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK +static struct flag_table dc8051_err_flags[] = { + FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), + FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), + FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), + FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)), + FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)), + FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)), + FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)), + FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)), + FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES", + D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)), + FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)), +}; + +/* + * DC8051 Information Error flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. + */ +static struct flag_table dc8051_info_err_flags[] = { + FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), + FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), + FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), + FLAG_ENTRY0("Serdes internal loopback failure", + FAILED_SERDES_INTERNAL_LOOPBACK), + FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT), + FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING), + FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE), + FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM), + FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ), + FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1), + FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2), + FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT), + FLAG_ENTRY0("Host Handshake Timeout", HOST_HANDSHAKE_TIMEOUT), + FLAG_ENTRY0("External Device Request Timeout", + EXTERNAL_DEVICE_REQ_TIMEOUT), +}; + +/* + * DC8051 Information Host Information flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. + */ +static struct flag_table dc8051_info_host_msg_flags[] = { + FLAG_ENTRY0("Host request done", 0x0001), + FLAG_ENTRY0("BC PWR_MGM message", 0x0002), + FLAG_ENTRY0("BC SMA message", 0x0004), + FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008), + FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010), + FLAG_ENTRY0("External device config request", 0x0020), + FLAG_ENTRY0("VerifyCap all frames received", 0x0040), + FLAG_ENTRY0("LinkUp achieved", 0x0080), + FLAG_ENTRY0("Link going down", 0x0100), + FLAG_ENTRY0("Link width downgraded", 0x0200), +}; + +static u32 encoded_size(u32 size); +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate); +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state); +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous); +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes); +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, u16 *link_widths); +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths); +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev); +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx); +static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, u8 *max_rate); +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg); +static void handle_dcc_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_lcb_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void set_partition_keys(struct hfi1_pportdata *ppd); +static const char *link_state_name(u32 state); +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, + u32 state); +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data); +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); +static int thermal_init(struct hfi1_devdata *dd); + +static void update_statusp(struct hfi1_pportdata *ppd, u32 state); +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs); +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs); +static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); +static void log_physical_state(struct hfi1_pportdata *ppd, u32 state); +static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs); +static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd, + int msecs); +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); +static void handle_temp_err(struct hfi1_devdata *dd); +static void dc_shutdown(struct hfi1_devdata *dd); +static void dc_start(struct hfi1_devdata *dd); +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, + unsigned int *np); +static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); +static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms); +static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index); +static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width); + +/* + * Error interrupt table entry. This is used as input to the interrupt + * "clear down" routine used for all second tier error interrupt register. + * Second tier interrupt registers have a single bit representing them + * in the top-level CceIntStatus. + */ +struct err_reg_info { + u32 status; /* status CSR offset */ + u32 clear; /* clear CSR offset */ + u32 mask; /* mask CSR offset */ + void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg); + const char *desc; +}; + +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START) + +/* + * Helpers for building HFI and DC error interrupt table entries. Different + * helpers are needed because of inconsistent register names. + */ +#define EE(reg, handler, desc) \ + { reg##_STATUS, reg##_CLEAR, reg##_MASK, \ + handler, desc } +#define DC_EE1(reg, handler, desc) \ + { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc } +#define DC_EE2(reg, handler, desc) \ + { reg##_FLG, reg##_CLR, reg##_EN, handler, desc } + +/* + * Table of the "misc" grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = { +/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"), +/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"), +/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"), +/* 3*/ { 0, 0, 0, NULL }, /* reserved */ +/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"), +/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"), +/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"), +/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr") + /* the rest are reserved */ +}; + +/* + * Index into the Various section of the interrupt sources + * corresponding to the Critical Temperature interrupt. + */ +#define TCRIT_INT_SOURCE 4 + +/* + * SDMA error interrupt entry - refers to another register containing more + * information. + */ +static const struct err_reg_info sdma_eng_err = + EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr"); + +static const struct err_reg_info various_err[NUM_VARIOUS] = { +/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */ +/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */ +/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"), +/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"), +/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */ + /* rest are reserved */ +}; + +/* + * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG + * register can not be derived from the MTU value because 10K is not + * a power of 2. Therefore, we need a constant. Everything else can + * be calculated. + */ +#define DCC_CFG_PORT_MTU_CAP_10240 7 + +/* + * Table of the DC grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info dc_errs[NUM_DC_ERRS] = { +/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"), +/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"), +/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"), +/* 3*/ /* dc_lbm_int - special, see is_dc_int() */ + /* the rest are reserved */ +}; + +struct cntr_entry { + /* + * counter name + */ + char *name; + + /* + * csr to read for name (if applicable) + */ + u64 csr; + + /* + * offset into dd or ppd to store the counter's value + */ + int offset; + + /* + * flags + */ + u8 flags; + + /* + * accessor for stat element, context either dd or ppd + */ + u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl, + int mode, u64 data); +}; + +#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0 +#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159 + +#define CNTR_ELEM(name, csr, offset, flags, accessor) \ +{ \ + name, \ + csr, \ + offset, \ + flags, \ + accessor \ +} + +/* 32bit RXE */ +#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* 64bit RXE */ +#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + dev_access_u64_csr) + +#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx +#define OVR_ELM(ctx) \ +CNTR_ELEM("RcvHdrOvr" #ctx, \ + (RCV_HDR_OVFL_CNT + ctx * 0x100), \ + 0, CNTR_NORMAL, port_access_u64_csr) + +/* 32bit TXE */ +#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +/* 64bit TXE */ +#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +# define TX64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name,\ + counter * 8 + SEND_COUNTER_ARRAY64, \ + 0, \ + flags, \ + dev_access_u64_csr) + +/* CCE */ +#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_INT_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* DC */ +#define DC_PERF_CNTR(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dev_access_u64_csr) + +#define DC_PERF_CNTR_LCB(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dc_access_lcb_cntr) + +/* ibp counters */ +#define SW_IBP_CNTR(name, cntr) \ +CNTR_ELEM(#name, \ + 0, \ + 0, \ + CNTR_SYNTH, \ + access_ibp_##cntr) + +/** + * hfi1_addr_from_offset - return addr for readq/writeq + * @dd: the dd device + * @offset: the offset of the CSR within bar0 + * + * This routine selects the appropriate base address + * based on the indicated offset. + */ +static inline void __iomem *hfi1_addr_from_offset( + const struct hfi1_devdata *dd, + u32 offset) +{ + if (offset >= dd->base2_start) + return dd->kregbase2 + (offset - dd->base2_start); + return dd->kregbase1 + offset; +} + +/** + * read_csr - read CSR at the indicated offset + * @dd: the dd device + * @offset: the offset of the CSR within bar0 + * + * Return: the value read or all FF's if there + * is no mapping + */ +u64 read_csr(const struct hfi1_devdata *dd, u32 offset) +{ + if (dd->flags & HFI1_PRESENT) + return readq(hfi1_addr_from_offset(dd, offset)); + return -1; +} + +/** + * write_csr - write CSR at the indicated offset + * @dd: the dd device + * @offset: the offset of the CSR within bar0 + * @value: value to write + */ +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value) +{ + if (dd->flags & HFI1_PRESENT) { + void __iomem *base = hfi1_addr_from_offset(dd, offset); + + /* avoid write to RcvArray */ + if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start)) + return; + writeq(value, base); + } +} + +/** + * get_csr_addr - return te iomem address for offset + * @dd: the dd device + * @offset: the offset of the CSR within bar0 + * + * Return: The iomem address to use in subsequent + * writeq/readq operations. + */ +void __iomem *get_csr_addr( + const struct hfi1_devdata *dd, + u32 offset) +{ + if (dd->flags & HFI1_PRESENT) + return hfi1_addr_from_offset(dd, offset); + return NULL; +} + +static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, + int mode, u64 value) +{ + u64 ret; + + if (mode == CNTR_MODE_R) { + ret = read_csr(dd, csr); + } else if (mode == CNTR_MODE_W) { + write_csr(dd, csr, value); + ret = value; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode); + return ret; +} + +/* Dev Access */ +static u64 dev_access_u32_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + u64 csr = entry->csr; + + if (entry->flags & CNTR_SDMA) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 0x100 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + return read_write_csr(dd, csr, mode, data); +} + +static u64 access_sde_err_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].err_cnt; + return 0; +} + +static u64 access_sde_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].sdma_int_cnt; + return 0; +} + +static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].idle_int_cnt; + return 0; +} + +static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].progress_int_cnt; + return 0; +} + +static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + u64 val = 0; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + + val = read_write_csr(dd, csr, mode, data); + return val; +} + +static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + u32 csr = entry->csr; + int ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + if (mode == CNTR_MODE_R) + ret = read_lcb_csr(dd, csr, &data); + else if (mode == CNTR_MODE_W) + ret = write_lcb_csr(dd, csr, data); + + if (ret) { + if (!(dd->flags & HFI1_SHUTDOWN)) + dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode); + return data; +} + +/* Port Access */ +static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_csr(ppd->dd, entry->csr, mode, data); +} + +static u64 port_access_u64_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + u64 val; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + val = read_write_csr(ppd->dd, csr, mode, data); + return val; +} + +/* Software defined */ +static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode, + u64 data) +{ + u64 ret; + + if (mode == CNTR_MODE_R) { + ret = *cntr; + } else if (mode == CNTR_MODE_W) { + *cntr = data; + ret = data; + } else { + dd_dev_err(dd, "Invalid cntr sw access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode); + + return ret; +} + +static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_downed, mode, data); +} + +static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_up, mode, data); +} + +static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data); +} + +static u64 access_sw_xmit_discards(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + u64 zero = 0; + u64 *counter; + + if (vl == CNTR_INVALID_VL) + counter = &ppd->port_xmit_discards; + else if (vl >= 0 && vl < C_VL_COUNT) + counter = &ppd->port_xmit_discards_vl[vl]; + else + counter = &zero; + + return read_write_sw(ppd->dd, counter, mode, data); +} + +static u64 access_xmit_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors, + mode, data); +} + +static u64 access_rcv_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors, + mode, data); +} + +u64 get_all_cpu_total(u64 __percpu *cntr) +{ + int cpu; + u64 counter = 0; + + for_each_possible_cpu(cpu) + counter += *per_cpu_ptr(cntr, cpu); + return counter; +} + +static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val, + u64 __percpu *cntr, + int vl, int mode, u64 data) +{ + u64 ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + + if (mode == CNTR_MODE_R) { + ret = get_all_cpu_total(cntr) - *z_val; + } else if (mode == CNTR_MODE_W) { + /* A write can only zero the counter */ + if (data == 0) + *z_val = get_all_cpu_total(cntr); + else + dd_dev_err(dd, "Per CPU cntrs can only be zeroed"); + } else { + dd_dev_err(dd, "Invalid cntr sw cpu access mode"); + return 0; + } + + return ret; +} + +static u64 access_sw_cpu_intr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl, + mode, data); +} + +static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl, + mode, data); +} + +static u64 access_sw_pio_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_piowait; +} + +static u64 access_sw_pio_drain(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_piodrain; +} + +static u64 access_sw_ctx0_seq_drop(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->ctx0_seq_drop; +} + +static u64 access_sw_vtx_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_txwait; +} + +static u64 access_sw_kmem_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_kmem_wait; +} + +static u64 access_sw_send_schedule(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl, + mode, data); +} + +/* Software counters for the error status bits within MISC_ERR_STATUS */ +static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[12]; +} + +static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[11]; +} + +static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[10]; +} + +static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[9]; +} + +static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[8]; +} + +static u64 access_misc_efuse_read_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[7]; +} + +static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[6]; +} + +static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[5]; +} + +static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[4]; +} + +static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[3]; +} + +static u64 access_misc_csr_write_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[2]; +} + +static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[1]; +} + +static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[0]; +} + +/* + * Software counter for the aggregate of + * individual CceErrStatus counters + */ +static u64 access_sw_cce_err_status_aggregated_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_cce_err_status_aggregate; +} + +/* + * Software counters corresponding to each of the + * error status bits within CceErrStatus + */ +static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[40]; +} + +static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[39]; +} + +static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[38]; +} + +static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[37]; +} + +static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[36]; +} + +static u64 access_cce_rxdma_conv_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[35]; +} + +static u64 access_cce_rcpl_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[34]; +} + +static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[33]; +} + +static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[32]; +} + +static u64 access_la_triggered_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[31]; +} + +static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[30]; +} + +static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[29]; +} + +static u64 access_pcic_transmit_back_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[28]; +} + +static u64 access_pcic_transmit_front_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[27]; +} + +static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[26]; +} + +static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[25]; +} + +static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[24]; +} + +static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[23]; +} + +static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[22]; +} + +static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[21]; +} + +static u64 access_pcic_n_post_dat_q_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[20]; +} + +static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[19]; +} + +static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[18]; +} + +static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[17]; +} + +static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[16]; +} + +static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[15]; +} + +static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[14]; +} + +static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[13]; +} + +static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[12]; +} + +static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[11]; +} + +static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[10]; +} + +static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[9]; +} + +static u64 access_cce_cli2_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[8]; +} + +static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[7]; +} + +static u64 access_cce_cli0_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[6]; +} + +static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[5]; +} + +static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[4]; +} + +static u64 access_cce_trgt_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[3]; +} + +static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[2]; +} + +static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[1]; +} + +static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within RcvErrStatus + */ +static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[63]; +} + +static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[62]; +} + +static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[61]; +} + +static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[60]; +} + +static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[59]; +} + +static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[58]; +} + +static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[57]; +} + +static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[56]; +} + +static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[55]; +} + +static u64 access_rx_dma_data_fifo_rd_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[54]; +} + +static u64 access_rx_dma_data_fifo_rd_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[53]; +} + +static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[52]; +} + +static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[51]; +} + +static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[50]; +} + +static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[49]; +} + +static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[48]; +} + +static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[47]; +} + +static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[46]; +} + +static u64 access_rx_hq_intr_csr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[45]; +} + +static u64 access_rx_lookup_csr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[44]; +} + +static u64 access_rx_lookup_rcv_array_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[43]; +} + +static u64 access_rx_lookup_rcv_array_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[42]; +} + +static u64 access_rx_lookup_des_part2_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[41]; +} + +static u64 access_rx_lookup_des_part1_unc_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[40]; +} + +static u64 access_rx_lookup_des_part1_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[39]; +} + +static u64 access_rx_rbuf_next_free_buf_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[38]; +} + +static u64 access_rx_rbuf_next_free_buf_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[37]; +} + +static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[36]; +} + +static u64 access_rx_rbuf_fl_initdone_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[35]; +} + +static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[34]; +} + +static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[33]; +} + +static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[32]; +} + +static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[31]; +} + +static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[30]; +} + +static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[29]; +} + +static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[28]; +} + +static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[27]; +} + +static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[26]; +} + +static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[25]; +} + +static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[24]; +} + +static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[23]; +} + +static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[22]; +} + +static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[21]; +} + +static u64 access_rx_rbuf_block_list_read_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[20]; +} + +static u64 access_rx_rbuf_block_list_read_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[19]; +} + +static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[18]; +} + +static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[17]; +} + +static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[16]; +} + +static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[15]; +} + +static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[14]; +} + +static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[13]; +} + +static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[12]; +} + +static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[11]; +} + +static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[10]; +} + +static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[9]; +} + +static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[8]; +} + +static u64 access_rx_rcv_qp_map_table_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[7]; +} + +static u64 access_rx_rcv_qp_map_table_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[6]; +} + +static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[5]; +} + +static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[4]; +} + +static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[3]; +} + +static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[2]; +} + +static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[1]; +} + +static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendPioErrStatus + */ +static u64 access_pio_pec_sop_head_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[35]; +} + +static u64 access_pio_pcc_sop_head_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[34]; +} + +static u64 access_pio_last_returned_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[33]; +} + +static u64 access_pio_current_free_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[32]; +} + +static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[31]; +} + +static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[30]; +} + +static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[29]; +} + +static u64 access_pio_ppmc_bqc_mem_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[28]; +} + +static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[27]; +} + +static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[26]; +} + +static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[25]; +} + +static u64 access_pio_block_qw_count_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[24]; +} + +static u64 access_pio_write_qw_valid_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[23]; +} + +static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[22]; +} + +static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[21]; +} + +static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[20]; +} + +static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[19]; +} + +static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[18]; +} + +static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[17]; +} + +static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[16]; +} + +static u64 access_pio_credit_ret_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[15]; +} + +static u64 access_pio_v1_len_mem_bank1_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[14]; +} + +static u64 access_pio_v1_len_mem_bank0_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[13]; +} + +static u64 access_pio_v1_len_mem_bank1_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[12]; +} + +static u64 access_pio_v1_len_mem_bank0_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[11]; +} + +static u64 access_pio_sm_pkt_reset_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[10]; +} + +static u64 access_pio_pkt_evict_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[9]; +} + +static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[8]; +} + +static u64 access_pio_sbrdctl_crrel_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[7]; +} + +static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[6]; +} + +static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[5]; +} + +static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[4]; +} + +static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[3]; +} + +static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[2]; +} + +static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[1]; +} + +static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendDmaErrStatus + */ +static u64 access_sdma_pcie_req_tracking_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[3]; +} + +static u64 access_sdma_pcie_req_tracking_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[2]; +} + +static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[1]; +} + +static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendEgressErrStatus + */ +static u64 access_tx_read_pio_memory_csr_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[63]; +} + +static u64 access_tx_read_sdma_memory_csr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[62]; +} + +static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[61]; +} + +static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[60]; +} + +static u64 access_tx_read_sdma_memory_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[59]; +} + +static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[58]; +} + +static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[57]; +} + +static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[56]; +} + +static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[55]; +} + +static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[54]; +} + +static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[53]; +} + +static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[52]; +} + +static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[51]; +} + +static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[50]; +} + +static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[49]; +} + +static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[48]; +} + +static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[47]; +} + +static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[46]; +} + +static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[45]; +} + +static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[44]; +} + +static u64 access_tx_read_sdma_memory_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[43]; +} + +static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[42]; +} + +static u64 access_tx_credit_return_partiy_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[41]; +} + +static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[40]; +} + +static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[39]; +} + +static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[38]; +} + +static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[37]; +} + +static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[36]; +} + +static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[35]; +} + +static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[34]; +} + +static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[33]; +} + +static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[32]; +} + +static u64 access_tx_sdma15_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[31]; +} + +static u64 access_tx_sdma14_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[30]; +} + +static u64 access_tx_sdma13_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[29]; +} + +static u64 access_tx_sdma12_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[28]; +} + +static u64 access_tx_sdma11_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[27]; +} + +static u64 access_tx_sdma10_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[26]; +} + +static u64 access_tx_sdma9_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[25]; +} + +static u64 access_tx_sdma8_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[24]; +} + +static u64 access_tx_sdma7_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[23]; +} + +static u64 access_tx_sdma6_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[22]; +} + +static u64 access_tx_sdma5_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[21]; +} + +static u64 access_tx_sdma4_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[20]; +} + +static u64 access_tx_sdma3_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[19]; +} + +static u64 access_tx_sdma2_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[18]; +} + +static u64 access_tx_sdma1_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[17]; +} + +static u64 access_tx_sdma0_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[16]; +} + +static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[15]; +} + +static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[14]; +} + +static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[13]; +} + +static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[12]; +} + +static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[11]; +} + +static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[10]; +} + +static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[9]; +} + +static u64 access_tx_sdma_launch_intf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[8]; +} + +static u64 access_tx_pio_launch_intf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[7]; +} + +static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[6]; +} + +static u64 access_tx_incorrect_link_state_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[5]; +} + +static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[4]; +} + +static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[3]; +} + +static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[2]; +} + +static u64 access_tx_pkt_integrity_mem_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[1]; +} + +static u64 access_tx_pkt_integrity_mem_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendErrStatus + */ +static u64 access_send_csr_write_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[2]; +} + +static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[1]; +} + +static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendCtxtErrStatus + */ +static u64 access_pio_write_out_of_bounds_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[4]; +} + +static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[3]; +} + +static u64 access_pio_write_crosses_boundary_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[2]; +} + +static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[1]; +} + +static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendDmaEngErrStatus + */ +static u64 access_sdma_header_request_fifo_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[23]; +} + +static u64 access_sdma_header_storage_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[22]; +} + +static u64 access_sdma_packet_tracking_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[21]; +} + +static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[20]; +} + +static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[19]; +} + +static u64 access_sdma_header_request_fifo_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[18]; +} + +static u64 access_sdma_header_storage_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[17]; +} + +static u64 access_sdma_packet_tracking_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[16]; +} + +static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[15]; +} + +static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[14]; +} + +static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[13]; +} + +static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[12]; +} + +static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[11]; +} + +static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[10]; +} + +static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[9]; +} + +static u64 access_sdma_packet_desc_overflow_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[8]; +} + +static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[7]; +} + +static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[6]; +} + +static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[5]; +} + +static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[4]; +} + +static u64 access_sdma_tail_out_of_bounds_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[3]; +} + +static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[2]; +} + +static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[1]; +} + +static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[0]; +} + +static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + u64 val = 0; + u64 csr = entry->csr; + + val = read_write_csr(dd, csr, mode, data); + if (mode == CNTR_MODE_R) { + val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ? + CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors; + } else if (mode == CNTR_MODE_W) { + dd->sw_rcv_bypass_packet_errors = 0; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + return val; +} + +#define def_access_sw_cpu(cntr) \ +static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr, \ + ppd->ibport_data.rvp.cntr, vl, \ + mode, data); \ +} + +def_access_sw_cpu(rc_acks); +def_access_sw_cpu(rc_qacks); +def_access_sw_cpu(rc_delayed_comp); + +#define def_access_ibp_counter(cntr) \ +static u64 access_ibp_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + \ + if (vl != CNTR_INVALID_VL) \ + return 0; \ + \ + return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr, \ + mode, data); \ +} + +def_access_ibp_counter(loop_pkts); +def_access_ibp_counter(rc_resends); +def_access_ibp_counter(rnr_naks); +def_access_ibp_counter(other_naks); +def_access_ibp_counter(rc_timeouts); +def_access_ibp_counter(pkt_drops); +def_access_ibp_counter(dmawait); +def_access_ibp_counter(rc_seqnak); +def_access_ibp_counter(rc_dupreq); +def_access_ibp_counter(rdma_seq); +def_access_ibp_counter(unaligned); +def_access_ibp_counter(seq_naks); +def_access_ibp_counter(rc_crwaits); + +static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { +[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH), +[C_RX_LEN_ERR] = RXE32_DEV_CNTR_ELEM(RxLenErr, RCV_LENGTH_ERR_CNT, CNTR_SYNTH), +[C_RX_SHORT_ERR] = RXE32_DEV_CNTR_ELEM(RxShrErr, RCV_SHORT_ERR_CNT, CNTR_SYNTH), +[C_RX_ICRC_ERR] = RXE32_DEV_CNTR_ELEM(RxICrcErr, RCV_ICRC_ERR_CNT, CNTR_SYNTH), +[C_RX_EBP] = RXE32_DEV_CNTR_ELEM(RxEbpCnt, RCV_EBP_CNT, CNTR_SYNTH), +[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs, + RCV_TID_FLOW_GEN_MISMATCH_CNT, + CNTR_NORMAL), +[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL, + CNTR_NORMAL), +[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs, + RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL), +[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt, + CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL), +[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT, + CNTR_NORMAL), +[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT, + CNTR_NORMAL), +[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT, + CNTR_NORMAL), +[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT, + CNTR_NORMAL), +[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt, + CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL), +[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt, + CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL), +[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH, + access_dc_rcv_err_cnt), +[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT, + CNTR_SYNTH), +[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT, + CNTR_SYNTH), +[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT, + CNTR_SYNTH), +[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts, + DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH), +[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts, + DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT, + CNTR_SYNTH), +[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr, + DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH), +[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT, + CNTR_SYNTH), +[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT, + CNTR_SYNTH), +[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH), +[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH), +[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT, + CNTR_SYNTH), +[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_TOTAL_CRC] = + DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR, + CNTR_SYNTH), +[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0, + CNTR_SYNTH), +[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1, + CNTR_SYNTH), +[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2, + CNTR_SYNTH), +[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3, + CNTR_SYNTH), +[C_DC_CRC_MULT_LN] = + DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN, + CNTR_SYNTH), +[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_SEQ_CRC_CNT] = + DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT, + CNTR_SYNTH), +[C_DC_ESC0_ONLY_CNT] = + DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS1_CNT] = + DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS2_CNT] = + DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT, + CNTR_SYNTH), +[C_DC_REINIT_FROM_PEER_CNT] = + DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT, + CNTR_SYNTH), +[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT, + CNTR_SYNTH), +[C_DC_MISC_FLG_CNT] = + DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT, + CNTR_SYNTH), +[C_DC_PRF_GOOD_LTP_CNT] = + DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH), +[C_DC_PRF_ACCEPTED_LTP_CNT] = + DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT, + CNTR_SYNTH), +[C_DC_PRF_RX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_TX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_CLK_CNTR] = + DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH), +[C_DC_PG_DBG_FLIT_CRDTS_CNT] = + DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH), +[C_DC_PG_STS_PAUSE_COMPLETE_CNT] = + DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT, + CNTR_SYNTH), +[C_DC_PG_STS_TX_SBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH), +[C_DC_PG_STS_TX_MBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT, + CNTR_SYNTH), +[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL, + access_sw_cpu_intr), +[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL, + access_sw_cpu_rcv_limit), +[C_SW_CTX0_SEQ_DROP] = CNTR_ELEM("SeqDrop0", 0, 0, CNTR_NORMAL, + access_sw_ctx0_seq_drop), +[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL, + access_sw_vtx_wait), +[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL, + access_sw_pio_wait), +[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL, + access_sw_pio_drain), +[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, + access_sw_kmem_wait), +[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL, + hfi1_access_sw_tid_wait), +[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, + access_sw_send_schedule), +[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn", + SEND_DMA_DESC_FETCHED_CNT, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + dev_access_u32_csr), +[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_int_cnt), +[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_err_cnt), +[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_idle_int_cnt), +[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_progress_int_cnt), +/* MISC_ERR_STATUS */ +[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0, + CNTR_NORMAL, + access_misc_pll_lock_fail_err_cnt), +[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0, + CNTR_NORMAL, + access_misc_mbist_fail_err_cnt), +[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0, + CNTR_NORMAL, + access_misc_invalid_eep_cmd_err_cnt), +[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_done_parity_err_cnt), +[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_write_err_cnt), +[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0, + 0, CNTR_NORMAL, + access_misc_efuse_read_bad_addr_err_cnt), +[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_csr_parity_err_cnt), +[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0, + CNTR_NORMAL, + access_misc_fw_auth_failed_err_cnt), +[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0, + CNTR_NORMAL, + access_misc_key_mismatch_err_cnt), +[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0, + CNTR_NORMAL, + access_misc_sbus_write_failed_err_cnt), +[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_write_bad_addr_err_cnt), +[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_read_bad_addr_err_cnt), +[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_parity_err_cnt), +/* CceErrStatus */ +[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0, + CNTR_NORMAL, + access_sw_cce_err_status_aggregated_cnt), +[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_csr_parity_err_cnt), +[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0, + CNTR_NORMAL, + access_cce_int_map_unc_err_cnt), +[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0, + CNTR_NORMAL, + access_cce_int_map_cor_err_cnt), +[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_table_unc_err_cnt), +[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_table_cor_err_cnt), +[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_rxdma_conv_fifo_parity_err_cnt), +[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_rcpl_async_fifo_parity_err_cnt), +[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_seg_write_bad_addr_err_cnt), +[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_seg_read_bad_addr_err_cnt), +[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0, + CNTR_NORMAL, + access_la_triggered_cnt), +[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0, + CNTR_NORMAL, + access_cce_trgt_cpl_timeout_err_cnt), +[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_receive_parity_err_cnt), +[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_transmit_back_parity_err_cnt), +[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0, + 0, CNTR_NORMAL, + access_pcic_transmit_front_parity_err_cnt), +[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_dat_q_unc_err_cnt), +[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_hd_q_unc_err_cnt), +[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_dat_q_unc_err_cnt), +[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_hd_q_unc_err_cnt), +[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_sot_mem_unc_err_cnt), +[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_mem_unc_err), +[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_n_post_dat_q_parity_err_cnt), +[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_n_post_h_q_parity_err_cnt), +[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_dat_q_cor_err_cnt), +[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_hd_q_cor_err_cnt), +[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_dat_q_cor_err_cnt), +[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_hd_q_cor_err_cnt), +[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_sot_mem_cor_err_cnt), +[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_mem_cor_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoDbgParityError", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_dbg_parity_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoRxdmaParityError", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_rxdma_parity_err_cnt + ), +[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoSdmaHdParityErr", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoPioCrdtParityErr", 0, 0, + CNTR_NORMAL, + access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt), +[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_cli2_async_fifo_parity_err_cnt), +[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_cfg_bus_parity_err_cnt), +[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_cli0_async_fifo_parity_err_cnt), +[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0, + CNTR_NORMAL, + access_cce_rspd_data_parity_err_cnt), +[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0, + CNTR_NORMAL, + access_cce_trgt_access_err_cnt), +[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_trgt_async_fifo_parity_err_cnt), +[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_write_bad_addr_err_cnt), +[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_read_bad_addr_err_cnt), +[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0, + CNTR_NORMAL, + access_ccs_csr_parity_err_cnt), + +/* RcvErrStatus */ +[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_parity_err_cnt), +[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_write_bad_addr_err_cnt), +[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_read_bad_addr_err_cnt), +[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_unc_err_cnt), +[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_dq_fsm_encoding_err_cnt), +[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_eq_fsm_encoding_err_cnt), +[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_parity_err_cnt), +[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_data_cor_err_cnt), +[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_data_unc_err_cnt), +[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_data_fifo_rd_cor_err_cnt), +[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_data_fifo_rd_unc_err_cnt), +[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_hdr_fifo_rd_cor_err_cnt), +[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_hdr_fifo_rd_unc_err_cnt), +[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part2_cor_err_cnt), +[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part2_unc_err_cnt), +[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part1_cor_err_cnt), +[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part1_unc_err_cnt), +[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0, + CNTR_NORMAL, + access_rx_hq_intr_fsm_err_cnt), +[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_hq_intr_csr_parity_err_cnt), +[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_csr_parity_err_cnt), +[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_rcv_array_cor_err_cnt), +[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_rcv_array_unc_err_cnt), +[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0, + 0, CNTR_NORMAL, + access_rx_lookup_des_part2_parity_err_cnt), +[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0, + 0, CNTR_NORMAL, + access_rx_lookup_des_part1_unc_cor_err_cnt), +[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_des_part1_unc_err_cnt), +[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_next_free_buf_cor_err_cnt), +[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_next_free_buf_unc_err_cnt), +[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM( + "RxRbufFlInitWrAddrParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_fl_init_wr_addr_parity_err_cnt), +[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_fl_initdone_parity_err_cnt), +[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_fl_write_addr_parity_err_cnt), +[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_fl_rd_addr_parity_err_cnt), +[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_empty_err_cnt), +[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_full_err_cnt), +[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0, + CNTR_NORMAL, + access_rbuf_bad_lookup_err_cnt), +[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_ctx_id_parity_err_cnt), +[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_csr_qeopdw_parity_err_cnt), +[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQNumOfPktParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt), +[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQTlPtrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt), +[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt), +[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_vld_bit_parity_err_cnt), +[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr", + 0, 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_next_buf_parity_err_cnt), +[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt), +[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQHeadBufNumParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt), +[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_block_list_read_cor_err_cnt), +[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_block_list_read_unc_err_cnt), +[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_cor_err_cnt), +[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_unc_err_cnt), +[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM( + "RxRbufLookupDesRegUncCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt), +[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_reg_unc_err_cnt), +[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_free_list_cor_err_cnt), +[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_free_list_unc_err_cnt), +[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_fsm_encoding_err_cnt), +[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_flag_cor_err_cnt), +[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_flag_unc_err_cnt), +[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dc_sop_eop_parity_err_cnt), +[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_csr_parity_err_cnt), +[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_qp_map_table_cor_err_cnt), +[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_qp_map_table_unc_err_cnt), +[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_data_cor_err_cnt), +[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_data_unc_err_cnt), +[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_hdr_cor_err_cnt), +[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_hdr_unc_err_cnt), +[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dc_intf_parity_err_cnt), +[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_cor_err_cnt), +/* SendPioErrStatus */ +[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pec_sop_head_parity_err_cnt), +[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pcc_sop_head_parity_err_cnt), +[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr", + 0, 0, CNTR_NORMAL, + access_pio_last_returned_cnt_parity_err_cnt), +[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0, + 0, CNTR_NORMAL, + access_pio_current_free_cnt_parity_err_cnt), +[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0, + CNTR_NORMAL, + access_pio_reserved_31_err_cnt), +[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0, + CNTR_NORMAL, + access_pio_reserved_30_err_cnt), +[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_sop_len_err_cnt), +[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_bqc_mem_parity_err_cnt), +[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vl_fifo_parity_err_cnt), +[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vlf_sop_parity_err_cnt), +[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vlf_v1_len_parity_err_cnt), +[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0, + CNTR_NORMAL, + access_pio_block_qw_count_parity_err_cnt), +[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_qw_valid_parity_err_cnt), +[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0, + CNTR_NORMAL, + access_pio_state_machine_err_cnt), +[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_data_parity_err_cnt), +[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0, + CNTR_NORMAL, + access_pio_host_addr_mem_cor_err_cnt), +[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0, + CNTR_NORMAL, + access_pio_host_addr_mem_unc_err_cnt), +[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0, + CNTR_NORMAL, + access_pio_pkt_evict_sm_or_arb_sm_err_cnt), +[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0, + CNTR_NORMAL, + access_pio_init_sm_in_err_cnt), +[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_pbl_fifo_err_cnt), +[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0, + 0, CNTR_NORMAL, + access_pio_credit_ret_fifo_parity_err_cnt), +[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank1_cor_err_cnt), +[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank0_cor_err_cnt), +[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank1_unc_err_cnt), +[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank0_unc_err_cnt), +[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sm_pkt_reset_parity_err_cnt), +[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pkt_evict_fifo_parity_err_cnt), +[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM( + "PioSbrdctrlCrrelFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sbrdctrl_crrel_fifo_parity_err_cnt), +[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sbrdctl_crrel_parity_err_cnt), +[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pec_fifo_parity_err_cnt), +[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pcc_fifo_parity_err_cnt), +[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0, + CNTR_NORMAL, + access_pio_sb_mem_fifo1_err_cnt), +[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0, + CNTR_NORMAL, + access_pio_sb_mem_fifo0_err_cnt), +[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0, + CNTR_NORMAL, + access_pio_csr_parity_err_cnt), +[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_addr_parity_err_cnt), +[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0, + CNTR_NORMAL, + access_pio_write_bad_ctxt_err_cnt), +/* SendDmaErrStatus */ +[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0, + 0, CNTR_NORMAL, + access_sdma_pcie_req_tracking_cor_err_cnt), +[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0, + 0, CNTR_NORMAL, + access_sdma_pcie_req_tracking_unc_err_cnt), +[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0, + CNTR_NORMAL, + access_sdma_csr_parity_err_cnt), +[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0, + CNTR_NORMAL, + access_sdma_rpy_tag_err_cnt), +/* SendEgressErrStatus */ +[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_csr_unc_err_cnt), +[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0, + 0, CNTR_NORMAL, + access_tx_read_sdma_memory_csr_err_cnt), +[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifo_cor_err_cnt), +[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_cor_err_cnt), +[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0, + CNTR_NORMAL, + access_tx_read_sdma_memory_cor_err_cnt), +[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0, + CNTR_NORMAL, + access_tx_sb_hdr_cor_err_cnt), +[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_overrun_err_cnt), +[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo8_cor_err_cnt), +[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo7_cor_err_cnt), +[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo6_cor_err_cnt), +[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo5_cor_err_cnt), +[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo4_cor_err_cnt), +[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo3_cor_err_cnt), +[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo2_cor_err_cnt), +[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo1_cor_err_cnt), +[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo0_cor_err_cnt), +[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_return_vl_err_cnt), +[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0, + CNTR_NORMAL, + access_tx_hcrc_insertion_err_cnt), +[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifo_unc_err_cnt), +[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_unc_err_cnt), +[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_sdma_memory_unc_err_cnt), +[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0, + CNTR_NORMAL, + access_tx_sb_hdr_unc_err_cnt), +[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_return_partiy_err_cnt), +[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo8_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo7_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo6_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo5_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo4_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo3_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo2_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo1_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo0_unc_or_parity_err_cnt), +[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma15_disallowed_packet_err_cnt), +[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma14_disallowed_packet_err_cnt), +[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma13_disallowed_packet_err_cnt), +[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma12_disallowed_packet_err_cnt), +[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma11_disallowed_packet_err_cnt), +[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma10_disallowed_packet_err_cnt), +[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma9_disallowed_packet_err_cnt), +[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma8_disallowed_packet_err_cnt), +[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma7_disallowed_packet_err_cnt), +[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma6_disallowed_packet_err_cnt), +[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma5_disallowed_packet_err_cnt), +[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma4_disallowed_packet_err_cnt), +[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma3_disallowed_packet_err_cnt), +[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma2_disallowed_packet_err_cnt), +[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma1_disallowed_packet_err_cnt), +[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma0_disallowed_packet_err_cnt), +[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0, + CNTR_NORMAL, + access_tx_config_parity_err_cnt), +[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_sbrd_ctl_csr_parity_err_cnt), +[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_csr_parity_err_cnt), +[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0, + CNTR_NORMAL, + access_tx_illegal_vl_err_cnt), +[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM( + "TxSbrdCtlStateMachineParityErr", 0, 0, + CNTR_NORMAL, + access_tx_sbrd_ctl_state_machine_parity_err_cnt), +[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0, + CNTR_NORMAL, + access_egress_reserved_10_err_cnt), +[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0, + CNTR_NORMAL, + access_egress_reserved_9_err_cnt), +[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma_launch_intf_parity_err_cnt), +[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0, + CNTR_NORMAL, + access_tx_pio_launch_intf_parity_err_cnt), +[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0, + CNTR_NORMAL, + access_egress_reserved_6_err_cnt), +[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0, + CNTR_NORMAL, + access_tx_incorrect_link_state_err_cnt), +[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0, + CNTR_NORMAL, + access_tx_linkdown_err_cnt), +[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM( + "EgressFifoUnderrunOrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifi_underrun_or_parity_err_cnt), +[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0, + CNTR_NORMAL, + access_egress_reserved_2_err_cnt), +[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0, + CNTR_NORMAL, + access_tx_pkt_integrity_mem_unc_err_cnt), +[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0, + CNTR_NORMAL, + access_tx_pkt_integrity_mem_cor_err_cnt), +/* SendErrStatus */ +[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_send_csr_write_bad_addr_err_cnt), +[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_send_csr_read_bad_addr_err_cnt), +[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0, + CNTR_NORMAL, + access_send_csr_parity_cnt), +/* SendCtxtErrStatus */ +[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0, + CNTR_NORMAL, + access_pio_write_out_of_bounds_err_cnt), +[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0, + CNTR_NORMAL, + access_pio_write_overflow_err_cnt), +[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr", + 0, 0, CNTR_NORMAL, + access_pio_write_crosses_boundary_err_cnt), +[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0, + CNTR_NORMAL, + access_pio_disallowed_packet_err_cnt), +[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0, + CNTR_NORMAL, + access_pio_inconsistent_sop_err_cnt), +/* SendDmaEngErrStatus */ +[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr", + 0, 0, CNTR_NORMAL, + access_sdma_header_request_fifo_cor_err_cnt), +[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_storage_cor_err_cnt), +[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_tracking_cor_err_cnt), +[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_assembly_cor_err_cnt), +[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_desc_table_cor_err_cnt), +[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr", + 0, 0, CNTR_NORMAL, + access_sdma_header_request_fifo_unc_err_cnt), +[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_storage_unc_err_cnt), +[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_tracking_unc_err_cnt), +[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_assembly_unc_err_cnt), +[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_desc_table_unc_err_cnt), +[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0, + CNTR_NORMAL, + access_sdma_timeout_err_cnt), +[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_length_err_cnt), +[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_address_err_cnt), +[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_select_err_cnt), +[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0, + CNTR_NORMAL, + access_sdma_reserved_9_err_cnt), +[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_desc_overflow_err_cnt), +[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0, + CNTR_NORMAL, + access_sdma_length_mismatch_err_cnt), +[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0, + CNTR_NORMAL, + access_sdma_halt_err_cnt), +[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0, + CNTR_NORMAL, + access_sdma_mem_read_err_cnt), +[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0, + CNTR_NORMAL, + access_sdma_first_desc_err_cnt), +[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0, + CNTR_NORMAL, + access_sdma_tail_out_of_bounds_err_cnt), +[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0, + CNTR_NORMAL, + access_sdma_too_long_err_cnt), +[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0, + CNTR_NORMAL, + access_sdma_gen_mismatch_err_cnt), +[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0, + CNTR_NORMAL, + access_sdma_wrong_dw_err_cnt), +}; + +static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = { +[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT, + CNTR_NORMAL), +[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT, + CNTR_NORMAL), +[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT, + CNTR_NORMAL), +[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT, + CNTR_NORMAL), +[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT, + CNTR_NORMAL), +[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL), +[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL), +[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH), +[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL), +[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL), +[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_dn_cnt), +[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_up_cnt), +[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL, + access_sw_unknown_frame_cnt), +[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_xmit_discards), +[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0, + CNTR_SYNTH | CNTR_32BIT | CNTR_VL, + access_sw_xmit_discards), +[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH, + access_xmit_constraint_errs), +[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH, + access_rcv_constraint_errs), +[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts), +[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends), +[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks), +[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks), +[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts), +[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops), +[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait), +[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak), +[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq), +[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq), +[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned), +[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks), +[C_SW_IBP_RC_CRWAITS] = SW_IBP_CNTR(RcCrWait, rc_crwaits), +[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_acks), +[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_qacks), +[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_delayed_comp), +[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1), +[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3), +[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5), +[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7), +[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9), +[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11), +[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13), +[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15), +[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17), +[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19), +[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21), +[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23), +[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25), +[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27), +[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29), +[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31), +[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33), +[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35), +[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37), +[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39), +[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41), +[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43), +[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45), +[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47), +[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49), +[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51), +[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53), +[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55), +[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57), +[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59), +[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61), +[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63), +[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65), +[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67), +[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69), +[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71), +[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73), +[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75), +[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77), +[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79), +[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81), +[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83), +[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85), +[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87), +[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89), +[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91), +[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93), +[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95), +[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97), +[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99), +[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101), +[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103), +[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105), +[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107), +[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109), +[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111), +[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113), +[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115), +[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117), +[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119), +[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121), +[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123), +[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125), +[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127), +[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129), +[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131), +[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133), +[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135), +[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137), +[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139), +[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141), +[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143), +[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145), +[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147), +[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149), +[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151), +[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153), +[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155), +[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157), +[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159), +}; + +/* ======================================================================== */ + +/* return true if this is chip revision revision a */ +int is_ax(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return (chip_rev_minor & 0xf0) == 0; +} + +/* return true if this is chip revision revision b */ +int is_bx(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return (chip_rev_minor & 0xF0) == 0x10; +} + +/* return true is kernel urg disabled for rcd */ +bool is_urg_masked(struct hfi1_ctxtdata *rcd) +{ + u64 mask; + u32 is = IS_RCVURGENT_START + rcd->ctxt; + u8 bit = is % 64; + + mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64))); + return !(mask & BIT_ULL(bit)); +} + +/* + * Append string s to buffer buf. Arguments curp and len are the current + * position and remaining length, respectively. + * + * return 0 on success, 1 on out of room + */ +static int append_str(char *buf, char **curp, int *lenp, const char *s) +{ + char *p = *curp; + int len = *lenp; + int result = 0; /* success */ + char c; + + /* add a comma, if first in the buffer */ + if (p != buf) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = ','; + len--; + } + + /* copy the string */ + while ((c = *s++) != 0) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = c; + len--; + } + +done: + /* write return values */ + *curp = p; + *lenp = len; + + return result; +} + +/* + * Using the given flag table, print a comma separated string into + * the buffer. End in '*' if the buffer is too short. + */ +static char *flag_string(char *buf, int buf_len, u64 flags, + struct flag_table *table, int table_size) +{ + char extra[32]; + char *p = buf; + int len = buf_len; + int no_room = 0; + int i; + + /* make sure there is at least 2 so we can form "*" */ + if (len < 2) + return ""; + + len--; /* leave room for a nul */ + for (i = 0; i < table_size; i++) { + if (flags & table[i].flag) { + no_room = append_str(buf, &p, &len, table[i].str); + if (no_room) + break; + flags &= ~table[i].flag; + } + } + + /* any undocumented bits left? */ + if (!no_room && flags) { + snprintf(extra, sizeof(extra), "bits 0x%llx", flags); + no_room = append_str(buf, &p, &len, extra); + } + + /* add * if ran out of room */ + if (no_room) { + /* may need to back up to add space for a '*' */ + if (len == 0) + --p; + *p++ = '*'; + } + + /* add final nul - space already allocated above */ + *p = 0; + return buf; +} + +/* first 8 CCE error interrupt source names */ +static const char * const cce_misc_names[] = { + "CceErrInt", /* 0 */ + "RxeErrInt", /* 1 */ + "MiscErrInt", /* 2 */ + "Reserved3", /* 3 */ + "PioErrInt", /* 4 */ + "SDmaErrInt", /* 5 */ + "EgressErrInt", /* 6 */ + "TxeErrInt" /* 7 */ +}; + +/* + * Return the miscellaneous error interrupt name. + */ +static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(cce_misc_names)) + strncpy(buf, cce_misc_names[source], bsize); + else + snprintf(buf, bsize, "Reserved%u", + source + IS_GENERAL_ERR_START); + + return buf; +} + +/* + * Return the SDMA engine error interrupt name. + */ +static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SDmaEngErrInt%u", source); + return buf; +} + +/* + * Return the send context error interrupt name. + */ +static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCtxtErrInt%u", source); + return buf; +} + +static const char * const various_names[] = { + "PbcInt", + "GpioAssertInt", + "Qsfp1Int", + "Qsfp2Int", + "TCritInt" +}; + +/* + * Return the various interrupt name. + */ +static char *is_various_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(various_names)) + strncpy(buf, various_names[source], bsize); + else + snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START); + return buf; +} + +/* + * Return the DC interrupt name. + */ +static char *is_dc_name(char *buf, size_t bsize, unsigned int source) +{ + static const char * const dc_int_names[] = { + "common", + "lcb", + "8051", + "lbm" /* local block merge */ + }; + + if (source < ARRAY_SIZE(dc_int_names)) + snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]); + else + snprintf(buf, bsize, "DCInt%u", source); + return buf; +} + +static const char * const sdma_int_names[] = { + "SDmaInt", + "SdmaIdleInt", + "SdmaProgressInt", +}; + +/* + * Return the SDMA engine interrupt name. + */ +static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + + if (likely(what < 3)) + snprintf(buf, bsize, "%s%u", sdma_int_names[what], which); + else + snprintf(buf, bsize, "Invalid SDMA interrupt %u", source); + return buf; +} + +/* + * Return the receive available interrupt name. + */ +static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvAvailInt%u", source); + return buf; +} + +/* + * Return the receive urgent interrupt name. + */ +static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvUrgentInt%u", source); + return buf; +} + +/* + * Return the send credit interrupt name. + */ +static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCreditInt%u", source); + return buf; +} + +/* + * Return the reserved interrupt name. + */ +static char *is_reserved_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START); + return buf; +} + +static char *cce_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + cce_err_status_flags, + ARRAY_SIZE(cce_err_status_flags)); +} + +static char *rxe_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + rxe_err_status_flags, + ARRAY_SIZE(rxe_err_status_flags)); +} + +static char *misc_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, misc_err_status_flags, + ARRAY_SIZE(misc_err_status_flags)); +} + +static char *pio_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + pio_err_status_flags, + ARRAY_SIZE(pio_err_status_flags)); +} + +static char *sdma_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sdma_err_status_flags, + ARRAY_SIZE(sdma_err_status_flags)); +} + +static char *egress_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_status_flags, + ARRAY_SIZE(egress_err_status_flags)); +} + +static char *egress_err_info_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_info_flags, + ARRAY_SIZE(egress_err_info_flags)); +} + +static char *send_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + send_err_status_flags, + ARRAY_SIZE(send_err_status_flags)); +} + +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + /* + * For most these errors, there is nothing that can be done except + * report or record it. + */ + dd_dev_info(dd, "CCE Error: %s\n", + cce_err_status_string(buf, sizeof(buf), reg)); + + if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) && + is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) { + /* this error requires a manual drop into SPC freeze mode */ + /* then a fix up */ + start_freeze_handling(dd->pport, FREEZE_SELF); + } + + for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) { + incr_cntr64(&dd->cce_err_status_cnt[i]); + /* maintain a counter over all cce_err_status errors */ + incr_cntr64(&dd->sw_cce_err_status_aggregate); + } + } +} + +/* + * Check counters for receive errors that do not have an interrupt + * associated with them. + */ +#define RCVERR_CHECK_TIME 10 +static void update_rcverr_timer(struct timer_list *t) +{ + struct hfi1_devdata *dd = from_timer(dd, t, rcverr_timer); + struct hfi1_pportdata *ppd = dd->pport; + u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + + if (dd->rcv_ovfl_cnt < cur_ovfl_cnt && + ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) { + dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); + set_link_down_reason( + ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, + OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); + queue_work(ppd->link_wq, &ppd->link_bounce_work); + } + dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt; + + mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static int init_rcverr(struct hfi1_devdata *dd) +{ + timer_setup(&dd->rcverr_timer, update_rcverr_timer, 0); + /* Assume the hardware counter has been reset */ + dd->rcv_ovfl_cnt = 0; + return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static void free_rcverr(struct hfi1_devdata *dd) +{ + if (dd->rcverr_timer.function) + del_timer_sync(&dd->rcverr_timer); +} + +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Receive Error: %s\n", + rxe_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_RXE_FREEZE_ERR) { + int flags = 0; + + /* + * Freeze mode recovery is disabled for the errors + * in RXE_FREEZE_ABORT_MASK + */ + if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK)) + flags = FREEZE_ABORT; + + start_freeze_handling(dd->pport, flags); + } + + for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->rcv_err_status_cnt[i]); + } +} + +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Misc Error: %s", + misc_err_status_string(buf, sizeof(buf), reg)); + for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->misc_err_status_cnt[i]); + } +} + +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "PIO Error: %s\n", + pio_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_PIO_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + + for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_pio_err_status_cnt[i]); + } +} + +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "SDMA Error: %s\n", + sdma_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_SDMA_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + + for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_dma_err_status_cnt[i]); + } +} + +static inline void __count_port_discards(struct hfi1_pportdata *ppd) +{ + incr_cntr64(&ppd->port_xmit_discards); +} + +static void count_port_inactive(struct hfi1_devdata *dd) +{ + __count_port_discards(dd->pport); +} + +/* + * We have had a "disallowed packet" error during egress. Determine the + * integrity check which failed, and update relevant error counter, etc. + * + * Note that the SEND_EGRESS_ERR_INFO register has only a single + * bit of state per integrity check, and so we can miss the reason for an + * egress error if more than one packet fails the same integrity check + * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO. + */ +static void handle_send_egress_err_info(struct hfi1_devdata *dd, + int vl) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */ + u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO); + char buf[96]; + + /* clear down all observed info as quickly as possible after read */ + write_csr(dd, SEND_EGRESS_ERR_INFO, info); + + dd_dev_info(dd, + "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n", + info, egress_err_info_string(buf, sizeof(buf), info), src); + + /* Eventually add other counters for each bit */ + if (info & PORT_DISCARD_EGRESS_ERRS) { + int weight, i; + + /* + * Count all applicable bits as individual errors and + * attribute them to the packet that triggered this handler. + * This may not be completely accurate due to limitations + * on the available hardware error information. There is + * a single information register and any number of error + * packets may have occurred and contributed to it before + * this routine is called. This means that: + * a) If multiple packets with the same error occur before + * this routine is called, earlier packets are missed. + * There is only a single bit for each error type. + * b) Errors may not be attributed to the correct VL. + * The driver is attributing all bits in the info register + * to the packet that triggered this call, but bits + * could be an accumulation of different packets with + * different VLs. + * c) A single error packet may have multiple counts attached + * to it. There is no way for the driver to know if + * multiple bits set in the info register are due to a + * single packet or multiple packets. The driver assumes + * multiple packets. + */ + weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS); + for (i = 0; i < weight; i++) { + __count_port_discards(ppd); + if (vl >= 0 && vl < TXE_NUM_DATA_VL) + incr_cntr64(&ppd->port_xmit_discards_vl[vl]); + else if (vl == 15) + incr_cntr64(&ppd->port_xmit_discards_vl + [C_VL_15]); + } + } +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'port inactive' error? + */ +static inline int port_inactive_err(u64 posn) +{ + return (posn >= SEES(TX_LINKDOWN) && + posn <= SEES(TX_INCORRECT_LINK_STATE)); +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'disallowed packet' error? + */ +static inline int disallowed_pkt_err(int posn) +{ + return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) && + posn <= SEES(TX_SDMA15_DISALLOWED_PACKET)); +} + +/* + * Input value is a bit position of one of the SDMA engine disallowed + * packet errors. Return which engine. Use of this must be guarded by + * disallowed_pkt_err(). + */ +static inline int disallowed_pkt_engine(int posn) +{ + return posn - SEES(TX_SDMA0_DISALLOWED_PACKET); +} + +/* + * Translate an SDMA engine to a VL. Return -1 if the tranlation cannot + * be done. + */ +static int engine_to_vl(struct hfi1_devdata *dd, int engine) +{ + struct sdma_vl_map *m; + int vl; + + /* range check */ + if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES) + return -1; + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + vl = m->engine_to_vl[engine]; + rcu_read_unlock(); + + return vl; +} + +/* + * Translate the send context (sofware index) into a VL. Return -1 if the + * translation cannot be done. + */ +static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) +{ + struct send_context_info *sci; + struct send_context *sc; + int i; + + sci = &dd->send_contexts[sw_index]; + + /* there is no information for user (PSM) and ack contexts */ + if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) + return -1; + + sc = sci->sc; + if (!sc) + return -1; + if (dd->vld[15].sc == sc) + return 15; + for (i = 0; i < num_vls; i++) + if (dd->vld[i].sc == sc) + return i; + + return -1; +} + +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 reg_copy = reg, handled = 0; + char buf[96]; + int i = 0; + + if (reg & ALL_TXE_EGRESS_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + else if (is_ax(dd) && + (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) && + (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) + start_freeze_handling(dd->pport, 0); + + while (reg_copy) { + int posn = fls64(reg_copy); + /* fls64() returns a 1-based offset, we want it zero based */ + int shift = posn - 1; + u64 mask = 1ULL << shift; + + if (port_inactive_err(shift)) { + count_port_inactive(dd); + handled |= mask; + } else if (disallowed_pkt_err(shift)) { + int vl = engine_to_vl(dd, disallowed_pkt_engine(shift)); + + handle_send_egress_err_info(dd, vl); + handled |= mask; + } + reg_copy &= ~mask; + } + + reg &= ~handled; + + if (reg) + dd_dev_info(dd, "Egress Error: %s\n", + egress_err_status_string(buf, sizeof(buf), reg)); + + for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_egress_err_status_cnt[i]); + } +} + +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Send Error: %s\n", + send_err_status_string(buf, sizeof(buf), reg)); + + for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_err_status_cnt[i]); + } +} + +/* + * The maximum number of times the error clear down will loop before + * blocking a repeating error. This value is arbitrary. + */ +#define MAX_CLEAR_COUNT 20 + +/* + * Clear and handle an error register. All error interrupts are funneled + * through here to have a central location to correctly handle single- + * or multi-shot errors. + * + * For non per-context registers, call this routine with a context value + * of 0 so the per-context offset is zero. + * + * If the handler loops too many times, assume that something is wrong + * and can't be fixed, so mask the error bits. + */ +static void interrupt_clear_down(struct hfi1_devdata *dd, + u32 context, + const struct err_reg_info *eri) +{ + u64 reg; + u32 count; + + /* read in a loop until no more errors are seen */ + count = 0; + while (1) { + reg = read_kctxt_csr(dd, context, eri->status); + if (reg == 0) + break; + write_kctxt_csr(dd, context, eri->clear, reg); + if (likely(eri->handler)) + eri->handler(dd, context, reg); + count++; + if (count > MAX_CLEAR_COUNT) { + u64 mask; + + dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n", + eri->desc, reg); + /* + * Read-modify-write so any other masked bits + * remain masked. + */ + mask = read_kctxt_csr(dd, context, eri->mask); + mask &= ~reg; + write_kctxt_csr(dd, context, eri->mask, mask); + break; + } + } +} + +/* + * CCE block "misc" interrupt. Source is < 16. + */ +static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &misc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else { + dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n", + source); + } +} + +static char *send_context_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sc_err_status_flags, + ARRAY_SIZE(sc_err_status_flags)); +} + +/* + * Send context error interrupt. Source (hw_context) is < 160. + * + * All send context errors cause the send context to halt. The normal + * clear-down mechanism cannot be used because we cannot clear the + * error bits until several other long-running items are done first. + * This is OK because with the context halted, nothing else is going + * to happen on it anyway. + */ +static void is_sendctxt_err_int(struct hfi1_devdata *dd, + unsigned int hw_context) +{ + struct send_context_info *sci; + struct send_context *sc; + char flags[96]; + u64 status; + u32 sw_index; + int i = 0; + unsigned long irq_flags; + + sw_index = dd->hw_to_sw[hw_context]; + if (sw_index >= dd->num_send_contexts) { + dd_dev_err(dd, + "out of range sw index %u for send context %u\n", + sw_index, hw_context); + return; + } + sci = &dd->send_contexts[sw_index]; + spin_lock_irqsave(&dd->sc_lock, irq_flags); + sc = sci->sc; + if (!sc) { + dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__, + sw_index, hw_context); + spin_unlock_irqrestore(&dd->sc_lock, irq_flags); + return; + } + + /* tell the software that a halt has begun */ + sc_stop(sc, SCF_HALTED); + + status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS); + + dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context, + send_context_err_status_string(flags, sizeof(flags), + status)); + + if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK) + handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index)); + + /* + * Automatically restart halted kernel contexts out of interrupt + * context. User contexts must ask the driver to restart the context. + */ + if (sc->type != SC_USER) + queue_work(dd->pport->hfi1_wq, &sc->halt_work); + spin_unlock_irqrestore(&dd->sc_lock, irq_flags); + + /* + * Update the counters for the corresponding status bits. + * Note that these particular counters are aggregated over all + * 160 contexts. + */ + for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) { + if (status & (1ull << i)) + incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]); + } +} + +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int source, u64 status) +{ + struct sdma_engine *sde; + int i = 0; + + sde = &dd->per_sdma[source]; +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n", + sde->this_idx, source, (unsigned long long)status); +#endif + sde->err_cnt++; + sdma_engine_error(sde, status); + + /* + * Update the counters for the corresponding status bits. + * Note that these particular counters are aggregated over + * all 16 DMA engines. + */ + for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) { + if (status & (1ull << i)) + incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]); + } +} + +/* + * CCE block SDMA error interrupt. Source is < 16. + */ +static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source) +{ +#ifdef CONFIG_SDMA_VERBOSITY + struct sdma_engine *sde = &dd->per_sdma[source]; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx, + source); + sdma_dumpstate(sde); +#endif + interrupt_clear_down(dd, source, &sdma_eng_err); +} + +/* + * CCE block "various" interrupt. Source is < 8. + */ +static void is_various_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &various_err[source]; + + /* + * TCritInt cannot go through interrupt_clear_down() + * because it is not a second tier interrupt. The handler + * should be called directly. + */ + if (source == TCRIT_INT_SOURCE) + handle_temp_err(dd); + else if (eri->handler) + interrupt_clear_down(dd, 0, eri); + else + dd_dev_info(dd, + "%s: Unimplemented/reserved interrupt %d\n", + __func__, source); +} + +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) +{ + /* src_ctx is always zero */ + struct hfi1_pportdata *ppd = dd->pport; + unsigned long flags; + u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + + if (reg & QSFP_HFI0_MODPRST_N) { + if (!qsfp_mod_present(ppd)) { + dd_dev_info(dd, "%s: QSFP module removed\n", + __func__); + + ppd->driver_link_ready = 0; + /* + * Cable removed, reset all our information about the + * cache and cable capabilities + */ + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + /* + * We don't set cache_refresh_required here as we expect + * an interrupt when a cable is inserted + */ + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.reset_needed = 0; + ppd->qsfp_info.limiting_active = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + /* Invert the ModPresent pin now to detect plug-in */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + if ((ppd->offline_disabled_reason > + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) || + (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + + if (ppd->host_link_state == HLS_DN_POLL) { + /* + * The link is still in POLL. This means + * that the normal link down processing + * will not happen. We have to do it here + * before turning the DC off. + */ + queue_work(ppd->link_wq, &ppd->link_down_work); + } + } else { + dd_dev_info(dd, "%s: QSFP module inserted\n", + __func__); + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.cache_refresh_required = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + + /* + * Stop inversion of ModPresent pin to detect + * removal of the cable + */ + qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); + } + } + + if (reg & QSFP_HFI0_INT_N) { + dd_dev_info(dd, "%s: Interrupt received from QSFP module\n", + __func__); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + } + + /* Schedule the QSFP work only if there is a cable attached. */ + if (qsfp_mod_present(ppd)) + queue_work(ppd->link_wq, &ppd->qsfp_info.qsfp_work); +} + +static int request_host_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_REQUEST_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); + if (ret != HCMD_SUCCESS && !(dd->flags & HFI1_SHUTDOWN)) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +static int request_8051_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_GRANT_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +/* + * Set the LCB selector - allow host access. The DCC selector always + * points to the host. + */ +static inline void set_host_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK | + DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK); +} + +/* + * Clear the LCB selector - allow 8051 access. The DCC selector always + * points to the host. + */ +static inline void set_8051_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK); +} + +/* + * Acquire LCB access from the 8051. If the host already has access, + * just increment a counter. Otherwise, inform the 8051 that the + * host is taking access. + * + * Returns: + * 0 on success + * -EBUSY if the 8051 has control and cannot be disturbed + * -errno if unable to acquire access from the 8051 + */ +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + struct hfi1_pportdata *ppd = dd->pport; + int ret = 0; + + /* + * Use the host link state lock so the operation of this routine + * { link state check, selector change, count increment } can occur + * as a unit against a link state change. Otherwise there is a + * race between the state change and the count increment. + */ + if (sleep_ok) { + mutex_lock(&ppd->hls_lock); + } else { + while (!mutex_trylock(&ppd->hls_lock)) + udelay(1); + } + + /* this access is valid only when the link is up */ + if (ppd->host_link_state & HLS_DOWN) { + dd_dev_info(dd, "%s: link state %s not up\n", + __func__, link_state_name(ppd->host_link_state)); + ret = -EBUSY; + goto done; + } + + if (dd->lcb_access_count == 0) { + ret = request_host_lcb_access(dd); + if (ret) { + if (!(dd->flags & HFI1_SHUTDOWN)) + dd_dev_err(dd, + "%s: unable to acquire LCB access, err %d\n", + __func__, ret); + goto done; + } + set_host_lcb_access(dd); + } + dd->lcb_access_count++; +done: + mutex_unlock(&ppd->hls_lock); + return ret; +} + +/* + * Release LCB access by decrementing the use count. If the count is moving + * from 1 to 0, inform 8051 that it has control back. + * + * Returns: + * 0 on success + * -errno if unable to release access to the 8051 + */ +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + int ret = 0; + + /* + * Use the host link state lock because the acquire needed it. + * Here, we only need to keep { selector change, count decrement } + * as a unit. + */ + if (sleep_ok) { + mutex_lock(&dd->pport->hls_lock); + } else { + while (!mutex_trylock(&dd->pport->hls_lock)) + udelay(1); + } + + if (dd->lcb_access_count == 0) { + dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n", + __func__); + goto done; + } + + if (dd->lcb_access_count == 1) { + set_8051_lcb_access(dd); + ret = request_8051_lcb_access(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to release LCB access, err %d\n", + __func__, ret); + /* restore host access if the grant didn't work */ + set_host_lcb_access(dd); + goto done; + } + } + dd->lcb_access_count--; +done: + mutex_unlock(&dd->pport->hls_lock); + return ret; +} + +/* + * Initialize LCB access variables and state. Called during driver load, + * after most of the initialization is finished. + * + * The DC default is LCB access on for the host. The driver defaults to + * leaving access to the 8051. Assign access now - this constrains the call + * to this routine to be after all LCB set-up is done. In particular, after + * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts() + */ +static void init_lcb_access(struct hfi1_devdata *dd) +{ + dd->lcb_access_count = 0; +} + +/* + * Write a response back to a 8051 request. + */ +static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) +{ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, + DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK | + (u64)return_code << + DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT | + (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); +} + +/* + * Handle host requests from the 8051. + */ +static void handle_8051_request(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + u16 data = 0; + u8 type; + + reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); + if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) + return; /* no request */ + + /* zero out COMPLETED so the response is seen */ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0); + + /* extract request details */ + type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK; + data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK; + + switch (type) { + case HREQ_LOAD_CONFIG: + case HREQ_SAVE_CONFIG: + case HREQ_READ_CONFIG: + case HREQ_SET_TX_EQ_ABS: + case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: + dd_dev_info(dd, "8051 request: request 0x%x not supported\n", + type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + case HREQ_LCB_RESET: + /* Put the LCB, RX FPE and TX FPE into reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_INTO_RESET); + /* Make sure the write completed */ + (void)read_csr(dd, DCC_CFG_RESET); + /* Hold the reset long enough to take effect */ + udelay(1); + /* Take the LCB, RX FPE and TX FPE out of reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); + hreq_response(dd, HREQ_SUCCESS, 0); + + break; + case HREQ_CONFIG_DONE: + hreq_response(dd, HREQ_SUCCESS, 0); + break; + + case HREQ_INTERFACE_TEST: + hreq_response(dd, HREQ_SUCCESS, data); + break; + default: + dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + } +} + +/* + * Set up allocation unit vaulue. + */ +void set_up_vau(struct hfi1_devdata *dd, u8 vau) +{ + u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + + /* do not modify other values in the register */ + reg &= ~SEND_CM_GLOBAL_CREDIT_AU_SMASK; + reg |= (u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* + * Set up initial VL15 credits of the remote. Assumes the rest of + * the CM credit registers are zero from a previous global or credit reset. + * Shared limit for VL15 will always be 0. + */ +void set_up_vl15(struct hfi1_devdata *dd, u16 vl15buf) +{ + u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + + /* set initial values for total and shared credit limit */ + reg &= ~(SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK | + SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK); + + /* + * Set total limit to be equal to VL15 credits. + * Leave shared limit at 0. + */ + reg |= (u64)vl15buf << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); + + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); +} + +/* + * Zero all credit details from the previous connection and + * reset the CM manager's internal counters. + */ +void reset_link_credits(struct hfi1_devdata *dd) +{ + int i; + + /* remove all previous VL credit limits */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + write_csr(dd, SEND_CM_GLOBAL_CREDIT, 0); + /* reset the CM block */ + pio_send_control(dd, PSC_CM_RESET); + /* reset cached value */ + dd->vl15buf_cached = 0; +} + +/* convert a vCU to a CU */ +static u32 vcu_to_cu(u8 vcu) +{ + return 1 << vcu; +} + +/* convert a CU to a vCU */ +static u8 cu_to_vcu(u32 cu) +{ + return ilog2(cu); +} + +/* convert a vAU to an AU */ +static u32 vau_to_au(u8 vau) +{ + return 8 * (1 << vau); +} + +static void set_linkup_defaults(struct hfi1_pportdata *ppd) +{ + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; +} + +/* + * Graceful LCB shutdown. This leaves the LCB FIFOs in reset. + */ +static void lcb_shutdown(struct hfi1_devdata *dd, int abort) +{ + u64 reg; + + /* clear lcb run: LCB_CFG_RUN.EN = 0 */ + write_csr(dd, DC_LCB_CFG_RUN, 0); + /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, + 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); + /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */ + dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); + reg = read_csr(dd, DCC_CFG_RESET); + write_csr(dd, DCC_CFG_RESET, reg | + DCC_CFG_RESET_RESET_LCB | DCC_CFG_RESET_RESET_RX_FPE); + (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ + if (!abort) { + udelay(1); /* must hold for the longer of 16cclks or 20ns */ + write_csr(dd, DCC_CFG_RESET, reg); + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + } +} + +/* + * This routine should be called after the link has been transitioned to + * OFFLINE (OFFLINE state has the side effect of putting the SerDes into + * reset). + * + * The expectation is that the caller of this routine would have taken + * care of properly transitioning the link into the correct state. + * NOTE: the caller needs to acquire the dd->dc8051_lock lock + * before calling this function. + */ +static void _dc_shutdown(struct hfi1_devdata *dd) +{ + lockdep_assert_held(&dd->dc8051_lock); + + if (dd->dc_shutdown) + return; + + dd->dc_shutdown = 1; + /* Shutdown the LCB */ + lcb_shutdown(dd, 1); + /* + * Going to OFFLINE would have causes the 8051 to put the + * SerDes into reset already. Just need to shut down the 8051, + * itself. + */ + write_csr(dd, DC_DC8051_CFG_RST, 0x1); +} + +static void dc_shutdown(struct hfi1_devdata *dd) +{ + mutex_lock(&dd->dc8051_lock); + _dc_shutdown(dd); + mutex_unlock(&dd->dc8051_lock); +} + +/* + * Calling this after the DC has been brought out of reset should not + * do any damage. + * NOTE: the caller needs to acquire the dd->dc8051_lock lock + * before calling this function. + */ +static void _dc_start(struct hfi1_devdata *dd) +{ + lockdep_assert_held(&dd->dc8051_lock); + + if (!dd->dc_shutdown) + return; + + /* Take the 8051 out of reset */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* Wait until 8051 is ready */ + if (wait_fm_ready(dd, TIMEOUT_8051_START)) + dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", + __func__); + + /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); + /* lcb_shutdown() with abort=1 does not restore these */ + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + dd->dc_shutdown = 0; +} + +static void dc_start(struct hfi1_devdata *dd) +{ + mutex_lock(&dd->dc8051_lock); + _dc_start(dd); + mutex_unlock(&dd->dc8051_lock); +} + +/* + * These LCB adjustments are for the Aurora SerDes core in the FPGA. + */ +static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd) +{ + u64 rx_radr, tx_radr; + u32 version; + + if (dd->icode != ICODE_FPGA_EMULATION) + return; + + /* + * These LCB defaults on emulator _s are good, nothing to do here: + * LCB_CFG_TX_FIFOS_RADR + * LCB_CFG_RX_FIFOS_RADR + * LCB_CFG_LN_DCLK + * LCB_CFG_IGNORE_LOST_RCLK + */ + if (is_emulator_s(dd)) + return; + /* else this is _p */ + + version = emulator_rev(dd); + if (!is_ax(dd)) + version = 0x2d; /* all B0 use 0x2d or higher settings */ + + if (version <= 0x12) { + /* release 0x12 and below */ + + /* + * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa + */ + rx_radr = + 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + /* + * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default) + * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6 + */ + tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version <= 0x18) { + /* release 0x13 up to 0x18 */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x19) { + /* release 0x19 */ + /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */ + rx_radr = + 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x1a) { + /* release 0x1a */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull); + } else { + /* release 0x1b and higher */ + /* LCB_CFG_RX_FIFOS_RADR = 0x877 */ + rx_radr = + 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } + + write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr); + /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */ + write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, + DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr); +} + +/* + * Handle a SMA idle message + * + * This is a work-queue function outside of the interrupt. + */ +void handle_sma_message(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + sma_message_work); + struct hfi1_devdata *dd = ppd->dd; + u64 msg; + int ret; + + /* + * msg is bytes 1-4 of the 40-bit idle message - the command code + * is stripped off + */ + ret = read_idle_sma(dd, &msg); + if (ret) + return; + dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg); + /* + * React to the SMA message. Byte[1] (0 for us) is the command. + */ + switch (msg & 0xff) { + case SMA_IDLE_ARM: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Only expected in INIT or ARMED, discard otherwise. + */ + if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED)) + ppd->neighbor_normal = 1; + break; + case SMA_IDLE_ACTIVE: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Can activate the node. Discard otherwise. + */ + if (ppd->host_link_state == HLS_UP_ARMED && + ppd->is_active_optimize_enabled) { + ppd->neighbor_normal = 1; + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret) + dd_dev_err( + dd, + "%s: received Active SMA idle message, couldn't set link to Active\n", + __func__); + } + break; + default: + dd_dev_err(dd, + "%s: received unexpected SMA idle message 0x%llx\n", + __func__, msg); + break; + } +} + +static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear) +{ + u64 rcvctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->rcvctrl_lock, flags); + rcvctrl = read_csr(dd, RCV_CTRL); + rcvctrl |= add; + rcvctrl &= ~clear; + write_csr(dd, RCV_CTRL, rcvctrl); + spin_unlock_irqrestore(&dd->rcvctrl_lock, flags); +} + +static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add) +{ + adjust_rcvctrl(dd, add, 0); +} + +static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear) +{ + adjust_rcvctrl(dd, 0, clear); +} + +/* + * Called from all interrupt handlers to start handling an SPC freeze. + */ +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) +{ + struct hfi1_devdata *dd = ppd->dd; + struct send_context *sc; + int i; + int sc_flags; + + if (flags & FREEZE_SELF) + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + + /* enter frozen mode */ + dd->flags |= HFI1_FROZEN; + + /* notify all SDMA engines that they are going into a freeze */ + sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN)); + + sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ? + SCF_LINK_DOWN : 0); + /* do halt pre-handling on all enabled send contexts */ + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (sc && (sc->flags & SCF_ENABLED)) + sc_stop(sc, sc_flags); + } + + /* Send context are frozen. Notify user space */ + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT); + + if (flags & FREEZE_ABORT) { + dd_dev_err(dd, + "Aborted freeze recovery. Please REBOOT system\n"); + return; + } + /* queue non-interrupt handler */ + queue_work(ppd->hfi1_wq, &ppd->freeze_work); +} + +/* + * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen, + * depending on the "freeze" parameter. + * + * No need to return an error if it times out, our only option + * is to proceed anyway. + */ +static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if (freeze) { + /* waiting until all indicators are set */ + if ((reg & ALL_FROZE) == ALL_FROZE) + return; /* all done */ + } else { + /* waiting until all indicators are clear */ + if ((reg & ALL_FROZE) == 0) + return; /* all done */ + } + + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing", + freeze ? "" : "un", reg & ALL_FROZE, + freeze ? ALL_FROZE : 0ull); + return; + } + usleep_range(80, 120); + } +} + +/* + * Do all freeze handling for the RXE block. + */ +static void rxe_freeze(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_ctxtdata *rcd; + + /* disable port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* disable all receive contexts */ + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, rcd); + hfi1_rcd_put(rcd); + } +} + +/* + * Unfreeze handling for the RXE block - kernel contexts only. + * This will also enable the port. User contexts will do unfreeze + * handling on a per-context basis as they call into the driver. + * + */ +static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) +{ + u32 rcvmask; + u16 i; + struct hfi1_ctxtdata *rcd; + + /* enable all kernel contexts */ + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + + /* Ensure all non-user contexts(including vnic) are enabled */ + if (!rcd || + (i >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic)) { + hfi1_rcd_put(rcd); + continue; + } + rcvmask = HFI1_RCVCTRL_CTXT_ENB; + /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ + rcvmask |= hfi1_rcvhdrtail_kvaddr(rcd) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(dd, rcvmask, rcd); + hfi1_rcd_put(rcd); + } + + /* enable port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); +} + +/* + * Non-interrupt SPC freeze handling. + * + * This is a work-queue function outside of the triggering interrupt. + */ +void handle_freeze(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + freeze_work); + struct hfi1_devdata *dd = ppd->dd; + + /* wait for freeze indicators on all affected blocks */ + wait_for_freeze_status(dd, 1); + + /* SPC is now frozen */ + + /* do send PIO freeze steps */ + pio_freeze(dd); + + /* do send DMA freeze steps */ + sdma_freeze(dd); + + /* do send egress freeze steps - nothing to do */ + + /* do receive freeze steps */ + rxe_freeze(dd); + + /* + * Unfreeze the hardware - clear the freeze, wait for each + * block's frozen bit to clear, then clear the frozen flag. + */ + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + + if (is_ax(dd)) { + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + wait_for_freeze_status(dd, 1); + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + } + + /* do send PIO unfreeze steps for kernel contexts */ + pio_kernel_unfreeze(dd); + + /* do send DMA unfreeze steps */ + sdma_unfreeze(dd); + + /* do send egress unfreeze steps - nothing to do */ + + /* do receive unfreeze steps for kernel contexts */ + rxe_kernel_unfreeze(dd); + + /* + * The unfreeze procedure touches global device registers when + * it disables and re-enables RXE. Mark the device unfrozen + * after all that is done so other parts of the driver waiting + * for the device to unfreeze don't do things out of order. + * + * The above implies that the meaning of HFI1_FROZEN flag is + * "Device has gone into freeze mode and freeze mode handling + * is still in progress." + * + * The flag will be removed when freeze mode processing has + * completed. + */ + dd->flags &= ~HFI1_FROZEN; + wake_up(&dd->event_queue); + + /* no longer frozen */ +} + +/** + * update_xmit_counters - update PortXmitWait/PortVlXmitWait + * counters. + * @ppd: info of physical Hfi port + * @link_width: new link width after link up or downgrade + * + * Update the PortXmitWait and PortVlXmitWait counters after + * a link up or downgrade event to reflect a link width change. + */ +static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width) +{ + int i; + u16 tx_width; + u16 link_speed; + + tx_width = tx_link_width(link_width); + link_speed = get_link_speed(ppd->link_speed_active); + + /* + * There are C_VL_COUNT number of PortVLXmitWait counters. + * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. + */ + for (i = 0; i < C_VL_COUNT + 1; i++) + get_xmit_wait_counters(ppd, tx_width, link_speed, i); +} + +/* + * Handle a link up interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_up(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_up_work); + struct hfi1_devdata *dd = ppd->dd; + + set_link_state(ppd, HLS_UP_INIT); + + /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_ltp_rtt(dd); + /* + * OPA specifies that certain counters are cleared on a transition + * to link up, so do that. + */ + clear_linkup_counters(dd); + /* + * And (re)set link up default values. + */ + set_linkup_defaults(ppd); + + /* + * Set VL15 credits. Use cached value from verify cap interrupt. + * In case of quick linkup or simulator, vl15 value will be set by + * handle_linkup_change. VerifyCap interrupt handler will not be + * called in those scenarios. + */ + if (!(quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) + set_up_vl15(dd, dd->vl15buf_cached); + + /* enforce link speed enabled */ + if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) { + /* oops - current speed is not enabled, bounce */ + dd_dev_err(dd, + "Link speed active 0x%x is outside enabled 0x%x, downing link\n", + ppd->link_speed_active, ppd->link_speed_enabled); + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0, + OPA_LINKDOWN_REASON_SPEED_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } +} + +/* + * Several pieces of LNI information were cached for SMA in ppd. + * Reset these on link down + */ +static void reset_neighbor_info(struct hfi1_pportdata *ppd) +{ + ppd->neighbor_guid = 0; + ppd->neighbor_port_number = 0; + ppd->neighbor_type = 0; + ppd->neighbor_fm_security = 0; +} + +static const char * const link_down_reason_strs[] = { + [OPA_LINKDOWN_REASON_NONE] = "None", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Receive error 0", + [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", + [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", + [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", + [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID", + [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID", + [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2", + [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC", + [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8", + [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail", + [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10", + [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error", + [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15", + [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker", + [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14", + [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15", + [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance", + [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance", + [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance", + [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack", + [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker", + [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt", + [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit", + [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit", + [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24", + [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25", + [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26", + [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27", + [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28", + [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29", + [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30", + [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] = + "Excessive buffer overrun", + [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown", + [OPA_LINKDOWN_REASON_REBOOT] = "Reboot", + [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown", + [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce", + [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy", + [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy", + [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected", + [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] = + "Local media not installed", + [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed", + [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config", + [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] = + "End to end not installed", + [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy", + [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy", + [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy", + [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management", + [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled", + [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient" +}; + +/* return the neighbor link down reason string */ +static const char *link_down_reason_str(u8 reason) +{ + const char *str = NULL; + + if (reason < ARRAY_SIZE(link_down_reason_strs)) + str = link_down_reason_strs[reason]; + if (!str) + str = "(invalid)"; + + return str; +} + +/* + * Handle a link down interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_down(struct work_struct *work) +{ + u8 lcl_reason, neigh_reason = 0; + u8 link_down_reason; + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_down_work); + int was_up; + static const char ldr_str[] = "Link down reason: "; + + if ((ppd->host_link_state & + (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && + ppd->port_type == PORT_TYPE_FIXED) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); + + /* Go offline first, then deal with reading/writing through 8051 */ + was_up = !!(ppd->host_link_state & HLS_UP); + set_link_state(ppd, HLS_DN_OFFLINE); + xchg(&ppd->is_link_down_queued, 0); + + if (was_up) { + lcl_reason = 0; + /* link down reason is only valid if the link was up */ + read_link_down_reason(ppd->dd, &link_down_reason); + switch (link_down_reason) { + case LDR_LINK_TRANSFER_ACTIVE_LOW: + /* the link went down, no idle message reason */ + dd_dev_info(ppd->dd, "%sUnexpected link down\n", + ldr_str); + break; + case LDR_RECEIVED_LINKDOWN_IDLE_MSG: + /* + * The neighbor reason is only valid if an idle message + * was received for it. + */ + read_planned_down_reason_code(ppd->dd, &neigh_reason); + dd_dev_info(ppd->dd, + "%sNeighbor link down message %d, %s\n", + ldr_str, neigh_reason, + link_down_reason_str(neigh_reason)); + break; + case LDR_RECEIVED_HOST_OFFLINE_REQ: + dd_dev_info(ppd->dd, + "%sHost requested link to go offline\n", + ldr_str); + break; + default: + dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n", + ldr_str, link_down_reason); + break; + } + + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + } else { + /* went down while polling or going up */ + lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT; + } + + set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + + /* inform the SMA when the link transitions from up to down */ + if (was_up && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + + reset_neighbor_info(ppd); + + /* disable the port */ + clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* + * If there is no cable attached, turn the DC off. Otherwise, + * start the link bring up. + */ + if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) + dc_shutdown(ppd->dd); + else + start_link(ppd); +} + +void handle_link_bounce(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_bounce_work); + + /* + * Only do something if the link is currently up. + */ + if (ppd->host_link_state & HLS_UP) { + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } else { + dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n", + __func__, link_state_name(ppd->host_link_state)); + } +} + +/* + * Mask conversion: Capability exchange to Port LTP. The capability + * exchange has an implicit 16b CRC that is mandatory. + */ +static int cap_to_port_ltp(int cap) +{ + int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */ + + if (cap & CAP_CRC_14B) + port_ltp |= PORT_LTP_CRC_MODE_14; + if (cap & CAP_CRC_48B) + port_ltp |= PORT_LTP_CRC_MODE_48; + if (cap & CAP_CRC_12B_16B_PER_LANE) + port_ltp |= PORT_LTP_CRC_MODE_PER_LANE; + + return port_ltp; +} + +/* + * Convert an OPA Port LTP mask to capability mask + */ +int port_ltp_to_cap(int port_ltp) +{ + int cap_mask = 0; + + if (port_ltp & PORT_LTP_CRC_MODE_14) + cap_mask |= CAP_CRC_14B; + if (port_ltp & PORT_LTP_CRC_MODE_48) + cap_mask |= CAP_CRC_48B; + if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE) + cap_mask |= CAP_CRC_12B_16B_PER_LANE; + + return cap_mask; +} + +/* + * Convert a single DC LCB CRC mode to an OPA Port LTP mask. + */ +static int lcb_to_port_ltp(int lcb_crc) +{ + int port_ltp = 0; + + if (lcb_crc == LCB_CRC_12B_16B_PER_LANE) + port_ltp = PORT_LTP_CRC_MODE_PER_LANE; + else if (lcb_crc == LCB_CRC_48B) + port_ltp = PORT_LTP_CRC_MODE_48; + else if (lcb_crc == LCB_CRC_14B) + port_ltp = PORT_LTP_CRC_MODE_14; + else + port_ltp = PORT_LTP_CRC_MODE_16; + + return port_ltp; +} + +static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + if (ppd->pkeys[2] != 0) { + ppd->pkeys[2] = 0; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(ppd->dd, ppd->port); + } +} + +/* + * Convert the given link width to the OPA link width bitmask. + */ +static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width) +{ + switch (width) { + case 0: + /* + * Simulator and quick linkup do not set the width. + * Just set it to 4x without complaint. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup) + return OPA_LINK_WIDTH_4X; + return 0; /* no lanes up */ + case 1: return OPA_LINK_WIDTH_1X; + case 2: return OPA_LINK_WIDTH_2X; + case 3: return OPA_LINK_WIDTH_3X; + case 4: return OPA_LINK_WIDTH_4X; + default: + dd_dev_info(dd, "%s: invalid width %d, using 4\n", + __func__, width); + return OPA_LINK_WIDTH_4X; + } +} + +/* + * Do a population count on the bottom nibble. + */ +static const u8 bit_counts[16] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 +}; + +static inline u8 nibble_to_count(u8 nibble) +{ + return bit_counts[nibble & 0xf]; +} + +/* + * Read the active lane information from the 8051 registers and return + * their widths. + * + * Active lane information is found in these 8051 registers: + * enable_lane_tx + * enable_lane_rx + */ +static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 tx, rx; + u8 enable_lane_rx; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + u8 max_rate; + + /* read the active lanes */ + read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &max_rate); + read_local_lni(dd, &enable_lane_rx); + + /* convert to counts */ + tx = nibble_to_count(enable_lane_tx); + rx = nibble_to_count(enable_lane_rx); + + /* + * Set link_speed_active here, overriding what was set in + * handle_verify_cap(). The ASIC 8051 firmware does not correctly + * set the max_rate field in handle_verify_cap until v0.19. + */ + if ((dd->icode == ICODE_RTL_SILICON) && + (dd->dc8051_ver < dc8051_ver(0, 19, 0))) { + /* max_rate: 0 = 12.5G, 1 = 25G */ + switch (max_rate) { + case 0: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G; + break; + case 1: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; + break; + default: + dd_dev_err(dd, + "%s: unexpected max rate %d, using 25Gb\n", + __func__, (int)max_rate); + dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } + + dd_dev_info(dd, + "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n", + enable_lane_tx, tx, enable_lane_rx, rx); + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); +} + +/* + * Read verify_cap_local_fm_link_width[1] to obtain the link widths. + * Valid after the end of VerifyCap and during LinkUp. Does not change + * after link up. I.e. look elsewhere for downgrade information. + * + * Bits are: + * + bits [7:4] contain the number of active transmitters + * + bits [3:0] contain the number of active receivers + * These are numbers 1 through 4 and can be different values if the + * link is asymmetric. + * + * verify_cap_local_fm_link_width[0] retains its original value. + */ +static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 widths, tx, rx; + u8 misc_bits, local_flags; + u16 active_tx, active_rx; + + read_vc_local_link_mode(dd, &misc_bits, &local_flags, &widths); + tx = widths >> 12; + rx = (widths >> 8) & 0xf; + + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); + + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); +} + +/* + * Set ppd->link_width_active and ppd->link_width_downgrade_active using + * hardware information when the link first comes up. + * + * The link width is not available until after VerifyCap.AllFramesReceived + * (the trigger for handle_verify_cap), so this is outside that routine + * and should be called when the 8051 signals linkup. + */ +void get_linkup_link_widths(struct hfi1_pportdata *ppd) +{ + u16 tx_width, rx_width; + + /* get end-of-LNI link widths */ + get_linkup_widths(ppd->dd, &tx_width, &rx_width); + + /* use tx_width as the link is supposed to be symmetric on link up */ + ppd->link_width_active = tx_width; + /* link width downgrade active (LWD.A) starts out matching LW.A */ + ppd->link_width_downgrade_tx_active = ppd->link_width_active; + ppd->link_width_downgrade_rx_active = ppd->link_width_active; + /* per OPA spec, on link up LWD.E resets to LWD.S */ + ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported; + /* cache the active egress rate (units {10^6 bits/sec]) */ + ppd->current_egress_rate = active_egress_rate(ppd); +} + +/* + * Handle a verify capabilities interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_verify_cap(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_vc_work); + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + u8 power_management; + u8 continuous; + u8 vcu; + u8 vau; + u8 z; + u16 vl15buf; + u16 link_widths; + u16 crc_mask; + u16 crc_val; + u16 device_id; + u16 active_tx, active_rx; + u8 partner_supported_crc; + u8 remote_tx_rate; + u8 device_rev; + + set_link_state(ppd, HLS_VERIFY_CAP); + + lcb_shutdown(dd, 0); + adjust_lcb_for_fpga_serdes(dd); + + read_vc_remote_phy(dd, &power_management, &continuous); + read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf, + &partner_supported_crc); + read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths); + read_remote_device_id(dd, &device_id, &device_rev); + + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); + dd_dev_info(dd, + "Peer PHY: power management 0x%x, continuous updates 0x%x\n", + (int)power_management, (int)continuous); + dd_dev_info(dd, + "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", + (int)vau, (int)z, (int)vcu, (int)vl15buf, + (int)partner_supported_crc); + dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n", + (u32)remote_tx_rate, (u32)link_widths); + dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n", + (u32)device_id, (u32)device_rev); + /* + * The peer vAU value just read is the peer receiver value. HFI does + * not support a transmit vAU of 0 (AU == 8). We advertised that + * with Z=1 in the fabric capabilities sent to the peer. The peer + * will see our Z=1, and, if it advertised a vAU of 0, will move its + * receive to vAU of 1 (AU == 16). Do the same here. We do not care + * about the peer Z value - our sent vAU is 3 (hardwired) and is not + * subject to the Z value exception. + */ + if (vau == 0) + vau = 1; + set_up_vau(dd, vau); + + /* + * Set VL15 credits to 0 in global credit register. Cache remote VL15 + * credits value and wait for link-up interrupt ot set it. + */ + set_up_vl15(dd, 0); + dd->vl15buf_cached = vl15buf; + + /* set up the LCB CRC mode */ + crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc; + + /* order is important: use the lowest bit in common */ + if (crc_mask & CAP_CRC_14B) + crc_val = LCB_CRC_14B; + else if (crc_mask & CAP_CRC_48B) + crc_val = LCB_CRC_48B; + else if (crc_mask & CAP_CRC_12B_16B_PER_LANE) + crc_val = LCB_CRC_12B_16B_PER_LANE; + else + crc_val = LCB_CRC_16B; + + dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val); + write_csr(dd, DC_LCB_CFG_CRC_MODE, + (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT); + + /* set (14b only) or clear sideband credit */ + reg = read_csr(dd, SEND_CM_CTRL); + if (crc_val == LCB_CRC_14B && crc_14b_sideband) { + write_csr(dd, SEND_CM_CTRL, + reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } else { + write_csr(dd, SEND_CM_CTRL, + reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } + + ppd->link_speed_active = 0; /* invalid value */ + if (dd->dc8051_ver < dc8051_ver(0, 20, 0)) { + /* remote_tx_rate: 0 = 12.5G, 1 = 25G */ + switch (remote_tx_rate) { + case 0: + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + break; + case 1: + ppd->link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } else { + /* actual rate is highest bit of the ANDed rates */ + u8 rate = remote_tx_rate & ppd->local_tx_rate; + + if (rate & 2) + ppd->link_speed_active = OPA_LINK_SPEED_25G; + else if (rate & 1) + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + } + if (ppd->link_speed_active == 0) { + dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n", + __func__, (int)remote_tx_rate); + ppd->link_speed_active = OPA_LINK_SPEED_25G; + } + + /* + * Cache the values of the supported, enabled, and active + * LTP CRC modes to return in 'portinfo' queries. But the bit + * flags that are returned in the portinfo query differ from + * what's in the link_crc_mask, crc_sizes, and crc_val + * variables. Convert these here. + */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* supported crc modes */ + ppd->port_ltp_crc_mode |= + cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4; + /* enabled crc modes */ + ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val); + /* active crc mode */ + + /* set up the remote credit return table */ + assign_remote_cm_au_table(dd, vcu); + + /* + * The LCB is reset on entry to handle_verify_cap(), so this must + * be applied on every link up. + * + * Adjust LCB error kill enable to kill the link if + * these RBUF errors are seen: + * REPLAY_BUF_MBE_SMASK + * FLIT_INPUT_BUF_MBE_SMASK + */ + if (is_ax(dd)) { /* fixed in B0 */ + reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN); + reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK + | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK; + write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg); + } + + /* pull LCB fifos out of reset - all fifo clocks must be stable */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* give 8051 access to the LCB CSRs */ + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + /* tell the 8051 to go to LinkUp */ + set_link_state(ppd, HLS_GOING_UP); +} + +/** + * apply_link_downgrade_policy - Apply the link width downgrade enabled + * policy against the current active link widths. + * @ppd: info of physical Hfi port + * @refresh_widths: True indicates link downgrade event + * @return: True indicates a successful link downgrade. False indicates + * link downgrade event failed and the link will bounce back to + * default link width. + * + * Called when the enabled policy changes or the active link widths + * change. + * Refresh_widths indicates that a link downgrade occurred. The + * link_downgraded variable is set by refresh_widths and + * determines the success/failure of the policy application. + */ +bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + bool refresh_widths) +{ + int do_bounce = 0; + int tries; + u16 lwde; + u16 tx, rx; + bool link_downgraded = refresh_widths; + + /* use the hls lock to avoid a race with actual link up */ + tries = 0; +retry: + mutex_lock(&ppd->hls_lock); + /* only apply if the link is up */ + if (ppd->host_link_state & HLS_DOWN) { + /* still going up..wait and retry */ + if (ppd->host_link_state & HLS_GOING_UP) { + if (++tries < 1000) { + mutex_unlock(&ppd->hls_lock); + usleep_range(100, 120); /* arbitrary */ + goto retry; + } + dd_dev_err(ppd->dd, + "%s: giving up waiting for link state change\n", + __func__); + } + goto done; + } + + lwde = ppd->link_width_downgrade_enabled; + + if (refresh_widths) { + get_link_widths(ppd->dd, &tx, &rx); + ppd->link_width_downgrade_tx_active = tx; + ppd->link_width_downgrade_rx_active = rx; + } + + if (ppd->link_width_downgrade_tx_active == 0 || + ppd->link_width_downgrade_rx_active == 0) { + /* the 8051 reported a dead link as a downgrade */ + dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + link_downgraded = false; + } else if (lwde == 0) { + /* downgrade is disabled */ + + /* bounce if not at starting active width */ + if ((ppd->link_width_active != + ppd->link_width_downgrade_tx_active) || + (ppd->link_width_active != + ppd->link_width_downgrade_rx_active)) { + dd_dev_err(ppd->dd, + "Link downgrade is disabled and link has downgraded, downing link\n"); + dd_dev_err(ppd->dd, + " original 0x%x, tx active 0x%x, rx active 0x%x\n", + ppd->link_width_active, + ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + link_downgraded = false; + } + } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 || + (lwde & ppd->link_width_downgrade_rx_active) == 0) { + /* Tx or Rx is outside the enabled policy */ + dd_dev_err(ppd->dd, + "Link is outside of downgrade allowed, downing link\n"); + dd_dev_err(ppd->dd, + " enabled 0x%x, tx active 0x%x, rx active 0x%x\n", + lwde, ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + link_downgraded = false; + } + +done: + mutex_unlock(&ppd->hls_lock); + + if (do_bounce) { + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0, + OPA_LINKDOWN_REASON_WIDTH_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } + + return link_downgraded; +} + +/* + * Handle a link downgrade interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_downgrade(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_downgrade_work); + + dd_dev_info(ppd->dd, "8051: Link width downgrade\n"); + if (apply_link_downgrade_policy(ppd, true)) + update_xmit_counters(ppd, ppd->link_width_downgrade_tx_active); +} + +static char *dcc_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dcc_err_flags, + ARRAY_SIZE(dcc_err_flags)); +} + +static char *lcb_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, lcb_err_flags, + ARRAY_SIZE(lcb_err_flags)); +} + +static char *dc8051_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_err_flags, + ARRAY_SIZE(dc8051_err_flags)); +} + +static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_err_flags, + ARRAY_SIZE(dc8051_info_err_flags)); +} + +static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags, + ARRAY_SIZE(dc8051_info_host_msg_flags)); +} + +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 info, err, host_msg; + int queue_link_down = 0; + char buf[96]; + + /* look at the flags */ + if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) { + /* 8051 information set by firmware */ + /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */ + info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051); + err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK; + host_msg = (info >> + DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK; + + /* + * Handle error flags. + */ + if (err & FAILED_LNI) { + /* + * LNI error indications are cleared by the 8051 + * only when starting polling. Only pay attention + * to them when in the states that occur during + * LNI. + */ + if (ppd->host_link_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + queue_link_down = 1; + dd_dev_info(dd, "Link error: %s\n", + dc8051_info_err_string(buf, + sizeof(buf), + err & + FAILED_LNI)); + } + err &= ~(u64)FAILED_LNI; + } + /* unknown frames can happen durning LNI, just count */ + if (err & UNKNOWN_FRAME) { + ppd->unknown_frame_count++; + err &= ~(u64)UNKNOWN_FRAME; + } + if (err) { + /* report remaining errors, but do not do anything */ + dd_dev_err(dd, "8051 info error: %s\n", + dc8051_info_err_string(buf, sizeof(buf), + err)); + } + + /* + * Handle host message flags. + */ + if (host_msg & HOST_REQ_DONE) { + /* + * Presently, the driver does a busy wait for + * host requests to complete. This is only an + * informational message. + * NOTE: The 8051 clears the host message + * information *on the next 8051 command*. + * Therefore, when linkup is achieved, + * this flag will still be set. + */ + host_msg &= ~(u64)HOST_REQ_DONE; + } + if (host_msg & BC_SMA_MSG) { + queue_work(ppd->link_wq, &ppd->sma_message_work); + host_msg &= ~(u64)BC_SMA_MSG; + } + if (host_msg & LINKUP_ACHIEVED) { + dd_dev_info(dd, "8051: Link up\n"); + queue_work(ppd->link_wq, &ppd->link_up_work); + host_msg &= ~(u64)LINKUP_ACHIEVED; + } + if (host_msg & EXT_DEVICE_CFG_REQ) { + handle_8051_request(ppd); + host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; + } + if (host_msg & VERIFY_CAP_FRAME) { + queue_work(ppd->link_wq, &ppd->link_vc_work); + host_msg &= ~(u64)VERIFY_CAP_FRAME; + } + if (host_msg & LINK_GOING_DOWN) { + const char *extra = ""; + /* no downgrade action needed if going down */ + if (host_msg & LINK_WIDTH_DOWNGRADED) { + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + extra = " (ignoring downgrade)"; + } + dd_dev_info(dd, "8051: Link down%s\n", extra); + queue_link_down = 1; + host_msg &= ~(u64)LINK_GOING_DOWN; + } + if (host_msg & LINK_WIDTH_DOWNGRADED) { + queue_work(ppd->link_wq, &ppd->link_downgrade_work); + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + } + if (host_msg) { + /* report remaining messages, but do not do anything */ + dd_dev_info(dd, "8051 info host message: %s\n", + dc8051_info_host_msg_string(buf, + sizeof(buf), + host_msg)); + } + + reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK; + } + if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) { + /* + * Lost the 8051 heartbeat. If this happens, we + * receive constant interrupts about it. Disable + * the interrupt after the first. + */ + dd_dev_err(dd, "Lost 8051 heartbeat\n"); + write_csr(dd, DC_DC8051_ERR_EN, + read_csr(dd, DC_DC8051_ERR_EN) & + ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK); + + reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK; + } + if (reg) { + /* report the error, but do not do anything */ + dd_dev_err(dd, "8051 error: %s\n", + dc8051_err_string(buf, sizeof(buf), reg)); + } + + if (queue_link_down) { + /* + * if the link is already going down or disabled, do not + * queue another. If there's a link down entry already + * queued, don't queue another one. + */ + if ((ppd->host_link_state & + (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) || + ppd->link_enabled == 0) { + dd_dev_info(dd, "%s: not queuing link down. host_link_state %x, link_enabled %x\n", + __func__, ppd->host_link_state, + ppd->link_enabled); + } else { + if (xchg(&ppd->is_link_down_queued, 1) == 1) + dd_dev_info(dd, + "%s: link down request already queued\n", + __func__); + else + queue_work(ppd->link_wq, &ppd->link_down_work); + } + } +} + +static const char * const fm_config_txt[] = { +[0] = + "BadHeadDist: Distance violation between two head flits", +[1] = + "BadTailDist: Distance violation between two tail flits", +[2] = + "BadCtrlDist: Distance violation between two credit control flits", +[3] = + "BadCrdAck: Credits return for unsupported VL", +[4] = + "UnsupportedVLMarker: Received VL Marker", +[5] = + "BadPreempt: Exceeded the preemption nesting level", +[6] = + "BadControlFlit: Received unsupported control flit", +/* no 7 */ +[8] = + "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL", +}; + +static const char * const port_rcv_txt[] = { +[1] = + "BadPktLen: Illegal PktLen", +[2] = + "PktLenTooLong: Packet longer than PktLen", +[3] = + "PktLenTooShort: Packet shorter than PktLen", +[4] = + "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)", +[5] = + "BadDLID: Illegal DLID (0, doesn't match HFI)", +[6] = + "BadL2: Illegal L2 opcode", +[7] = + "BadSC: Unsupported SC", +[9] = + "BadRC: Illegal RC", +[11] = + "PreemptError: Preempting with same VL", +[12] = + "PreemptVL15: Preempting a VL15 packet", +}; + +#define OPA_LDR_FMCONFIG_OFFSET 16 +#define OPA_LDR_PORTRCV_OFFSET 0 +static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 info, hdr0, hdr1; + const char *extra; + char buf[96]; + struct hfi1_pportdata *ppd = dd->pport; + u8 lcl_reason = 0; + int do_bounce = 0; + + if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) { + if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) { + info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE); + dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK; + } + reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) { + struct hfi1_pportdata *ppd = dd->pport; + /* this counter saturates at (2^32) - 1 */ + if (ppd->link_downed < (u32)UINT_MAX) + ppd->link_downed++; + reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_FMCONFIG); + if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) { + dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK; + } + switch (info) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + extra = fm_config_txt[info]; + break; + case 8: + extra = fm_config_txt[info]; + if (ppd->port_error_action & + OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) { + do_bounce = 1; + /* + * lcl_reason cannot be derived from info + * for this error + */ + lcl_reason = + OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER; + } + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_FMCONFIG_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST; + } + + /* just report this */ + dd_dev_info_ratelimited(dd, "DCC Error: fmconfig error: %s\n", + extra); + reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_PORTRCV); + hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0); + hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + dd->err_info_rcvport.status_and_code = + info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_rcvport.status_and_code |= + OPA_EI_STATUS_SMASK; + /* + * save first 2 flits in the packet that caused + * the error + */ + dd->err_info_rcvport.packet_flit1 = hdr0; + dd->err_info_rcvport.packet_flit2 = hdr1; + } + switch (info) { + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 9: + case 11: + case 12: + extra = port_rcv_txt[info]; + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_PORTRCV_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0; + } + + /* just report this */ + dd_dev_info_ratelimited(dd, "DCC Error: PortRcv error: %s\n" + " hdr0 0x%llx, hdr1 0x%llx\n", + extra, hdr0, hdr1); + + reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) { + /* informative only */ + dd_dev_info_ratelimited(dd, "8051 access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK; + } + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) { + /* informative only */ + dd_dev_info_ratelimited(dd, "host access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK; + } + + if (unlikely(hfi1_dbg_fault_suppress_err(&dd->verbs_dev))) + reg &= ~DCC_ERR_FLG_LATE_EBP_ERR_SMASK; + + /* report any remaining errors */ + if (reg) + dd_dev_info_ratelimited(dd, "DCC Error: %s\n", + dcc_err_string(buf, sizeof(buf), reg)); + + if (lcl_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN; + + if (do_bounce) { + dd_dev_info_ratelimited(dd, "%s: PortErrorAction bounce\n", + __func__); + set_link_down_reason(ppd, lcl_reason, 0, lcl_reason); + queue_work(ppd->link_wq, &ppd->link_bounce_work); + } +} + +static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "LCB Error: %s\n", + lcb_err_string(buf, sizeof(buf), reg)); +} + +/* + * CCE block DC interrupt. Source is < 8. + */ +static void is_dc_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &dc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else if (source == 3 /* dc_lbm_int */) { + /* + * This indicates that a parity error has occurred on the + * address/control lines presented to the LBM. The error + * is a single pulse, there is no associated error flag, + * and it is non-maskable. This is because if a parity + * error occurs on the request the request is dropped. + * This should never occur, but it is nice to know if it + * ever does. + */ + dd_dev_err(dd, "Parity error in DC LBM block\n"); + } else { + dd_dev_err(dd, "Invalid DC interrupt %u\n", source); + } +} + +/* + * TX block send credit interrupt. Source is < 160. + */ +static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source) +{ + sc_group_release_update(dd, source); +} + +/* + * TX block SDMA interrupt. Source is < 48. + * + * SDMA interrupts are grouped by type: + * + * 0 - N-1 = SDma + * N - 2N-1 = SDmaProgress + * 2N - 3N-1 = SDmaIdle + */ +static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(&dd->per_sdma[which]); +#endif + + if (likely(what < 3 && which < dd->num_sdma)) { + sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source); + } else { + /* should not happen */ + dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source); + } +} + +/** + * is_rcv_avail_int() - User receive context available IRQ handler + * @dd: valid dd + * @source: logical IRQ source (offset from IS_RCVAVAIL_START) + * + * RX block receive available interrupt. Source is < 160. + * + * This is the general interrupt handler for user (PSM) receive contexts, + * and can only be used for non-threaded IRQs. + */ +static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = hfi1_rcd_get_by_index(dd, source); + if (rcd) { + handle_user_interrupt(rcd); + hfi1_rcd_put(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n", + err_detail, source); +} + +/** + * is_rcv_urgent_int() - User receive context urgent IRQ handler + * @dd: valid dd + * @source: logical IRQ source (offset from IS_RCVURGENT_START) + * + * RX block receive urgent interrupt. Source is < 160. + * + * NOTE: kernel receive contexts specifically do NOT enable this IRQ. + */ +static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = hfi1_rcd_get_by_index(dd, source); + if (rcd) { + handle_user_interrupt(rcd); + hfi1_rcd_put(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n", + err_detail, source); +} + +/* + * Reserved range interrupt. Should not be called in normal operation. + */ +static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source) +{ + char name[64]; + + dd_dev_err(dd, "unexpected %s interrupt\n", + is_reserved_name(name, sizeof(name), source)); +} + +static const struct is_table is_table[] = { +/* + * start end + * name func interrupt func + */ +{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END, + is_misc_err_name, is_misc_err_int }, +{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END, + is_sdma_eng_err_name, is_sdma_eng_err_int }, +{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, + is_sendctxt_err_name, is_sendctxt_err_int }, +{ IS_SDMA_START, IS_SDMA_IDLE_END, + is_sdma_eng_name, is_sdma_eng_int }, +{ IS_VARIOUS_START, IS_VARIOUS_END, + is_various_name, is_various_int }, +{ IS_DC_START, IS_DC_END, + is_dc_name, is_dc_int }, +{ IS_RCVAVAIL_START, IS_RCVAVAIL_END, + is_rcv_avail_name, is_rcv_avail_int }, +{ IS_RCVURGENT_START, IS_RCVURGENT_END, + is_rcv_urgent_name, is_rcv_urgent_int }, +{ IS_SENDCREDIT_START, IS_SENDCREDIT_END, + is_send_credit_name, is_send_credit_int}, +{ IS_RESERVED_START, IS_RESERVED_END, + is_reserved_name, is_reserved_int}, +}; + +/* + * Interrupt source interrupt - called when the given source has an interrupt. + * Source is a bit index into an array of 64-bit integers. + */ +static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) +{ + const struct is_table *entry; + + /* avoids a double compare by walking the table in-order */ + for (entry = &is_table[0]; entry->is_name; entry++) { + if (source <= entry->end) { + trace_hfi1_interrupt(dd, entry, source); + entry->is_int(dd, source - entry->start); + return; + } + } + /* fell off the end */ + dd_dev_err(dd, "invalid interrupt source %u\n", source); +} + +/** + * general_interrupt - General interrupt handler + * @irq: MSIx IRQ vector + * @data: hfi1 devdata + * + * This is able to correctly handle all non-threaded interrupts. Receive + * context DATA IRQs are threaded and are not supported by this handler. + * + */ +irqreturn_t general_interrupt(int irq, void *data) +{ + struct hfi1_devdata *dd = data; + u64 regs[CCE_NUM_INT_CSRS]; + u32 bit; + int i; + irqreturn_t handled = IRQ_NONE; + + this_cpu_inc(*dd->int_counter); + + /* phase 1: scan and clear all handled interrupts */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + if (dd->gi_mask[i] == 0) { + regs[i] = 0; /* used later */ + continue; + } + regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) & + dd->gi_mask[i]; + /* only clear if anything is set */ + if (regs[i]) + write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]); + } + + /* phase 2: call the appropriate handler */ + for_each_set_bit(bit, (unsigned long *)®s[0], + CCE_NUM_INT_CSRS * 64) { + is_interrupt(dd, bit); + handled = IRQ_HANDLED; + } + + return handled; +} + +irqreturn_t sdma_interrupt(int irq, void *data) +{ + struct sdma_engine *sde = data; + struct hfi1_devdata *dd = sde->dd; + u64 status; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(sde); +#endif + + this_cpu_inc(*dd->int_counter); + + /* This read_csr is really bad in the hot path */ + status = read_csr(dd, + CCE_INT_STATUS + (8 * (IS_SDMA_START / 64))) + & sde->imask; + if (likely(status)) { + /* clear the interrupt(s) */ + write_csr(dd, + CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)), + status); + + /* handle the interrupt(s) */ + sdma_engine_interrupt(sde, status); + } else { + dd_dev_info_ratelimited(dd, "SDMA engine %u interrupt, but no status bits set\n", + sde->this_idx); + } + return IRQ_HANDLED; +} + +/* + * Clear the receive interrupt. Use a read of the interrupt clear CSR + * to insure that the write completed. This does NOT guarantee that + * queued DMA writes to memory from the chip are pushed. + */ +static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg); + + write_csr(dd, addr, rcd->imask); + /* force the above write on the chip and get a value back */ + (void)read_csr(dd, addr); +} + +/* force the receive interrupt */ +void force_recv_intr(struct hfi1_ctxtdata *rcd) +{ + write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask); +} + +/* + * Return non-zero if a packet is present. + * + * This routine is called when rechecking for packets after the RcvAvail + * interrupt has been cleared down. First, do a quick check of memory for + * a packet present. If not found, use an expensive CSR read of the context + * tail to determine the actual tail. The CSR read is necessary because there + * is no method to push pending DMAs to memory other than an interrupt and we + * are trying to determine if we need to force an interrupt. + */ +static inline int check_packet_present(struct hfi1_ctxtdata *rcd) +{ + u32 tail; + + if (hfi1_packet_present(rcd)) + return 1; + + /* fall back to a CSR read, correct indpendent of DMA_RTAIL */ + tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + return hfi1_rcd_head(rcd) != tail; +} + +/* + * Common code for receive contexts interrupt handlers. + * Update traces, increment kernel IRQ counter and + * setup ASPM when needed. + */ +static void receive_interrupt_common(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + + trace_hfi1_receive_interrupt(dd, rcd); + this_cpu_inc(*dd->int_counter); + aspm_ctx_disable(rcd); +} + +/* + * __hfi1_rcd_eoi_intr() - Make HW issue receive interrupt + * when there are packets present in the queue. When calling + * with interrupts enabled please use hfi1_rcd_eoi_intr. + * + * @rcd: valid receive context + */ +static void __hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd) +{ + if (!rcd->rcvhdrq) + return; + clear_recv_intr(rcd); + if (check_packet_present(rcd)) + force_recv_intr(rcd); +} + +/** + * hfi1_rcd_eoi_intr() - End of Interrupt processing action + * + * @rcd: Ptr to hfi1_ctxtdata of receive context + * + * Hold IRQs so we can safely clear the interrupt and + * recheck for a packet that may have arrived after the previous + * check and the interrupt clear. If a packet arrived, force another + * interrupt. This routine can be called at the end of receive packet + * processing in interrupt service routines, interrupt service thread + * and softirqs + */ +static void hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd) +{ + unsigned long flags; + + local_irq_save(flags); + __hfi1_rcd_eoi_intr(rcd); + local_irq_restore(flags); +} + +/** + * hfi1_netdev_rx_napi - napi poll function to move eoi inline + * @napi: pointer to napi object + * @budget: netdev budget + */ +int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget) +{ + struct hfi1_netdev_rxq *rxq = container_of(napi, + struct hfi1_netdev_rxq, napi); + struct hfi1_ctxtdata *rcd = rxq->rcd; + int work_done = 0; + + work_done = rcd->do_interrupt(rcd, budget); + + if (work_done < budget) { + napi_complete_done(napi, work_done); + hfi1_rcd_eoi_intr(rcd); + } + + return work_done; +} + +/* Receive packet napi handler for netdevs VNIC and AIP */ +irqreturn_t receive_context_interrupt_napi(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + + receive_interrupt_common(rcd); + + if (likely(rcd->napi)) { + if (likely(napi_schedule_prep(rcd->napi))) + __napi_schedule_irqoff(rcd->napi); + else + __hfi1_rcd_eoi_intr(rcd); + } else { + WARN_ONCE(1, "Napi IRQ handler without napi set up ctxt=%d\n", + rcd->ctxt); + __hfi1_rcd_eoi_intr(rcd); + } + + return IRQ_HANDLED; +} + +/* + * Receive packet IRQ handler. This routine expects to be on its own IRQ. + * This routine will try to handle packets immediately (latency), but if + * it finds too many, it will invoke the thread handler (bandwitdh). The + * chip receive interrupt is *not* cleared down until this or the thread (if + * invoked) is finished. The intent is to avoid extra interrupts while we + * are processing packets anyway. + */ +irqreturn_t receive_context_interrupt(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + int disposition; + + receive_interrupt_common(rcd); + + /* receive interrupt remains blocked while processing packets */ + disposition = rcd->do_interrupt(rcd, 0); + + /* + * Too many packets were seen while processing packets in this + * IRQ handler. Invoke the handler thread. The receive interrupt + * remains blocked. + */ + if (disposition == RCV_PKT_LIMIT) + return IRQ_WAKE_THREAD; + + __hfi1_rcd_eoi_intr(rcd); + return IRQ_HANDLED; +} + +/* + * Receive packet thread handler. This expects to be invoked with the + * receive interrupt still blocked. + */ +irqreturn_t receive_context_thread(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + + /* receive interrupt is still blocked from the IRQ handler */ + (void)rcd->do_interrupt(rcd, 1); + + hfi1_rcd_eoi_intr(rcd); + + return IRQ_HANDLED; +} + +/* ========================================================================= */ + +u32 read_physical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT) + & DC_DC8051_STS_CUR_STATE_PORT_MASK; +} + +u32 read_logical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT) + & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK; +} + +static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + /* clear current state, set new state */ + reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK; + reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT; + write_csr(dd, DCC_CFG_PORT_CONFIG, reg); +} + +/* + * Use the 8051 to read a LCB CSR. + */ +static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + u32 regno; + int ret; + + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + if (acquire_lcb_access(dd, 0) == 0) { + *data = read_csr(dd, addr); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; + } + + /* register is an index of LCB registers: (offset - base) / 8 */ + regno = (addr - DC_LCB_CFG_RUN) >> 3; + ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data); + if (ret != HCMD_SUCCESS) + return -EBUSY; + return 0; +} + +/* + * Provide a cache for some of the LCB registers in case the LCB is + * unavailable. + * (The LCB is unavailable in certain link states, for example.) + */ +struct lcb_datum { + u32 off; + u64 val; +}; + +static struct lcb_datum lcb_cache[] = { + { DC_LCB_ERR_INFO_RX_REPLAY_CNT, 0}, + { DC_LCB_ERR_INFO_SEQ_CRC_CNT, 0 }, + { DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT, 0 }, +}; + +static void update_lcb_cache(struct hfi1_devdata *dd) +{ + int i; + int ret; + u64 val; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + ret = read_lcb_csr(dd, lcb_cache[i].off, &val); + + /* Update if we get good data */ + if (likely(ret != -EBUSY)) + lcb_cache[i].val = val; + } +} + +static int read_lcb_cache(u32 off, u64 *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + *val = lcb_cache[i].val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +/* + * Read an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return read_lcb_via_8051(dd, addr, data); + /* if going up or down, check the cache, otherwise, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) { + if (read_lcb_cache(addr, data)) + return -EBUSY; + return 0; + } + + /* otherwise, host has access */ + *data = read_csr(dd, addr); + return 0; +} + +/* + * Use the 8051 to write a LCB CSR. + */ +static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + u32 regno; + int ret; + + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || + (dd->dc8051_ver < dc8051_ver(0, 20, 0))) { + if (acquire_lcb_access(dd, 0) == 0) { + write_csr(dd, addr, data); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; + } + + /* register is an index of LCB registers: (offset - base) / 8 */ + regno = (addr - DC_LCB_CFG_RUN) >> 3; + ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data); + if (ret != HCMD_SUCCESS) + return -EBUSY; + return 0; +} + +/* + * Write an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return write_lcb_via_8051(dd, addr, data); + /* if going up or down, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) + return -EBUSY; + /* otherwise, host has access */ + write_csr(dd, addr, data); + return 0; +} + +/* + * Returns: + * < 0 = Linux error, not able to get access + * > 0 = 8051 command RETURN_CODE + */ +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data) +{ + u64 reg, completed; + int return_code; + unsigned long timeout; + + hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data); + + mutex_lock(&dd->dc8051_lock); + + /* We can't send any commands to the 8051 if it's in reset */ + if (dd->dc_shutdown) { + return_code = -ENODEV; + goto fail; + } + + /* + * If an 8051 host command timed out previously, then the 8051 is + * stuck. + * + * On first timeout, attempt to reset and restart the entire DC + * block (including 8051). (Is this too big of a hammer?) + * + * If the 8051 times out a second time, the reset did not bring it + * back to healthy life. In that case, fail any subsequent commands. + */ + if (dd->dc8051_timed_out) { + if (dd->dc8051_timed_out > 1) { + dd_dev_err(dd, + "Previous 8051 host command timed out, skipping command %u\n", + type); + return_code = -ENXIO; + goto fail; + } + _dc_shutdown(dd); + _dc_start(dd); + } + + /* + * If there is no timeout, then the 8051 command interface is + * waiting for a command. + */ + + /* + * When writing a LCB CSR, out_data contains the full value to + * be written, while in_data contains the relative LCB + * address in 7:0. Do the work here, rather than the caller, + * of distrubting the write data to where it needs to go: + * + * Write data + * 39:00 -> in_data[47:8] + * 47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE + * 63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA + */ + if (type == HCMD_WRITE_LCB_CSR) { + in_data |= ((*out_data) & 0xffffffffffull) << 8; + /* must preserve COMPLETED - it is tied to hardware */ + reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_0); + reg &= DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK; + reg |= ((((*out_data) >> 40) & 0xff) << + DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT) + | ((((*out_data) >> 48) & 0xffff) << + DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg); + } + + /* + * Do two writes: the first to stabilize the type and req_data, the + * second to activate. + */ + reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT + | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + + /* wait for completion, alternate: interrupt */ + timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT); + while (1) { + reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1); + completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK; + if (completed) + break; + if (time_after(jiffies, timeout)) { + dd->dc8051_timed_out++; + dd_dev_err(dd, "8051 host command %u timeout\n", type); + if (out_data) + *out_data = 0; + return_code = -ETIMEDOUT; + goto fail; + } + udelay(2); + } + + if (out_data) { + *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK; + if (type == HCMD_READ_LCB_CSR) { + /* top 16 bits are in a different register */ + *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK) + << (48 + - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT); + } + } + return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK; + dd->dc8051_timed_out = 0; + /* + * Clear command for next user. + */ + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0); + +fail: + mutex_unlock(&dd->dc8051_lock); + return return_code; +} + +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) +{ + return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); +} + +int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) +{ + u64 data; + int ret; + + data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT + | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT + | (u64)config_data << LOAD_DATA_DATA_SHIFT; + ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "load 8051 config: field id %d, lane %d, err %d\n", + (int)field_id, (int)lane_id, ret); + } + return ret; +} + +/* + * Read the 8051 firmware "registers". Use the RAM directly. Always + * set the result, even on error. + * Return 0 on success, -errno on failure + */ +int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, + u32 *result) +{ + u64 big_data; + u32 addr; + int ret; + + /* address start depends on the lane_id */ + if (lane_id < 4) + addr = (4 * NUM_GENERAL_FIELDS) + + (lane_id * 4 * NUM_LANE_FIELDS); + else + addr = 0; + addr += field_id * 4; + + /* read is in 8-byte chunks, hardware will truncate the address down */ + ret = read_8051_data(dd, addr, 8, &big_data); + + if (ret == 0) { + /* extract the 4 bytes we want */ + if (addr & 0x4) + *result = (u32)(big_data >> 32); + else + *result = (u32)big_data; + } else { + *result = 0; + dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n", + __func__, lane_id, field_id); + } + + return ret; +} + +static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management, + u8 continuous) +{ + u32 frame; + + frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT + | power_management << POWER_MANAGEMENT_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY, + GENERAL_CONFIG, frame); +} + +static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu, + u16 vl15buf, u8 crc_sizes) +{ + u32 frame; + + frame = (u32)vau << VAU_SHIFT + | (u32)z << Z_SHIFT + | (u32)vcu << VCU_SHIFT + | (u32)vl15buf << VL15BUF_SHIFT + | (u32)crc_sizes << CRC_SIZES_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC, + GENERAL_CONFIG, frame); +} + +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, + &frame); + *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; + *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static int write_vc_local_link_mode(struct hfi1_devdata *dd, + u8 misc_bits, + u8 flag_bits, + u16 link_widths) +{ + u32 frame; + + frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT + | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT + | (u32)link_widths << LINK_WIDTH_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, + frame); +} + +static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id, + u8 device_rev) +{ + u32 frame; + + frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT) + | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT); + return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame); +} + +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev) +{ + u32 frame; + + read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame); + *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK; + *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT) + & REMOTE_DEVICE_REV_MASK; +} + +int write_host_interface_version(struct hfi1_devdata *dd, u8 version) +{ + u32 frame; + u32 mask; + + mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT); + read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame); + /* Clear, then set field */ + frame &= ~mask; + frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT); + return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, + frame); +} + +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, + u8 *ver_patch) +{ + u32 frame; + + read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame); + *ver_major = (frame >> STS_FM_VERSION_MAJOR_SHIFT) & + STS_FM_VERSION_MAJOR_MASK; + *ver_minor = (frame >> STS_FM_VERSION_MINOR_SHIFT) & + STS_FM_VERSION_MINOR_MASK; + + read_8051_config(dd, VERSION_PATCH, GENERAL_CONFIG, &frame); + *ver_patch = (frame >> STS_FM_VERSION_PATCH_SHIFT) & + STS_FM_VERSION_PATCH_MASK; +} + +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame); + *power_management = (frame >> POWER_MANAGEMENT_SHIFT) + & POWER_MANAGEMENT_MASK; + *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT) + & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK; +} + +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame); + *vau = (frame >> VAU_SHIFT) & VAU_MASK; + *z = (frame >> Z_SHIFT) & Z_MASK; + *vcu = (frame >> VCU_SHIFT) & VCU_MASK; + *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK; + *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK; +} + +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, + u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG, + &frame); + *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT) + & REMOTE_TX_RATE_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx) +{ + u32 frame; + + read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame); + *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK; +} + +static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls) +{ + read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls); +} + +static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs) +{ + read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs); +} + +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality) +{ + u32 frame; + int ret; + + *link_quality = 0; + if (dd->pport->host_link_state & HLS_UP) { + ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, + &frame); + if (ret == 0) + *link_quality = (frame >> LINK_QUALITY_SHIFT) + & LINK_QUALITY_MASK; + } +} + +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) +{ + u32 frame; + + read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame); + *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; +} + +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr) +{ + u32 frame; + + read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame); + *ldr = (frame & 0xff); +} + +static int read_tx_settings(struct hfi1_devdata *dd, + u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, + u8 *max_rate) +{ + u32 frame; + int ret; + + ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame); + *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT) + & ENABLE_LANE_TX_MASK; + *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT) + & TX_POLARITY_INVERSION_MASK; + *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT) + & RX_POLARITY_INVERSION_MASK; + *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK; + return ret; +} + +static int write_tx_settings(struct hfi1_devdata *dd, + u8 enable_lane_tx, + u8 tx_polarity_inversion, + u8 rx_polarity_inversion, + u8 max_rate) +{ + u32 frame; + + /* no need to mask, all variable sizes match field widths */ + frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT + | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT + | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT + | max_rate << MAX_RATE_SHIFT; + return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame); +} + +/* + * Read an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out) +{ + int ret; + + ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "read idle message: type %d, err %d\n", + (u32)type, ret); + return -EINVAL; + } + dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out); + /* return only the payload as we already know the type */ + *data_out >>= IDLE_PAYLOAD_SHIFT; + return 0; +} + +/* + * Read an idle SMA message. To be done in response to a notification from + * the 8051. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data) +{ + return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, + data); +} + +/* + * Send an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int send_idle_message(struct hfi1_devdata *dd, u64 data) +{ + int ret; + + dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data); + ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n", + data, ret); + return -EINVAL; + } + return 0; +} + +/* + * Send an idle SMA message. + * + * Returns 0 on success, -EINVAL on error + */ +int send_idle_sma(struct hfi1_devdata *dd, u64 message) +{ + u64 data; + + data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) | + ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT); + return send_idle_message(dd, data); +} + +/* + * Initialize the LCB then do a quick link up. This may or may not be + * in loopback. + * + * return 0 on success, -errno on error + */ +static int do_quick_linkup(struct hfi1_devdata *dd) +{ + int ret; + + lcb_shutdown(dd, 0); + + if (loopback) { + /* LCB_CFG_LOOPBACK.VAL = 2 */ + /* LCB_CFG_LANE_WIDTH.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_LOOPBACK, + IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT); + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0); + } + + /* start the LCBs */ + /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* simulator only loopback steps */ + if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + /* LCB_CFG_RUN.EN = 1 */ + write_csr(dd, DC_LCB_CFG_RUN, + 1ull << DC_LCB_CFG_RUN_EN_SHIFT); + + ret = wait_link_transfer_active(dd, 10); + if (ret) + return ret; + + write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, + 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT); + } + + if (!loopback) { + /* + * When doing quick linkup and not in loopback, both + * sides must be done with LCB set-up before either + * starts the quick linkup. Put a delay here so that + * both sides can be started and have a chance to be + * done with LCB set up before resuming. + */ + dd_dev_err(dd, + "Pausing for peer to be finished with LCB set up\n"); + msleep(5000); + dd_dev_err(dd, "Continuing with quick linkup\n"); + } + + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + /* + * State "quick" LinkUp request sets the physical link state to + * LinkUp without a verify capability sequence. + * This state is in simulator v37 and later. + */ + ret = set_physical_link_state(dd, PLS_QUICK_LINKUP); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "%s: set physical link state to quick LinkUp failed with return %d\n", + __func__, ret); + + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + if (ret >= 0) + ret = -EINVAL; + return ret; + } + + return 0; /* success */ +} + +/* + * Do all special steps to set up loopback. + */ +static int init_loopback(struct hfi1_devdata *dd) +{ + dd_dev_info(dd, "Entering loopback mode\n"); + + /* all loopbacks should disable self GUID check */ + write_csr(dd, DC_DC8051_CFG_MODE, + (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK)); + + /* + * The simulator has only one loopback option - LCB. Switch + * to that option, which includes quick link up. + * + * Accept all valid loopback values. + */ + if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) && + (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB || + loopback == LOOPBACK_CABLE)) { + loopback = LOOPBACK_LCB; + quick_linkup = 1; + return 0; + } + + /* + * SerDes loopback init sequence is handled in set_local_link_attributes + */ + if (loopback == LOOPBACK_SERDES) + return 0; + + /* LCB loopback - handled at poll time */ + if (loopback == LOOPBACK_LCB) { + quick_linkup = 1; /* LCB is always quick linkup */ + + /* not supported in emulation due to emulation RTL changes */ + if (dd->icode == ICODE_FPGA_EMULATION) { + dd_dev_err(dd, + "LCB loopback not supported in emulation\n"); + return -EINVAL; + } + return 0; + } + + /* external cable loopback requires no extra steps */ + if (loopback == LOOPBACK_CABLE) + return 0; + + dd_dev_err(dd, "Invalid loopback mode %d\n", loopback); + return -EINVAL; +} + +/* + * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits + * used in the Verify Capability link width attribute. + */ +static u16 opa_to_vc_link_widths(u16 opa_widths) +{ + int i; + u16 result = 0; + + static const struct link_bits { + u16 from; + u16 to; + } opa_link_xlate[] = { + { OPA_LINK_WIDTH_1X, 1 << (1 - 1) }, + { OPA_LINK_WIDTH_2X, 1 << (2 - 1) }, + { OPA_LINK_WIDTH_3X, 1 << (3 - 1) }, + { OPA_LINK_WIDTH_4X, 1 << (4 - 1) }, + }; + + for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) { + if (opa_widths & opa_link_xlate[i].from) + result |= opa_link_xlate[i].to; + } + return result; +} + +/* + * Set link attributes before moving to polling. + */ +static int set_local_link_attributes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + int ret; + u32 misc_bits = 0; + /* reset our fabric serdes to clear any lingering problems */ + fabric_serdes_reset(dd); + + /* set the local tx rate - need to read-modify-write */ + ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &ppd->local_tx_rate); + if (ret) + goto set_local_link_attributes_fail; + + if (dd->dc8051_ver < dc8051_ver(0, 20, 0)) { + /* set the tx rate to the fastest enabled */ + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate = 1; + else + ppd->local_tx_rate = 0; + } else { + /* set the tx rate to all enabled */ + ppd->local_tx_rate = 0; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate |= 2; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G) + ppd->local_tx_rate |= 1; + } + + enable_lane_tx = 0xF; /* enable all four lanes */ + ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion, + rx_polarity_inversion, ppd->local_tx_rate); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + goto set_local_link_attributes_fail; + } + + /* + * DC supports continuous updates. + */ + ret = write_vc_local_phy(dd, + 0 /* no power management */, + 1 /* continuous updates */); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* z=1 in the next call: AU of 0 is not supported by the hardware */ + ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init, + ppd->port_crc_mode_enabled); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* + * SerDes loopback init sequence requires + * setting bit 0 of MISC_CONFIG_BITS + */ + if (loopback == LOOPBACK_SERDES) + misc_bits |= 1 << LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT; + + /* + * An external device configuration request is used to reset the LCB + * to retry to obtain operational lanes when the first attempt is + * unsuccesful. + */ + if (dd->dc8051_ver >= dc8051_ver(1, 25, 0)) + misc_bits |= 1 << EXT_CFG_LCB_RESET_SUPPORTED_SHIFT; + + ret = write_vc_local_link_mode(dd, misc_bits, 0, + opa_to_vc_link_widths( + ppd->link_width_enabled)); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* let peer know who we are */ + ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev); + if (ret == HCMD_SUCCESS) + return 0; + +set_local_link_attributes_fail: + dd_dev_err(dd, + "Failed to set local link attributes, return 0x%x\n", + ret); + return ret; +} + +/* + * Call this to start the link. + * Do not do anything if the link is disabled. + * Returns 0 if link is disabled, moved to polling, or the driver is not ready. + */ +int start_link(struct hfi1_pportdata *ppd) +{ + /* + * Tune the SerDes to a ballpark setting for optimal signal and bit + * error rate. Needs to be done before starting the link. + */ + tune_serdes(ppd); + + if (!ppd->driver_link_ready) { + dd_dev_info(ppd->dd, + "%s: stopping link start because driver is not ready\n", + __func__); + return 0; + } + + /* + * FULL_MGMT_P_KEY is cleared from the pkey table, so that the + * pkey table can be configured properly if the HFI unit is connected + * to switch port with MgmtAllowed=NO + */ + clear_full_mgmt_pkey(ppd); + + return set_link_state(ppd, HLS_DN_POLL); +} + +static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + unsigned long timeout; + + /* + * Some QSFP cables have a quirk that asserts the IntN line as a side + * effect of power up on plug-in. We ignore this false positive + * interrupt until the module has finished powering up by waiting for + * a minimum timeout of the module inrush initialization time of + * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the + * module have stabilized. + */ + msleep(500); + + /* + * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1) + */ + timeout = jiffies + msecs_to_jiffies(2000); + while (1) { + mask = read_csr(dd, dd->hfi1_id ? + ASIC_QSFP2_IN : ASIC_QSFP1_IN); + if (!(mask & QSFP_HFI0_INT_N)) + break; + if (time_after(jiffies, timeout)) { + dd_dev_info(dd, "%s: No IntN detected, reset complete\n", + __func__); + break; + } + udelay(2); + } +} + +static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + + mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK); + if (enable) { + /* + * Clear the status register to avoid an immediate interrupt + * when we re-enable the IntN pin + */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + QSFP_HFI0_INT_N); + mask |= (u64)QSFP_HFI0_INT_N; + } else { + mask &= ~(u64)QSFP_HFI0_INT_N; + } + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); +} + +int reset_qsfp(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask, qsfp_mask; + + /* Disable INT_N from triggering QSFP interrupts */ + set_qsfp_int_n(ppd, 0); + + /* Reset the QSFP */ + mask = (u64)QSFP_HFI0_RESET_N; + + qsfp_mask = read_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); + qsfp_mask &= ~mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); + + udelay(10); + + qsfp_mask |= mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); + + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings + */ + set_qsfp_int_n(ppd, 1); + + /* + * After the reset, AOC transmitters are enabled by default. They need + * to be turned off to complete the QSFP setup before they can be + * enabled again. + */ + return set_qsfp_tx(ppd, 0); +} + +static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, + u8 *qsfp_interrupt_status) +{ + struct hfi1_devdata *dd = ppd->dd; + + if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) + dd_dev_err(dd, "%s: QSFP cable temperature too high\n", + __func__); + + if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) + dd_dev_err(dd, "%s: QSFP cable temperature too low\n", + __func__); + + /* + * The remaining alarms/warnings don't matter if the link is down. + */ + if (ppd->host_link_state & HLS_DOWN) + return 0; + + if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) + dd_dev_err(dd, "%s: QSFP supply voltage too high\n", + __func__); + + if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) + dd_dev_err(dd, "%s: QSFP supply voltage too low\n", + __func__); + + /* Byte 2 is vendor specific */ + + if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable RX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable RX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable RX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable RX channel 3/4 power too low\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) + dd_dev_err(dd, "%s: Cable TX channel 3/4 power too low\n", + __func__); + + /* Bytes 9-10 and 11-12 are reserved */ + /* Bytes 13-15 are vendor specific */ + + return 0; +} + +/* This routine will only be scheduled if the QSFP module present is asserted */ +void qsfp_event(struct work_struct *work) +{ + struct qsfp_data *qd; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + + qd = container_of(work, struct qsfp_data, qsfp_work); + ppd = qd->ppd; + dd = ppd->dd; + + /* Sanity check */ + if (!qsfp_mod_present(ppd)) + return; + + if (ppd->host_link_state == HLS_DN_DISABLE) { + dd_dev_info(ppd->dd, + "%s: stopping link start because link is disabled\n", + __func__); + return; + } + + /* + * Turn DC back on after cable has been re-inserted. Up until + * now, the DC has been in reset to save power. + */ + dc_start(dd); + + if (qd->cache_refresh_required) { + set_qsfp_int_n(ppd, 0); + + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings + */ + set_qsfp_int_n(ppd, 1); + + start_link(ppd); + } + + if (qd->check_interrupt_flags) { + u8 qsfp_interrupt_status[16] = {0,}; + + if (one_qsfp_read(ppd, dd->hfi1_id, 6, + &qsfp_interrupt_status[0], 16) != 16) { + dd_dev_info(dd, + "%s: Failed to read status of QSFP module\n", + __func__); + } else { + unsigned long flags; + + handle_qsfp_error_conditions( + ppd, qsfp_interrupt_status); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + } + } +} + +void init_qsfp_int(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 qsfp_mask; + + qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + /* Clear current status to avoid spurious interrupts */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + qsfp_mask); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, + qsfp_mask); + + set_qsfp_int_n(ppd, 0); + + /* Handle active low nature of INT_N and MODPRST_N pins */ + if (qsfp_mod_present(ppd)) + qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, + qsfp_mask); + + /* Enable the appropriate QSFP IRQ source */ + if (!dd->hfi1_id) + set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true); + else + set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true); +} + +/* + * Do a one-time initialize of the LCB block. + */ +static void init_lcb(struct hfi1_devdata *dd) +{ + /* simulator does not correctly handle LCB cclk loopback, skip */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return; + + /* the DC has been reset earlier in the driver load */ + + /* set LCB for cclk loopback on the port */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01); + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00); + write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00); + write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110); + write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08); + write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02); + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00); +} + +/* + * Perform a test read on the QSFP. Return 0 on success, -ERRNO + * on error. + */ +static int test_qsfp_read(struct hfi1_pportdata *ppd) +{ + int ret; + u8 status; + + /* + * Report success if not a QSFP or, if it is a QSFP, but the cable is + * not present + */ + if (ppd->port_type != PORT_TYPE_QSFP || !qsfp_mod_present(ppd)) + return 0; + + /* read byte 2, the status byte */ + ret = one_qsfp_read(ppd, ppd->dd->hfi1_id, 2, &status, 1); + if (ret < 0) + return ret; + if (ret != 1) + return -EIO; + + return 0; /* success */ +} + +/* + * Values for QSFP retry. + * + * Give up after 10s (20 x 500ms). The overall timeout was empirically + * arrived at from experience on a large cluster. + */ +#define MAX_QSFP_RETRIES 20 +#define QSFP_RETRY_WAIT 500 /* msec */ + +/* + * Try a QSFP read. If it fails, schedule a retry for later. + * Called on first link activation after driver load. + */ +static void try_start_link(struct hfi1_pportdata *ppd) +{ + if (test_qsfp_read(ppd)) { + /* read failed */ + if (ppd->qsfp_retry_count >= MAX_QSFP_RETRIES) { + dd_dev_err(ppd->dd, "QSFP not responding, giving up\n"); + return; + } + dd_dev_info(ppd->dd, + "QSFP not responding, waiting and retrying %d\n", + (int)ppd->qsfp_retry_count); + ppd->qsfp_retry_count++; + queue_delayed_work(ppd->link_wq, &ppd->start_link_work, + msecs_to_jiffies(QSFP_RETRY_WAIT)); + return; + } + ppd->qsfp_retry_count = 0; + + start_link(ppd); +} + +/* + * Workqueue function to start the link after a delay. + */ +void handle_start_link(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + start_link_work.work); + try_start_link(ppd); +} + +int bringup_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 guid; + int ret; + + if (HFI1_CAP_IS_KSET(EXTENDED_PSN)) + add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK); + + guid = ppd->guids[HFI1_PORT_GUID_INDEX]; + if (!guid) { + if (dd->base_guid) + guid = dd->base_guid + ppd->port - 1; + ppd->guids[HFI1_PORT_GUID_INDEX] = guid; + } + + /* Set linkinit_reason on power up per OPA spec */ + ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP; + + /* one-time init of the LCB */ + init_lcb(dd); + + if (loopback) { + ret = init_loopback(dd); + if (ret < 0) + return ret; + } + + get_port_type(ppd); + if (ppd->port_type == PORT_TYPE_QSFP) { + set_qsfp_int_n(ppd, 0); + wait_for_qsfp_init(ppd); + set_qsfp_int_n(ppd, 1); + } + + try_start_link(ppd); + return 0; +} + +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * Shut down the link and keep it down. First turn off that the + * driver wants to allow the link to be up (driver_link_ready). + * Then make sure the link is not automatically restarted + * (link_enabled). Cancel any pending restart. And finally + * go offline. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + + ppd->qsfp_retry_count = MAX_QSFP_RETRIES; /* prevent more retries */ + flush_delayed_work(&ppd->start_link_work); + cancel_delayed_work_sync(&ppd->start_link_work); + + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_REBOOT); + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_REBOOT, 0, + OPA_LINKDOWN_REASON_REBOOT); + set_link_state(ppd, HLS_DN_OFFLINE); + + /* disable the port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + cancel_work_sync(&ppd->freeze_work); +} + +static inline int init_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64); + if (!ppd->ibport_data.rvp.rc_acks || + !ppd->ibport_data.rvp.rc_delayed_comp || + !ppd->ibport_data.rvp.rc_qacks) + return -ENOMEM; + } + + return 0; +} + +/* + * index is the index into the receive array + */ +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order) +{ + u64 reg; + + if (!(dd->flags & HFI1_PRESENT)) + goto done; + + if (type == PT_INVALID || type == PT_INVALID_FLUSH) { + pa = 0; + order = 0; + } else if (type > PT_INVALID) { + dd_dev_err(dd, + "unexpected receive array type %u for index %u, not handled\n", + type, index); + goto done; + } + trace_hfi1_put_tid(dd, index, type, pa, order); + +#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */ + reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK + | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT + | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK) + << RCV_ARRAY_RT_ADDR_SHIFT; + trace_hfi1_write_rcvarray(dd->rcvarray_wc + (index * 8), reg); + writeq(reg, dd->rcvarray_wc + (index * 8)); + + if (type == PT_EAGER || type == PT_INVALID_FLUSH || (index & 3) == 3) + /* + * Eager entries are written and flushed + * + * Expected entries are flushed every 4 writes + */ + flush_wc(); +done: + return; +} + +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 i; + + /* this could be optimized */ + for (i = rcd->eager_base; i < rcd->eager_base + + rcd->egrbufs.alloced; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); + + for (i = rcd->expected_base; + i < rcd->expected_base + rcd->expected_count; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); +} + +static const char * const ib_cfg_name_strings[] = { + "HFI1_IB_CFG_LIDLMC", + "HFI1_IB_CFG_LWID_DG_ENB", + "HFI1_IB_CFG_LWID_ENB", + "HFI1_IB_CFG_LWID", + "HFI1_IB_CFG_SPD_ENB", + "HFI1_IB_CFG_SPD", + "HFI1_IB_CFG_RXPOL_ENB", + "HFI1_IB_CFG_LREV_ENB", + "HFI1_IB_CFG_LINKLATENCY", + "HFI1_IB_CFG_HRTBT", + "HFI1_IB_CFG_OP_VLS", + "HFI1_IB_CFG_VL_HIGH_CAP", + "HFI1_IB_CFG_VL_LOW_CAP", + "HFI1_IB_CFG_OVERRUN_THRESH", + "HFI1_IB_CFG_PHYERR_THRESH", + "HFI1_IB_CFG_LINKDEFAULT", + "HFI1_IB_CFG_PKEYS", + "HFI1_IB_CFG_MTU", + "HFI1_IB_CFG_LSTATE", + "HFI1_IB_CFG_VL_HIGH_LIMIT", + "HFI1_IB_CFG_PMA_TICKS", + "HFI1_IB_CFG_PORT" +}; + +static const char *ib_cfg_name(int which) +{ + if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings)) + return "invalid"; + return ib_cfg_name_strings[which]; +} + +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which) +{ + struct hfi1_devdata *dd = ppd->dd; + int val = 0; + + switch (which) { + case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */ + val = ppd->link_width_enabled; + break; + case HFI1_IB_CFG_LWID: /* currently active Link-width */ + val = ppd->link_width_active; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + val = ppd->link_speed_enabled; + break; + case HFI1_IB_CFG_SPD: /* current Link speed */ + val = ppd->link_speed_active; + break; + + case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */ + case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */ + case HFI1_IB_CFG_LINKLATENCY: + goto unimplemented; + + case HFI1_IB_CFG_OP_VLS: + val = ppd->actual_vls_operational; + break; + case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */ + val = VL_ARB_HIGH_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */ + val = VL_ARB_LOW_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + val = ppd->overrun_threshold; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + val = ppd->phy_error_threshold; + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + val = HLS_DEFAULT; + break; + + case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */ + case HFI1_IB_CFG_PMA_TICKS: + default: +unimplemented: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info( + dd, + "%s: which %s: not implemented\n", + __func__, + ib_cfg_name(which)); + break; + } + + return val; +} + +/* + * The largest MAD packet size. + */ +#define MAX_MAD_PACKET 2048 + +/* + * Return the maximum header bytes that can go on the _wire_ + * for this device. This count includes the ICRC which is + * not part of the packet held in memory but it is appended + * by the HW. + * This is dependent on the device's receive header entry size. + * HFI allows this to be set per-receive context, but the + * driver presently enforces a global value. + */ +u32 lrh_max_header_bytes(struct hfi1_devdata *dd) +{ + /* + * The maximum non-payload (MTU) bytes in LRH.PktLen are + * the Receive Header Entry Size minus the PBC (or RHF) size + * plus one DW for the ICRC appended by HW. + * + * dd->rcd[0].rcvhdrqentsize is in DW. + * We use rcd[0] as all context will have the same value. Also, + * the first kernel context would have been allocated by now so + * we are guaranteed a valid value. + */ + return (get_hdrqentsize(dd->rcd[0]) - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2; +} + +/* + * Set Send Length + * @ppd: per port data + * + * Set the MTU by limiting how many DWs may be sent. The SendLenCheck* + * registers compare against LRH.PktLen, so use the max bytes included + * in the LRH. + * + * This routine changes all VL values except VL15, which it maintains at + * the same value. + */ +static void set_send_length(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 max_hb = lrh_max_header_bytes(dd), dcmtu; + u32 maxvlmtu = dd->vld[15].mtu; + u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL15_MASK) << + SEND_LEN_CHECK1_LEN_VL15_SHIFT; + int i, j; + u32 thres; + + for (i = 0; i < ppd->vls_supported; i++) { + if (dd->vld[i].mtu > maxvlmtu) + maxvlmtu = dd->vld[i].mtu; + if (i <= 3) + len1 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK0_LEN_VL0_MASK) << + ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT); + else + len2 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL4_MASK) << + ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT); + } + write_csr(dd, SEND_LEN_CHECK0, len1); + write_csr(dd, SEND_LEN_CHECK1, len2); + /* adjust kernel credit return thresholds based on new MTUs */ + /* all kernel receive contexts have the same hdrqentsize */ + for (i = 0; i < ppd->vls_supported; i++) { + thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50), + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, + get_hdrqentsize(dd->rcd[0]))); + for (j = 0; j < INIT_SC_PER_VL; j++) + sc_set_cr_threshold( + pio_select_send_context_vl(dd, j, i), + thres); + } + thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50), + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[15].sc, thres); + + /* Adjust maximum MTU for the port in DC */ + dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : + (ilog2(maxvlmtu >> 8) + 1); + len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG); + len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK; + len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) << + DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT; + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1); +} + +static void set_lidlmc(struct hfi1_pportdata *ppd) +{ + int i; + u64 sreg = 0; + struct hfi1_devdata *dd = ppd->dd; + u32 mask = ~((1U << ppd->lmc) - 1); + u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); + u32 lid; + + /* + * Program 0 in CSR if port lid is extended. This prevents + * 9B packets being sent out for large lids. + */ + lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid; + c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK + | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); + c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) + << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) | + ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) + << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1); + + /* + * Iterate over all the send contexts and set their SLID check + */ + sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) << + SEND_CTXT_CHECK_SLID_MASK_SHIFT) | + (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << + SEND_CTXT_CHECK_SLID_VALUE_SHIFT); + + for (i = 0; i < chip_send_contexts(dd); i++) { + hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg); + } + + /* Now we have to do the same thing for the sdma engines */ + sdma_update_lmc(dd, mask, lid); +} + +static const char *state_completed_string(u32 completed) +{ + static const char * const state_completed[] = { + "EstablishComm", + "OptimizeEQ", + "VerifyCap" + }; + + if (completed < ARRAY_SIZE(state_completed)) + return state_completed[completed]; + + return "unknown"; +} + +static const char all_lanes_dead_timeout_expired[] = + "All lanes were inactive – was the interconnect media removed?"; +static const char tx_out_of_policy[] = + "Passing lanes on local port do not meet the local link width policy"; +static const char no_state_complete[] = + "State timeout occurred before link partner completed the state"; +static const char * const state_complete_reasons[] = { + [0x00] = "Reason unknown", + [0x01] = "Link was halted by driver, refer to LinkDownReason", + [0x02] = "Link partner reported failure", + [0x10] = "Unable to achieve frame sync on any lane", + [0x11] = + "Unable to find a common bit rate with the link partner", + [0x12] = + "Unable to achieve frame sync on sufficient lanes to meet the local link width policy", + [0x13] = + "Unable to identify preset equalization on sufficient lanes to meet the local link width policy", + [0x14] = no_state_complete, + [0x15] = + "State timeout occurred before link partner identified equalization presets", + [0x16] = + "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy", + [0x17] = tx_out_of_policy, + [0x20] = all_lanes_dead_timeout_expired, + [0x21] = + "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy", + [0x22] = no_state_complete, + [0x23] = + "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy", + [0x24] = tx_out_of_policy, + [0x30] = all_lanes_dead_timeout_expired, + [0x31] = + "State timeout occurred waiting for host to process received frames", + [0x32] = no_state_complete, + [0x33] = + "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy", + [0x34] = tx_out_of_policy, + [0x35] = "Negotiated link width is mutually exclusive", + [0x36] = + "Timed out before receiving verifycap frames in VerifyCap.Exchange", + [0x37] = "Unable to resolve secure data exchange", +}; + +static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd, + u32 code) +{ + const char *str = NULL; + + if (code < ARRAY_SIZE(state_complete_reasons)) + str = state_complete_reasons[code]; + + if (str) + return str; + return "Reserved"; +} + +/* describe the given last state complete frame */ +static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame, + const char *prefix) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 success; + u32 state; + u32 reason; + u32 lanes; + + /* + * Decode frame: + * [ 0: 0] - success + * [ 3: 1] - state + * [ 7: 4] - next state timeout + * [15: 8] - reason code + * [31:16] - lanes + */ + success = frame & 0x1; + state = (frame >> 1) & 0x7; + reason = (frame >> 8) & 0xff; + lanes = (frame >> 16) & 0xffff; + + dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n", + prefix, frame); + dd_dev_err(dd, " last reported state state: %s (0x%x)\n", + state_completed_string(state), state); + dd_dev_err(dd, " state successfully completed: %s\n", + success ? "yes" : "no"); + dd_dev_err(dd, " fail reason 0x%x: %s\n", + reason, state_complete_reason_code_string(ppd, reason)); + dd_dev_err(dd, " passing lane mask: 0x%x", lanes); +} + +/* + * Read the last state complete frames and explain them. This routine + * expects to be called if the link went down during link negotiation + * and initialization (LNI). That is, anywhere between polling and link up. + */ +static void check_lni_states(struct hfi1_pportdata *ppd) +{ + u32 last_local_state; + u32 last_remote_state; + + read_last_local_state(ppd->dd, &last_local_state); + read_last_remote_state(ppd->dd, &last_remote_state); + + /* + * Don't report anything if there is nothing to report. A value of + * 0 means the link was taken down while polling and there was no + * training in-process. + */ + if (last_local_state == 0 && last_remote_state == 0) + return; + + decode_state_complete(ppd, last_local_state, "transmitted"); + decode_state_complete(ppd, last_remote_state, "received"); +} + +/* wait for wait_ms for LINK_TRANSFER_ACTIVE to go to 1 */ +static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms) +{ + u64 reg; + unsigned long timeout; + + /* watch LCB_STS_LINK_TRANSFER_ACTIVE */ + timeout = jiffies + msecs_to_jiffies(wait_ms); + while (1) { + reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE); + if (reg) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "timeout waiting for LINK_TRANSFER_ACTIVE\n"); + return -ETIMEDOUT; + } + udelay(2); + } + return 0; +} + +/* called when the logical link state is not down as it should be */ +static void force_logical_link_state_down(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * Bring link up in LCB loopback + */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1); + write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, + DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); + + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0); + write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0); + write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110); + write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x2); + + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + (void)read_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET); + udelay(3); + write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 1); + write_csr(dd, DC_LCB_CFG_RUN, 1ull << DC_LCB_CFG_RUN_EN_SHIFT); + + wait_link_transfer_active(dd, 100); + + /* + * Bring the link down again. + */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1); + write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 0); + write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, 0); + + dd_dev_info(ppd->dd, "logical state forced to LINK_DOWN\n"); +} + +/* + * Helper for set_link_state(). Do not call except from that routine. + * Expects ppd->hls_mutex to be held. + * + * @rem_reason value to be sent to the neighbor + * + * LinkDownReasons only set if transition succeeds. + */ +static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 previous_state; + int offline_state_ret; + int ret; + + update_lcb_cache(dd); + + previous_state = ppd->host_link_state; + ppd->host_link_state = HLS_GOING_OFFLINE; + + /* start offline transition */ + ret = set_physical_link_state(dd, (rem_reason << 8) | PLS_OFFLINE); + + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Offline link state, return %d\n", + ret); + return -EINVAL; + } + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); + + offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); + if (offline_state_ret < 0) + return offline_state_ret; + + /* Disabling AOC transmitters */ + if (ppd->port_type == PORT_TYPE_QSFP && + ppd->qsfp_info.limiting_active && + qsfp_mod_present(ppd)) { + int ret; + + ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT); + if (ret == 0) { + set_qsfp_tx(ppd, 0); + release_chip_resource(dd, qsfp_resource(dd)); + } else { + /* not fatal, but should warn */ + dd_dev_err(dd, + "Unable to acquire lock to turn off QSFP TX\n"); + } + } + + /* + * Wait for the offline.Quiet transition if it hasn't happened yet. It + * can take a while for the link to go down. + */ + if (offline_state_ret != PLS_OFFLINE_QUIET) { + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); + if (ret < 0) + return ret; + } + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + /* make sure the logical state is also down */ + ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + if (ret) + force_logical_link_state_down(ppd); + + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + update_statusp(ppd, IB_PORT_DOWN); + + /* + * The LNI has a mandatory wait time after the physical state + * moves to Offline.Quiet. The wait time may be different + * depending on how the link went down. The 8051 firmware + * will observe the needed wait time and only move to ready + * when that is completed. The largest of the quiet timeouts + * is 6s, so wait that long and then at least 0.5s more for + * other transitions, and another 0.5s for a buffer. + */ + ret = wait_fm_ready(dd, 7000); + if (ret) { + dd_dev_err(dd, + "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); + /* state is really offline, so make it so */ + ppd->host_link_state = HLS_DN_OFFLINE; + return ret; + } + + /* + * The state is now offline and the 8051 is ready to accept host + * requests. + * - change our state + * - notify others if we were previously in a linkup state + */ + ppd->host_link_state = HLS_DN_OFFLINE; + if (previous_state & HLS_UP) { + /* went down while link was up */ + handle_linkup_change(dd, 0); + } else if (previous_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + /* went down while attempting link up */ + check_lni_states(ppd); + + /* The QSFP doesn't need to be reset on LNI failure */ + ppd->qsfp_info.reset_needed = 0; + } + + /* the active link width (downgrade) is 0 on link down */ + ppd->link_width_active = 0; + ppd->link_width_downgrade_tx_active = 0; + ppd->link_width_downgrade_rx_active = 0; + ppd->current_egress_rate = 0; + return 0; +} + +/* return the link state name */ +static const char *link_state_name(u32 state) +{ + const char *name; + int n = ilog2(state); + static const char * const names[] = { + [__HLS_UP_INIT_BP] = "INIT", + [__HLS_UP_ARMED_BP] = "ARMED", + [__HLS_UP_ACTIVE_BP] = "ACTIVE", + [__HLS_DN_DOWNDEF_BP] = "DOWNDEF", + [__HLS_DN_POLL_BP] = "POLL", + [__HLS_DN_DISABLE_BP] = "DISABLE", + [__HLS_DN_OFFLINE_BP] = "OFFLINE", + [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP", + [__HLS_GOING_UP_BP] = "GOING_UP", + [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE", + [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN" + }; + + name = n < ARRAY_SIZE(names) ? names[n] : NULL; + return name ? name : "unknown"; +} + +/* return the link state reason name */ +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state) +{ + if (state == HLS_UP_INIT) { + switch (ppd->linkinit_reason) { + case OPA_LINKINIT_REASON_LINKUP: + return "(LINKUP)"; + case OPA_LINKINIT_REASON_FLAPPING: + return "(FLAPPING)"; + case OPA_LINKINIT_OUTSIDE_POLICY: + return "(OUTSIDE_POLICY)"; + case OPA_LINKINIT_QUARANTINED: + return "(QUARANTINED)"; + case OPA_LINKINIT_INSUFIC_CAPABILITY: + return "(INSUFIC_CAPABILITY)"; + default: + break; + } + } + return ""; +} + +/* + * driver_pstate - convert the driver's notion of a port's + * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*). + * Return -1 (converted to a u32) to indicate error. + */ +u32 driver_pstate(struct hfi1_pportdata *ppd) +{ + switch (ppd->host_link_state) { + case HLS_UP_INIT: + case HLS_UP_ARMED: + case HLS_UP_ACTIVE: + return IB_PORTPHYSSTATE_LINKUP; + case HLS_DN_POLL: + return IB_PORTPHYSSTATE_POLLING; + case HLS_DN_DISABLE: + return IB_PORTPHYSSTATE_DISABLED; + case HLS_DN_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_VERIFY_CAP: + return IB_PORTPHYSSTATE_TRAINING; + case HLS_GOING_UP: + return IB_PORTPHYSSTATE_TRAINING; + case HLS_GOING_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_LINK_COOLDOWN: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_DN_DOWNDEF: + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +/* + * driver_lstate - convert the driver's notion of a port's + * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1 + * (converted to a u32) to indicate error. + */ +u32 driver_lstate(struct hfi1_pportdata *ppd) +{ + if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN)) + return IB_PORT_DOWN; + + switch (ppd->host_link_state & HLS_UP) { + case HLS_UP_INIT: + return IB_PORT_INIT; + case HLS_UP_ARMED: + return IB_PORT_ARMED; + case HLS_UP_ACTIVE: + return IB_PORT_ACTIVE; + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason) +{ + if (ppd->local_link_down_reason.latest == 0 && + ppd->neigh_link_down_reason.latest == 0) { + ppd->local_link_down_reason.latest = lcl_reason; + ppd->neigh_link_down_reason.latest = neigh_reason; + ppd->remote_link_down_reason = rem_reason; + } +} + +/** + * data_vls_operational() - Verify if data VL BCT credits and MTU + * are both set. + * @ppd: pointer to hfi1_pportdata structure + * + * Return: true - Ok, false -otherwise. + */ +static inline bool data_vls_operational(struct hfi1_pportdata *ppd) +{ + int i; + u64 reg; + + if (!ppd->actual_vls_operational) + return false; + + for (i = 0; i < ppd->vls_supported; i++) { + reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i)); + if ((reg && !ppd->dd->vld[i].mtu) || + (!reg && ppd->dd->vld[i].mtu)) + return false; + } + + return true; +} + +/* + * Change the physical and/or logical link state. + * + * Do not call this routine while inside an interrupt. It contains + * calls to routines that can take multiple seconds to finish. + * + * Returns 0 on success, -errno on failure. + */ +int set_link_state(struct hfi1_pportdata *ppd, u32 state) +{ + struct hfi1_devdata *dd = ppd->dd; + struct ib_event event = {.device = NULL}; + int ret1, ret = 0; + int orig_new_state, poll_bounce; + + mutex_lock(&ppd->hls_lock); + + orig_new_state = state; + if (state == HLS_DN_DOWNDEF) + state = HLS_DEFAULT; + + /* interpret poll -> poll as a link bounce */ + poll_bounce = ppd->host_link_state == HLS_DN_POLL && + state == HLS_DN_POLL; + + dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__, + link_state_name(ppd->host_link_state), + link_state_name(orig_new_state), + poll_bounce ? "(bounce) " : "", + link_state_reason_name(ppd, state)); + + /* + * If we're going to a (HLS_*) link state that implies the logical + * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then + * reset is_sm_config_started to 0. + */ + if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE))) + ppd->is_sm_config_started = 0; + + /* + * Do nothing if the states match. Let a poll to poll link bounce + * go through. + */ + if (ppd->host_link_state == state && !poll_bounce) + goto done; + + switch (state) { + case HLS_UP_INIT: + if (ppd->host_link_state == HLS_DN_POLL && + (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { + /* + * Quick link up jumps from polling to here. + * + * Whether in normal or loopback mode, the + * simulator jumps from polling to link up. + * Accept that here. + */ + /* OK */ + } else if (ppd->host_link_state != HLS_GOING_UP) { + goto unexpected; + } + + /* + * Wait for Link_Up physical state. + * Physical and Logical states should already be + * be transitioned to LinkUp and LinkInit respectively. + */ + ret = wait_physical_linkstate(ppd, PLS_LINKUP, 1000); + if (ret) { + dd_dev_err(dd, + "%s: physical state did not change to LINK-UP\n", + __func__); + break; + } + + ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000); + if (ret) { + dd_dev_err(dd, + "%s: logical state did not change to INIT\n", + __func__); + break; + } + + /* clear old transient LINKINIT_REASON code */ + if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) + ppd->linkinit_reason = + OPA_LINKINIT_REASON_LINKUP; + + /* enable the port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + handle_linkup_change(dd, 1); + pio_kernel_linkup(dd); + + /* + * After link up, a new link width will have been set. + * Update the xmit counters with regards to the new + * link width. + */ + update_xmit_counters(ppd, ppd->link_width_active); + + ppd->host_link_state = HLS_UP_INIT; + update_statusp(ppd, IB_PORT_INIT); + break; + case HLS_UP_ARMED: + if (ppd->host_link_state != HLS_UP_INIT) + goto unexpected; + + if (!data_vls_operational(ppd)) { + dd_dev_err(dd, + "%s: Invalid data VL credits or mtu\n", + __func__); + ret = -EINVAL; + break; + } + + set_logical_state(dd, LSTATE_ARMED); + ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000); + if (ret) { + dd_dev_err(dd, + "%s: logical state did not change to ARMED\n", + __func__); + break; + } + ppd->host_link_state = HLS_UP_ARMED; + update_statusp(ppd, IB_PORT_ARMED); + /* + * The simulator does not currently implement SMA messages, + * so neighbor_normal is not set. Set it here when we first + * move to Armed. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + ppd->neighbor_normal = 1; + break; + case HLS_UP_ACTIVE: + if (ppd->host_link_state != HLS_UP_ARMED) + goto unexpected; + + set_logical_state(dd, LSTATE_ACTIVE); + ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000); + if (ret) { + dd_dev_err(dd, + "%s: logical state did not change to ACTIVE\n", + __func__); + } else { + /* tell all engines to go running */ + sdma_all_running(dd); + ppd->host_link_state = HLS_UP_ACTIVE; + update_statusp(ppd, IB_PORT_ACTIVE); + + /* Signal the IB layer that the port has went active */ + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = ppd->port; + event.event = IB_EVENT_PORT_ACTIVE; + } + break; + case HLS_DN_POLL: + if ((ppd->host_link_state == HLS_DN_DISABLE || + ppd->host_link_state == HLS_DN_OFFLINE) && + dd->dc_shutdown) + dc_start(dd); + /* Hand LED control to the DC */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); + + if (ppd->host_link_state != HLS_DN_OFFLINE) { + u8 tmp = ppd->link_enabled; + + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) { + ppd->link_enabled = tmp; + break; + } + ppd->remote_link_down_reason = 0; + + if (ppd->driver_link_ready) + ppd->link_enabled = 1; + } + + set_all_slowpath(ppd->dd); + ret = set_local_link_attributes(ppd); + if (ret) + break; + + ppd->port_error_action = 0; + + if (quick_linkup) { + /* quick linkup does not go into polling */ + ret = do_quick_linkup(dd); + } else { + ret1 = set_physical_link_state(dd, PLS_POLLING); + if (!ret1) + ret1 = wait_phys_link_out_of_offline(ppd, + 3000); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Polling link state, return 0x%x\n", + ret1); + ret = -EINVAL; + } + } + + /* + * Change the host link state after requesting DC8051 to + * change its physical state so that we can ignore any + * interrupt with stale LNI(XX) error, which will not be + * cleared until DC8051 transitions to Polling state. + */ + ppd->host_link_state = HLS_DN_POLL; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + /* + * If an error occurred above, go back to offline. The + * caller may reschedule another attempt. + */ + if (ret) + goto_offline(ppd, 0); + else + log_physical_state(ppd, PLS_POLLING); + break; + case HLS_DN_DISABLE: + /* link is disabled */ + ppd->link_enabled = 0; + + /* allow any state to transition to disabled */ + + /* must transition to offline first */ + if (ppd->host_link_state != HLS_DN_OFFLINE) { + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) + break; + ppd->remote_link_down_reason = 0; + } + + if (!dd->dc_shutdown) { + ret1 = set_physical_link_state(dd, PLS_DISABLED); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Disabled link state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ret = wait_physical_linkstate(ppd, PLS_DISABLED, 10000); + if (ret) { + dd_dev_err(dd, + "%s: physical state did not change to DISABLED\n", + __func__); + break; + } + dc_shutdown(dd); + } + ppd->host_link_state = HLS_DN_DISABLE; + break; + case HLS_DN_OFFLINE: + if (ppd->host_link_state == HLS_DN_DISABLE) + dc_start(dd); + + /* allow any state to transition to offline */ + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (!ret) + ppd->remote_link_down_reason = 0; + break; + case HLS_VERIFY_CAP: + if (ppd->host_link_state != HLS_DN_POLL) + goto unexpected; + ppd->host_link_state = HLS_VERIFY_CAP; + log_physical_state(ppd, PLS_CONFIGPHY_VERIFYCAP); + break; + case HLS_GOING_UP: + if (ppd->host_link_state != HLS_VERIFY_CAP) + goto unexpected; + + ret1 = set_physical_link_state(dd, PLS_LINKUP); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to link up state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ppd->host_link_state = HLS_GOING_UP; + break; + + case HLS_GOING_OFFLINE: /* transient within goto_offline() */ + case HLS_LINK_COOLDOWN: /* transient within goto_offline() */ + default: + dd_dev_info(dd, "%s: state 0x%x: not supported\n", + __func__, state); + ret = -EINVAL; + break; + } + + goto done; + +unexpected: + dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n", + __func__, link_state_name(ppd->host_link_state), + link_state_name(state)); + ret = -EINVAL; + +done: + mutex_unlock(&ppd->hls_lock); + + if (event.device) + ib_dispatch_event(&event); + + return ret; +} + +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) +{ + u64 reg; + int ret = 0; + + switch (which) { + case HFI1_IB_CFG_LIDLMC: + set_lidlmc(ppd); + break; + case HFI1_IB_CFG_VL_HIGH_LIMIT: + /* + * The VL Arbitrator high limit is sent in units of 4k + * bytes, while HFI stores it in units of 64 bytes. + */ + val *= 4096 / 64; + reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK) + << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT; + write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg); + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* HFI only supports POLL as the default link down state */ + if (val != HLS_DN_POLL) + ret = -EINVAL; + break; + case HFI1_IB_CFG_OP_VLS: + if (ppd->vls_operational != val) { + ppd->vls_operational = val; + if (!ppd->port) + ret = -EINVAL; + } + break; + /* + * For link width, link width downgrade, and speed enable, always AND + * the setting with what is actually supported. This has two benefits. + * First, enabled can't have unsupported values, no matter what the + * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean + * "fill in with your supported value" have all the bits in the + * field set, so simply ANDing with supported has the desired result. + */ + case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */ + ppd->link_width_enabled = val & ppd->link_width_supported; + break; + case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */ + ppd->link_width_downgrade_enabled = + val & ppd->link_width_downgrade_supported; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + ppd->link_speed_enabled = val & ppd->link_speed_supported; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->overrun_threshold = val; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->phy_error_threshold = val; + break; + + case HFI1_IB_CFG_MTU: + set_send_length(ppd); + break; + + case HFI1_IB_CFG_PKEYS: + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + set_partition_keys(ppd); + break; + + default: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(ppd->dd, + "%s: which %s, val 0x%x: not implemented\n", + __func__, ib_cfg_name(which), val); + break; + } + return ret; +} + +/* begin functions related to vl arbitration table caching */ +static void init_vl_arb_caches(struct hfi1_pportdata *ppd) +{ + int i; + + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_LOW_PRIO_TABLE_SIZE); + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_HIGH_PRIO_TABLE_SIZE); + + /* + * Note that we always return values directly from the + * 'vl_arb_cache' (and do no CSR reads) in response to a + * 'Get(VLArbTable)'. This is obviously correct after a + * 'Set(VLArbTable)', since the cache will then be up to + * date. But it's also correct prior to any 'Set(VLArbTable)' + * since then both the cache, and the relevant h/w registers + * will be zeroed. + */ + + for (i = 0; i < MAX_PRIO_TABLE; i++) + spin_lock_init(&ppd->vl_arb_cache[i].lock); +} + +/* + * vl_arb_lock_cache + * + * All other vl_arb_* functions should be called only after locking + * the cache. + */ +static inline struct vl_arb_cache * +vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx) +{ + if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE) + return NULL; + spin_lock(&ppd->vl_arb_cache[idx].lock); + return &ppd->vl_arb_cache[idx]; +} + +static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx) +{ + spin_unlock(&ppd->vl_arb_cache[idx].lock); +} + +static void vl_arb_get_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static void vl_arb_set_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static int vl_arb_match_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +/* end functions related to vl arbitration table caching */ + +static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target, + u32 size, struct ib_vl_weight_elem *vl) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + unsigned int i, is_up = 0; + int drain, ret = 0; + + mutex_lock(&ppd->hls_lock); + + if (ppd->host_link_state & HLS_UP) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * Before adjusting VL arbitration weights, empty per-VL + * FIFOs, otherwise a packet whose VL weight is being + * set to 0 could get stuck in a FIFO with no chance to + * egress. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err( + dd, + "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n", + __func__); + goto err; + } + + for (i = 0; i < size; i++, vl++) { + /* + * NOTE: The low priority shift and mask are used here, but + * they are the same for both the low and high registers. + */ + reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK) + << SEND_LOW_PRIORITY_LIST_VL_SHIFT) + | (((u64)vl->weight + & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK) + << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT); + write_csr(dd, target + (i * 8), reg); + } + pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +/* + * Read one credit merge VL register. + */ +static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr, + struct vl_limit *vll) +{ + u64 reg = read_csr(dd, csr); + + vll->dedicated = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK); + vll->shared = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK); +} + +/* + * Read the current credit merge limits. + */ +static int get_buffer_control(struct hfi1_devdata *dd, + struct buffer_control *bc, u16 *overall_limit) +{ + u64 reg; + int i; + + /* not all entries are filled in */ + memset(bc, 0, sizeof(*bc)); + + /* OPA and HFI have a 1-1 mapping */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]); + + /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */ + read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]); + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + bc->overall_shared_limit = cpu_to_be16( + (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK); + if (overall_limit) + *overall_limit = (reg + >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK; + return sizeof(struct buffer_control); +} + +static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + u64 reg; + int i; + + /* each register contains 16 SC->VLnt mappings, 4 bits each */ + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[2 * i] = byte & 0xf; + dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4; + } + + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[16 + (2 * i)] = byte & 0xf; + dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4; + } + return sizeof(struct sc2vlnt); +} + +static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems, + struct ib_vl_weight_elem *vl) +{ + unsigned int i; + + for (i = 0; i < nelems; i++, vl++) { + vl->vl = 0xf; + vl->weight = 0; + } +} + +static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, + DC_SC_VL_VAL(15_0, + 0, dp->vlnt[0] & 0xf, + 1, dp->vlnt[1] & 0xf, + 2, dp->vlnt[2] & 0xf, + 3, dp->vlnt[3] & 0xf, + 4, dp->vlnt[4] & 0xf, + 5, dp->vlnt[5] & 0xf, + 6, dp->vlnt[6] & 0xf, + 7, dp->vlnt[7] & 0xf, + 8, dp->vlnt[8] & 0xf, + 9, dp->vlnt[9] & 0xf, + 10, dp->vlnt[10] & 0xf, + 11, dp->vlnt[11] & 0xf, + 12, dp->vlnt[12] & 0xf, + 13, dp->vlnt[13] & 0xf, + 14, dp->vlnt[14] & 0xf, + 15, dp->vlnt[15] & 0xf)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, + DC_SC_VL_VAL(31_16, + 16, dp->vlnt[16] & 0xf, + 17, dp->vlnt[17] & 0xf, + 18, dp->vlnt[18] & 0xf, + 19, dp->vlnt[19] & 0xf, + 20, dp->vlnt[20] & 0xf, + 21, dp->vlnt[21] & 0xf, + 22, dp->vlnt[22] & 0xf, + 23, dp->vlnt[23] & 0xf, + 24, dp->vlnt[24] & 0xf, + 25, dp->vlnt[25] & 0xf, + 26, dp->vlnt[26] & 0xf, + 27, dp->vlnt[27] & 0xf, + 28, dp->vlnt[28] & 0xf, + 29, dp->vlnt[29] & 0xf, + 30, dp->vlnt[30] & 0xf, + 31, dp->vlnt[31] & 0xf)); +} + +static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what, + u16 limit) +{ + if (limit != 0) + dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n", + what, (int)limit, idx); +} + +/* change only the shared limit portion of SendCmGLobalCredit */ +static void set_global_shared(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* change only the total credit limit portion of SendCmGLobalCredit */ +static void set_global_limit(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* set the given per-VL shared limit */ +static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* set the given per-VL dedicated limit */ +static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* spin until the given per-VL status mask bits clear */ +static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask, + const char *which) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT); + while (1) { + reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask; + + if (reg == 0) + return; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + udelay(1); + } + + dd_dev_err(dd, + "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n", + which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); + /* + * If this occurs, it is likely there was a credit loss on the link. + * The only recovery from that is a link bounce. + */ + dd_dev_err(dd, + "Continuing anyway. A credit loss may occur. Suggest a link bounce\n"); +} + +/* + * The number of credits on the VLs may be changed while everything + * is "live", but the following algorithm must be followed due to + * how the hardware is actually implemented. In particular, + * Return_Credit_Status[] is the only correct status check. + * + * if (reducing Global_Shared_Credit_Limit or any shared limit changing) + * set Global_Shared_Credit_Limit = 0 + * use_all_vl = 1 + * mask0 = all VLs that are changing either dedicated or shared limits + * set Shared_Limit[mask0] = 0 + * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0 + * if (changing any dedicated limit) + * mask1 = all VLs that are lowering dedicated limits + * lower Dedicated_Limit[mask1] + * spin until Return_Credit_Status[mask1] == 0 + * raise Dedicated_Limits + * raise Shared_Limits + * raise Global_Shared_Credit_Limit + * + * lower = if the new limit is lower, set the limit to the new value + * raise = if the new limit is higher than the current value (may be changed + * earlier in the algorithm), set the new limit to the new value + */ +int set_buffer_control(struct hfi1_pportdata *ppd, + struct buffer_control *new_bc) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 changing_mask, ld_mask, stat_mask; + int change_count; + int i, use_all_mask; + int this_shared_changing; + int vl_count = 0, ret; + /* + * A0: add the variable any_shared_limit_changing below and in the + * algorithm above. If removing A0 support, it can be removed. + */ + int any_shared_limit_changing; + struct buffer_control cur_bc; + u8 changing[OPA_MAX_VLS]; + u8 lowering_dedicated[OPA_MAX_VLS]; + u16 cur_total; + u32 new_total = 0; + const u64 all_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK; + +#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15) +#define NUM_USABLE_VLS 16 /* look at VL15 and less */ + + /* find the new total credits, do sanity check on unused VLs */ + for (i = 0; i < OPA_MAX_VLS; i++) { + if (valid_vl(i)) { + new_total += be16_to_cpu(new_bc->vl[i].dedicated); + continue; + } + nonzero_msg(dd, i, "dedicated", + be16_to_cpu(new_bc->vl[i].dedicated)); + nonzero_msg(dd, i, "shared", + be16_to_cpu(new_bc->vl[i].shared)); + new_bc->vl[i].dedicated = 0; + new_bc->vl[i].shared = 0; + } + new_total += be16_to_cpu(new_bc->overall_shared_limit); + + /* fetch the current values */ + get_buffer_control(dd, &cur_bc, &cur_total); + + /* + * Create the masks we will use. + */ + memset(changing, 0, sizeof(changing)); + memset(lowering_dedicated, 0, sizeof(lowering_dedicated)); + /* + * NOTE: Assumes that the individual VL bits are adjacent and in + * increasing order + */ + stat_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK; + changing_mask = 0; + ld_mask = 0; + change_count = 0; + any_shared_limit_changing = 0; + for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) { + if (!valid_vl(i)) + continue; + this_shared_changing = new_bc->vl[i].shared + != cur_bc.vl[i].shared; + if (this_shared_changing) + any_shared_limit_changing = 1; + if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated || + this_shared_changing) { + changing[i] = 1; + changing_mask |= stat_mask; + change_count++; + } + if (be16_to_cpu(new_bc->vl[i].dedicated) < + be16_to_cpu(cur_bc.vl[i].dedicated)) { + lowering_dedicated[i] = 1; + ld_mask |= stat_mask; + } + } + + /* bracket the credit change with a total adjustment */ + if (new_total > cur_total) + set_global_limit(dd, new_total); + + /* + * Start the credit change algorithm. + */ + use_all_mask = 0; + if ((be16_to_cpu(new_bc->overall_shared_limit) < + be16_to_cpu(cur_bc.overall_shared_limit)) || + (is_ax(dd) && any_shared_limit_changing)) { + set_global_shared(dd, 0); + cur_bc.overall_shared_limit = 0; + use_all_mask = 1; + } + + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (changing[i]) { + set_vl_shared(dd, i, 0); + cur_bc.vl[i].shared = 0; + } + } + + wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask, + "shared"); + + if (change_count > 0) { + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (lowering_dedicated[i]) { + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc-> + vl[i].dedicated)); + cur_bc.vl[i].dedicated = + new_bc->vl[i].dedicated; + } + } + + wait_for_vl_status_clear(dd, ld_mask, "dedicated"); + + /* now raise all dedicated that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].dedicated) > + be16_to_cpu(cur_bc.vl[i].dedicated)) + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc-> + vl[i].dedicated)); + } + } + + /* next raise all shared that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].shared) > + be16_to_cpu(cur_bc.vl[i].shared)) + set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared)); + } + + /* finally raise the global shared */ + if (be16_to_cpu(new_bc->overall_shared_limit) > + be16_to_cpu(cur_bc.overall_shared_limit)) + set_global_shared(dd, + be16_to_cpu(new_bc->overall_shared_limit)); + + /* bracket the credit change with a total adjustment */ + if (new_total < cur_total) + set_global_limit(dd, new_total); + + /* + * Determine the actual number of operational VLS using the number of + * dedicated and shared credits for each VL. + */ + if (change_count > 0) { + for (i = 0; i < TXE_NUM_DATA_VL; i++) + if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 || + be16_to_cpu(new_bc->vl[i].shared) > 0) + vl_count++; + ppd->actual_vls_operational = vl_count; + ret = sdma_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, + NULL); + if (ret == 0) + ret = pio_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, NULL); + if (ret) + return ret; + } + return 0; +} + +/* + * Read the given fabric manager table. Return the size of the + * table (in bytes) on success, and a negative error code on + * failure. + */ +int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t) + +{ + int size; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + case FM_TBL_VL_LOW_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + case FM_TBL_BUFFER_CONTROL: + size = get_buffer_control(ppd->dd, t, NULL); + break; + case FM_TBL_SC2VLNT: + size = get_sc2vlnt(ppd->dd, t); + break; + case FM_TBL_VL_PREEMPT_ELEMS: + size = 256; + /* OPA specifies 128 elements, of 2 bytes each */ + get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t); + break; + case FM_TBL_VL_PREEMPT_MATRIX: + size = 256; + /* + * OPA specifies that this is the same size as the VL + * arbitration tables (i.e., 256 bytes). + */ + break; + default: + return -EINVAL; + } + return size; +} + +/* + * Write the given fabric manager table. + */ +int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t) +{ + int ret = 0; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST, + VL_ARB_HIGH_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_VL_LOW_ARB: + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST, + VL_ARB_LOW_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_BUFFER_CONTROL: + ret = set_buffer_control(ppd, t); + break; + case FM_TBL_SC2VLNT: + set_sc2vlnt(ppd->dd, t); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/* + * Disable all data VLs. + * + * Return 0 if disabled, non-zero if the VLs cannot be disabled. + */ +static int disable_data_vls(struct hfi1_devdata *dd) +{ + if (is_ax(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_DISABLE); + + return 0; +} + +/* + * open_fill_data_vls() - the counterpart to stop_drain_data_vls(). + * Just re-enables all data VLs (the "fill" part happens + * automatically - the name was chosen for symmetry with + * stop_drain_data_vls()). + * + * Return 0 if successful, non-zero if the VLs cannot be enabled. + */ +int open_fill_data_vls(struct hfi1_devdata *dd) +{ + if (is_ax(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_ENABLE); + + return 0; +} + +/* + * drain_data_vls() - assumes that disable_data_vls() has been called, + * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA + * engines to drop to 0. + */ +static void drain_data_vls(struct hfi1_devdata *dd) +{ + sc_wait(dd); + sdma_wait(dd); + pause_for_credit_return(dd); +} + +/* + * stop_drain_data_vls() - disable, then drain all per-VL fifos. + * + * Use open_fill_data_vls() to resume using data VLs. This pair is + * meant to be used like this: + * + * stop_drain_data_vls(dd); + * // do things with per-VL resources + * open_fill_data_vls(dd); + */ +int stop_drain_data_vls(struct hfi1_devdata *dd) +{ + int ret; + + ret = disable_data_vls(dd); + if (ret == 0) + drain_data_vls(dd); + + return ret; +} + +/* + * Convert a nanosecond time to a cclock count. No matter how slow + * the cclock, a non-zero ns will always have a non-zero result. + */ +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns) +{ + u32 cclocks; + + if (dd->icode == ICODE_FPGA_EMULATION) + cclocks = (ns * 1000) / FPGA_CCLOCK_PS; + else /* simulation pretends to be ASIC */ + cclocks = (ns * 1000) / ASIC_CCLOCK_PS; + if (ns && !cclocks) /* if ns nonzero, must be at least 1 */ + cclocks = 1; + return cclocks; +} + +/* + * Convert a cclock count to nanoseconds. Not matter how slow + * the cclock, a non-zero cclocks will always have a non-zero result. + */ +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks) +{ + u32 ns; + + if (dd->icode == ICODE_FPGA_EMULATION) + ns = (cclocks * FPGA_CCLOCK_PS) / 1000; + else /* simulation pretends to be ASIC */ + ns = (cclocks * ASIC_CCLOCK_PS) / 1000; + if (cclocks && !ns) + ns = 1; + return ns; +} + +/* + * Dynamically adjust the receive interrupt timeout for a context based on + * incoming packet rate. + * + * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero. + */ +static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 timeout = rcd->rcvavail_timeout; + + /* + * This algorithm doubles or halves the timeout depending on whether + * the number of packets received in this interrupt were less than or + * greater equal the interrupt count. + * + * The calculations below do not allow a steady state to be achieved. + * Only at the endpoints it is possible to have an unchanging + * timeout. + */ + if (npkts < rcv_intr_count) { + /* + * Not enough packets arrived before the timeout, adjust + * timeout downward. + */ + if (timeout < 2) /* already at minimum? */ + return; + timeout >>= 1; + } else { + /* + * More than enough packets arrived before the timeout, adjust + * timeout upward. + */ + if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */ + return; + timeout = min(timeout << 1, dd->rcv_intr_timeout_csr); + } + + rcd->rcvavail_timeout = timeout; + /* + * timeout cannot be larger than rcv_intr_timeout_csr which has already + * been verified to be in range + */ + write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT, + (u64)timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); +} + +void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, + u32 intr_adjust, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u64 reg; + u32 ctxt = rcd->ctxt; + + /* + * Need to write timeout register before updating RcvHdrHead to ensure + * that a new value is used when the HW decides to restart counting. + */ + if (intr_adjust) + adjust_rcv_timeout(rcd, npkts); + if (updegr) { + reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK) + << RCV_EGR_INDEX_HEAD_HEAD_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg); + } + reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) | + (((u64)hd & RCV_HDR_HEAD_HEAD_MASK) + << RCV_HDR_HEAD_HEAD_SHIFT); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); +} + +u32 hdrqempty(struct hfi1_ctxtdata *rcd) +{ + u32 head, tail; + + head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) + & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT; + + if (hfi1_rcvhdrtail_kvaddr(rcd)) + tail = get_rcvhdrtail(rcd); + else + tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + + return head == tail; +} + +/* + * Context Control and Receive Array encoding for buffer size: + * 0x0 invalid + * 0x1 4 KB + * 0x2 8 KB + * 0x3 16 KB + * 0x4 32 KB + * 0x5 64 KB + * 0x6 128 KB + * 0x7 256 KB + * 0x8 512 KB (Receive Array only) + * 0x9 1 MB (Receive Array only) + * 0xa 2 MB (Receive Array only) + * + * 0xB-0xF - reserved (Receive Array only) + * + * + * This routine assumes that the value has already been sanity checked. + */ +static u32 encoded_size(u32 size) +{ + switch (size) { + case 4 * 1024: return 0x1; + case 8 * 1024: return 0x2; + case 16 * 1024: return 0x3; + case 32 * 1024: return 0x4; + case 64 * 1024: return 0x5; + case 128 * 1024: return 0x6; + case 256 * 1024: return 0x7; + case 512 * 1024: return 0x8; + case 1 * 1024 * 1024: return 0x9; + case 2 * 1024 * 1024: return 0xa; + } + return 0x1; /* if invalid, go with the minimum size */ +} + +/** + * encode_rcv_header_entry_size - return chip specific encoding for size + * @size: size in dwords + * + * Convert a receive header entry size that to the encoding used in the CSR. + * + * Return a zero if the given size is invalid, otherwise the encoding. + */ +u8 encode_rcv_header_entry_size(u8 size) +{ + /* there are only 3 valid receive header entry sizes */ + if (size == 2) + return 1; + if (size == 16) + return 2; + if (size == 32) + return 4; + return 0; /* invalid */ +} + +/** + * hfi1_validate_rcvhdrcnt - validate hdrcnt + * @dd: the device data + * @thecnt: the header count + */ +int hfi1_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) +{ + if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { + dd_dev_err(dd, "Receive header queue count too small\n"); + return -EINVAL; + } + + if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { + dd_dev_err(dd, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); + return -EINVAL; + } + + if (thecnt % HDRQ_INCREMENT) { + dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", + thecnt, HDRQ_INCREMENT); + return -EINVAL; + } + + return 0; +} + +/** + * set_hdrq_regs - set header queue registers for context + * @dd: the device data + * @ctxt: the context + * @entsize: the dword entry size + * @hdrcnt: the number of header entries + */ +void set_hdrq_regs(struct hfi1_devdata *dd, u8 ctxt, u8 entsize, u16 hdrcnt) +{ + u64 reg; + + reg = (((u64)hdrcnt >> HDRQ_SIZE_SHIFT) & RCV_HDR_CNT_CNT_MASK) << + RCV_HDR_CNT_CNT_SHIFT; + write_kctxt_csr(dd, ctxt, RCV_HDR_CNT, reg); + reg = ((u64)encode_rcv_header_entry_size(entsize) & + RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) << + RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; + write_kctxt_csr(dd, ctxt, RCV_HDR_ENT_SIZE, reg); + reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK) << + RCV_HDR_SIZE_HDR_SIZE_SHIFT; + write_kctxt_csr(dd, ctxt, RCV_HDR_SIZE, reg); + + /* + * Program dummy tail address for every receive context + * before enabling any receive context + */ + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_dma); +} + +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, + struct hfi1_ctxtdata *rcd) +{ + u64 rcvctrl, reg; + int did_enable = 0; + u16 ctxt; + + if (!rcd) + return; + + ctxt = rcd->ctxt; + + hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op); + + rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); + /* if the context already enabled, don't do the extra steps */ + if ((op & HFI1_RCVCTRL_CTXT_ENB) && + !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { + /* reset the tail and hdr addresses, and sequence count */ + write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, + rcd->rcvhdrq_dma); + if (hfi1_rcvhdrtail_kvaddr(rcd)) + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + rcd->rcvhdrqtailaddr_dma); + hfi1_set_seq_cnt(rcd, 1); + + /* reset the cached receive header queue head value */ + hfi1_set_rcd_head(rcd, 0); + + /* + * Zero the receive header queue so we don't get false + * positives when checking the sequence number. The + * sequence numbers could land exactly on the same spot. + * E.g. a rcd restart before the receive header wrapped. + */ + memset(rcd->rcvhdrq, 0, rcvhdrq_size(rcd)); + + /* starting timeout */ + rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr; + + /* enable the context */ + rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK; + + /* clean the egr buffer size first */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size) + & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK) + << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT; + + /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */ + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0); + did_enable = 1; + + /* zero RcvEgrIndexHead */ + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0); + + /* set eager count and base index */ + reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_CNT_MASK) + << RCV_EGR_CTRL_EGR_CNT_SHIFT) | + (((rcd->eager_base >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK) + << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg); + + /* + * Set TID (expected) count and base index. + * rcd->expected_count is set to individual RcvArray entries, + * not pairs, and the CSR takes a pair-count in groups of + * four, so divide by 8. + */ + reg = (((rcd->expected_count >> RCV_SHIFT) + & RCV_TID_CTRL_TID_PAIR_CNT_MASK) + << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) | + (((rcd->expected_base >> RCV_SHIFT) + & RCV_TID_CTRL_TID_BASE_INDEX_MASK) + << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg); + if (ctxt == HFI1_CTRL_CTXT) + write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT); + } + if (op & HFI1_RCVCTRL_CTXT_DIS) { + write_csr(dd, RCV_VL15, 0); + /* + * When receive context is being disabled turn on tail + * update with a dummy tail address and then disable + * receive context. + */ + if (dd->rcvhdrtail_dummy_dma) { + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_dma); + /* Enabling RcvCtxtCtrl.TailUpd is intentional. */ + rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; + } + + rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; + } + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, true); + rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + } + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, false); + rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + } + if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && hfi1_rcvhdrtail_kvaddr(rcd)) + rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_DIS) { + /* See comment on RcvCtxtCtrl.TailUpd above */ + if (!(op & HFI1_RCVCTRL_CTXT_DIS)) + rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK; + } + if (op & HFI1_RCVCTRL_TIDFLOW_ENB) + rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_TIDFLOW_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) { + /* + * In one-packet-per-eager mode, the size comes from + * the RcvArray entry. + */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + } + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_URGENT_ENB) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, true); + if (op & HFI1_RCVCTRL_URGENT_DIS) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, false); + + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx", ctxt, rcvctrl); + write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl); + + /* work around sticky RcvCtxtStatus.BlockedRHQFull */ + if (did_enable && + (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) { + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + if (reg != 0) { + dd_dev_info(dd, "ctxt %d status %lld (blocked)\n", + ctxt, reg); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n", + ctxt, reg, reg == 0 ? "not" : "still"); + } + } + + if (did_enable) { + /* + * The interrupt timeout and count must be set after + * the context is enabled to take effect. + */ + /* set interrupt timeout */ + write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT, + (u64)rcd->rcvavail_timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); + + /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */ + reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); + } + + if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS)) + /* + * If the context has been disabled and the Tail Update has + * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address + * so it doesn't contain an address that is invalid. + */ + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_dma); +} + +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = dd->cntrnameslen; + *namep = dd->cntrnames; + } else { + const struct cntr_entry *entry; + int i, j; + + ret = (dd->ndevcntrs) * sizeof(u64); + + /* Get the start of the block of counters */ + *cntrp = dd->cntrs; + + /* + * Now go and fill in each counter in the block. + */ + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled"); + } else { + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, + dd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d", + val, j); + dd->cntrs[entry->offset + j] = + val; + } + } else if (entry->flags & CNTR_SDMA) { + hfi1_cdbg(CNTR, + "\t Per SDMA Engine"); + for (j = 0; j < chip_sdma_engines(dd); + j++) { + val = + entry->rw_cntr(entry, dd, j, + CNTR_MODE_R, 0); + hfi1_cdbg(CNTR, + "\t\tRead 0x%llx for %d", + val, j); + dd->cntrs[entry->offset + j] = + val; + } + } else { + val = entry->rw_cntr(entry, dd, + CNTR_INVALID_VL, + CNTR_MODE_R, 0); + dd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + } + return ret; +} + +/* + * Used by sysfs to create files for hfi stats to read + */ +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = ppd->dd->portcntrnameslen; + *namep = ppd->dd->portcntrnames; + } else { + const struct cntr_entry *entry; + int i, j; + + ret = ppd->dd->nportcntrs * sizeof(u64); + *cntrp = ppd->cntrs; + + for (i = 0; i < PORT_CNTR_LAST; i++) { + entry = &port_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled"); + continue; + } + + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, ppd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d", + val, j); + ppd->cntrs[entry->offset + j] = val; + } + } else { + val = entry->rw_cntr(entry, ppd, + CNTR_INVALID_VL, + CNTR_MODE_R, + 0); + ppd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + return ret; +} + +static void free_cntrs(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + if (dd->synth_stats_timer.function) + del_timer_sync(&dd->synth_stats_timer); + cancel_work_sync(&dd->update_cntr_work); + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + kfree(ppd->cntrs); + kfree(ppd->scntrs); + free_percpu(ppd->ibport_data.rvp.rc_acks); + free_percpu(ppd->ibport_data.rvp.rc_qacks); + free_percpu(ppd->ibport_data.rvp.rc_delayed_comp); + ppd->cntrs = NULL; + ppd->scntrs = NULL; + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_delayed_comp = NULL; + } + kfree(dd->portcntrnames); + dd->portcntrnames = NULL; + kfree(dd->cntrs); + dd->cntrs = NULL; + kfree(dd->scntrs); + dd->scntrs = NULL; + kfree(dd->cntrnames); + dd->cntrnames = NULL; + if (dd->update_cntr_wq) { + destroy_workqueue(dd->update_cntr_wq); + dd->update_cntr_wq = NULL; + } +} + +static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry, + u64 *psval, void *context, int vl) +{ + u64 val; + u64 sval = *psval; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0); + + /* If its a synthetic counter there is more work we need to do */ + if (entry->flags & CNTR_SYNTH) { + if (sval == CNTR_MAX) { + /* No need to read already saturated */ + return CNTR_MAX; + } + + if (entry->flags & CNTR_32BIT) { + /* 32bit counters can wrap multiple times */ + u64 upper = sval >> 32; + u64 lower = (sval << 32) >> 32; + + if (lower > val) { /* hw wrapped */ + if (upper == CNTR_32BIT_MAX) + val = CNTR_MAX; + else + upper++; + } + + if (val != CNTR_MAX) + val = (upper << 32) | val; + + } else { + /* If we rolled we are saturated */ + if ((val < sval) || (val > CNTR_MAX)) + val = CNTR_MAX; + } + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +static u64 write_dev_port_cntr(struct hfi1_devdata *dd, + struct cntr_entry *entry, + u64 *psval, void *context, int vl, u64 data) +{ + u64 val; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + if (entry->flags & CNTR_SYNTH) { + *psval = data; + if (entry->flags & CNTR_32BIT) { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + (data << 32) >> 32); + val = data; /* return the full 64bit value */ + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + data); + } + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data); + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return read_dev_port_cntr(dd, entry, sval, dd, vl); +} + +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return write_dev_port_cntr(dd, entry, sval, dd, vl, data); +} + +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl); +} + +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data); +} + +static void do_update_synth_timer(struct work_struct *work) +{ + u64 cur_tx; + u64 cur_rx; + u64 total_flits; + u8 update = 0; + int i, j, vl; + struct hfi1_pportdata *ppd; + struct cntr_entry *entry; + struct hfi1_devdata *dd = container_of(work, struct hfi1_devdata, + update_cntr_work); + + /* + * Rather than keep beating on the CSRs pick a minimal set that we can + * check to watch for potential roll over. We can do this by looking at + * the number of flits sent/recv. If the total flits exceeds 32bits then + * we have to iterate all the counters and update. + */ + entry = &dev_cntrs[C_DC_RCV_FLITS]; + cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + hfi1_cdbg( + CNTR, + "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx", + dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx); + + if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) { + /* + * May not be strictly necessary to update but it won't hurt and + * simplifies the logic here. + */ + update = 1; + hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating", + dd->unit); + } else { + total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx); + hfi1_cdbg(CNTR, + "[%d] total flits 0x%llx limit 0x%llx", dd->unit, + total_flits, (u64)CNTR_32BIT_MAX); + if (total_flits >= CNTR_32BIT_MAX) { + hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating", + dd->unit); + update = 1; + } + } + + if (update) { + hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit); + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_dev_cntr(dd, i, vl); + } else { + read_dev_cntr(dd, i, CNTR_INVALID_VL); + } + } + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + for (j = 0; j < PORT_CNTR_LAST; j++) { + entry = &port_cntrs[j]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_port_cntr(ppd, j, vl); + } else { + read_port_cntr(ppd, j, CNTR_INVALID_VL); + } + } + } + + /* + * We want the value in the register. The goal is to keep track + * of the number of "ticks" not the counter value. In other + * words if the register rolls we want to notice it and go ahead + * and force an update. + */ + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_RCV_FLITS]; + dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx", + dd->unit, dd->last_tx, dd->last_rx); + + } else { + hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit); + } +} + +static void update_synth_timer(struct timer_list *t) +{ + struct hfi1_devdata *dd = from_timer(dd, t, synth_stats_timer); + + queue_work(dd->update_cntr_wq, &dd->update_cntr_work); + mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); +} + +#define C_MAX_NAME 16 /* 15 chars + one for /0 */ +static int init_cntrs(struct hfi1_devdata *dd) +{ + int i, rcv_ctxts, j; + size_t sz; + char *p; + char name[C_MAX_NAME]; + struct hfi1_pportdata *ppd; + const char *bit_type_32 = ",32"; + const int bit_type_32_sz = strlen(bit_type_32); + u32 sdma_engines = chip_sdma_engines(dd); + + /* set up the stats timer; the add_timer is done at the end */ + timer_setup(&dd->synth_stats_timer, update_synth_timer, 0); + + /***********************/ + /* per device counters */ + /***********************/ + + /* size names and determine how many we have*/ + dd->ndevcntrs = 0; + sz = 0; + + for (i = 0; i < DEV_CNTR_LAST; i++) { + if (dev_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name); + continue; + } + + if (dev_cntrs[i].flags & CNTR_VL) { + dev_cntrs[i].offset = dd->ndevcntrs; + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, vl_from_idx(j)); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->ndevcntrs++; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + dev_cntrs[i].offset = dd->ndevcntrs; + for (j = 0; j < sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->ndevcntrs++; + } + } else { + /* +1 for newline. */ + sz += strlen(dev_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + dev_cntrs[i].offset = dd->ndevcntrs; + dd->ndevcntrs++; + } + } + + /* allocate space for the counter values */ + dd->cntrs = kcalloc(dd->ndevcntrs + num_driver_cntrs, sizeof(u64), + GFP_KERNEL); + if (!dd->cntrs) + goto bail; + + dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); + if (!dd->scntrs) + goto bail; + + /* allocate space for the counter names */ + dd->cntrnameslen = sz; + dd->cntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->cntrnames) + goto bail; + + /* fill in the names */ + for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) { + if (dev_cntrs[i].flags & CNTR_DISABLED) { + /* Nothing */ + } else if (dev_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, + vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + for (j = 0; j < sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else { + memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name)); + p += strlen(dev_cntrs[i].name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } + + /*********************/ + /* per port counters */ + /*********************/ + + /* + * Go through the counters for the overflows and disable the ones we + * don't need. This varies based on platform so we need to do it + * dynamically here. + */ + rcv_ctxts = dd->num_rcv_contexts; + for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts; + i <= C_RCV_HDR_OVF_LAST; i++) { + port_cntrs[i].flags |= CNTR_DISABLED; + } + + /* size port counter names and determine how many we have*/ + sz = 0; + dd->nportcntrs = 0; + for (i = 0; i < PORT_CNTR_LAST; i++) { + if (port_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name); + continue; + } + + if (port_cntrs[i].flags & CNTR_VL) { + port_cntrs[i].offset = dd->nportcntrs; + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, vl_from_idx(j)); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->nportcntrs++; + } + } else { + /* +1 for newline */ + sz += strlen(port_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + port_cntrs[i].offset = dd->nportcntrs; + dd->nportcntrs++; + } + } + + /* allocate space for the counter names */ + dd->portcntrnameslen = sz; + dd->portcntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->portcntrnames) + goto bail; + + /* fill in port cntr names */ + for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) { + if (port_cntrs[i].flags & CNTR_DISABLED) + continue; + + if (port_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else { + memcpy(p, port_cntrs[i].name, + strlen(port_cntrs[i].name)); + p += strlen(port_cntrs[i].name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } + + /* allocate per port storage for counter values */ + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->cntrs) + goto bail; + + ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->scntrs) + goto bail; + } + + /* CPU counters need to be allocated and zeroed */ + if (init_cpu_counters(dd)) + goto bail; + + dd->update_cntr_wq = alloc_ordered_workqueue("hfi1_update_cntr_%d", + WQ_MEM_RECLAIM, dd->unit); + if (!dd->update_cntr_wq) + goto bail; + + INIT_WORK(&dd->update_cntr_work, do_update_synth_timer); + + mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); + return 0; +bail: + free_cntrs(dd); + return -ENOMEM; +} + +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate) +{ + switch (chip_lstate) { + case LSTATE_DOWN: + return IB_PORT_DOWN; + case LSTATE_INIT: + return IB_PORT_INIT; + case LSTATE_ARMED: + return IB_PORT_ARMED; + case LSTATE_ACTIVE: + return IB_PORT_ACTIVE; + default: + dd_dev_err(dd, + "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n", + chip_lstate); + return IB_PORT_DOWN; + } +} + +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) +{ + /* look at the HFI meta-states only */ + switch (chip_pstate & 0xf0) { + case PLS_DISABLED: + return IB_PORTPHYSSTATE_DISABLED; + case PLS_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case PLS_POLLING: + return IB_PORTPHYSSTATE_POLLING; + case PLS_CONFIGPHY: + return IB_PORTPHYSSTATE_TRAINING; + case PLS_LINKUP: + return IB_PORTPHYSSTATE_LINKUP; + case PLS_PHYTEST: + return IB_PORTPHYSSTATE_PHY_TEST; + default: + dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n", + chip_pstate); + return IB_PORTPHYSSTATE_DISABLED; + } +} + +/* return the OPA port logical state name */ +const char *opa_lstate_name(u32 lstate) +{ + static const char * const port_logical_names[] = { + "PORT_NOP", + "PORT_DOWN", + "PORT_INIT", + "PORT_ARMED", + "PORT_ACTIVE", + "PORT_ACTIVE_DEFER", + }; + if (lstate < ARRAY_SIZE(port_logical_names)) + return port_logical_names[lstate]; + return "unknown"; +} + +/* return the OPA port physical state name */ +const char *opa_pstate_name(u32 pstate) +{ + static const char * const port_physical_names[] = { + "PHYS_NOP", + "reserved1", + "PHYS_POLL", + "PHYS_DISABLED", + "PHYS_TRAINING", + "PHYS_LINKUP", + "PHYS_LINK_ERR_RECOVER", + "PHYS_PHY_TEST", + "reserved8", + "PHYS_OFFLINE", + "PHYS_GANGED", + "PHYS_TEST", + }; + if (pstate < ARRAY_SIZE(port_physical_names)) + return port_physical_names[pstate]; + return "unknown"; +} + +/** + * update_statusp - Update userspace status flag + * @ppd: Port data structure + * @state: port state information + * + * Actual port status is determined by the host_link_state value + * in the ppd. + * + * host_link_state MUST be updated before updating the user space + * statusp. + */ +static void update_statusp(struct hfi1_pportdata *ppd, u32 state) +{ + /* + * Set port status flags in the page mapped into userspace + * memory. Do it here to ensure a reliable state - this is + * the only function called by all state handling code. + * Always set the flags due to the fact that the cache value + * might have been changed explicitly outside of this + * function. + */ + if (ppd->statusp) { + switch (state) { + case IB_PORT_DOWN: + case IB_PORT_INIT: + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + break; + case IB_PORT_ARMED: + *ppd->statusp |= HFI1_STATUS_IB_CONF; + break; + case IB_PORT_ACTIVE: + *ppd->statusp |= HFI1_STATUS_IB_READY; + break; + } + } + dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", + opa_lstate_name(state), state); +} + +/** + * wait_logical_linkstate - wait for an IB link state change to occur + * @ppd: port device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for IB link state change to occur. + * For now, take the easy polling route. + * Returns 0 if state reached, otherwise -ETIMEDOUT. + */ +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs) +{ + unsigned long timeout; + u32 new_state; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + new_state = chip_to_opa_lstate(ppd->dd, + read_logical_state(ppd->dd)); + if (new_state == state) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for link state 0x%x\n", + state); + return -ETIMEDOUT; + } + msleep(20); + } + + return 0; +} + +static void log_state_transition(struct hfi1_pportdata *ppd, u32 state) +{ + u32 ib_pstate = chip_to_opa_pstate(ppd->dd, state); + + dd_dev_info(ppd->dd, + "physical state changed to %s (0x%x), phy 0x%x\n", + opa_pstate_name(ib_pstate), ib_pstate, state); +} + +/* + * Read the physical hardware link state and check if it matches host + * drivers anticipated state. + */ +static void log_physical_state(struct hfi1_pportdata *ppd, u32 state) +{ + u32 read_state = read_physical_state(ppd->dd); + + if (read_state == state) { + log_state_transition(ppd, state); + } else { + dd_dev_err(ppd->dd, + "anticipated phy link state 0x%x, read 0x%x\n", + state, read_state); + } +} + +/* + * wait_physical_linkstate - wait for an physical link state change to occur + * @ppd: port device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for physical link state change to occur. + * Returns 0 if state reached, otherwise -ETIMEDOUT. + */ +static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if (read_state == state) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link state 0x%x\n", + state); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, state); + return 0; +} + +/* + * wait_phys_link_offline_quiet_substates - wait for any offline substate + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) == PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + +/* + * wait_phys_link_out_of_offline - wait for any out of offline state + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any out of offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) != PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + +#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ +(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +#define SET_STATIC_RATE_CONTROL_SMASK(r) \ +(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +void hfi1_init_ctxt(struct send_context *sc) +{ + if (sc) { + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u8 set = (sc->type == SC_USER ? + HFI1_CAP_IS_USET(STATIC_RATE_CTRL) : + HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)); + reg = read_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE); + if (set) + CLEAR_STATIC_RATE_CONTROL_SMASK(reg); + else + SET_STATIC_RATE_CONTROL_SMASK(reg); + write_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE, reg); + } +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) +{ + int ret = 0; + u64 reg; + + if (dd->icode != ICODE_RTL_SILICON) { + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(dd, "%s: tempsense not supported by HW\n", + __func__); + return -EINVAL; + } + reg = read_csr(dd, ASIC_STS_THERM); + temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) & + ASIC_STS_THERM_CURR_TEMP_MASK); + temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) & + ASIC_STS_THERM_LO_TEMP_MASK); + temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) & + ASIC_STS_THERM_HI_TEMP_MASK); + temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) & + ASIC_STS_THERM_CRIT_TEMP_MASK); + /* triggers is a 3-bit value - 1 bit per trigger. */ + temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7); + + return ret; +} + +/* ========================================================================= */ + +/** + * read_mod_write() - Calculate the IRQ register index and set/clear the bits + * @dd: valid devdata + * @src: IRQ source to determine register index from + * @bits: the bits to set or clear + * @set: true == set the bits, false == clear the bits + * + */ +static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits, + bool set) +{ + u64 reg; + u16 idx = src / BITS_PER_REGISTER; + + spin_lock(&dd->irq_src_lock); + reg = read_csr(dd, CCE_INT_MASK + (8 * idx)); + if (set) + reg |= bits; + else + reg &= ~bits; + write_csr(dd, CCE_INT_MASK + (8 * idx), reg); + spin_unlock(&dd->irq_src_lock); +} + +/** + * set_intr_bits() - Enable/disable a range (one or more) IRQ sources + * @dd: valid devdata + * @first: first IRQ source to set/clear + * @last: last IRQ source (inclusive) to set/clear + * @set: true == set the bits, false == clear the bits + * + * If first == last, set the exact source. + */ +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) +{ + u64 bits = 0; + u64 bit; + u16 src; + + if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES) + return -EINVAL; + + if (last < first) + return -ERANGE; + + for (src = first; src <= last; src++) { + bit = src % BITS_PER_REGISTER; + /* wrapped to next register? */ + if (!bit && bits) { + read_mod_write(dd, src - 1, bits, set); + bits = 0; + } + bits |= BIT_ULL(bit); + } + read_mod_write(dd, last, bits, set); + + return 0; +} + +/* + * Clear all interrupt sources on the chip. + */ +void clear_all_interrupts(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0); + + write_csr(dd, CCE_ERR_CLEAR, ~(u64)0); + write_csr(dd, MISC_ERR_CLEAR, ~(u64)0); + write_csr(dd, RCV_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0); + for (i = 0; i < chip_send_contexts(dd); i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0); + for (i = 0; i < chip_sdma_engines(dd); i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0); + + write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0); + write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0); + write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); +} + +/* + * Remap the interrupt source from the general handler to the given MSI-X + * interrupt. + */ +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +{ + u64 reg; + int m, n; + + /* clear from the handled mask of the general interrupt */ + m = isrc / 64; + n = isrc % 64; + if (likely(m < CCE_NUM_INT_CSRS)) { + dd->gi_mask[m] &= ~((u64)1 << n); + } else { + dd_dev_err(dd, "remap interrupt err\n"); + return; + } + + /* direct the chip source to the given MSI-X interrupt */ + m = isrc / 8; + n = isrc % 8; + reg = read_csr(dd, CCE_INT_MAP + (8 * m)); + reg &= ~((u64)0xff << (8 * n)); + reg |= ((u64)msix_intr & 0xff) << (8 * n); + write_csr(dd, CCE_INT_MAP + (8 * m), reg); +} + +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr) +{ + /* + * SDMA engine interrupt sources grouped by type, rather than + * engine. Per-engine interrupts are as follows: + * SDMA + * SDMAProgress + * SDMAIdle + */ + remap_intr(dd, IS_SDMA_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr); +} + +/* + * Set the general handler to accept all interrupts, remap all + * chip interrupts back to MSI-X 0. + */ +void reset_interrupts(struct hfi1_devdata *dd) +{ + int i; + + /* all interrupts handled by the general handler */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + dd->gi_mask[i] = ~(u64)0; + + /* all chip interrupts map to MSI-X 0 */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP + (8 * i), 0); +} + +/** + * set_up_interrupts() - Initialize the IRQ resources and state + * @dd: valid devdata + * + */ +static int set_up_interrupts(struct hfi1_devdata *dd) +{ + int ret; + + /* mask all interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + + /* clear all pending interrupts */ + clear_all_interrupts(dd); + + /* reset general handler mask, chip MSI-X mappings */ + reset_interrupts(dd); + + /* ask for MSI-X interrupts */ + ret = msix_initialize(dd); + if (ret) + return ret; + + ret = msix_request_irqs(dd); + if (ret) + msix_clean_up_interrupts(dd); + + return ret; +} + +/* + * Set up context values in dd. Sets: + * + * num_rcv_contexts - number of contexts being used + * n_krcv_queues - number of kernel contexts + * first_dyn_alloc_ctxt - first dynamically allocated context + * in array of contexts + * freectxts - number of free user contexts + * num_send_contexts - number of PIO send contexts being used + * num_netdev_contexts - number of contexts reserved for netdev + */ +static int set_up_context_variables(struct hfi1_devdata *dd) +{ + unsigned long num_kernel_contexts; + u16 num_netdev_contexts; + int ret; + unsigned ngroups; + int rmt_count; + u32 n_usr_ctxts; + u32 send_contexts = chip_send_contexts(dd); + u32 rcv_contexts = chip_rcv_contexts(dd); + + /* + * Kernel receive contexts: + * - Context 0 - control context (VL15/multicast/error) + * - Context 1 - first kernel context + * - Context 2 - second kernel context + * ... + */ + if (n_krcvqs) + /* + * n_krcvqs is the sum of module parameter kernel receive + * contexts, krcvqs[]. It does not include the control + * context, so add that. + */ + num_kernel_contexts = n_krcvqs + 1; + else + num_kernel_contexts = DEFAULT_KRCVQS + 1; + /* + * Every kernel receive context needs an ACK send context. + * one send context is allocated for each VL{0-7} and VL15 + */ + if (num_kernel_contexts > (send_contexts - num_vls - 1)) { + dd_dev_err(dd, + "Reducing # kernel rcv contexts to: %d, from %lu\n", + send_contexts - num_vls - 1, + num_kernel_contexts); + num_kernel_contexts = send_contexts - num_vls - 1; + } + + /* + * User contexts: + * - default to 1 user context per real (non-HT) CPU core if + * num_user_contexts is negative + */ + if (num_user_contexts < 0) + n_usr_ctxts = cpumask_weight(&node_affinity.real_cpu_mask); + else + n_usr_ctxts = num_user_contexts; + /* + * Adjust the counts given a global max. + */ + if (num_kernel_contexts + n_usr_ctxts > rcv_contexts) { + dd_dev_err(dd, + "Reducing # user receive contexts to: %u, from %u\n", + (u32)(rcv_contexts - num_kernel_contexts), + n_usr_ctxts); + /* recalculate */ + n_usr_ctxts = rcv_contexts - num_kernel_contexts; + } + + num_netdev_contexts = + hfi1_num_netdev_contexts(dd, rcv_contexts - + (num_kernel_contexts + n_usr_ctxts), + &node_affinity.real_cpu_mask); + /* + * RMT entries are allocated as follows: + * 1. QOS (0 to 128 entries) + * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts + + * num_netdev_contexts [b]) + * 3. netdev (NUM_NETDEV_MAP_ENTRIES) + * + * Notes: + * [a] Kernel contexts (except control) are included in FECN if kernel + * TID_RDMA is active. + * [b] Netdev and user contexts are randomly allocated from the same + * context pool, so FECN must cover all contexts in the pool. + */ + rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL) + + (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1 + : 0) + + n_usr_ctxts + + num_netdev_contexts + + NUM_NETDEV_MAP_ENTRIES; + if (rmt_count > NUM_MAP_ENTRIES) { + int over = rmt_count - NUM_MAP_ENTRIES; + /* try to squish user contexts, minimum of 1 */ + if (over >= n_usr_ctxts) { + dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n"); + return -EINVAL; + } + dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n", + n_usr_ctxts, n_usr_ctxts - over); + n_usr_ctxts -= over; + } + + /* the first N are kernel contexts, the rest are user/netdev contexts */ + dd->num_rcv_contexts = + num_kernel_contexts + n_usr_ctxts + num_netdev_contexts; + dd->n_krcv_queues = num_kernel_contexts; + dd->first_dyn_alloc_ctxt = num_kernel_contexts; + dd->num_netdev_contexts = num_netdev_contexts; + dd->num_user_contexts = n_usr_ctxts; + dd->freectxts = n_usr_ctxts; + dd_dev_info(dd, + "rcv contexts: chip %d, used %d (kernel %d, netdev %u, user %u)\n", + rcv_contexts, + (int)dd->num_rcv_contexts, + (int)dd->n_krcv_queues, + dd->num_netdev_contexts, + dd->num_user_contexts); + + /* + * Receive array allocation: + * All RcvArray entries are divided into groups of 8. This + * is required by the hardware and will speed up writes to + * consecutive entries by using write-combining of the entire + * cacheline. + * + * The number of groups are evenly divided among all contexts. + * any left over groups will be given to the first N user + * contexts. + */ + dd->rcv_entries.group_size = RCV_INCREMENT; + ngroups = chip_rcv_array_count(dd) / dd->rcv_entries.group_size; + dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts; + dd->rcv_entries.nctxt_extra = ngroups - + (dd->num_rcv_contexts * dd->rcv_entries.ngroups); + dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n", + dd->rcv_entries.ngroups, + dd->rcv_entries.nctxt_extra); + if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size > + MAX_EAGER_ENTRIES * 2) { + dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) / + dd->rcv_entries.group_size; + dd_dev_info(dd, + "RcvArray group count too high, change to %u\n", + dd->rcv_entries.ngroups); + dd->rcv_entries.nctxt_extra = 0; + } + /* + * PIO send contexts + */ + ret = init_sc_pools_and_sizes(dd); + if (ret >= 0) { /* success */ + dd->num_send_contexts = ret; + dd_dev_info( + dd, + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", + send_contexts, + dd->num_send_contexts, + dd->sc_sizes[SC_KERNEL].count, + dd->sc_sizes[SC_ACK].count, + dd->sc_sizes[SC_USER].count, + dd->sc_sizes[SC_VL15].count); + ret = 0; /* success */ + } + + return ret; +} + +/* + * Set the device/port partition key table. The MAD code + * will ensure that, at least, the partial management + * partition key is present in the table. + */ +static void set_partition_keys(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg = 0; + int i; + + dd_dev_info(dd, "Setting partition keys\n"); + for (i = 0; i < hfi1_get_npkeys(dd); i++) { + reg |= (ppd->pkeys[i] & + RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) << + ((i % 4) * + RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT); + /* Each register holds 4 PKey values. */ + if ((i % 4) == 3) { + write_csr(dd, RCV_PARTITION_KEY + + ((i - 3) * 2), reg); + reg = 0; + } + } + + /* Always enable HW pkeys check when pkeys table is set */ + add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK); +} + +/* + * These CSRs and memories are uninitialized on reset and must be + * written before reading to set the ECC/parity bits. + * + * NOTE: All user context CSRs that are not mmaped write-only + * (e.g. the TID flows) must be initialized even if the driver never + * reads them. + */ +static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) +{ + int i, j; + + /* CceIntMap */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP + (8 * i), 0); + + /* SendCtxtCreditReturnAddr */ + for (i = 0; i < chip_send_contexts(dd); i++) + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + + /* PIO Send buffers */ + /* SDMA Send buffers */ + /* + * These are not normally read, and (presently) have no method + * to be read, so are not pre-initialized + */ + + /* RcvHdrAddr */ + /* RcvHdrTailAddr */ + /* RcvTidFlowTable */ + for (i = 0; i < chip_rcv_contexts(dd); i++) { + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) + write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0); + } + + /* RcvArray */ + for (i = 0; i < chip_rcv_array_count(dd); i++) + hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0); + + /* RcvQPMapTable */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); +} + +/* + * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus. + */ +static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits, + u64 ctrl_bits) +{ + unsigned long timeout; + u64 reg; + + /* is the condition present? */ + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + + /* clear the condition */ + write_csr(dd, CCE_CTRL, ctrl_bits); + + /* wait for the condition to clear */ + timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n", + status_bits, reg & status_bits); + return; + } + udelay(1); + } +} + +/* set CCE CSRs to chip reset defaults */ +static void reset_cce_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* CCE_REVISION read-only */ + /* CCE_REVISION2 read-only */ + /* CCE_CTRL - bits clear automatically */ + /* CCE_STATUS read-only, use CceCtrl to clear */ + clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK); + clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK); + clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK); + for (i = 0; i < CCE_NUM_SCRATCH; i++) + write_csr(dd, CCE_SCRATCH + (8 * i), 0); + /* CCE_ERR_STATUS read-only */ + write_csr(dd, CCE_ERR_MASK, 0); + write_csr(dd, CCE_ERR_CLEAR, ~0ull); + /* CCE_ERR_FORCE leave alone */ + for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0); + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR); + /* CCE_PCIE_CTRL leave alone */ + for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) { + write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0); + write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i), + CCE_MSIX_TABLE_UPPER_RESETCSR); + } + for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) { + /* CCE_MSIX_PBA read-only */ + write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull); + write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull); + } + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP, 0); + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + /* CCE_INT_STATUS read-only */ + write_csr(dd, CCE_INT_MASK + (8 * i), 0); + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull); + /* CCE_INT_FORCE leave alone */ + /* CCE_INT_BLOCKED read-only */ + } + for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++) + write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0); +} + +/* set MISC CSRs to chip reset defaults */ +static void reset_misc_csrs(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < 32; i++) { + write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0); + } + /* + * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can + * only be written 128-byte chunks + */ + /* init RSA engine to clear lingering errors */ + write_csr(dd, MISC_CFG_RSA_CMD, 1); + write_csr(dd, MISC_CFG_RSA_MU, 0); + write_csr(dd, MISC_CFG_FW_CTRL, 0); + /* MISC_STS_8051_DIGEST read-only */ + /* MISC_STS_SBM_DIGEST read-only */ + /* MISC_STS_PCIE_DIGEST read-only */ + /* MISC_STS_FAB_DIGEST read-only */ + /* MISC_ERR_STATUS read-only */ + write_csr(dd, MISC_ERR_MASK, 0); + write_csr(dd, MISC_ERR_CLEAR, ~0ull); + /* MISC_ERR_FORCE leave alone */ +} + +/* set TXE CSRs to chip reset defaults */ +static void reset_txe_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* + * TXE Kernel CSRs + */ + write_csr(dd, SEND_CTRL, 0); + __cm_reset(dd, 0); /* reset CM internal state */ + /* SEND_CONTEXTS read-only */ + /* SEND_DMA_ENGINES read-only */ + /* SEND_PIO_MEM_SIZE read-only */ + /* SEND_DMA_MEM_SIZE read-only */ + write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0); + pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */ + /* SEND_PIO_ERR_STATUS read-only */ + write_csr(dd, SEND_PIO_ERR_MASK, 0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull); + /* SEND_PIO_ERR_FORCE leave alone */ + /* SEND_DMA_ERR_STATUS read-only */ + write_csr(dd, SEND_DMA_ERR_MASK, 0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull); + /* SEND_DMA_ERR_FORCE leave alone */ + /* SEND_EGRESS_ERR_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_MASK, 0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull); + /* SEND_EGRESS_ERR_FORCE leave alone */ + write_csr(dd, SEND_BTH_QP, 0); + write_csr(dd, SEND_STATIC_RATE_CONTROL, 0); + write_csr(dd, SEND_SC2VLT0, 0); + write_csr(dd, SEND_SC2VLT1, 0); + write_csr(dd, SEND_SC2VLT2, 0); + write_csr(dd, SEND_SC2VLT3, 0); + write_csr(dd, SEND_LEN_CHECK0, 0); + write_csr(dd, SEND_LEN_CHECK1, 0); + /* SEND_ERR_STATUS read-only */ + write_csr(dd, SEND_ERR_MASK, 0); + write_csr(dd, SEND_ERR_CLEAR, ~0ull); + /* SEND_ERR_FORCE read-only */ + for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0); + for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0); + for (i = 0; i < chip_send_contexts(dd) / NUM_CONTEXTS_PER_SET; i++) + write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0); + for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0); + for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0); + write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR); + write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR); + /* SEND_CM_CREDIT_USED_STATUS read-only */ + write_csr(dd, SEND_CM_TIMER_CTRL, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0); + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + /* SEND_CM_CREDIT_USED_VL read-only */ + /* SEND_CM_CREDIT_USED_VL15 read-only */ + /* SEND_EGRESS_CTXT_STATUS read-only */ + /* SEND_EGRESS_SEND_DMA_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull); + /* SEND_EGRESS_ERR_INFO read-only */ + /* SEND_EGRESS_ERR_SOURCE read-only */ + + /* + * TXE Per-Context CSRs + */ + for (i = 0; i < chip_send_contexts(dd); i++) { + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0); + } + + /* + * TXE Per-SDMA CSRs + */ + for (i = 0; i < chip_sdma_engines(dd); i++) { + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* SEND_DMA_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0); + write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0); + /* SEND_DMA_HEAD read-only */ + write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0); + /* SEND_DMA_IDLE_CNT read-only */ + write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0); + write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0); + /* SEND_DMA_DESC_FETCHED_CNT read-only */ + /* SEND_DMA_ENG_ERR_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull); + /* SEND_DMA_ENG_ERR_FORCE leave alone */ + write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0); + write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0); + } +} + +/* + * Expect on entry: + * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0 + */ +static void init_rbufs(struct hfi1_devdata *dd) +{ + u64 reg; + int count; + + /* + * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are + * clear. + */ + count = 0; + while (1) { + reg = read_csr(dd, RCV_STATUS); + if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK + | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0) + break; + /* + * Give up after 1ms - maximum wait time. + * + * RBuf size is 136KiB. Slowest possible is PCIe Gen1 x1 at + * 250MB/s bandwidth. Lower rate to 66% for overhead to get: + * 136 KB / (66% * 250MB/s) = 844us + */ + if (count++ > 500) { + dd_dev_err(dd, + "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n", + __func__, reg); + break; + } + udelay(2); /* do not busy-wait the CSR */ + } + + /* start the init - expect RcvCtrl to be 0 */ + write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK); + + /* + * Read to force the write of Rcvtrl.RxRbufInit. There is a brief + * period after the write before RcvStatus.RxRbufInitDone is valid. + * The delay in the first run through the loop below is sufficient and + * required before the first read of RcvStatus.RxRbufInintDone. + */ + read_csr(dd, RCV_CTRL); + + /* wait for the init to finish */ + count = 0; + while (1) { + /* delay is required first time through - see above */ + udelay(2); /* do not busy-wait the CSR */ + reg = read_csr(dd, RCV_STATUS); + if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK)) + break; + + /* give up after 100us - slowest possible at 33MHz is 73us */ + if (count++ > 50) { + dd_dev_err(dd, + "%s: RcvStatus.RxRbufInit not set, continuing\n", + __func__); + break; + } + } +} + +/* set RXE CSRs to chip reset defaults */ +static void reset_rxe_csrs(struct hfi1_devdata *dd) +{ + int i, j; + + /* + * RXE Kernel CSRs + */ + write_csr(dd, RCV_CTRL, 0); + init_rbufs(dd); + /* RCV_STATUS read-only */ + /* RCV_CONTEXTS read-only */ + /* RCV_ARRAY_CNT read-only */ + /* RCV_BUF_SIZE read-only */ + write_csr(dd, RCV_BTH_QP, 0); + write_csr(dd, RCV_MULTICAST, 0); + write_csr(dd, RCV_BYPASS, 0); + write_csr(dd, RCV_VL15, 0); + /* this is a clear-down */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + /* RCV_ERR_STATUS read-only */ + write_csr(dd, RCV_ERR_MASK, 0); + write_csr(dd, RCV_ERR_CLEAR, ~0ull); + /* RCV_ERR_FORCE leave alone */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); + for (i = 0; i < 4; i++) + write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0); + for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0); + for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0); + for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) + clear_rsm_rule(dd, i); + for (i = 0; i < 32; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0); + + /* + * RXE Kernel and User Per-Context CSRs + */ + for (i = 0; i < chip_rcv_contexts(dd); i++) { + /* kernel */ + write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0); + /* RCV_CTXT_STATUS read-only */ + write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0); + write_kctxt_csr(dd, i, RCV_TID_CTRL, 0); + write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0); + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_CNT, 0); + write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0); + write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0); + + /* user */ + /* RCV_HDR_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0); + /* RCV_EGR_INDEX_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0); + /* RCV_EGR_OFFSET_TAIL read-only */ + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) { + write_uctxt_csr(dd, i, + RCV_TID_FLOW_TABLE + (8 * j), 0); + } + } +} + +/* + * Set sc2vl tables. + * + * They power on to zeros, so to avoid send context errors + * they need to be set: + * + * SC 0-7 -> VL 0-7 (respectively) + * SC 15 -> VL 15 + * otherwise + * -> VL 0 + */ +static void init_sc2vl_tables(struct hfi1_devdata *dd) +{ + int i; + /* init per architecture spec, constrained by hardware capability */ + + /* HFI maps sent packets */ + write_csr(dd, SEND_SC2VLT0, SC2VL_VAL( + 0, + 0, 0, 1, 1, + 2, 2, 3, 3, + 4, 4, 5, 5, + 6, 6, 7, 7)); + write_csr(dd, SEND_SC2VLT1, SC2VL_VAL( + 1, + 8, 0, 9, 0, + 10, 0, 11, 0, + 12, 0, 13, 0, + 14, 0, 15, 15)); + write_csr(dd, SEND_SC2VLT2, SC2VL_VAL( + 2, + 16, 0, 17, 0, + 18, 0, 19, 0, + 20, 0, 21, 0, + 22, 0, 23, 0)); + write_csr(dd, SEND_SC2VLT3, SC2VL_VAL( + 3, + 24, 0, 25, 0, + 26, 0, 27, 0, + 28, 0, 29, 0, + 30, 0, 31, 0)); + + /* DC maps received packets */ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL( + 15_0, + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL( + 31_16, + 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0)); + + /* initialize the cached sc2vl values consistently with h/w */ + for (i = 0; i < 32; i++) { + if (i < 8 || i == 15) + *((u8 *)(dd->sc2vl) + i) = (u8)i; + else + *((u8 *)(dd->sc2vl) + i) = 0; + } +} + +/* + * Read chip sizes and then reset parts to sane, disabled, values. We cannot + * depend on the chip going through a power-on reset - a driver may be loaded + * and unloaded many times. + * + * Do not write any CSR values to the chip in this routine - there may be + * a reset following the (possible) FLR in this routine. + * + */ +static int init_chip(struct hfi1_devdata *dd) +{ + int i; + int ret = 0; + + /* + * Put the HFI CSRs in a known state. + * Combine this with a DC reset. + * + * Stop the device from doing anything while we do a + * reset. We know there are no other active users of + * the device since we are now in charge. Turn off + * off all outbound and inbound traffic and make sure + * the device does not generate any interrupts. + */ + + /* disable send contexts and SDMA engines */ + write_csr(dd, SEND_CTRL, 0); + for (i = 0; i < chip_send_contexts(dd); i++) + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + for (i = 0; i < chip_sdma_engines(dd); i++) + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* disable port (turn off RXE inbound traffic) and contexts */ + write_csr(dd, RCV_CTRL, 0); + for (i = 0; i < chip_rcv_contexts(dd); i++) + write_csr(dd, RCV_CTXT_CTRL, 0); + /* mask all interrupt sources */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + + /* + * DC Reset: do a full DC reset before the register clear. + * A recommended length of time to hold is one CSR read, + * so reread the CceDcCtrl. Then, hold the DC in reset + * across the clear. + */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void)read_csr(dd, CCE_DC_CTRL); + + if (use_flr) { + /* + * A FLR will reset the SPC core and part of the PCIe. + * The parts that need to be restored have already been + * saved. + */ + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + + /* do the FLR, the DC reset will remain */ + pcie_flr(dd->pcidev); + + /* restore command and BARs */ + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return ret; + } + + if (is_ax(dd)) { + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + pcie_flr(dd->pcidev); + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return ret; + } + } + } else { + dd_dev_info(dd, "Resetting CSRs with writes\n"); + reset_cce_csrs(dd); + reset_txe_csrs(dd); + reset_rxe_csrs(dd); + reset_misc_csrs(dd); + } + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + + /* Set the LED off */ + setextled(dd, 0); + + /* + * Clear the QSFP reset. + * An FLR enforces a 0 on all out pins. The driver does not touch + * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and + * anything plugged constantly in reset, if it pays attention + * to RESET_N. + * Prime examples of this are optical cables. Set all pins high. + * I2CCLK and I2CDAT will change per direction, and INT_N and + * MODPRS_N are input only and their value is ignored. + */ + write_csr(dd, ASIC_QSFP1_OUT, 0x1f); + write_csr(dd, ASIC_QSFP2_OUT, 0x1f); + init_chip_resources(dd); + return ret; +} + +static void init_early_variables(struct hfi1_devdata *dd) +{ + int i; + + /* assign link credit variables */ + dd->vau = CM_VAU; + dd->link_credits = CM_GLOBAL_CREDITS; + if (is_ax(dd)) + dd->link_credits--; + dd->vcu = cu_to_vcu(hfi1_cu); + /* enough room for 8 MAD packets plus header - 17K */ + dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau); + if (dd->vl15_init > dd->link_credits) + dd->vl15_init = dd->link_credits; + + write_uninitialized_csrs_and_memories(dd); + + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_pportdata *ppd = &dd->pport[i]; + + set_partition_keys(ppd); + } + init_sc2vl_tables(dd); +} + +static void init_kdeth_qp(struct hfi1_devdata *dd) +{ + write_csr(dd, SEND_BTH_QP, + (RVT_KDETH_QP_PREFIX & SEND_BTH_QP_KDETH_QP_MASK) << + SEND_BTH_QP_KDETH_QP_SHIFT); + + write_csr(dd, RCV_BTH_QP, + (RVT_KDETH_QP_PREFIX & RCV_BTH_QP_KDETH_QP_MASK) << + RCV_BTH_QP_KDETH_QP_SHIFT); +} + +/** + * hfi1_get_qp_map - get qp map + * @dd: device data + * @idx: index to read + */ +u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx) +{ + u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8); + + reg >>= (idx % 8) * 8; + return reg; +} + +/** + * init_qpmap_table - init qp map + * @dd: device data + * @first_ctxt: first context + * @last_ctxt: first context + * + * This return sets the qpn mapping table that + * is indexed by qpn[8:1]. + * + * The routine will round robin the 256 settings + * from first_ctxt to last_ctxt. + * + * The first/last looks ahead to having specialized + * receive contexts for mgmt and bypass. Normal + * verbs traffic will assumed to be on a range + * of receive contexts. + */ +static void init_qpmap_table(struct hfi1_devdata *dd, + u32 first_ctxt, + u32 last_ctxt) +{ + u64 reg = 0; + u64 regno = RCV_QP_MAP_TABLE; + int i; + u64 ctxt = first_ctxt; + + for (i = 0; i < 256; i++) { + reg |= ctxt << (8 * (i % 8)); + ctxt++; + if (ctxt > last_ctxt) + ctxt = first_ctxt; + if (i % 8 == 7) { + write_csr(dd, regno, reg); + reg = 0; + regno += 8; + } + } + + add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK + | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); +} + +struct rsm_map_table { + u64 map[NUM_MAP_REGS]; + unsigned int used; +}; + +struct rsm_rule_data { + u8 offset; + u8 pkt_type; + u32 field1_off; + u32 field2_off; + u32 index1_off; + u32 index1_width; + u32 index2_off; + u32 index2_width; + u32 mask1; + u32 value1; + u32 mask2; + u32 value2; +}; + +/* + * Return an initialized RMT map table for users to fill in. OK if it + * returns NULL, indicating no table. + */ +static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd) +{ + struct rsm_map_table *rmt; + u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + rmt = kmalloc(sizeof(*rmt), GFP_KERNEL); + if (rmt) { + memset(rmt->map, rxcontext, sizeof(rmt->map)); + rmt->used = 0; + } + + return rmt; +} + +/* + * Write the final RMT map table to the chip and free the table. OK if + * table is NULL. + */ +static void complete_rsm_map_table(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + int i; + + if (rmt) { + /* write table to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]); + + /* enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + } +} + +/* Is a receive side mapping rule */ +static bool has_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) +{ + return read_csr(dd, RCV_RSM_CFG + (8 * rule_index)) != 0; +} + +/* + * Add a receive side mapping rule. + */ +static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index, + struct rsm_rule_data *rrd) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), + (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT | + 1ull << rule_index | /* enable bit */ + (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), + (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), + (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT | + (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT | + (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT | + (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT); +} + +/* + * Clear a receive side mapping rule. + */ +static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), 0); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), 0); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), 0); +} + +/* return the number of RSM map table entries that will be used for QOS */ +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, + unsigned int *np) +{ + int i; + unsigned int m, n; + uint max_by_vl = 0; + + /* is QOS active at all? */ + if (n_krcv_queues < MIN_KERNEL_KCTXTS || + num_vls == 1 || + krcvqsset <= 1) + goto no_qos; + + /* determine bits for qpn */ + for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++) + if (krcvqs[i] > max_by_vl) + max_by_vl = krcvqs[i]; + if (max_by_vl > 32) + goto no_qos; + m = ilog2(__roundup_pow_of_two(max_by_vl)); + + /* determine bits for vl */ + n = ilog2(__roundup_pow_of_two(num_vls)); + + /* reject if too much is used */ + if ((m + n) > 7) + goto no_qos; + + if (mp) + *mp = m; + if (np) + *np = n; + + return 1 << (m + n); + +no_qos: + if (mp) + *mp = 0; + if (np) + *np = 0; + return 0; +} + +/** + * init_qos - init RX qos + * @dd: device data + * @rmt: RSM map table + * + * This routine initializes Rule 0 and the RSM map table to implement + * quality of service (qos). + * + * If all of the limit tests succeed, qos is applied based on the array + * interpretation of krcvqs where entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn bits (m) are computed to + * feed both the RSM map table and the single rule. + */ +static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + unsigned int rmt_entries; + u64 reg; + + if (!rmt) + goto bail; + rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n); + if (rmt_entries == 0) + goto bail; + qpns_per_vl = 1 << m; + + /* enough room in the map table? */ + rmt_entries = 1 << (m + n); + if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) + goto bail; + + /* add qos entries to the RSM map table */ + for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { + unsigned tctxt; + + for (qpn = 0, tctxt = ctxt; + krcvqs[i] && qpn < qpns_per_vl; qpn++) { + unsigned idx, regoff, regidx; + + /* generate the index the hardware will produce */ + idx = rmt->used + ((qpn << n) ^ i); + regoff = (idx % 8) * 8; + regidx = idx / 8; + /* replace default with context number */ + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK + << regoff); + reg |= (u64)(tctxt++) << regoff; + rmt->map[regidx] = reg; + if (tctxt == ctxt + krcvqs[i]) + tctxt = ctxt; + } + ctxt += krcvqs[i]; + } + + rrd.offset = rmt->used; + rrd.pkt_type = 2; + rrd.field1_off = LRH_BTH_MATCH_OFFSET; + rrd.field2_off = LRH_SC_MATCH_OFFSET; + rrd.index1_off = LRH_SC_SELECT_OFFSET; + rrd.index1_width = n; + rrd.index2_off = QPN_SELECT_OFFSET; + rrd.index2_width = m + n; + rrd.mask1 = LRH_BTH_MASK; + rrd.value1 = LRH_BTH_VALUE; + rrd.mask2 = LRH_SC_MASK; + rrd.value2 = LRH_SC_VALUE; + + /* add rule 0 */ + add_rsm_rule(dd, RSM_INS_VERBS, &rrd); + + /* mark RSM map entries as used */ + rmt->used += rmt_entries; + /* map everything else to the mcast/err/vl15 context */ + init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT); + dd->qos_shift = n + 1; + return; +bail: + dd->qos_shift = 1; + init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); +} + +static void init_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + u64 reg; + int i, idx, regoff, regidx, start; + u8 offset; + u32 total_cnt; + + if (HFI1_CAP_IS_KSET(TID_RDMA)) + /* Exclude context 0 */ + start = 1; + else + start = dd->first_dyn_alloc_ctxt; + + total_cnt = dd->num_rcv_contexts - start; + + /* there needs to be enough room in the map table */ + if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n"); + return; + } + + /* + * RSM will extract the destination context as an index into the + * map table. The destination contexts are a sequential block + * in the range start...num_rcv_contexts-1 (inclusive). + * Map entries are accessed as offset + extracted value. Adjust + * the added offset so this sequence can be placed anywhere in + * the table - as long as the entries themselves do not wrap. + * There are only enough bits in offset for the table size, so + * start with that to allow for a "negative" offset. + */ + offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start); + + for (i = start, idx = rmt->used; i < dd->num_rcv_contexts; + i++, idx++) { + /* replace with identity mapping */ + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); + reg |= (u64)i << regoff; + rmt->map[regidx] = reg; + } + + /* + * For RSM intercept of Expected FECN packets: + * o packet type 0 - expected + * o match on F (bit 95), using select/match 1, and + * o match on SH (bit 133), using select/match 2. + * + * Use index 1 to extract the 8-bit receive context from DestQP + * (start at bit 64). Use that as the RSM map table index. + */ + rrd.offset = offset; + rrd.pkt_type = 0; + rrd.field1_off = 95; + rrd.field2_off = 133; + rrd.index1_off = 64; + rrd.index1_width = 8; + rrd.index2_off = 0; + rrd.index2_width = 0; + rrd.mask1 = 1; + rrd.value1 = 1; + rrd.mask2 = 1; + rrd.value2 = 1; + + /* add rule 1 */ + add_rsm_rule(dd, RSM_INS_FECN, &rrd); + + rmt->used += total_cnt; +} + +static inline bool hfi1_is_rmt_full(int start, int spare) +{ + return (start + spare) > NUM_MAP_ENTRIES; +} + +static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) +{ + u8 i, j; + u8 ctx_id = 0; + u64 reg; + u32 regoff; + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); + int ctxt_count = hfi1_netdev_ctxt_count(dd); + + /* We already have contexts mapped in RMT */ + if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) { + dd_dev_info(dd, "Contexts are already mapped in RMT\n"); + return true; + } + + if (hfi1_is_rmt_full(rmt_start, NUM_NETDEV_MAP_ENTRIES)) { + dd_dev_err(dd, "Not enough RMT entries used = %d\n", + rmt_start); + return false; + } + + dev_dbg(&(dd)->pcidev->dev, "RMT start = %d, end %d\n", + rmt_start, + rmt_start + NUM_NETDEV_MAP_ENTRIES); + + /* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */ + regoff = RCV_RSM_MAP_TABLE + (rmt_start / 8) * 8; + reg = read_csr(dd, regoff); + for (i = 0; i < NUM_NETDEV_MAP_ENTRIES; i++) { + /* Update map register with netdev context */ + j = (rmt_start + i) % 8; + reg &= ~(0xffllu << (j * 8)); + reg |= (u64)hfi1_netdev_get_ctxt(dd, ctx_id++)->ctxt << (j * 8); + /* Wrap up netdev ctx index */ + ctx_id %= ctxt_count; + /* Write back map register */ + if (j == 7 || ((i + 1) == NUM_NETDEV_MAP_ENTRIES)) { + dev_dbg(&(dd)->pcidev->dev, + "RMT[%d] =0x%llx\n", + regoff - RCV_RSM_MAP_TABLE, reg); + + write_csr(dd, regoff, reg); + regoff += 8; + if (i < (NUM_NETDEV_MAP_ENTRIES - 1)) + reg = read_csr(dd, regoff); + } + } + + return true; +} + +static void hfi1_enable_rsm_rule(struct hfi1_devdata *dd, + int rule, struct rsm_rule_data *rrd) +{ + if (!hfi1_netdev_update_rmt(dd)) { + dd_dev_err(dd, "Failed to update RMT for RSM%d rule\n", rule); + return; + } + + add_rsm_rule(dd, rule, rrd); + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); +} + +void hfi1_init_aip_rsm(struct hfi1_devdata *dd) +{ + /* + * go through with the initialisation only if this rule actually doesn't + * exist yet + */ + if (atomic_fetch_inc(&dd->ipoib_rsm_usr_num) == 0) { + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); + struct rsm_rule_data rrd = { + .offset = rmt_start, + .pkt_type = IB_PACKET_TYPE, + .field1_off = LRH_BTH_MATCH_OFFSET, + .mask1 = LRH_BTH_MASK, + .value1 = LRH_BTH_VALUE, + .field2_off = BTH_DESTQP_MATCH_OFFSET, + .mask2 = BTH_DESTQP_MASK, + .value2 = BTH_DESTQP_VALUE, + .index1_off = DETH_AIP_SQPN_SELECT_OFFSET + + ilog2(NUM_NETDEV_MAP_ENTRIES), + .index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES), + .index2_off = DETH_AIP_SQPN_SELECT_OFFSET, + .index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES) + }; + + hfi1_enable_rsm_rule(dd, RSM_INS_AIP, &rrd); + } +} + +/* Initialize RSM for VNIC */ +void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) +{ + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); + struct rsm_rule_data rrd = { + /* Add rule for vnic */ + .offset = rmt_start, + .pkt_type = 4, + /* Match 16B packets */ + .field1_off = L2_TYPE_MATCH_OFFSET, + .mask1 = L2_TYPE_MASK, + .value1 = L2_16B_VALUE, + /* Match ETH L4 packets */ + .field2_off = L4_TYPE_MATCH_OFFSET, + .mask2 = L4_16B_TYPE_MASK, + .value2 = L4_16B_ETH_VALUE, + /* Calc context from veswid and entropy */ + .index1_off = L4_16B_HDR_VESWID_OFFSET, + .index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES), + .index2_off = L2_16B_ENTROPY_OFFSET, + .index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES) + }; + + hfi1_enable_rsm_rule(dd, RSM_INS_VNIC, &rrd); +} + +void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd) +{ + clear_rsm_rule(dd, RSM_INS_VNIC); +} + +void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd) +{ + /* only actually clear the rule if it's the last user asking to do so */ + if (atomic_fetch_add_unless(&dd->ipoib_rsm_usr_num, -1, 0) == 1) + clear_rsm_rule(dd, RSM_INS_AIP); +} + +static int init_rxe(struct hfi1_devdata *dd) +{ + struct rsm_map_table *rmt; + u64 val; + + /* enable all receive errors */ + write_csr(dd, RCV_ERR_MASK, ~0ull); + + rmt = alloc_rsm_map_table(dd); + if (!rmt) + return -ENOMEM; + + /* set up QOS, including the QPN map table */ + init_qos(dd, rmt); + init_fecn_handling(dd, rmt); + complete_rsm_map_table(dd, rmt); + /* record number of used rsm map entries for netdev */ + hfi1_netdev_set_free_rmt_idx(dd, rmt->used); + kfree(rmt); + + /* + * make sure RcvCtrl.RcvWcb <= PCIe Device Control + * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config + * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one + * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and + * Max_PayLoad_Size set to its minimum of 128. + * + * Presently, RcvCtrl.RcvWcb is not modified from its default of 0 + * (64 bytes). Max_Payload_Size is possibly modified upward in + * tune_pcie_caps() which is called after this routine. + */ + + /* Have 16 bytes (4DW) of bypass header available in header queue */ + val = read_csr(dd, RCV_BYPASS); + val &= ~RCV_BYPASS_HDR_SIZE_SMASK; + val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) << + RCV_BYPASS_HDR_SIZE_SHIFT); + write_csr(dd, RCV_BYPASS, val); + return 0; +} + +static void init_other(struct hfi1_devdata *dd) +{ + /* enable all CCE errors */ + write_csr(dd, CCE_ERR_MASK, ~0ull); + /* enable *some* Misc errors */ + write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK); + /* enable all DC errors, except LCB */ + write_csr(dd, DCC_ERR_FLG_EN, ~0ull); + write_csr(dd, DC_DC8051_ERR_EN, ~0ull); +} + +/* + * Fill out the given AU table using the given CU. A CU is defined in terms + * AUs. The table is a an encoding: given the index, how many AUs does that + * represent? + * + * NOTE: Assumes that the register layout is the same for the + * local and remote tables. + */ +static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu, + u32 csr0to3, u32 csr4to7) +{ + write_csr(dd, csr0to3, + 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT | + 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT | + 2ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT | + 4ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT); + write_csr(dd, csr4to7, + 8ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT | + 16ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT | + 32ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT | + 64ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT); +} + +static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3, + SEND_CM_LOCAL_AU_TABLE4_TO7); +} + +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3, + SEND_CM_REMOTE_AU_TABLE4_TO7); +} + +static void init_txe(struct hfi1_devdata *dd) +{ + int i; + + /* enable all PIO, SDMA, general, and Egress errors */ + write_csr(dd, SEND_PIO_ERR_MASK, ~0ull); + write_csr(dd, SEND_DMA_ERR_MASK, ~0ull); + write_csr(dd, SEND_ERR_MASK, ~0ull); + write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull); + + /* enable all per-context and per-SDMA engine errors */ + for (i = 0; i < chip_send_contexts(dd); i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull); + for (i = 0; i < chip_sdma_engines(dd); i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull); + + /* set the local CU to AU mapping */ + assign_local_cm_au_table(dd, dd->vcu); + + /* + * Set reasonable default for Credit Return Timer + * Don't set on Simulator - causes it to choke. + */ + if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR) + write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE); +} + +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 jkey) +{ + u8 hw_ctxt; + u64 reg; + + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; + reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */ + ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT); + /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */ + if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY)) + reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, reg); + /* + * Enable send-side J_KEY integrity check, unless this is A0 h/w + */ + if (!is_ax(dd)) { + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + + /* Enable J_KEY check on receive context. */ + reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK | + ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) << + RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT); + write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, reg); + + return 0; +} + +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + u8 hw_ctxt; + u64 reg; + + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, 0); + /* + * Disable send-side J_KEY integrity check, unless this is A0 h/w. + * This check would not have been enabled for A0 h/w, see + * set_ctxt_jkey(). + */ + if (!is_ax(dd)) { + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + /* Turn off the J_KEY on the receive side */ + write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, 0); + + return 0; +} + +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 pkey) +{ + u8 hw_ctxt; + u64 reg; + + if (!rcd || !rcd->sc) + return -EINVAL; + + hw_ctxt = rcd->sc->hw_context; + reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); + + return 0; +} + +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt) +{ + u8 hw_ctxt; + u64 reg; + + if (!ctxt || !ctxt->sc) + return -EINVAL; + + hw_ctxt = ctxt->sc->hw_context; + reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg); + write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0); + + return 0; +} + +/* + * Start doing the clean up the chip. Our clean up happens in multiple + * stages and this is just the first. + */ +void hfi1_start_cleanup(struct hfi1_devdata *dd) +{ + aspm_exit(dd); + free_cntrs(dd); + free_rcverr(dd); + finish_chip_resources(dd); +} + +#define HFI_BASE_GUID(dev) \ + ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT)) + +/* + * Information can be shared between the two HFIs on the same ASIC + * in the same OS. This function finds the peer device and sets + * up a shared structure. + */ +static int init_asic_data(struct hfi1_devdata *dd) +{ + unsigned long index; + struct hfi1_devdata *peer; + struct hfi1_asic_data *asic_data; + int ret = 0; + + /* pre-allocate the asic structure in case we are the first device */ + asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL); + if (!asic_data) + return -ENOMEM; + + xa_lock_irq(&hfi1_dev_table); + /* Find our peer device */ + xa_for_each(&hfi1_dev_table, index, peer) { + if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(peer)) && + dd->unit != peer->unit) + break; + } + + if (peer) { + /* use already allocated structure */ + dd->asic_data = peer->asic_data; + kfree(asic_data); + } else { + dd->asic_data = asic_data; + mutex_init(&dd->asic_data->asic_resource_mutex); + } + dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */ + xa_unlock_irq(&hfi1_dev_table); + + /* first one through - set up i2c devices */ + if (!peer) + ret = set_up_i2c(dd, dd->asic_data); + + return ret; +} + +/* + * Set dd->boardname. Use a generic name if a name is not returned from + * EFI variable space. + * + * Return 0 on success, -ENOMEM if space could not be allocated. + */ +static int obtain_boardname(struct hfi1_devdata *dd) +{ + /* generic board description */ + const char generic[] = + "Cornelis Omni-Path Host Fabric Interface Adapter 100 Series"; + unsigned long size; + int ret; + + ret = read_hfi1_efi_var(dd, "description", &size, + (void **)&dd->boardname); + if (ret) { + dd_dev_info(dd, "Board description not found\n"); + /* use generic description */ + dd->boardname = kstrdup(generic, GFP_KERNEL); + if (!dd->boardname) + return -ENOMEM; + } + return 0; +} + +/* + * Check the interrupt registers to make sure that they are mapped correctly. + * It is intended to help user identify any mismapping by VMM when the driver + * is running in a VM. This function should only be called before interrupt + * is set up properly. + * + * Return 0 on success, -EINVAL on failure. + */ +static int check_int_registers(struct hfi1_devdata *dd) +{ + u64 reg; + u64 all_bits = ~(u64)0; + u64 mask; + + /* Clear CceIntMask[0] to avoid raising any interrupts */ + mask = read_csr(dd, CCE_INT_MASK); + write_csr(dd, CCE_INT_MASK, 0ull); + reg = read_csr(dd, CCE_INT_MASK); + if (reg) + goto err_exit; + + /* Clear all interrupt status bits */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg) + goto err_exit; + + /* Set all interrupt status bits */ + write_csr(dd, CCE_INT_FORCE, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg != all_bits) + goto err_exit; + + /* Restore the interrupt mask */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + write_csr(dd, CCE_INT_MASK, mask); + + return 0; +err_exit: + write_csr(dd, CCE_INT_MASK, mask); + dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n"); + return -EINVAL; +} + +/** + * hfi1_init_dd() - Initialize most of the dd structure. + * @dd: the dd device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +int hfi1_init_dd(struct hfi1_devdata *dd) +{ + struct pci_dev *pdev = dd->pcidev; + struct hfi1_pportdata *ppd; + u64 reg; + int i, ret; + static const char * const inames[] = { /* implementation names */ + "RTL silicon", + "RTL VCS simulation", + "RTL FPGA emulation", + "Functional simulator" + }; + struct pci_dev *parent = pdev->bus->self; + u32 sdma_engines = chip_sdma_engines(dd); + + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) { + int vl; + /* init common fields */ + hfi1_init_pportdata(pdev, ppd, dd, 0, 1); + /* DC supports 4 link widths */ + ppd->link_width_supported = + OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X | + OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_supported = + ppd->link_width_supported; + /* start out enabling only 4X */ + ppd->link_width_enabled = OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_enabled = + ppd->link_width_downgrade_supported; + /* link width active is 0 when link is down */ + /* link width downgrade active is 0 when link is down */ + + if (num_vls < HFI1_MIN_VLS_SUPPORTED || + num_vls > HFI1_MAX_VLS_SUPPORTED) { + dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); + num_vls = HFI1_MAX_VLS_SUPPORTED; + } + ppd->vls_supported = num_vls; + ppd->vls_operational = ppd->vls_supported; + /* Set the default MTU. */ + for (vl = 0; vl < num_vls; vl++) + dd->vld[vl].mtu = hfi1_max_mtu; + dd->vld[15].mtu = MAX_MAD_PACKET; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->overrun_threshold = 0x4; + ppd->phy_error_threshold = 0xf; + ppd->port_crc_mode_enabled = link_crc_mask; + /* initialize supported LTP CRC mode */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* initialize enabled LTP CRC mode */ + ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4; + /* start in offline */ + ppd->host_link_state = HLS_DN_OFFLINE; + init_vl_arb_caches(ppd); + } + + /* + * Do remaining PCIe setup and save PCIe values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped. + */ + ret = hfi1_pcie_ddinit(dd, pdev); + if (ret < 0) + goto bail_free; + + /* Save PCI space registers to rewrite after device reset */ + ret = save_pci_variables(dd); + if (ret < 0) + goto bail_cleanup; + + dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT) + & CCE_REVISION_CHIP_REV_MAJOR_MASK; + dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) + & CCE_REVISION_CHIP_REV_MINOR_MASK; + + /* + * Check interrupt registers mapping if the driver has no access to + * the upstream component. In this case, it is likely that the driver + * is running in a VM. + */ + if (!parent) { + ret = check_int_registers(dd); + if (ret) + goto bail_cleanup; + } + + /* + * obtain the hardware ID - NOT related to unit, which is a + * software enumeration + */ + reg = read_csr(dd, CCE_REVISION2); + dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT) + & CCE_REVISION2_HFI_ID_MASK; + /* the variable size will remove unwanted bits */ + dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT; + dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT; + dd_dev_info(dd, "Implementation: %s, revision 0x%x\n", + dd->icode < ARRAY_SIZE(inames) ? + inames[dd->icode] : "unknown", (int)dd->irev); + + /* speeds the hardware can support */ + dd->pport->link_speed_supported = OPA_LINK_SPEED_25G; + /* speeds allowed to run at */ + dd->pport->link_speed_enabled = dd->pport->link_speed_supported; + /* give a reasonable active value, will be set on link up */ + dd->pport->link_speed_active = OPA_LINK_SPEED_25G; + + /* fix up link widths for emulation _p */ + ppd = dd->pport; + if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) { + ppd->link_width_supported = + ppd->link_width_enabled = + ppd->link_width_downgrade_supported = + ppd->link_width_downgrade_enabled = + OPA_LINK_WIDTH_1X; + } + /* insure num_vls isn't larger than number of sdma engines */ + if (HFI1_CAP_IS_KSET(SDMA) && num_vls > sdma_engines) { + dd_dev_err(dd, "num_vls %u too large, using %u VLs\n", + num_vls, sdma_engines); + num_vls = sdma_engines; + ppd->vls_supported = sdma_engines; + ppd->vls_operational = ppd->vls_supported; + } + + /* + * Convert the ns parameter to the 64 * cclocks used in the CSR. + * Limit the max if larger than the field holds. If timeout is + * non-zero, then the calculated field will be at least 1. + * + * Must be after icode is set up - the cclock rate depends + * on knowing the hardware being used. + */ + dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64; + if (dd->rcv_intr_timeout_csr > + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK) + dd->rcv_intr_timeout_csr = + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK; + else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout) + dd->rcv_intr_timeout_csr = 1; + + /* needs to be done before we look for the peer device */ + read_guid(dd); + + /* set up shared ASIC data with peer device */ + ret = init_asic_data(dd); + if (ret) + goto bail_cleanup; + + /* obtain chip sizes, reset chip CSRs */ + ret = init_chip(dd); + if (ret) + goto bail_cleanup; + + /* read in the PCIe link speed information */ + ret = pcie_speeds(dd); + if (ret) + goto bail_cleanup; + + /* call before get_platform_config(), after init_chip_resources() */ + ret = eprom_init(dd); + if (ret) + goto bail_free_rcverr; + + /* Needs to be called before hfi1_firmware_init */ + get_platform_config(dd); + + /* read in firmware */ + ret = hfi1_firmware_init(dd); + if (ret) + goto bail_cleanup; + + /* + * In general, the PCIe Gen3 transition must occur after the + * chip has been idled (so it won't initiate any PCIe transactions + * e.g. an interrupt) and before the driver changes any registers + * (the transition will reset the registers). + * + * In particular, place this call after: + * - init_chip() - the chip will not initiate any PCIe transactions + * - pcie_speeds() - reads the current link speed + * - hfi1_firmware_init() - the needed firmware is ready to be + * downloaded + */ + ret = do_pcie_gen3_transition(dd); + if (ret) + goto bail_cleanup; + + /* + * This should probably occur in hfi1_pcie_init(), but historically + * occurs after the do_pcie_gen3_transition() code. + */ + tune_pcie_caps(dd); + + /* start setting dd values and adjusting CSRs */ + init_early_variables(dd); + + parse_platform_config(dd); + + ret = obtain_boardname(dd); + if (ret) + goto bail_cleanup; + + snprintf(dd->boardversion, BOARD_VERS_MAX, + "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n", + HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN, + (u32)dd->majrev, + (u32)dd->minrev, + (dd->revision >> CCE_REVISION_SW_SHIFT) + & CCE_REVISION_SW_MASK); + + /* alloc VNIC/AIP rx data */ + ret = hfi1_alloc_rx(dd); + if (ret) + goto bail_cleanup; + + ret = set_up_context_variables(dd); + if (ret) + goto bail_cleanup; + + /* set initial RXE CSRs */ + ret = init_rxe(dd); + if (ret) + goto bail_cleanup; + + /* set initial TXE CSRs */ + init_txe(dd); + /* set initial non-RXE, non-TXE CSRs */ + init_other(dd); + /* set up KDETH QP prefix in both RX and TX CSRs */ + init_kdeth_qp(dd); + + ret = hfi1_dev_affinity_init(dd); + if (ret) + goto bail_cleanup; + + /* send contexts must be set up before receive contexts */ + ret = init_send_contexts(dd); + if (ret) + goto bail_cleanup; + + ret = hfi1_create_kctxts(dd); + if (ret) + goto bail_cleanup; + + /* + * Initialize aspm, to be done after gen3 transition and setting up + * contexts and before enabling interrupts + */ + aspm_init(dd); + + ret = init_pervl_scs(dd); + if (ret) + goto bail_cleanup; + + /* sdma init */ + for (i = 0; i < dd->num_pports; ++i) { + ret = sdma_init(dd, i); + if (ret) + goto bail_cleanup; + } + + /* use contexts created by hfi1_create_kctxts */ + ret = set_up_interrupts(dd); + if (ret) + goto bail_cleanup; + + ret = hfi1_comp_vectors_set_up(dd); + if (ret) + goto bail_clear_intr; + + /* set up LCB access - must be after set_up_interrupts() */ + init_lcb_access(dd); + + /* + * Serial number is created from the base guid: + * [27:24] = base guid [38:35] + * [23: 0] = base guid [23: 0] + */ + snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n", + (dd->base_guid & 0xFFFFFF) | + ((dd->base_guid >> 11) & 0xF000000)); + + dd->oui1 = dd->base_guid >> 56 & 0xFF; + dd->oui2 = dd->base_guid >> 48 & 0xFF; + dd->oui3 = dd->base_guid >> 40 & 0xFF; + + ret = load_firmware(dd); /* asymmetric with dispose_firmware() */ + if (ret) + goto bail_clear_intr; + + thermal_init(dd); + + ret = init_cntrs(dd); + if (ret) + goto bail_clear_intr; + + ret = init_rcverr(dd); + if (ret) + goto bail_free_cntrs; + + init_completion(&dd->user_comp); + + /* The user refcount starts with one to inidicate an active device */ + refcount_set(&dd->user_refcount, 1); + + goto bail; + +bail_free_rcverr: + free_rcverr(dd); +bail_free_cntrs: + free_cntrs(dd); +bail_clear_intr: + hfi1_comp_vectors_clean_up(dd); + msix_clean_up_interrupts(dd); +bail_cleanup: + hfi1_free_rx(dd); + hfi1_pcie_ddcleanup(dd); +bail_free: + hfi1_free_devdata(dd); +bail: + return ret; +} + +static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, + u32 dw_len) +{ + u32 delta_cycles; + u32 current_egress_rate = ppd->current_egress_rate; + /* rates here are in units of 10^6 bits/sec */ + + if (desired_egress_rate == -1) + return 0; /* shouldn't happen */ + + if (desired_egress_rate >= current_egress_rate) + return 0; /* we can't help go faster, only slower */ + + delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) - + egress_cycles(dw_len * 4, current_egress_rate); + + return (u16)delta_cycles; +} + +/** + * create_pbc - build a pbc for transmission + * @ppd: info of physical Hfi port + * @flags: special case flags or-ed in built pbc + * @srate_mbs: static rate + * @vl: vl + * @dw_len: dword length (header words + data words + pbc words) + * + * Create a PBC with the given flags, rate, VL, and length. + * + * NOTE: The PBC created will not insert any HCRC - all callers but one are + * for verbs, which does not use this PSM feature. The lone other caller + * is for the diagnostic interface which calls this if the user does not + * supply their own PBC. + */ +u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, + u32 dw_len) +{ + u64 pbc, delay = 0; + + if (unlikely(srate_mbs)) + delay = delay_cycles(ppd, srate_mbs, dw_len); + + pbc = flags + | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT) + | (vl & PBC_VL_MASK) << PBC_VL_SHIFT + | (dw_len & PBC_LENGTH_DWS_MASK) + << PBC_LENGTH_DWS_SHIFT; + + return pbc; +} + +#define SBUS_THERMAL 0x4f +#define SBUS_THERM_MONITOR_MODE 0x1 + +#define THERM_FAILURE(dev, ret, reason) \ + dd_dev_err((dd), \ + "Thermal sensor initialization failed: %s (%d)\n", \ + (reason), (ret)) + +/* + * Initialize the thermal sensor. + * + * After initialization, enable polling of thermal sensor through + * SBus interface. In order for this to work, the SBus Master + * firmware has to be loaded due to the fact that the HW polling + * logic uses SBus interrupts, which are not supported with + * default firmware. Otherwise, no data will be returned through + * the ASIC_STS_THERM CSR. + */ +static int thermal_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + if (dd->icode != ICODE_RTL_SILICON || + check_chip_resource(dd, CR_THERM_INIT, NULL)) + return ret; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + THERM_FAILURE(dd, ret, "Acquire SBus"); + return ret; + } + + dd_dev_info(dd, "Initializing thermal sensor\n"); + /* Disable polling of thermal readings */ + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); + msleep(100); + /* Thermal Sensor Initialization */ + /* Step 1: Reset the Thermal SBus Receiver */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + RESET_SBUS_RECEIVER, 0); + if (ret) { + THERM_FAILURE(dd, ret, "Bus Reset"); + goto done; + } + /* Step 2: Set Reset bit in Thermal block */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x1); + if (ret) { + THERM_FAILURE(dd, ret, "Therm Block Reset"); + goto done; + } + /* Step 3: Write clock divider value (100MHz -> 2MHz) */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1, + WRITE_SBUS_RECEIVER, 0x32); + if (ret) { + THERM_FAILURE(dd, ret, "Write Clock Div"); + goto done; + } + /* Step 4: Select temperature mode */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3, + WRITE_SBUS_RECEIVER, + SBUS_THERM_MONITOR_MODE); + if (ret) { + THERM_FAILURE(dd, ret, "Write Mode Sel"); + goto done; + } + /* Step 5: De-assert block reset and start conversion */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x2); + if (ret) { + THERM_FAILURE(dd, ret, "Write Reset Deassert"); + goto done; + } + /* Step 5.1: Wait for first conversion (21.5ms per spec) */ + msleep(22); + + /* Enable polling of thermal readings */ + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + + /* Set initialized flag */ + ret = acquire_chip_resource(dd, CR_THERM_INIT, 0); + if (ret) + THERM_FAILURE(dd, ret, "Unable to set thermal init flag"); + +done: + release_chip_resource(dd, CR_SBUS); + return ret; +} + +static void handle_temp_err(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + /* + * Thermal Critical Interrupt + * Put the device into forced freeze mode, take link down to + * offline, and put DC into reset. + */ + dd_dev_emerg(dd, + "Critical temperature reached! Forcing device into freeze mode!\n"); + dd->flags |= HFI1_FORCED_FREEZE; + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT); + /* + * Shut DC down as much and as quickly as possible. + * + * Step 1: Take the link down to OFFLINE. This will cause the + * 8051 to put the Serdes in reset. However, we don't want to + * go through the entire link state machine since we want to + * shutdown ASAP. Furthermore, this is not a graceful shutdown + * but rather an attempt to save the chip. + * Code below is almost the same as quiet_serdes() but avoids + * all the extra work and the sleeps. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) | + PLS_OFFLINE); + /* + * Step 2: Shutdown LCB and 8051 + * After shutdown, do not restore DC_CFG_RESET value. + */ + dc_shutdown(dd); +} diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h new file mode 100644 index 0000000000..b2d53713da --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -0,0 +1,1430 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#ifndef _CHIP_H +#define _CHIP_H +/* + * This file contains all of the defines that is specific to the HFI chip + */ + +/* sizes */ +#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64)) +#define NUM_INTERRUPT_SOURCES 768 +#define RXE_NUM_CONTEXTS 160 +#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ +#define RXE_NUM_TID_FLOWS 32 +#define RXE_NUM_DATA_VL 8 +#define TXE_NUM_CONTEXTS 160 +#define TXE_NUM_SDMA_ENGINES 16 +#define NUM_CONTEXTS_PER_SET 8 +#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16 +#define VL_ARB_LOW_PRIO_TABLE_SIZE 16 +#define VL_ARB_TABLE_SIZE 16 +#define TXE_NUM_32_BIT_COUNTER 7 +#define TXE_NUM_64_BIT_COUNTER 30 +#define TXE_NUM_DATA_VL 8 +#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */ +#define PIO_BLOCK_SIZE 64 /* bytes */ +#define SDMA_BLOCK_SIZE 64 /* bytes */ +#define RCV_BUF_BLOCK_SIZE 64 /* bytes */ +#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */ +#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */ +#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */ +/* + * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed + * at 64 bytes for all generation one devices + */ +#define CM_VAU 3 +/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */ +#define CM_GLOBAL_CREDITS 0x880 +/* Number of PKey entries in the HW */ +#define MAX_PKEY_VALUES 16 + +#include "chip_registers.h" + +#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET) +#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET) + +/* PBC flags */ +#define PBC_INTR BIT_ULL(31) +#define PBC_DC_INFO_SHIFT (30) +#define PBC_DC_INFO BIT_ULL(PBC_DC_INFO_SHIFT) +#define PBC_TEST_EBP BIT_ULL(29) +#define PBC_PACKET_BYPASS BIT_ULL(28) +#define PBC_CREDIT_RETURN BIT_ULL(25) +#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24) +#define PBC_TEST_BAD_ICRC BIT_ULL(23) +#define PBC_FECN BIT_ULL(22) + +/* PbcInsertHcrc field settings */ +#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */ +#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */ +#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */ + +/* PBC fields */ +#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32 +#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull +#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \ + (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \ + PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + +#define PBC_INSERT_HCRC_SHIFT 26 +#define PBC_INSERT_HCRC_MASK 0x3ull +#define PBC_INSERT_HCRC_SMASK \ + (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT) + +#define PBC_VL_SHIFT 12 +#define PBC_VL_MASK 0xfull +#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT) + +#define PBC_LENGTH_DWS_SHIFT 0 +#define PBC_LENGTH_DWS_MASK 0xfffull +#define PBC_LENGTH_DWS_SMASK \ + (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT) + +/* Credit Return Fields */ +#define CR_COUNTER_SHIFT 0 +#define CR_COUNTER_MASK 0x7ffull +#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT) + +#define CR_STATUS_SHIFT 11 +#define CR_STATUS_MASK 0x1ull +#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12 +#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \ + CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13 +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \ + CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14 +#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \ + CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15 +#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ + CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) + +/* Specific IRQ sources */ +#define CCE_ERR_INT 0 +#define RXE_ERR_INT 1 +#define MISC_ERR_INT 2 +#define PIO_ERR_INT 4 +#define SDMA_ERR_INT 5 +#define EGRESS_ERR_INT 6 +#define TXE_ERR_INT 7 +#define PBC_INT 240 +#define GPIO_ASSERT_INT 241 +#define QSFP1_INT 242 +#define QSFP2_INT 243 +#define TCRIT_INT 244 + +/* interrupt source ranges */ +#define IS_FIRST_SOURCE CCE_ERR_INT +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 +#define IS_SDMA_PROGRESS_START 208 +#define IS_SDMA_IDLE_START 224 +#define IS_VARIOUS_START 240 +#define IS_DC_START 248 +#define IS_RCVAVAIL_START 256 +#define IS_RCVURGENT_START 416 +#define IS_SENDCREDIT_START 576 +#define IS_RESERVED_START 736 +#define IS_LAST_SOURCE 767 + +/* derived interrupt source values */ +#define IS_GENERAL_ERR_END 7 +#define IS_SDMAENG_ERR_END 31 +#define IS_SENDCTXT_ERR_END 191 +#define IS_SDMA_END 207 +#define IS_SDMA_PROGRESS_END 223 +#define IS_SDMA_IDLE_END 239 +#define IS_VARIOUS_END 244 +#define IS_DC_END 255 +#define IS_RCVAVAIL_END 415 +#define IS_RCVURGENT_END 575 +#define IS_SENDCREDIT_END 735 +#define IS_RESERVED_END IS_LAST_SOURCE + +/* DCC_CFG_PORT_CONFIG logical link states */ +#define LSTATE_DOWN 0x1 +#define LSTATE_INIT 0x2 +#define LSTATE_ARMED 0x3 +#define LSTATE_ACTIVE 0x4 + +/* DCC_CFG_RESET reset states */ +#define LCB_RX_FPE_TX_FPE_INTO_RESET (DCC_CFG_RESET_RESET_LCB | \ + DCC_CFG_RESET_RESET_TX_FPE | \ + DCC_CFG_RESET_RESET_RX_FPE | \ + DCC_CFG_RESET_ENABLE_CCLK_BCC) + /* 0x17 */ + +#define LCB_RX_FPE_TX_FPE_OUT_OF_RESET DCC_CFG_RESET_ENABLE_CCLK_BCC /* 0x10 */ + +/* DC8051_STS_CUR_STATE port values (physical link states) */ +#define PLS_DISABLED 0x30 +#define PLS_OFFLINE 0x90 +#define PLS_OFFLINE_QUIET 0x90 +#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91 +#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 +#define PLS_OFFLINE_REPORT_FAILURE 0x93 +#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_OFFLINE_QUIET_DURATION 0x95 +#define PLS_POLLING 0x20 +#define PLS_POLLING_QUIET 0x20 +#define PLS_POLLING_ACTIVE 0x21 +#define PLS_CONFIGPHY 0x40 +#define PLS_CONFIGPHY_DEBOUCE 0x40 +#define PLS_CONFIGPHY_ESTCOMM 0x41 +#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42 +#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE 0x43 +#define PLS_CONFIGPHY_OPTEQ 0x44 +#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44 +#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45 +#define PLS_CONFIGPHY_VERIFYCAP 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47 +#define PLS_CONFIGLT 0x48 +#define PLS_CONFIGLT_CONFIGURE 0x48 +#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49 +#define PLS_LINKUP 0x50 +#define PLS_PHYTEST 0xB0 +#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1 +#define PLS_QUICK_LINKUP 0xe2 + +/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */ +#define HCMD_LOAD_CONFIG_DATA 0x01 +#define HCMD_READ_CONFIG_DATA 0x02 +#define HCMD_CHANGE_PHY_STATE 0x03 +#define HCMD_SEND_LCB_IDLE_MSG 0x04 +#define HCMD_MISC 0x05 +#define HCMD_READ_LCB_IDLE_MSG 0x06 +#define HCMD_READ_LCB_CSR 0x07 +#define HCMD_WRITE_LCB_CSR 0x08 +#define HCMD_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */ +#define HCMD_SUCCESS 2 + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */ +#define SPICO_ROM_FAILED BIT(0) +#define UNKNOWN_FRAME BIT(1) +#define TARGET_BER_NOT_MET BIT(2) +#define FAILED_SERDES_INTERNAL_LOOPBACK BIT(3) +#define FAILED_SERDES_INIT BIT(4) +#define FAILED_LNI_POLLING BIT(5) +#define FAILED_LNI_DEBOUNCE BIT(6) +#define FAILED_LNI_ESTBCOMM BIT(7) +#define FAILED_LNI_OPTEQ BIT(8) +#define FAILED_LNI_VERIFY_CAP1 BIT(9) +#define FAILED_LNI_VERIFY_CAP2 BIT(10) +#define FAILED_LNI_CONFIGLT BIT(11) +#define HOST_HANDSHAKE_TIMEOUT BIT(12) +#define EXTERNAL_DEVICE_REQ_TIMEOUT BIT(13) + +#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \ + | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \ + | FAILED_LNI_VERIFY_CAP1 \ + | FAILED_LNI_VERIFY_CAP2 \ + | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT \ + | EXTERNAL_DEVICE_REQ_TIMEOUT) + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */ +#define HOST_REQ_DONE BIT(0) +#define BC_PWR_MGM_MSG BIT(1) +#define BC_SMA_MSG BIT(2) +#define BC_BCC_UNKNOWN_MSG BIT(3) +#define BC_IDLE_UNKNOWN_MSG BIT(4) +#define EXT_DEVICE_CFG_REQ BIT(5) +#define VERIFY_CAP_FRAME BIT(6) +#define LINKUP_ACHIEVED BIT(7) +#define LINK_GOING_DOWN BIT(8) +#define LINK_WIDTH_DOWNGRADED BIT(9) + +/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */ +#define HREQ_LOAD_CONFIG 0x01 +#define HREQ_SAVE_CONFIG 0x02 +#define HREQ_READ_CONFIG 0x03 +#define HREQ_SET_TX_EQ_ABS 0x04 +#define HREQ_SET_TX_EQ_REL 0x05 +#define HREQ_ENABLE 0x06 +#define HREQ_LCB_RESET 0x07 +#define HREQ_CONFIG_DONE 0xfe +#define HREQ_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */ +#define HREQ_INVALID 0x01 +#define HREQ_SUCCESS 0x02 +#define HREQ_NOT_SUPPORTED 0x03 +#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */ +#define HREQ_REQUEST_REJECTED 0xfe +#define HREQ_EXECUTION_ONGOING 0xff + +/* MISC host command functions */ +#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1 +#define HCMD_MISC_GRANT_LCB_ACCESS 0x2 + +/* idle flit message types */ +#define IDLE_PHYSICAL_LINK_MGMT 0x1 +#define IDLE_CRU 0x2 +#define IDLE_SMA 0x3 +#define IDLE_POWER_MGMT 0x4 + +/* idle flit message send fields (both send and read) */ +#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */ +#define IDLE_PAYLOAD_SHIFT 8 +#define IDLE_MSG_TYPE_MASK 0xf +#define IDLE_MSG_TYPE_SHIFT 0 + +/* idle flit message read fields */ +#define READ_IDLE_MSG_TYPE_MASK 0xf +#define READ_IDLE_MSG_TYPE_SHIFT 0 + +/* SMA idle flit payload commands */ +#define SMA_IDLE_ARM 1 +#define SMA_IDLE_ACTIVE 2 + +/* DC_DC8051_CFG_MODE.GENERAL bits */ +#define DISABLE_SELF_GUID_CHECK 0x2 + +/* Bad L2 frame error code */ +#define BAD_L2_ERR 0x6 + +/* + * Eager buffer minimum and maximum sizes supported by the hardware. + * All power-of-two sizes in between are supported as well. + * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory + * allocatable for Eager buffer to a single context. All others + * are limits for the RcvArray entries. + */ +#define MIN_EAGER_BUFFER (4 * 1024) +#define MAX_EAGER_BUFFER (256 * 1024) +#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */ +#define MAX_EXPECTED_BUFFER (2048 * 1024) +#define HFI1_MIN_HDRQ_EGRBUF_CNT 32 +#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 + +/* + * Receive expected base and count and eager base and count increment - + * the CSR fields hold multiples of this value. + */ +#define RCV_SHIFT 3 +#define RCV_INCREMENT BIT(RCV_SHIFT) + +/* + * Receive header queue entry increment - the CSR holds multiples of + * this value. + */ +#define HDRQ_SIZE_SHIFT 5 +#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT) + +/* + * Freeze handling flags + */ +#define FREEZE_ABORT 0x01 /* do not do recovery */ +#define FREEZE_SELF 0x02 /* initiate the freeze */ +#define FREEZE_LINK_DOWN 0x04 /* link is down */ + +/* + * Chip implementation codes. + */ +#define ICODE_RTL_SILICON 0x00 +#define ICODE_RTL_VCS_SIMULATION 0x01 +#define ICODE_FPGA_EMULATION 0x02 +#define ICODE_FUNCTIONAL_SIMULATOR 0x03 + +/* + * 8051 data memory size. + */ +#define DC8051_DATA_MEM_SIZE 0x1000 + +/* + * 8051 firmware registers + */ +#define NUM_GENERAL_FIELDS 0x17 +#define NUM_LANE_FIELDS 0x8 + +/* 8051 general register Field IDs */ +#define LINK_OPTIMIZATION_SETTINGS 0x00 +#define LINK_TUNING_PARAMETERS 0x02 +#define DC_HOST_COMM_SETTINGS 0x03 +#define TX_SETTINGS 0x06 +#define VERIFY_CAP_LOCAL_PHY 0x07 +#define VERIFY_CAP_LOCAL_FABRIC 0x08 +#define VERIFY_CAP_LOCAL_LINK_MODE 0x09 +#define LOCAL_DEVICE_ID 0x0a +#define RESERVED_REGISTERS 0x0b +#define LOCAL_LNI_INFO 0x0c +#define REMOTE_LNI_INFO 0x0d +#define MISC_STATUS 0x0e +#define VERIFY_CAP_REMOTE_PHY 0x0f +#define VERIFY_CAP_REMOTE_FABRIC 0x10 +#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11 +#define LAST_LOCAL_STATE_COMPLETE 0x12 +#define LAST_REMOTE_STATE_COMPLETE 0x13 +#define LINK_QUALITY_INFO 0x14 +#define REMOTE_DEVICE_ID 0x15 +#define LINK_DOWN_REASON 0x16 /* first byte of offset 0x16 */ +#define VERSION_PATCH 0x16 /* last byte of offset 0x16 */ + +/* 8051 lane specific register field IDs */ +#define TX_EQ_SETTINGS 0x00 +#define CHANNEL_LOSS_SETTINGS 0x05 + +/* Lane ID for general configuration registers */ +#define GENERAL_CONFIG 4 + +/* LINK_TUNING_PARAMETERS fields */ +#define TUNING_METHOD_SHIFT 24 + +/* LINK_OPTIMIZATION_SETTINGS fields */ +#define ENABLE_EXT_DEV_CONFIG_SHIFT 24 + +/* LOAD_DATA 8051 command shifts and fields */ +#define LOAD_DATA_FIELD_ID_SHIFT 40 +#define LOAD_DATA_FIELD_ID_MASK 0xfull +#define LOAD_DATA_LANE_ID_SHIFT 32 +#define LOAD_DATA_LANE_ID_MASK 0xfull +#define LOAD_DATA_DATA_SHIFT 0x0 +#define LOAD_DATA_DATA_MASK 0xffffffffull + +/* READ_DATA 8051 command shifts and fields */ +#define READ_DATA_FIELD_ID_SHIFT 40 +#define READ_DATA_FIELD_ID_MASK 0xffull +#define READ_DATA_LANE_ID_SHIFT 32 +#define READ_DATA_LANE_ID_MASK 0xffull +#define READ_DATA_DATA_SHIFT 0x0 +#define READ_DATA_DATA_MASK 0xffffffffull + +/* TX settings fields */ +#define ENABLE_LANE_TX_SHIFT 0 +#define ENABLE_LANE_TX_MASK 0xff +#define TX_POLARITY_INVERSION_SHIFT 8 +#define TX_POLARITY_INVERSION_MASK 0xff +#define RX_POLARITY_INVERSION_SHIFT 16 +#define RX_POLARITY_INVERSION_MASK 0xff +#define MAX_RATE_SHIFT 24 +#define MAX_RATE_MASK 0xff + +/* verify capability PHY fields */ +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4 +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1 +#define POWER_MANAGEMENT_SHIFT 0x0 +#define POWER_MANAGEMENT_MASK 0xf + +/* 8051 lane register Field IDs */ +#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */ + +/* SPICO firmware version fields */ +#define SPICO_ROM_VERSION_SHIFT 0 +#define SPICO_ROM_VERSION_MASK 0xffff +#define SPICO_ROM_PROD_ID_SHIFT 16 +#define SPICO_ROM_PROD_ID_MASK 0xffff + +/* verify capability fabric fields */ +#define VAU_SHIFT 0 +#define VAU_MASK 0x0007 +#define Z_SHIFT 3 +#define Z_MASK 0x0001 +#define VCU_SHIFT 4 +#define VCU_MASK 0x0007 +#define VL15BUF_SHIFT 8 +#define VL15BUF_MASK 0x0fff +#define CRC_SIZES_SHIFT 20 +#define CRC_SIZES_MASK 0x7 + +/* verify capability local link width fields */ +#define LINK_WIDTH_SHIFT 0 /* also for remote link width */ +#define LINK_WIDTH_MASK 0xffff /* also for remote link width */ +#define LOCAL_FLAG_BITS_SHIFT 16 +#define LOCAL_FLAG_BITS_MASK 0xff +#define MISC_CONFIG_BITS_SHIFT 24 +#define MISC_CONFIG_BITS_MASK 0xff + +/* verify capability remote link width fields */ +#define REMOTE_TX_RATE_SHIFT 16 +#define REMOTE_TX_RATE_MASK 0xff + +/* LOCAL_DEVICE_ID fields */ +#define LOCAL_DEVICE_REV_SHIFT 0 +#define LOCAL_DEVICE_REV_MASK 0xff +#define LOCAL_DEVICE_ID_SHIFT 8 +#define LOCAL_DEVICE_ID_MASK 0xffff + +/* REMOTE_DEVICE_ID fields */ +#define REMOTE_DEVICE_REV_SHIFT 0 +#define REMOTE_DEVICE_REV_MASK 0xff +#define REMOTE_DEVICE_ID_SHIFT 8 +#define REMOTE_DEVICE_ID_MASK 0xffff + +/* local LNI link width fields */ +#define ENABLE_LANE_RX_SHIFT 16 +#define ENABLE_LANE_RX_MASK 0xff + +/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */ +#define MGMT_ALLOWED_SHIFT 23 +#define MGMT_ALLOWED_MASK 0x1 + +/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */ +#define LINK_QUALITY_SHIFT 24 +#define LINK_QUALITY_MASK 0x7 + +/* + * mask, shift for reading 'planned_down_remote_reason_code' + * from LINK_QUALITY_INFO field + */ +#define DOWN_REMOTE_REASON_SHIFT 16 +#define DOWN_REMOTE_REASON_MASK 0xff + +#define HOST_INTERFACE_VERSION 1 +#define HOST_INTERFACE_VERSION_SHIFT 16 +#define HOST_INTERFACE_VERSION_MASK 0xff + +/* verify capability PHY power management bits */ +#define PWRM_BER_CONTROL 0x1 +#define PWRM_BANDWIDTH_CONTROL 0x2 + +/* 8051 link down reasons */ +#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa +#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb +#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc + +/* verify capability fabric CRC size bits */ +enum { + CAP_CRC_14B = (1 << 0), /* 14b CRC */ + CAP_CRC_48B = (1 << 1), /* 48b CRC */ + CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */ +}; + +#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B) + +/* misc status version fields */ +#define STS_FM_VERSION_MINOR_SHIFT 16 +#define STS_FM_VERSION_MINOR_MASK 0xff +#define STS_FM_VERSION_MAJOR_SHIFT 24 +#define STS_FM_VERSION_MAJOR_MASK 0xff +#define STS_FM_VERSION_PATCH_SHIFT 24 +#define STS_FM_VERSION_PATCH_MASK 0xff + +/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */ +#define LCB_CRC_16B 0x0 /* 16b CRC */ +#define LCB_CRC_14B 0x1 /* 14b CRC */ +#define LCB_CRC_48B 0x2 /* 48b CRC */ +#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */ + +/* + * the following enum is (almost) a copy/paste of the definition + * in the OPA spec, section 20.2.2.6.8 (PortInfo) + */ +enum { + PORT_LTP_CRC_MODE_NONE = 0, + PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */ + PORT_LTP_CRC_MODE_48 = 4, + /* 48-bit overlapping LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_PER_LANE = 8 + /* 12 to 16 bit per lane LTP CRC mode (optional) */ +}; + +/* timeouts */ +#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */ +#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */ +#define DC8051_COMMAND_TIMEOUT 1000 /* DC8051 command timeout, in ms */ +#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */ +#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */ +#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */ + +/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */ +#define ASIC_CCLOCK_PS 1242 /* 805 MHz */ +#define FPGA_CCLOCK_PS 30300 /* 33 MHz */ + +/* + * Mask of enabled MISC errors. Do not enable the two RSA engine errors - + * see firmware.c:run_rsa() for details. + */ +#define DRIVER_MISC_MASK \ + (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \ + | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)) + +/* valid values for the loopback module parameter */ +#define LOOPBACK_NONE 0 /* no loopback - default */ +#define LOOPBACK_SERDES 1 +#define LOOPBACK_LCB 2 +#define LOOPBACK_CABLE 3 /* external cable */ + +/* set up bits in MISC_CONFIG_BITS */ +#define LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT 0 +#define EXT_CFG_LCB_RESET_SUPPORTED_SHIFT 3 + +/* read and write hardware registers */ +u64 read_csr(const struct hfi1_devdata *dd, u32 offset); +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value); + +/* + * The *_kctxt_* flavor of the CSR read/write functions are for + * per-context or per-SDMA CSRs that are not mappable to user-space. + * Their spacing is not a PAGE_SIZE multiple. + */ +static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* kernel per-context CSRs are separated by 0x100 */ + return read_csr(dd, offset0 + (0x100 * ctxt)); +} + +static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* kernel per-context CSRs are separated by 0x100 */ + write_csr(dd, offset0 + (0x100 * ctxt), value); +} + +int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data); +int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data); + +void __iomem *get_csr_addr( + const struct hfi1_devdata *dd, + u32 offset); + +static inline void __iomem *get_kctxt_csr_addr( + const struct hfi1_devdata *dd, + int ctxt, + u32 offset0) +{ + return get_csr_addr(dd, offset0 + (0x100 * ctxt)); +} + +/* + * The *_uctxt_* flavor of the CSR read/write functions are for + * per-context CSRs that are mappable to user space. All these CSRs + * are spaced by a PAGE_SIZE multiple in order to be mappable to + * different processes without exposing other contexts' CSRs + */ +static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* user per-context CSRs are separated by 0x1000 */ + return read_csr(dd, offset0 + (0x1000 * ctxt)); +} + +static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* user per-context CSRs are separated by 0x1000 */ + write_csr(dd, offset0 + (0x1000 * ctxt), value); +} + +static inline u32 chip_rcv_contexts(struct hfi1_devdata *dd) +{ + return read_csr(dd, RCV_CONTEXTS); +} + +static inline u32 chip_send_contexts(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_CONTEXTS); +} + +static inline u32 chip_sdma_engines(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_DMA_ENGINES); +} + +static inline u32 chip_pio_mem_size(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_PIO_MEM_SIZE); +} + +static inline u32 chip_sdma_mem_size(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_DMA_MEM_SIZE); +} + +static inline u32 chip_rcv_array_count(struct hfi1_devdata *dd) +{ + return read_csr(dd, RCV_ARRAY_CNT); +} + +u8 encode_rcv_header_entry_size(u8 size); +int hfi1_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt); +void set_hdrq_regs(struct hfi1_devdata *dd, u8 ctxt, u8 entsize, u16 hdrcnt); + +u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, + u32 dw_len); + +/* firmware.c */ +#define SBUS_MASTER_BROADCAST 0xfd +#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */ +extern const u8 pcie_serdes_broadcast[]; +extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES]; + +/* SBus commands */ +#define RESET_SBUS_RECEIVER 0x20 +#define WRITE_SBUS_RECEIVER 0x21 +#define READ_SBUS_RECEIVER 0x22 +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +void set_sbus_fast_mode(struct hfi1_devdata *dd); +void clear_sbus_fast_mode(struct hfi1_devdata *dd); +int hfi1_firmware_init(struct hfi1_devdata *dd); +int load_pcie_firmware(struct hfi1_devdata *dd); +int load_firmware(struct hfi1_devdata *dd); +void dispose_firmware(void); +int acquire_hw_mutex(struct hfi1_devdata *dd); +void release_hw_mutex(struct hfi1_devdata *dd); + +/* + * Bitmask of dynamic access for ASIC block chip resources. Each HFI has its + * own range of bits for the resource so it can clear its own bits on + * starting and exiting. If either HFI has the resource bit set, the + * resource is in use. The separate bit ranges are: + * HFI0 bits 7:0 + * HFI1 bits 15:8 + */ +#define CR_SBUS 0x01 /* SBUS, THERM, and PCIE registers */ +#define CR_EPROM 0x02 /* EEP, GPIO registers */ +#define CR_I2C1 0x04 /* QSFP1_OE register */ +#define CR_I2C2 0x08 /* QSFP2_OE register */ +#define CR_DYN_SHIFT 8 /* dynamic flag shift */ +#define CR_DYN_MASK ((1ull << CR_DYN_SHIFT) - 1) + +/* + * Bitmask of static ASIC states these are outside of the dynamic ASIC + * block chip resources above. These are to be set once and never cleared. + * Must be holding the SBus dynamic flag when setting. + */ +#define CR_THERM_INIT 0x010000 + +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait); +void release_chip_resource(struct hfi1_devdata *dd, u32 resource); +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func); +void init_chip_resources(struct hfi1_devdata *dd); +void finish_chip_resources(struct hfi1_devdata *dd); + +/* ms wait time for access to an SBus resoure */ +#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */ + +/* ms wait time for a qsfp (i2c) chain to become available */ +#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */ + +void fabric_serdes_reset(struct hfi1_devdata *dd); +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result); + +/* chip.c */ +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, + u8 *ver_patch); +int write_host_interface_version(struct hfi1_devdata *dd, u8 version); +void read_guid(struct hfi1_devdata *dd); +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason); +int set_link_state(struct hfi1_pportdata *, u32 state); +int port_ltp_to_cap(int port_ltp); +void handle_verify_cap(struct work_struct *work); +void handle_freeze(struct work_struct *work); +void handle_link_up(struct work_struct *work); +void handle_link_down(struct work_struct *work); +void handle_link_downgrade(struct work_struct *work); +void handle_link_bounce(struct work_struct *work); +void handle_start_link(struct work_struct *work); +void handle_sma_message(struct work_struct *work); +int reset_qsfp(struct hfi1_pportdata *ppd); +void qsfp_event(struct work_struct *work); +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); +int send_idle_sma(struct hfi1_devdata *dd, u64 message); +int load_8051_config(struct hfi1_devdata *, u8, u8, u32); +int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *); +int start_link(struct hfi1_pportdata *ppd); +int bringup_serdes(struct hfi1_pportdata *ppd); +void set_intr_state(struct hfi1_devdata *dd, u32 enable); +bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + bool refresh_widths); +void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, + u32 intr_adjust, u32 npkts); +int stop_drain_data_vls(struct hfi1_devdata *dd); +int open_fill_data_vls(struct hfi1_devdata *dd); +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns); +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock); +void get_linkup_link_widths(struct hfi1_pportdata *ppd); +void read_ltp_rtt(struct hfi1_devdata *dd); +void clear_linkup_counters(struct hfi1_devdata *dd); +u32 hdrqempty(struct hfi1_ctxtdata *rcd); +int is_ax(struct hfi1_devdata *dd); +int is_bx(struct hfi1_devdata *dd); +bool is_urg_masked(struct hfi1_ctxtdata *rcd); +u32 read_physical_state(struct hfi1_devdata *dd); +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); +const char *opa_lstate_name(u32 lstate); +const char *opa_pstate_name(u32 pstate); +u32 driver_pstate(struct hfi1_pportdata *ppd); +u32 driver_lstate(struct hfi1_pportdata *ppd); + +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +#define LCB_START DC_LCB_CSRS +#define LCB_END DC_8051_CSRS /* next block is 8051 */ +extern uint num_vls; + +extern uint disable_integrity; +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl); +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data); +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl); +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data); +u32 read_logical_state(struct hfi1_devdata *dd); +void force_recv_intr(struct hfi1_ctxtdata *rcd); + +/* Per VL indexes */ +enum { + C_VL_0 = 0, + C_VL_1, + C_VL_2, + C_VL_3, + C_VL_4, + C_VL_5, + C_VL_6, + C_VL_7, + C_VL_15, + C_VL_COUNT +}; + +static inline int vl_from_idx(int idx) +{ + return (idx == C_VL_15 ? 15 : idx); +} + +static inline int idx_from_vl(int vl) +{ + return (vl == 15 ? C_VL_15 : vl); +} + +/* Per device counter indexes */ +enum { + C_RCV_OVF = 0, + C_RX_LEN_ERR, + C_RX_SHORT_ERR, + C_RX_ICRC_ERR, + C_RX_EBP, + C_RX_TID_FULL, + C_RX_TID_INVALID, + C_RX_TID_FLGMS, + C_RX_CTX_EGRS, + C_RCV_TID_FLSMS, + C_CCE_PCI_CR_ST, + C_CCE_PCI_TR_ST, + C_CCE_PIO_WR_ST, + C_CCE_ERR_INT, + C_CCE_SDMA_INT, + C_CCE_MISC_INT, + C_CCE_RCV_AV_INT, + C_CCE_RCV_URG_INT, + C_CCE_SEND_CR_INT, + C_DC_UNC_ERR, + C_DC_RCV_ERR, + C_DC_FM_CFG_ERR, + C_DC_RMT_PHY_ERR, + C_DC_DROPPED_PKT, + C_DC_MC_XMIT_PKTS, + C_DC_MC_RCV_PKTS, + C_DC_XMIT_CERR, + C_DC_RCV_CERR, + C_DC_RCV_FCC, + C_DC_XMIT_FCC, + C_DC_XMIT_FLITS, + C_DC_RCV_FLITS, + C_DC_XMIT_PKTS, + C_DC_RCV_PKTS, + C_DC_RX_FLIT_VL, + C_DC_RX_PKT_VL, + C_DC_RCV_FCN, + C_DC_RCV_FCN_VL, + C_DC_RCV_BCN, + C_DC_RCV_BCN_VL, + C_DC_RCV_BBL, + C_DC_RCV_BBL_VL, + C_DC_MARK_FECN, + C_DC_MARK_FECN_VL, + C_DC_TOTAL_CRC, + C_DC_CRC_LN0, + C_DC_CRC_LN1, + C_DC_CRC_LN2, + C_DC_CRC_LN3, + C_DC_CRC_MULT_LN, + C_DC_TX_REPLAY, + C_DC_RX_REPLAY, + C_DC_SEQ_CRC_CNT, + C_DC_ESC0_ONLY_CNT, + C_DC_ESC0_PLUS1_CNT, + C_DC_ESC0_PLUS2_CNT, + C_DC_REINIT_FROM_PEER_CNT, + C_DC_SBE_CNT, + C_DC_MISC_FLG_CNT, + C_DC_PRF_GOOD_LTP_CNT, + C_DC_PRF_ACCEPTED_LTP_CNT, + C_DC_PRF_RX_FLIT_CNT, + C_DC_PRF_TX_FLIT_CNT, + C_DC_PRF_CLK_CNTR, + C_DC_PG_DBG_FLIT_CRDTS_CNT, + C_DC_PG_STS_PAUSE_COMPLETE_CNT, + C_DC_PG_STS_TX_SBE_CNT, + C_DC_PG_STS_TX_MBE_CNT, + C_SW_CPU_INTR, + C_SW_CPU_RCV_LIM, + C_SW_CTX0_SEQ_DROP, + C_SW_VTX_WAIT, + C_SW_PIO_WAIT, + C_SW_PIO_DRAIN, + C_SW_KMEM_WAIT, + C_SW_TID_WAIT, + C_SW_SEND_SCHED, + C_SDMA_DESC_FETCHED_CNT, + C_SDMA_INT_CNT, + C_SDMA_ERR_CNT, + C_SDMA_IDLE_INT_CNT, + C_SDMA_PROGRESS_INT_CNT, +/* MISC_ERR_STATUS */ + C_MISC_PLL_LOCK_FAIL_ERR, + C_MISC_MBIST_FAIL_ERR, + C_MISC_INVALID_EEP_CMD_ERR, + C_MISC_EFUSE_DONE_PARITY_ERR, + C_MISC_EFUSE_WRITE_ERR, + C_MISC_EFUSE_READ_BAD_ADDR_ERR, + C_MISC_EFUSE_CSR_PARITY_ERR, + C_MISC_FW_AUTH_FAILED_ERR, + C_MISC_KEY_MISMATCH_ERR, + C_MISC_SBUS_WRITE_FAILED_ERR, + C_MISC_CSR_WRITE_BAD_ADDR_ERR, + C_MISC_CSR_READ_BAD_ADDR_ERR, + C_MISC_CSR_PARITY_ERR, +/* CceErrStatus */ + /* + * A special counter that is the aggregate count + * of all the cce_err_status errors. The remainder + * are actual bits in the CceErrStatus register. + */ + C_CCE_ERR_STATUS_AGGREGATED_CNT, + C_CCE_MSIX_CSR_PARITY_ERR, + C_CCE_INT_MAP_UNC_ERR, + C_CCE_INT_MAP_COR_ERR, + C_CCE_MSIX_TABLE_UNC_ERR, + C_CCE_MSIX_TABLE_COR_ERR, + C_CCE_RXDMA_CONV_FIFO_PARITY_ERR, + C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR, + C_CCE_SEG_WRITE_BAD_ADDR_ERR, + C_CCE_SEG_READ_BAD_ADDR_ERR, + C_LA_TRIGGERED, + C_CCE_TRGT_CPL_TIMEOUT_ERR, + C_PCIC_RECEIVE_PARITY_ERR, + C_PCIC_TRANSMIT_BACK_PARITY_ERR, + C_PCIC_TRANSMIT_FRONT_PARITY_ERR, + C_PCIC_CPL_DAT_Q_UNC_ERR, + C_PCIC_CPL_HD_Q_UNC_ERR, + C_PCIC_POST_DAT_Q_UNC_ERR, + C_PCIC_POST_HD_Q_UNC_ERR, + C_PCIC_RETRY_SOT_MEM_UNC_ERR, + C_PCIC_RETRY_MEM_UNC_ERR, + C_PCIC_N_POST_DAT_Q_PARITY_ERR, + C_PCIC_N_POST_H_Q_PARITY_ERR, + C_PCIC_CPL_DAT_Q_COR_ERR, + C_PCIC_CPL_HD_Q_COR_ERR, + C_PCIC_POST_DAT_Q_COR_ERR, + C_PCIC_POST_HD_Q_COR_ERR, + C_PCIC_RETRY_SOT_MEM_COR_ERR, + C_PCIC_RETRY_MEM_COR_ERR, + C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR, + C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR, + C_CCE_CSR_CFG_BUS_PARITY_ERR, + C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR, + C_CCE_RSPD_DATA_PARITY_ERR, + C_CCE_TRGT_ACCESS_ERR, + C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR, + C_CCE_CSR_WRITE_BAD_ADDR_ERR, + C_CCE_CSR_READ_BAD_ADDR_ERR, + C_CCE_CSR_PARITY_ERR, +/* RcvErrStatus */ + C_RX_CSR_PARITY_ERR, + C_RX_CSR_WRITE_BAD_ADDR_ERR, + C_RX_CSR_READ_BAD_ADDR_ERR, + C_RX_DMA_CSR_UNC_ERR, + C_RX_DMA_DQ_FSM_ENCODING_ERR, + C_RX_DMA_EQ_FSM_ENCODING_ERR, + C_RX_DMA_CSR_PARITY_ERR, + C_RX_RBUF_DATA_COR_ERR, + C_RX_RBUF_DATA_UNC_ERR, + C_RX_DMA_DATA_FIFO_RD_COR_ERR, + C_RX_DMA_DATA_FIFO_RD_UNC_ERR, + C_RX_DMA_HDR_FIFO_RD_COR_ERR, + C_RX_DMA_HDR_FIFO_RD_UNC_ERR, + C_RX_RBUF_DESC_PART2_COR_ERR, + C_RX_RBUF_DESC_PART2_UNC_ERR, + C_RX_RBUF_DESC_PART1_COR_ERR, + C_RX_RBUF_DESC_PART1_UNC_ERR, + C_RX_HQ_INTR_FSM_ERR, + C_RX_HQ_INTR_CSR_PARITY_ERR, + C_RX_LOOKUP_CSR_PARITY_ERR, + C_RX_LOOKUP_RCV_ARRAY_COR_ERR, + C_RX_LOOKUP_RCV_ARRAY_UNC_ERR, + C_RX_LOOKUP_DES_PART2_PARITY_ERR, + C_RX_LOOKUP_DES_PART1_UNC_COR_ERR, + C_RX_LOOKUP_DES_PART1_UNC_ERR, + C_RX_RBUF_NEXT_FREE_BUF_COR_ERR, + C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR, + C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR, + C_RX_RBUF_FL_INITDONE_PARITY_ERR, + C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR, + C_RX_RBUF_FL_RD_ADDR_PARITY_ERR, + C_RX_RBUF_EMPTY_ERR, + C_RX_RBUF_FULL_ERR, + C_RX_RBUF_BAD_LOOKUP_ERR, + C_RX_RBUF_CTX_ID_PARITY_ERR, + C_RX_RBUF_CSR_QEOPDW_PARITY_ERR, + C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR, + C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR, + C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR, + C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR, + C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR, + C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR, + C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR, + C_RX_RBUF_BLOCK_LIST_READ_COR_ERR, + C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR, + C_RX_RBUF_LOOKUP_DES_COR_ERR, + C_RX_RBUF_LOOKUP_DES_UNC_ERR, + C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR, + C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR, + C_RX_RBUF_FREE_LIST_COR_ERR, + C_RX_RBUF_FREE_LIST_UNC_ERR, + C_RX_RCV_FSM_ENCODING_ERR, + C_RX_DMA_FLAG_COR_ERR, + C_RX_DMA_FLAG_UNC_ERR, + C_RX_DC_SOP_EOP_PARITY_ERR, + C_RX_RCV_CSR_PARITY_ERR, + C_RX_RCV_QP_MAP_TABLE_COR_ERR, + C_RX_RCV_QP_MAP_TABLE_UNC_ERR, + C_RX_RCV_DATA_COR_ERR, + C_RX_RCV_DATA_UNC_ERR, + C_RX_RCV_HDR_COR_ERR, + C_RX_RCV_HDR_UNC_ERR, + C_RX_DC_INTF_PARITY_ERR, + C_RX_DMA_CSR_COR_ERR, +/* SendPioErrStatus */ + C_PIO_PEC_SOP_HEAD_PARITY_ERR, + C_PIO_PCC_SOP_HEAD_PARITY_ERR, + C_PIO_LAST_RETURNED_CNT_PARITY_ERR, + C_PIO_CURRENT_FREE_CNT_PARITY_ERR, + C_PIO_RSVD_31_ERR, + C_PIO_RSVD_30_ERR, + C_PIO_PPMC_SOP_LEN_ERR, + C_PIO_PPMC_BQC_MEM_PARITY_ERR, + C_PIO_VL_FIFO_PARITY_ERR, + C_PIO_VLF_SOP_PARITY_ERR, + C_PIO_VLF_V1_LEN_PARITY_ERR, + C_PIO_BLOCK_QW_COUNT_PARITY_ERR, + C_PIO_WRITE_QW_VALID_PARITY_ERR, + C_PIO_STATE_MACHINE_ERR, + C_PIO_WRITE_DATA_PARITY_ERR, + C_PIO_HOST_ADDR_MEM_COR_ERR, + C_PIO_HOST_ADDR_MEM_UNC_ERR, + C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR, + C_PIO_INIT_SM_IN_ERR, + C_PIO_PPMC_PBL_FIFO_ERR, + C_PIO_CREDIT_RET_FIFO_PARITY_ERR, + C_PIO_V1_LEN_MEM_BANK1_COR_ERR, + C_PIO_V1_LEN_MEM_BANK0_COR_ERR, + C_PIO_V1_LEN_MEM_BANK1_UNC_ERR, + C_PIO_V1_LEN_MEM_BANK0_UNC_ERR, + C_PIO_SM_PKT_RESET_PARITY_ERR, + C_PIO_PKT_EVICT_FIFO_PARITY_ERR, + C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR, + C_PIO_SBRDCTL_CRREL_PARITY_ERR, + C_PIO_PEC_FIFO_PARITY_ERR, + C_PIO_PCC_FIFO_PARITY_ERR, + C_PIO_SB_MEM_FIFO1_ERR, + C_PIO_SB_MEM_FIFO0_ERR, + C_PIO_CSR_PARITY_ERR, + C_PIO_WRITE_ADDR_PARITY_ERR, + C_PIO_WRITE_BAD_CTXT_ERR, +/* SendDmaErrStatus */ + C_SDMA_PCIE_REQ_TRACKING_COR_ERR, + C_SDMA_PCIE_REQ_TRACKING_UNC_ERR, + C_SDMA_CSR_PARITY_ERR, + C_SDMA_RPY_TAG_ERR, +/* SendEgressErrStatus */ + C_TX_READ_PIO_MEMORY_CSR_UNC_ERR, + C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR, + C_TX_EGRESS_FIFO_COR_ERR, + C_TX_READ_PIO_MEMORY_COR_ERR, + C_TX_READ_SDMA_MEMORY_COR_ERR, + C_TX_SB_HDR_COR_ERR, + C_TX_CREDIT_OVERRUN_ERR, + C_TX_LAUNCH_FIFO8_COR_ERR, + C_TX_LAUNCH_FIFO7_COR_ERR, + C_TX_LAUNCH_FIFO6_COR_ERR, + C_TX_LAUNCH_FIFO5_COR_ERR, + C_TX_LAUNCH_FIFO4_COR_ERR, + C_TX_LAUNCH_FIFO3_COR_ERR, + C_TX_LAUNCH_FIFO2_COR_ERR, + C_TX_LAUNCH_FIFO1_COR_ERR, + C_TX_LAUNCH_FIFO0_COR_ERR, + C_TX_CREDIT_RETURN_VL_ERR, + C_TX_HCRC_INSERTION_ERR, + C_TX_EGRESS_FIFI_UNC_ERR, + C_TX_READ_PIO_MEMORY_UNC_ERR, + C_TX_READ_SDMA_MEMORY_UNC_ERR, + C_TX_SB_HDR_UNC_ERR, + C_TX_CREDIT_RETURN_PARITY_ERR, + C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR, + C_TX_SDMA15_DISALLOWED_PACKET_ERR, + C_TX_SDMA14_DISALLOWED_PACKET_ERR, + C_TX_SDMA13_DISALLOWED_PACKET_ERR, + C_TX_SDMA12_DISALLOWED_PACKET_ERR, + C_TX_SDMA11_DISALLOWED_PACKET_ERR, + C_TX_SDMA10_DISALLOWED_PACKET_ERR, + C_TX_SDMA9_DISALLOWED_PACKET_ERR, + C_TX_SDMA8_DISALLOWED_PACKET_ERR, + C_TX_SDMA7_DISALLOWED_PACKET_ERR, + C_TX_SDMA6_DISALLOWED_PACKET_ERR, + C_TX_SDMA5_DISALLOWED_PACKET_ERR, + C_TX_SDMA4_DISALLOWED_PACKET_ERR, + C_TX_SDMA3_DISALLOWED_PACKET_ERR, + C_TX_SDMA2_DISALLOWED_PACKET_ERR, + C_TX_SDMA1_DISALLOWED_PACKET_ERR, + C_TX_SDMA0_DISALLOWED_PACKET_ERR, + C_TX_CONFIG_PARITY_ERR, + C_TX_SBRD_CTL_CSR_PARITY_ERR, + C_TX_LAUNCH_CSR_PARITY_ERR, + C_TX_ILLEGAL_CL_ERR, + C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR, + C_TX_RESERVED_10, + C_TX_RESERVED_9, + C_TX_SDMA_LAUNCH_INTF_PARITY_ERR, + C_TX_PIO_LAUNCH_INTF_PARITY_ERR, + C_TX_RESERVED_6, + C_TX_INCORRECT_LINK_STATE_ERR, + C_TX_LINK_DOWN_ERR, + C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR, + C_TX_RESERVED_2, + C_TX_PKT_INTEGRITY_MEM_UNC_ERR, + C_TX_PKT_INTEGRITY_MEM_COR_ERR, +/* SendErrStatus */ + C_SEND_CSR_WRITE_BAD_ADDR_ERR, + C_SEND_CSR_READ_BAD_ADD_ERR, + C_SEND_CSR_PARITY_ERR, +/* SendCtxtErrStatus */ + C_PIO_WRITE_OUT_OF_BOUNDS_ERR, + C_PIO_WRITE_OVERFLOW_ERR, + C_PIO_WRITE_CROSSES_BOUNDARY_ERR, + C_PIO_DISALLOWED_PACKET_ERR, + C_PIO_INCONSISTENT_SOP_ERR, +/*SendDmaEngErrStatus */ + C_SDMA_HEADER_REQUEST_FIFO_COR_ERR, + C_SDMA_HEADER_STORAGE_COR_ERR, + C_SDMA_PACKET_TRACKING_COR_ERR, + C_SDMA_ASSEMBLY_COR_ERR, + C_SDMA_DESC_TABLE_COR_ERR, + C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR, + C_SDMA_HEADER_STORAGE_UNC_ERR, + C_SDMA_PACKET_TRACKING_UNC_ERR, + C_SDMA_ASSEMBLY_UNC_ERR, + C_SDMA_DESC_TABLE_UNC_ERR, + C_SDMA_TIMEOUT_ERR, + C_SDMA_HEADER_LENGTH_ERR, + C_SDMA_HEADER_ADDRESS_ERR, + C_SDMA_HEADER_SELECT_ERR, + C_SMDA_RESERVED_9, + C_SDMA_PACKET_DESC_OVERFLOW_ERR, + C_SDMA_LENGTH_MISMATCH_ERR, + C_SDMA_HALT_ERR, + C_SDMA_MEM_READ_ERR, + C_SDMA_FIRST_DESC_ERR, + C_SDMA_TAIL_OUT_OF_BOUNDS_ERR, + C_SDMA_TOO_LONG_ERR, + C_SDMA_GEN_MISMATCH_ERR, + C_SDMA_WRONG_DW_ERR, + DEV_CNTR_LAST /* Must be kept last */ +}; + +/* Per port counter indexes */ +enum { + C_TX_UNSUP_VL = 0, + C_TX_INVAL_LEN, + C_TX_MM_LEN_ERR, + C_TX_UNDERRUN, + C_TX_FLOW_STALL, + C_TX_DROPPED, + C_TX_HDR_ERR, + C_TX_PKT, + C_TX_WORDS, + C_TX_WAIT, + C_TX_FLIT_VL, + C_TX_PKT_VL, + C_TX_WAIT_VL, + C_RX_PKT, + C_RX_WORDS, + C_SW_LINK_DOWN, + C_SW_LINK_UP, + C_SW_UNKNOWN_FRAME, + C_SW_XMIT_DSCD, + C_SW_XMIT_DSCD_VL, + C_SW_XMIT_CSTR_ERR, + C_SW_RCV_CSTR_ERR, + C_SW_IBP_LOOP_PKTS, + C_SW_IBP_RC_RESENDS, + C_SW_IBP_RNR_NAKS, + C_SW_IBP_OTHER_NAKS, + C_SW_IBP_RC_TIMEOUTS, + C_SW_IBP_PKT_DROPS, + C_SW_IBP_DMA_WAIT, + C_SW_IBP_RC_SEQNAK, + C_SW_IBP_RC_DUPREQ, + C_SW_IBP_RDMA_SEQ, + C_SW_IBP_UNALIGNED, + C_SW_IBP_SEQ_NAK, + C_SW_IBP_RC_CRWAITS, + C_SW_CPU_RC_ACKS, + C_SW_CPU_RC_QACKS, + C_SW_CPU_RC_DELAYED_COMP, + C_RCV_HDR_OVF_0, + C_RCV_HDR_OVF_1, + C_RCV_HDR_OVF_2, + C_RCV_HDR_OVF_3, + C_RCV_HDR_OVF_4, + C_RCV_HDR_OVF_5, + C_RCV_HDR_OVF_6, + C_RCV_HDR_OVF_7, + C_RCV_HDR_OVF_8, + C_RCV_HDR_OVF_9, + C_RCV_HDR_OVF_10, + C_RCV_HDR_OVF_11, + C_RCV_HDR_OVF_12, + C_RCV_HDR_OVF_13, + C_RCV_HDR_OVF_14, + C_RCV_HDR_OVF_15, + C_RCV_HDR_OVF_16, + C_RCV_HDR_OVF_17, + C_RCV_HDR_OVF_18, + C_RCV_HDR_OVF_19, + C_RCV_HDR_OVF_20, + C_RCV_HDR_OVF_21, + C_RCV_HDR_OVF_22, + C_RCV_HDR_OVF_23, + C_RCV_HDR_OVF_24, + C_RCV_HDR_OVF_25, + C_RCV_HDR_OVF_26, + C_RCV_HDR_OVF_27, + C_RCV_HDR_OVF_28, + C_RCV_HDR_OVF_29, + C_RCV_HDR_OVF_30, + C_RCV_HDR_OVF_31, + C_RCV_HDR_OVF_32, + C_RCV_HDR_OVF_33, + C_RCV_HDR_OVF_34, + C_RCV_HDR_OVF_35, + C_RCV_HDR_OVF_36, + C_RCV_HDR_OVF_37, + C_RCV_HDR_OVF_38, + C_RCV_HDR_OVF_39, + C_RCV_HDR_OVF_40, + C_RCV_HDR_OVF_41, + C_RCV_HDR_OVF_42, + C_RCV_HDR_OVF_43, + C_RCV_HDR_OVF_44, + C_RCV_HDR_OVF_45, + C_RCV_HDR_OVF_46, + C_RCV_HDR_OVF_47, + C_RCV_HDR_OVF_48, + C_RCV_HDR_OVF_49, + C_RCV_HDR_OVF_50, + C_RCV_HDR_OVF_51, + C_RCV_HDR_OVF_52, + C_RCV_HDR_OVF_53, + C_RCV_HDR_OVF_54, + C_RCV_HDR_OVF_55, + C_RCV_HDR_OVF_56, + C_RCV_HDR_OVF_57, + C_RCV_HDR_OVF_58, + C_RCV_HDR_OVF_59, + C_RCV_HDR_OVF_60, + C_RCV_HDR_OVF_61, + C_RCV_HDR_OVF_62, + C_RCV_HDR_OVF_63, + C_RCV_HDR_OVF_64, + C_RCV_HDR_OVF_65, + C_RCV_HDR_OVF_66, + C_RCV_HDR_OVF_67, + C_RCV_HDR_OVF_68, + C_RCV_HDR_OVF_69, + C_RCV_HDR_OVF_70, + C_RCV_HDR_OVF_71, + C_RCV_HDR_OVF_72, + C_RCV_HDR_OVF_73, + C_RCV_HDR_OVF_74, + C_RCV_HDR_OVF_75, + C_RCV_HDR_OVF_76, + C_RCV_HDR_OVF_77, + C_RCV_HDR_OVF_78, + C_RCV_HDR_OVF_79, + C_RCV_HDR_OVF_80, + C_RCV_HDR_OVF_81, + C_RCV_HDR_OVF_82, + C_RCV_HDR_OVF_83, + C_RCV_HDR_OVF_84, + C_RCV_HDR_OVF_85, + C_RCV_HDR_OVF_86, + C_RCV_HDR_OVF_87, + C_RCV_HDR_OVF_88, + C_RCV_HDR_OVF_89, + C_RCV_HDR_OVF_90, + C_RCV_HDR_OVF_91, + C_RCV_HDR_OVF_92, + C_RCV_HDR_OVF_93, + C_RCV_HDR_OVF_94, + C_RCV_HDR_OVF_95, + C_RCV_HDR_OVF_96, + C_RCV_HDR_OVF_97, + C_RCV_HDR_OVF_98, + C_RCV_HDR_OVF_99, + C_RCV_HDR_OVF_100, + C_RCV_HDR_OVF_101, + C_RCV_HDR_OVF_102, + C_RCV_HDR_OVF_103, + C_RCV_HDR_OVF_104, + C_RCV_HDR_OVF_105, + C_RCV_HDR_OVF_106, + C_RCV_HDR_OVF_107, + C_RCV_HDR_OVF_108, + C_RCV_HDR_OVF_109, + C_RCV_HDR_OVF_110, + C_RCV_HDR_OVF_111, + C_RCV_HDR_OVF_112, + C_RCV_HDR_OVF_113, + C_RCV_HDR_OVF_114, + C_RCV_HDR_OVF_115, + C_RCV_HDR_OVF_116, + C_RCV_HDR_OVF_117, + C_RCV_HDR_OVF_118, + C_RCV_HDR_OVF_119, + C_RCV_HDR_OVF_120, + C_RCV_HDR_OVF_121, + C_RCV_HDR_OVF_122, + C_RCV_HDR_OVF_123, + C_RCV_HDR_OVF_124, + C_RCV_HDR_OVF_125, + C_RCV_HDR_OVF_126, + C_RCV_HDR_OVF_127, + C_RCV_HDR_OVF_128, + C_RCV_HDR_OVF_129, + C_RCV_HDR_OVF_130, + C_RCV_HDR_OVF_131, + C_RCV_HDR_OVF_132, + C_RCV_HDR_OVF_133, + C_RCV_HDR_OVF_134, + C_RCV_HDR_OVF_135, + C_RCV_HDR_OVF_136, + C_RCV_HDR_OVF_137, + C_RCV_HDR_OVF_138, + C_RCV_HDR_OVF_139, + C_RCV_HDR_OVF_140, + C_RCV_HDR_OVF_141, + C_RCV_HDR_OVF_142, + C_RCV_HDR_OVF_143, + C_RCV_HDR_OVF_144, + C_RCV_HDR_OVF_145, + C_RCV_HDR_OVF_146, + C_RCV_HDR_OVF_147, + C_RCV_HDR_OVF_148, + C_RCV_HDR_OVF_149, + C_RCV_HDR_OVF_150, + C_RCV_HDR_OVF_151, + C_RCV_HDR_OVF_152, + C_RCV_HDR_OVF_153, + C_RCV_HDR_OVF_154, + C_RCV_HDR_OVF_155, + C_RCV_HDR_OVF_156, + C_RCV_HDR_OVF_157, + C_RCV_HDR_OVF_158, + C_RCV_HDR_OVF_159, + PORT_CNTR_LAST /* Must be kept last */ +}; + +u64 get_all_cpu_total(u64 __percpu *cntr); +void hfi1_start_cleanup(struct hfi1_devdata *dd); +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); +void hfi1_init_ctxt(struct send_context *sc); +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order); +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd); +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, + struct hfi1_ctxtdata *rcd); +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp); +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp); +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which); +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val); +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd, + u16 jkey); +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt); +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt, + u16 pkey); +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt); +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); +void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); +void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd); + +irqreturn_t general_interrupt(int irq, void *data); +irqreturn_t sdma_interrupt(int irq, void *data); +irqreturn_t receive_context_interrupt(int irq, void *data); +irqreturn_t receive_context_thread(int irq, void *data); +irqreturn_t receive_context_interrupt_napi(int irq, void *data); + +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); +void init_qsfp_int(struct hfi1_devdata *dd); +void clear_all_interrupts(struct hfi1_devdata *dd); +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); +void reset_interrupts(struct hfi1_devdata *dd); +u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx); +void hfi1_init_aip_rsm(struct hfi1_devdata *dd); +void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd); + +/* + * Interrupt source table. + * + * Each entry is an interrupt source "type". It is ordered by increasing + * number. + */ +struct is_table { + int start; /* interrupt source type start */ + int end; /* interrupt source type end */ + /* routine that returns the name of the interrupt source */ + char *(*is_name)(char *name, size_t size, unsigned int source); + /* routine to call when receiving an interrupt */ + void (*is_int)(struct hfi1_devdata *dd, unsigned int source); +}; + +#endif /* _CHIP_H */ diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h new file mode 100644 index 0000000000..95a8d530d5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -0,0 +1,1295 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#ifndef DEF_CHIP_REG +#define DEF_CHIP_REG + +#define CORE 0x000000000000 +#define CCE (CORE + 0x000000000000) +#define ASIC (CORE + 0x000000400000) +#define MISC (CORE + 0x000000500000) +#define DC_TOP_CSRS (CORE + 0x000000600000) +#define CHIP_DEBUG (CORE + 0x000000700000) +#define RXE (CORE + 0x000001000000) +#define TXE (CORE + 0x000001800000) +#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000) +#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000) +#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000) +#define PCIE 0 + +#define ASIC_NUM_SCRATCH 4 +#define CCE_ERR_INT_CNT 0 +#define CCE_MISC_INT_CNT 2 +#define CCE_NUM_32_BIT_COUNTERS 3 +#define CCE_NUM_32_BIT_INT_COUNTERS 6 +#define CCE_NUM_INT_CSRS 12 +#define CCE_NUM_INT_MAP_CSRS 96 +#define CCE_NUM_MSIX_PBAS 4 +#define CCE_NUM_MSIX_VECTORS 256 +#define CCE_NUM_SCRATCH 4 +#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2 +#define CCE_PCIE_TRGT_STALL_CNT 0 +#define CCE_PIO_WR_STALL_CNT 1 +#define CCE_RCV_AVAIL_INT_CNT 3 +#define CCE_RCV_URGENT_INT_CNT 4 +#define CCE_SDMA_INT_CNT 1 +#define CCE_SEND_CREDIT_INT_CNT 5 +#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040) +#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0 +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull +#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008) +#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010) +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16 +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0 +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48 +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32 +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull +#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000) +#define DCC_CFG_RESET_RESET_LCB BIT_ULL(0) +#define DCC_CFG_RESET_RESET_TX_FPE BIT_ULL(1) +#define DCC_CFG_RESET_RESET_RX_FPE BIT_ULL(2) +#define DCC_CFG_RESET_RESET_8051 BIT_ULL(3) +#define DCC_CFG_RESET_ENABLE_CCLK_BCC BIT_ULL(4) +#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028) +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030) +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60 +#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120) +#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050) +#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull +#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull +#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull +#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull +#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull +#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull +#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull +#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull +#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull +#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull +#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull +#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull +#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull +#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060) +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull +#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull +#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058) +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull +#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull +#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull +#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull +#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull +#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull +#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull +#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull +#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull +#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull +#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull +#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull +#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull +#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull +#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull +#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull +#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull +#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull +#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull +#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull +#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull +#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull +#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull +#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull +#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110) +#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090) +#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078) +#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080) +#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088) +#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098) +#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108) +#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118) +#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100) +#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330) +#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290) +#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0) +#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140) +#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198) +#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240) +#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130) +#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8) +#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338) +#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298) +#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8) +#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0) +#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248) +#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8) +#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138) +#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190) +#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128) +#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0) +#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180) +#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188) +#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110) +#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull +#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118) +#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120) +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028) +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030) +#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038) +#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0 +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000) +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull +#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018) +#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010) +#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020) +#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068) +#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull +#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull +#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull +#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull +#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8) +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16 +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0 +#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8) +#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0) +#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0) +#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull +#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull +#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull +#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull +#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull +#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull +#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull +#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull +#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull +#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060) +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16 +#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0 +#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050) +#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull +#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058) +#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040) +#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048) +#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull +#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130) +#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull +#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128) +#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0 +#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058) +#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0 +#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020) +#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull +#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100) +#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120) +#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060) +#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8) +#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0 +#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000) +#define DC_LCB_CFG_RUN_EN_SHIFT 0 +#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018) +#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8 +#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4 +#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010) +#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008) +#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0 +#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030) +#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040) +#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110) +#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308) +#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310) +#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300) +#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull +#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull +#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull +#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull +#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull +#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull +#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull +#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull +#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull +#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull +#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull +#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull +#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull +#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull +#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull +#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull +#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull +#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull +#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull +#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull +#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328) +#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330) +#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338) +#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340) +#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348) +#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378) +#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390) +#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380) +#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358) +#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388) +#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360) +#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320) +#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350) +#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580) +#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8) +#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608) +#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600) +#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408) +#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420) +#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400) +#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410) +#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418) +#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468) +#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0) +#define RCV_LENGTH_ERR_CNT 0 +#define RCV_SHORT_ERR_CNT 2 +#define RCV_ICRC_ERR_CNT 6 +#define RCV_EBP_CNT 9 +#define RCV_BUF_OVFL_CNT 10 +#define RCV_CONTEXT_EGR_STALL 22 +#define RCV_DATA_PKT_CNT 0 +#define RCV_DWORD_CNT 1 +#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20 +#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23 +#define RCV_TID_FULL_ERR_CNT 18 +#define RCV_TID_VALID_ERR_CNT 19 +#define RXE_NUM_32_BIT_COUNTERS 24 +#define RXE_NUM_64_BIT_COUNTERS 2 +#define RXE_NUM_RSM_INSTANCES 4 +#define RXE_NUM_TID_FLOWS 32 +#define RXE_PER_CONTEXT_OFFSET 0x0300000 +#define SEND_DATA_PKT_CNT 0 +#define SEND_DATA_PKT_VL0_CNT 12 +#define SEND_DATA_VL0_CNT 3 +#define SEND_DROPPED_PKT_CNT 5 +#define SEND_DWORD_CNT 1 +#define SEND_FLOW_STALL_CNT 4 +#define SEND_HEADERS_ERR_CNT 6 +#define SEND_LEN_ERR_CNT 1 +#define SEND_MAX_MIN_LEN_ERR_CNT 2 +#define SEND_UNDERRUN_CNT 3 +#define SEND_UNSUP_VL_ERR_CNT 0 +#define SEND_WAIT_CNT 2 +#define SEND_WAIT_VL0_CNT 21 +#define TXE_PIO_SEND_OFFSET 0x0800000 +#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048) +#define ASIC_CFG_MUTEX (ASIC + 0x000000000040) +#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008) +#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull +#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull +#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000) +#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16 +#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8 +#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32 +#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0 +#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020) +#define ASIC_CFG_SCRATCH_1 (ASIC_CFG_SCRATCH + 0x08) +#define ASIC_CFG_SCRATCH_2 (ASIC_CFG_SCRATCH + 0x10) +#define ASIC_CFG_SCRATCH_3 (ASIC_CFG_SCRATCH + 0x18) +#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050) +#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308) +#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull +#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300) +#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull +#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8 +#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull +#define ASIC_EEP_DATA (ASIC + 0x000000000310) +#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230) +#define ASIC_GPIO_FORCE (ASIC + 0x000000000238) +#define ASIC_GPIO_IN (ASIC + 0x000000000200) +#define ASIC_GPIO_INVERT (ASIC + 0x000000000210) +#define ASIC_GPIO_MASK (ASIC + 0x000000000220) +#define ASIC_GPIO_OE (ASIC + 0x000000000208) +#define ASIC_GPIO_OUT (ASIC + 0x000000000218) +#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100) +#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0 +#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull +#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull +#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12 +#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108) +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110) +#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118) +#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180) +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16 +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128) +#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270) +#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278) +#define ASIC_QSFP1_IN (ASIC + 0x000000000240) +#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250) +#define ASIC_QSFP1_MASK (ASIC + 0x000000000260) +#define ASIC_QSFP1_OE (ASIC + 0x000000000248) +#define ASIC_QSFP1_OUT (ASIC + 0x000000000258) +#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268) +#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0) +#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8) +#define ASIC_QSFP2_IN (ASIC + 0x000000000280) +#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290) +#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0) +#define ASIC_QSFP2_OE (ASIC + 0x000000000288) +#define ASIC_QSFP2_OUT (ASIC + 0x000000000298) +#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8) +#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018) +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0 +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16 +#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010) +#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull +#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2 +#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull +#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32 +#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull +#define ASIC_STS_THERM (ASIC + 0x000000000058) +#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18 +#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2 +#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_HI_TEMP_SHIFT 50 +#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_LO_TEMP_SHIFT 34 +#define ASIC_STS_THERM_LOW_SHIFT 13 +#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060) +#define CCE_CTRL (CCE + 0x000000000010) +#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull +#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull +#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull +#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull +#define CCE_DC_CTRL (CCE + 0x0000000000B8) +#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull +#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull +#define CCE_ERR_CLEAR (CCE + 0x000000000050) +#define CCE_ERR_MASK (CCE + 0x000000000048) +#define CCE_ERR_STATUS (CCE + 0x000000000040) +#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \ + 0x200ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \ + 0x800ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \ + 0x400ull +#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull +#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull +#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull +#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull +#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull +#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull +#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull +#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull +#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull +#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull +#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull +#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull +#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull +#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull +#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull +#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull +#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull +#define CCE_INT_CLEAR (CCE + 0x000000110A00) +#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00) +#define CCE_INT_FORCE (CCE + 0x000000110B00) +#define CCE_INT_MAP (CCE + 0x000000110500) +#define CCE_INT_MASK (CCE + 0x000000110900) +#define CCE_INT_STATUS (CCE + 0x000000110800) +#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200) +#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000) +#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008) +#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull +#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400) +#define CCE_PCIE_CTRL (CCE + 0x0000000000C0) +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0 +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2 +#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8 +#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13 +#define CCE_REVISION (CCE + 0x000000000000) +#define CCE_REVISION2 (CCE + 0x000000000008) +#define CCE_REVISION2_HFI_ID_MASK 0x1ull +#define CCE_REVISION2_HFI_ID_SHIFT 0 +#define CCE_REVISION2_IMPL_CODE_SHIFT 8 +#define CCE_REVISION2_IMPL_REVISION_SHIFT 16 +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32 +#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8 +#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0 +#define CCE_REVISION_SW_MASK 0xFFull +#define CCE_REVISION_SW_SHIFT 24 +#define CCE_SCRATCH (CCE + 0x000000000020) +#define CCE_STATUS (CCE + 0x000000000018) +#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull +#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull +#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull +#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull +#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull +#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull +#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull +#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull +#define MISC_CFG_FW_CTRL (MISC + 0x000000001000) +#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull +#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2 +#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull +#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08) +#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400) +#define MISC_CFG_RSA_MU (MISC + 0x000000000A10) +#define MISC_CFG_RSA_R2 (MISC + 0x000000000000) +#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200) +#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00) +#define MISC_ERR_CLEAR (MISC + 0x000000002010) +#define MISC_ERR_MASK (MISC + 0x000000002008) +#define MISC_ERR_STATUS (MISC + 0x000000002000) +#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull +#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull +#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull +#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull +#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull +#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull +#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull +#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull +#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull +#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull +#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull +#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0) +#define PCI_CFG_REG1 (PCIE + 0x000000000004) +#define PCI_CFG_REG11 (PCIE + 0x00000000002C) +#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C) +#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150) +#define PCIE_CFG_TPH2 (PCIE + 0x000000000180) +#define RCV_ARRAY (RXE + 0x000000200000) +#define RCV_ARRAY_CNT (RXE + 0x000000000018) +#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull +#define RCV_ARRAY_RT_ADDR_SHIFT 0 +#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36 +#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull +#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050) +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0 +#define RCV_BTH_QP (RXE + 0x000000000028) +#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull +#define RCV_BTH_QP_KDETH_QP_SHIFT 16 +#define RCV_BYPASS (RXE + 0x000000000038) +#define RCV_BYPASS_HDR_SIZE_SHIFT 16 +#define RCV_BYPASS_HDR_SIZE_MASK 0x1Full +#define RCV_BYPASS_HDR_SIZE_SMASK 0x1F0000ull +#define RCV_BYPASS_BYPASS_CONTEXT_SHIFT 0 +#define RCV_BYPASS_BYPASS_CONTEXT_MASK 0xFFull +#define RCV_BYPASS_BYPASS_CONTEXT_SMASK 0xFFull +#define RCV_CONTEXTS (RXE + 0x000000000010) +#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400) +#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500) +#define RCV_CTRL (RXE + 0x000000000000) +#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull +#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull +#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull +#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull +#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull +#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull +#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull +#define RCV_CTXT_CTRL (RXE + 0x000000100000) +#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull +#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8 +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull +#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull +#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull +#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull +#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull +#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull +#define RCV_CTXT_STATUS (RXE + 0x000000100008) +#define RCV_EGR_CTRL (RXE + 0x000000100010) +#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull +#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0 +#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull +#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32 +#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018) +#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull +#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0 +#define RCV_ERR_CLEAR (RXE + 0x000000000070) +#define RCV_ERR_INFO (RXE + 0x000000000050) +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull +#define RCV_ERR_MASK (RXE + 0x000000000068) +#define RCV_ERR_STATUS (RXE + 0x000000000060) +#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \ + 0x4000000000000000ull +#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull +#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull +#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull +#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull +#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \ + 0x40000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + 0x20000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + 0x800000000000000ull +#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + 0x400000000000000ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + 0x10000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + 0x20000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull +#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + 0x200000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + 0x8000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull +#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull +#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull +#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull +#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull +#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull +#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull +#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull +#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull +#define RCV_HDR_ADDR (RXE + 0x000000100028) +#define RCV_HDR_CNT (RXE + 0x000000100030) +#define RCV_HDR_CNT_CNT_MASK 0x1FFull +#define RCV_HDR_CNT_CNT_SHIFT 0 +#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038) +#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull +#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0 +#define RCV_HDR_HEAD (RXE + 0x000000300008) +#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull +#define RCV_HDR_HEAD_COUNTER_SHIFT 32 +#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull +#define RCV_HDR_HEAD_HEAD_SHIFT 0 +#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull +#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058) +#define RCV_HDR_SIZE (RXE + 0x000000100040) +#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full +#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0 +#define RCV_HDR_TAIL (RXE + 0x000000300000) +#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048) +#define RCV_KEY_CTRL (RXE + 0x000000100020) +#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0 +#define RCV_MULTICAST (RXE + 0x000000000030) +#define RCV_PARTITION_KEY (RXE + 0x000000000200) +#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull +#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16 +#define RCV_QP_MAP_TABLE (RXE + 0x000000000100) +#define RCV_RSM_CFG (RXE + 0x000000000600) +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0 +#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60 +#define RCV_RSM_CFG_OFFSET_SHIFT 32 +#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900) +#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull +#define RCV_RSM_MATCH (RXE + 0x000000000800) +#define RCV_RSM_MATCH_MASK1_SHIFT 0 +#define RCV_RSM_MATCH_MASK2_SHIFT 16 +#define RCV_RSM_MATCH_VALUE1_SHIFT 8 +#define RCV_RSM_MATCH_VALUE2_SHIFT 24 +#define RCV_RSM_SELECT (RXE + 0x000000000700) +#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0 +#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16 +#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32 +#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44 +#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48 +#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60 +#define RCV_STATUS (RXE + 0x000000000008) +#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull +#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull +#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull +#define RCV_TID_CTRL (RXE + 0x000000100018) +#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull +#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0 +#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull +#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32 +#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800) +#define RCV_VL15 (RXE + 0x000000000048) +#define SEND_BTH_QP (TXE + 0x0000000000A0) +#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull +#define SEND_BTH_QP_KDETH_QP_SHIFT 16 +#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510) +#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \ + 0x1000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \ + 0x2000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \ + 0x4000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \ + 0x10000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \ + 0x20000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \ + 0x40000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \ + 0x80000000000000ull +#define SEND_CM_CREDIT_VL (TXE + 0x000000000600) +#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678) +#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16 +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull +#define SEND_CM_CTRL (TXE + 0x000000000500) +#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull +#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull +#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508) +#define SEND_CM_GLOBAL_CREDIT_AU_MASK 0x7ull +#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16 +#define SEND_CM_GLOBAL_CREDIT_AU_SMASK 0x70000ull +#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0 +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32 +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull +#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520) +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48 +#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528) +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48 +#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530) +#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538) +#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518) +#define SEND_CONTEXTS (TXE + 0x000000000010) +#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200) +#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300) +#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400) +#define SEND_CTRL (TXE + 0x000000000000) +#define SEND_CTRL_CM_RESET_SMASK 0x4ull +#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) +#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) +#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \ + 0x200000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \ + 0x40000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \ + 0x4000ull +#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090) +#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull +#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8) +#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8 +#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098) +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0) +#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16 +#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088) +#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010) +#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull +#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0 +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull +#define SEND_CTXT_CREDIT_STATUS (TXE + 0x000000100018) +#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK 0x7FFull +#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT 32 +#define SEND_CTXT_CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK 0x7FFull +#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028) +#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull +#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020) +#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull +#define SEND_CTXT_CTRL (TXE + 0x000000100000) +#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull +#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32 +#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull +#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48 +#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull +#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050) +#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048) +#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040) +#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull +#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull +#define SEND_CTXT_STATUS (TXE + 0x000000100008) +#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull +#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010) +#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080) +#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull +#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090) +#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8) +#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098) +#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0) +#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16 +#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_DMA_CHECK_VL (TXE + 0x000000200088) +#define SEND_DMA_CTRL (TXE + 0x000000200000) +#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull +#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull +#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull +#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull +#define SEND_DMA_DESC_CNT (TXE + 0x000000200050) +#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull +#define SEND_DMA_DESC_CNT_CNT_SHIFT 0 +#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070) +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18 +#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068) +#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060) +#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \ + 0x40000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \ + 0x20000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \ + 0x100ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \ + 0x10000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull +#define SEND_DMA_ENGINES (TXE + 0x000000000018) +#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070) +#define SEND_DMA_ERR_MASK (TXE + 0x000000000068) +#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060) +#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull +#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull +#define SEND_DMA_HEAD (TXE + 0x000000200028) +#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030) +#define SEND_DMA_LEN_GEN (TXE + 0x000000200018) +#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16 +#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6 +#define SEND_DMA_MEMORY (TXE + 0x0000002000B0) +#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16 +#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0 +#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028) +#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038) +#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048) +#define SEND_DMA_STATUS (TXE + 0x000000200008) +#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull +#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull +#define SEND_DMA_TAIL (TXE + 0x000000200020) +#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800) +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090) +#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00) +#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull +#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull +#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull +#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull +#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull +#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull +#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull +#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull +#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull +#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull +#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088) +#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08) +#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080) +#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \ + 0x200000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \ + 0x20000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \ + 0x800000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \ + 0x2000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \ + 0x200000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \ + 0x8ull +#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \ + 0x400000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \ + 0x1000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \ + 0x2000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \ + 0x4000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \ + 0x8000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \ + 0x10000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \ + 0x20000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \ + 0x2000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \ + 0x40000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \ + 0x4000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \ + 0x80000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \ + 0x8000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \ + 0x100000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \ + 0x10000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \ + 0x1000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \ + 0x8000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \ + 0x100000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \ + 0x800000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \ + 0x4000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \ + 0x80000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \ + 0x800ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \ + 0x4000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \ + 0x8000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \ + 0x100000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \ + 0x200000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \ + 0x400000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \ + 0x800000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \ + 0x1000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \ + 0x2000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00) +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_ERR_CLEAR (TXE + 0x0000000000F0) +#define SEND_ERR_MASK (TXE + 0x0000000000E8) +#define SEND_ERR_STATUS (TXE + 0x0000000000E0) +#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull +#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030) +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0 +#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180) +#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0) +#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull +#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12 +#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8) +#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48 +#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12 +#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100) +#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull +#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16 +#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull +#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0 +#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050) +#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_MASK (TXE + 0x000000000048) +#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040) +#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + 0x1000000ull +#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull +#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull +#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull +#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull +#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull +#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038) +#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8 +#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull +#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull +#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull +#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020) +#define SEND_SC2VLT0 (TXE + 0x0000000000B0) +#define SEND_SC2VLT0_SC0_SHIFT 0 +#define SEND_SC2VLT0_SC1_SHIFT 8 +#define SEND_SC2VLT0_SC2_SHIFT 16 +#define SEND_SC2VLT0_SC3_SHIFT 24 +#define SEND_SC2VLT0_SC4_SHIFT 32 +#define SEND_SC2VLT0_SC5_SHIFT 40 +#define SEND_SC2VLT0_SC6_SHIFT 48 +#define SEND_SC2VLT0_SC7_SHIFT 56 +#define SEND_SC2VLT1 (TXE + 0x0000000000B8) +#define SEND_SC2VLT1_SC10_SHIFT 16 +#define SEND_SC2VLT1_SC11_SHIFT 24 +#define SEND_SC2VLT1_SC12_SHIFT 32 +#define SEND_SC2VLT1_SC13_SHIFT 40 +#define SEND_SC2VLT1_SC14_SHIFT 48 +#define SEND_SC2VLT1_SC15_SHIFT 56 +#define SEND_SC2VLT1_SC8_SHIFT 0 +#define SEND_SC2VLT1_SC9_SHIFT 8 +#define SEND_SC2VLT2 (TXE + 0x0000000000C0) +#define SEND_SC2VLT2_SC16_SHIFT 0 +#define SEND_SC2VLT2_SC17_SHIFT 8 +#define SEND_SC2VLT2_SC18_SHIFT 16 +#define SEND_SC2VLT2_SC19_SHIFT 24 +#define SEND_SC2VLT2_SC20_SHIFT 32 +#define SEND_SC2VLT2_SC21_SHIFT 40 +#define SEND_SC2VLT2_SC22_SHIFT 48 +#define SEND_SC2VLT2_SC23_SHIFT 56 +#define SEND_SC2VLT3 (TXE + 0x0000000000C8) +#define SEND_SC2VLT3_SC24_SHIFT 0 +#define SEND_SC2VLT3_SC25_SHIFT 8 +#define SEND_SC2VLT3_SC26_SHIFT 16 +#define SEND_SC2VLT3_SC27_SHIFT 24 +#define SEND_SC2VLT3_SC28_SHIFT 32 +#define SEND_SC2VLT3_SC29_SHIFT 40 +#define SEND_SC2VLT3_SC30_SHIFT 48 +#define SEND_SC2VLT3_SC31_SHIFT 56 +#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8) +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0 +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull +#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708) +#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C) +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27 +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000 +#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898) +#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12 +#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6 +#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0 +#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C) +#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4) +#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull +#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24 +#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890) +#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull +#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894) +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6 +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0 +#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8) +#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8 +#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull +#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull +#define CCE_INT_BLOCKED (CCE + 0x000000110C00) +#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040) +#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058) +#define CCE_MSIX_PBA_OFFSET 0X0110000 + +#endif /* DEF_CHIP_REG */ diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h new file mode 100644 index 0000000000..166ad6b828 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/common.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#ifndef _COMMON_H +#define _COMMON_H + +#include <rdma/hfi/hfi1_user.h> + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in + * fast path code + * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in fast path code + * _HFI1_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* driver/hw feature set bitmask */ +#define HFI1_CAP_USER_SHIFT 24 +#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1) +/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */ +#define HFI1_CAP_LOCKED_SHIFT 63 +#define HFI1_CAP_LOCKED_MASK 0x1ULL +#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT) +/* extra bits used between kernel and user processes */ +#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2) +#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \ + HFI1_CAP_MISC_SHIFT)) - 1) + +#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; }) +#define HFI1_CAP_KCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~HFI1_CAP_##cap; \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_USET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_UCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_SET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \ + HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_CLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap | \ + (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_LOCK() \ + ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; }) +#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK)) +/* + * The set of capability bits that can be changed after initial load + * This set is the same for kernel and user contexts. However, for + * user contexts, the set can be further filtered by using the + * HFI1_CAP_RESERVED_MASK bits. + */ +#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \ + HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_ALLOW_PERM_JKEY | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_TID_UNMAP | \ + HFI1_CAP_OPFN) +/* + * A set of capability bits that are "global" and are not allowed to be + * set in the user bitmask. + */ +#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \ + HFI1_CAP_USE_SDMA_HEAD | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_NO_INTEGRITY | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_TID_RDMA | \ + HFI1_CAP_OPFN | \ + HFI1_CAP_AIP) << \ + HFI1_CAP_USER_SHIFT) +/* + * Set of capabilities that need to be enabled for kernel context in + * order to be allowed for user contexts, as well. + */ +#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL) +/* Default enabled capabilities (both kernel and user) */ +#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_SDMA | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_AIP | \ + ((HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_EARLY_CREDIT_RETURN) << \ + HFI1_CAP_USER_SHIFT)) +/* + * A bitmask of kernel/global capabilities that should be communicated + * to user level processes. + */ +#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_NO_INTEGRITY) + +#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \ + HFI1_USER_SWMINOR) + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* + * Receive Header Flags + */ +#define RHF_PKT_LEN_SHIFT 0 +#define RHF_PKT_LEN_MASK 0xfffull +#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT) + +#define RHF_RCV_TYPE_SHIFT 12 +#define RHF_RCV_TYPE_MASK 0x7ull +#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT) + +#define RHF_USE_EGR_BFR_SHIFT 15 +#define RHF_USE_EGR_BFR_MASK 0x1ull +#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT) + +#define RHF_EGR_INDEX_SHIFT 16 +#define RHF_EGR_INDEX_MASK 0x7ffull +#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT) + +#define RHF_DC_INFO_SHIFT 27 +#define RHF_DC_INFO_MASK 0x1ull +#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT) + +#define RHF_RCV_SEQ_SHIFT 28 +#define RHF_RCV_SEQ_MASK 0xfull +#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT) + +#define RHF_EGR_OFFSET_SHIFT 32 +#define RHF_EGR_OFFSET_MASK 0xfffull +#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT) +#define RHF_HDRQ_OFFSET_SHIFT 44 +#define RHF_HDRQ_OFFSET_MASK 0x1ffull +#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT) +#define RHF_K_HDR_LEN_ERR (0x1ull << 53) +#define RHF_DC_UNC_ERR (0x1ull << 54) +#define RHF_DC_ERR (0x1ull << 55) +#define RHF_RCV_TYPE_ERR_SHIFT 56 +#define RHF_RCV_TYPE_ERR_MASK 0x7ul +#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT) +#define RHF_TID_ERR (0x1ull << 59) +#define RHF_LEN_ERR (0x1ull << 60) +#define RHF_ECC_ERR (0x1ull << 61) +#define RHF_RESERVED (0x1ull << 62) +#define RHF_ICRC_ERR (0x1ull << 63) + +#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */ + +/* RHF receive types */ +#define RHF_RCV_TYPE_EXPECTED 0 +#define RHF_RCV_TYPE_EAGER 1 +#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */ +#define RHF_RCV_TYPE_ERROR 3 +#define RHF_RCV_TYPE_BYPASS 4 +#define RHF_RCV_TYPE_INVALID5 5 +#define RHF_RCV_TYPE_INVALID6 6 +#define RHF_RCV_TYPE_INVALID7 7 + +/* RHF receive type error - expected packet errors */ +#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2 +#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4 + +/* RHF receive type error - eager packet errors */ +#define RHF_RTE_EAGER_NO_ERR 0x0 + +/* RHF receive type error - IB packet errors */ +#define RHF_RTE_IB_NO_ERR 0x0 + +/* RHF receive type error - error packet errors */ +#define RHF_RTE_ERROR_NO_ERR 0x0 +#define RHF_RTE_ERROR_OP_CODE_ERR 0x1 +#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2 +#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3 +#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4 +#define RHF_RTE_ERROR_CONTEXT_ERR 0x5 +#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6 + +/* RHF receive type error - bypass packet errors */ +#define RHF_RTE_BYPASS_NO_ERR 0x0 + +/* MAX RcvSEQ */ +#define RHF_MAX_SEQ 13 + +/* IB - LRH header constants */ +#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SC15_PACKET 0xF +#define SIZE_OF_CRC 1 +#define SIZE_OF_LT 1 +#define MAX_16B_PADDING 12 /* CRC = 4, LT = 1, Pad = 0 to 7 bytes */ + +#define LIM_MGMT_P_KEY 0x7FFF +#define FULL_MGMT_P_KEY 0xFFFF + +#define DEFAULT_P_KEY LIM_MGMT_P_KEY + +#define HFI1_PSM_IOC_BASE_SEQ 0x0 + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define HFI1_KDETH_BTH_SEQ_SHIFT 11 +#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1) + +static inline __u64 rhf_to_cpu(const __le32 *rbuf) +{ + return __le64_to_cpu(*((__le64 *)rbuf)); +} + +static inline u64 rhf_err_flags(u64 rhf) +{ + return rhf & RHF_ERROR_SMASK; +} + +static inline u32 rhf_rcv_type(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK; +} + +static inline u32 rhf_rcv_type_err(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK; +} + +/* return size is in bytes, not DWORDs */ +static inline u32 rhf_pkt_len(u64 rhf) +{ + return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2; +} + +static inline u32 rhf_egr_index(u64 rhf) +{ + return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK; +} + +static inline u32 rhf_rcv_seq(u64 rhf) +{ + return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK; +} + +/* returned offset is in DWORDS */ +static inline u32 rhf_hdrq_offset(u64 rhf) +{ + return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK; +} + +static inline u64 rhf_use_egr_bfr(u64 rhf) +{ + return rhf & RHF_USE_EGR_BFR_SMASK; +} + +static inline u64 rhf_dc_info(u64 rhf) +{ + return rhf & RHF_DC_INFO_SMASK; +} + +static inline u32 rhf_egr_buf_offset(u64 rhf) +{ + return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK; +} +#endif /* _COMMON_H */ diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c new file mode 100644 index 0000000000..80ba1e53c0 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -0,0 +1,1363 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2018 Intel Corporation. + */ + +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ratelimit.h> +#include <linux/fault-inject.h> + +#include "hfi.h" +#include "trace.h" +#include "debugfs.h" +#include "device.h" +#include "qp.h" +#include "sdma.h" +#include "fault.h" + +static struct dentry *hfi1_dbg_root; + +/* wrappers to enforce srcu in seq file */ +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct dentry *d = file->f_path.dentry; + ssize_t r; + + r = debugfs_file_get(d); + if (unlikely(r)) + return r; + r = seq_read(file, buf, size, ppos); + debugfs_file_put(d); + return r; +} + +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence) +{ + struct dentry *d = file->f_path.dentry; + loff_t r; + + r = debugfs_file_get(d); + if (unlikely(r)) + return r; + r = seq_lseek(file, offset, whence); + debugfs_file_put(d); + return r; +} + +#define private2dd(file) (file_inode(file)->i_private) +#define private2ppd(file) (file_inode(file)->i_private) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static int opcode_stats_show(struct seq_file *s, u8 i, u64 packets, u64 bytes) +{ + if (!packets && !bytes) + return SEQ_SKIP; + seq_printf(s, "%02x %llu/%llu\n", i, + (unsigned long long)packets, + (unsigned long long)bytes); + + return 0; +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + + for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); + } + return opcode_stats_show(s, i, n_packets, n_bytes); +} + +DEBUGFS_SEQ_FILE_OPS(opcode_stats); +DEBUGFS_SEQ_FILE_OPEN(opcode_stats) +DEBUGFS_FILE_OPS(opcode_stats); + +static void *_tx_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + return _opcode_stats_seq_start(s, pos); +} + +static void *_tx_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + return _opcode_stats_seq_next(s, v, pos); +} + +static void _tx_opcode_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _tx_opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos; + int j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + for_each_possible_cpu(j) { + struct hfi1_opcode_stats_perctx *s = + per_cpu_ptr(dd->tx_opstats, j); + n_packets += s->stats[i].n_packets; + n_bytes += s->stats[i].n_bytes; + } + return opcode_stats_show(s, i, n_packets, n_bytes); +} + +DEBUGFS_SEQ_FILE_OPS(tx_opcode_stats); +DEBUGFS_SEQ_FILE_OPEN(tx_opcode_stats) +DEBUGFS_FILE_OPS(tx_opcode_stats); + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_dyn_alloc_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_dyn_alloc_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + rcd = hfi1_rcd_get_by_index_safe(dd, i); + if (!rcd) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(rcd->opstats->stats); j++) + n_packets += rcd->opstats->stats[j].n_packets; + + hfi1_rcd_put(rcd); + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(ctx_stats); +DEBUGFS_SEQ_FILE_OPEN(ctx_stats) +DEBUGFS_FILE_OPS(ctx_stats); + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) + __acquires(RCU) +{ + struct rvt_qp_iter *iter; + loff_t n = *pos; + + iter = rvt_qp_iter_init(s->private, 0, NULL); + + /* stop calls rcu_read_unlock */ + rcu_read_lock(); + + if (!iter) + return NULL; + + do { + if (rvt_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } while (n--); + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) + __must_hold(RCU) +{ + struct rvt_qp_iter *iter = iter_ptr; + + (*pos)++; + + if (rvt_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct rvt_qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(qp_stats); +DEBUGFS_SEQ_FILE_OPEN(qp_stats) +DEBUGFS_FILE_OPS(qp_stats); + +static void *_sdes_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + +static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + +static void _sdes_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _sdes_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + loff_t *spos = v; + loff_t i = *spos; + + sdma_seqfile_dump_sde(s, &dd->per_sdma[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(sdes); +DEBUGFS_SEQ_FILE_OPEN(sdes) +DEBUGFS_FILE_OPS(sdes); + +static void *_rcds_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->rcd || *pos >= dd->n_krcv_queues) + return NULL; + return pos; +} + +static void *_rcds_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->rcd || *pos >= dd->num_rcv_contexts) + return NULL; + return pos; +} + +static void _rcds_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _rcds_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + loff_t *spos = v; + loff_t i = *spos; + + rcd = hfi1_rcd_get_by_index_safe(dd, i); + if (rcd) + seqfile_dump_rcd(s, rcd); + hfi1_rcd_put(rcd); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(rcds); +DEBUGFS_SEQ_FILE_OPEN(rcds) +DEBUGFS_FILE_OPS(rcds); + +static void *_pios_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->send_contexts || *pos >= dd->num_send_contexts) + return NULL; + return pos; +} + +static void *_pios_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->send_contexts || *pos >= dd->num_send_contexts) + return NULL; + return pos; +} + +static void _pios_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _pios_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct send_context_info *sci; + loff_t *spos = v; + loff_t i = *spos; + unsigned long flags; + + spin_lock_irqsave(&dd->sc_lock, flags); + sci = &dd->send_contexts[i]; + if (sci && sci->type != SC_USER && sci->allocated && sci->sc) + seqfile_dump_sci(s, i, sci); + spin_unlock_irqrestore(&dd->sc_lock, flags); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(pios); +DEBUGFS_SEQ_FILE_OPEN(pios) +DEBUGFS_FILE_OPS(pios); + +/* read the per-device counters */ +static ssize_t dev_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + return rval; +} + +/* read the per-device counters */ +static ssize_t dev_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + return rval; +} + +struct counter_info { + char *name; + const struct file_operations ops; +}; + +/* + * Could use file_inode(file)->i_ino to figure out which file, + * instead of separate routine for each, but for now, this works... + */ + +/* read the per-port names (same for each port) */ +static ssize_t portnames_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + dd = private2dd(file); + avail = hfi1_read_portcntrs(dd->pport, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + return rval; +} + +/* read the per-port counters */ +static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_pportdata *ppd; + ssize_t rval; + + ppd = private2ppd(file); + avail = hfi1_read_portcntrs(ppd, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + return rval; +} + +static void check_dyn_flag(u64 scratch0, char *p, int size, int *used, + int this_hfi, int hfi, u32 flag, const char *what) +{ + u32 mask; + + mask = flag << (hfi ? CR_DYN_SHIFT : 0); + if (scratch0 & mask) { + *used += scnprintf(p + *used, size - *used, + " 0x%08x - HFI%d %s in use, %s device\n", + mask, hfi, what, + this_hfi == hfi ? "this" : "other"); + } +} + +static ssize_t asic_flags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u64 scratch0; + char *tmp; + int ret = 0; + int size; + int used; + int i; + + ppd = private2ppd(file); + dd = ppd->dd; + size = PAGE_SIZE; + used = 0; + tmp = kmalloc(size, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + used += scnprintf(tmp + used, size - used, + "Resource flags: 0x%016llx\n", scratch0); + + /* check permanent flag */ + if (scratch0 & CR_THERM_INIT) { + used += scnprintf(tmp + used, size - used, + " 0x%08x - thermal monitoring initialized\n", + (u32)CR_THERM_INIT); + } + + /* check each dynamic flag on each HFI */ + for (i = 0; i < 2; i++) { + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_SBUS, "SBus"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_EPROM, "EPROM"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C1, "i2c chain 1"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C2, "i2c chain 2"); + } + used += scnprintf(tmp + used, size - used, "Write bits to clear\n"); + + ret = simple_read_from_buffer(buf, count, ppos, tmp, used); + kfree(tmp); + return ret; +} + +static ssize_t asic_flags_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + char *buff; + int ret; + unsigned long long value; + u64 scratch0; + u64 clear; + + ppd = private2ppd(file); + dd = ppd->dd; + + /* zero terminate and read the expected integer */ + buff = memdup_user_nul(buf, count); + if (IS_ERR(buff)) + return PTR_ERR(buff); + + ret = kstrtoull(buff, 0, &value); + if (ret) + goto do_free; + clear = value; + + /* obtain exclusive access */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + acquire_hw_mutex(dd); + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~clear; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + mutex_unlock(&dd->asic_data->asic_resource_mutex); + + /* return the number of bytes written */ + ret = count; + + do_free: + kfree(buff); + return ret; +} + +/* read the dc8051 memory */ +static ssize_t dc8051_memory_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd = private2ppd(file); + ssize_t rval; + void *tmp; + loff_t start, end; + + /* the checks below expect the position to be positive */ + if (*ppos < 0) + return -EINVAL; + + tmp = kzalloc(DC8051_DATA_MEM_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + /* + * Fill in the requested portion of the temporary buffer from the + * 8051 memory. The 8051 memory read is done in terms of 8 bytes. + * Adjust start and end to fit. Skip reading anything if out of + * range. + */ + start = *ppos & ~0x7; /* round down */ + if (start < DC8051_DATA_MEM_SIZE) { + end = (*ppos + count + 7) & ~0x7; /* round up */ + if (end > DC8051_DATA_MEM_SIZE) + end = DC8051_DATA_MEM_SIZE; + rval = read_8051_data(ppd->dd, start, end - start, + (u64 *)(tmp + start)); + if (rval) + goto done; + } + + rval = simple_read_from_buffer(buf, count, ppos, tmp, + DC8051_DATA_MEM_SIZE); +done: + kfree(tmp); + return rval; +} + +static ssize_t debugfs_lcb_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd = private2ppd(file); + struct hfi1_devdata *dd = ppd->dd; + unsigned long total, csr_off; + u64 data; + + if (*ppos < 0) + return -EINVAL; + /* only read 8 byte quantities */ + if ((count % 8) != 0) + return -EINVAL; + /* offset must be 8-byte aligned */ + if ((*ppos % 8) != 0) + return -EINVAL; + /* do nothing if out of range or zero count */ + if (*ppos >= (LCB_END - LCB_START) || !count) + return 0; + /* reduce count if needed */ + if (*ppos + count > LCB_END - LCB_START) + count = (LCB_END - LCB_START) - *ppos; + + csr_off = LCB_START + *ppos; + for (total = 0; total < count; total += 8, csr_off += 8) { + if (read_lcb_csr(dd, csr_off, (u64 *)&data)) + break; /* failed */ + if (put_user(data, (unsigned long __user *)(buf + total))) + break; + } + *ppos += total; + return total; +} + +static ssize_t debugfs_lcb_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd = private2ppd(file); + struct hfi1_devdata *dd = ppd->dd; + unsigned long total, csr_off, data; + + if (*ppos < 0) + return -EINVAL; + /* only write 8 byte quantities */ + if ((count % 8) != 0) + return -EINVAL; + /* offset must be 8-byte aligned */ + if ((*ppos % 8) != 0) + return -EINVAL; + /* do nothing if out of range or zero count */ + if (*ppos >= (LCB_END - LCB_START) || !count) + return 0; + /* reduce count if needed */ + if (*ppos + count > LCB_END - LCB_START) + count = (LCB_END - LCB_START) - *ppos; + + csr_off = LCB_START + *ppos; + for (total = 0; total < count; total += 8, csr_off += 8) { + if (get_user(data, (unsigned long __user *)(buf + total))) + break; + if (write_lcb_csr(dd, csr_off, data)) + break; /* failed */ + } + *ppos += total; + return total; +} + +/* + * read the per-port QSFP data for ppd + */ +static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + char *tmp; + int ret; + + ppd = private2ppd(file); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = qsfp_dump(ppd, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + kfree(tmp); + return ret; +} + +/* Do an i2c write operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_written; + + ppd = private2ppd(file); + + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) + return -EINVAL; + + buff = memdup_user(buf, count); + if (IS_ERR(buff)) + return PTR_ERR(buff); + + total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + return ret; +} + +/* Do an i2c write operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do an i2c write operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do an i2c read operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_read; + + ppd = private2ppd(file); + + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) + return -EINVAL; + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) + return -ENOMEM; + + total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + return ret; +} + +/* Do an i2c read operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do an i2c read operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 1); +} + +/* Do a QSFP write operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_written; + + if (*ppos + count > QSFP_PAGESIZE * 4) /* base page + page00-page03 */ + return -EINVAL; + + ppd = private2ppd(file); + + buff = memdup_user(buf, count); + if (IS_ERR(buff)) + return PTR_ERR(buff); + + total_written = qsfp_write(ppd, target, *ppos, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + return ret; +} + +/* Do a QSFP write operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do a QSFP write operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do a QSFP read operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_read; + + if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */ + ret = -EINVAL; + goto _return; + } + + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + total_read = qsfp_read(ppd, target, *ppos, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + _return: + return ret; +} + +/* Do a QSFP read operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do a QSFP read operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 1); +} + +static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); +} + +static int i2c1_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 0); +} + +static int i2c2_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 1); +} + +static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + + return 0; +} + +static int i2c1_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 0); +} + +static int i2c2_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 1); +} + +static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); +} + +static int qsfp1_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 0); +} + +static int qsfp2_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 1); +} + +static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + + return 0; +} + +static int qsfp1_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 0); +} + +static int qsfp2_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 1); +} + +#define EXPROM_WRITE_ENABLE BIT_ULL(14) + +static bool exprom_wp_disabled; + +static int exprom_wp_set(struct hfi1_devdata *dd, bool disable) +{ + u64 gpio_val = 0; + + if (disable) { + gpio_val = EXPROM_WRITE_ENABLE; + exprom_wp_disabled = true; + dd_dev_info(dd, "Disable Expansion ROM Write Protection\n"); + } else { + exprom_wp_disabled = false; + dd_dev_info(dd, "Enable Expansion ROM Write Protection\n"); + } + + write_csr(dd, ASIC_GPIO_OUT, gpio_val); + write_csr(dd, ASIC_GPIO_OE, gpio_val); + + return 0; +} + +static ssize_t exprom_wp_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t exprom_wp_debugfs_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct hfi1_pportdata *ppd = private2ppd(file); + char cdata; + + if (count != 1) + return -EINVAL; + if (get_user(cdata, buf)) + return -EFAULT; + if (cdata == '0') + exprom_wp_set(ppd->dd, false); + else if (cdata == '1') + exprom_wp_set(ppd->dd, true); + else + return -EINVAL; + + return 1; +} + +static unsigned long exprom_in_use; + +static int exprom_wp_debugfs_open(struct inode *in, struct file *fp) +{ + if (test_and_set_bit(0, &exprom_in_use)) + return -EBUSY; + + return 0; +} + +static int exprom_wp_debugfs_release(struct inode *in, struct file *fp) +{ + struct hfi1_pportdata *ppd = private2ppd(fp); + + if (exprom_wp_disabled) + exprom_wp_set(ppd->dd, false); + clear_bit(0, &exprom_in_use); + + return 0; +} + +#define DEBUGFS_OPS(nm, readroutine, writeroutine) \ +{ \ + .name = nm, \ + .ops = { \ + .owner = THIS_MODULE, \ + .read = readroutine, \ + .write = writeroutine, \ + .llseek = generic_file_llseek, \ + }, \ +} + +#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \ +{ \ + .name = nm, \ + .ops = { \ + .owner = THIS_MODULE, \ + .read = readf, \ + .write = writef, \ + .llseek = generic_file_llseek, \ + .open = openf, \ + .release = releasef \ + }, \ +} + +static const struct counter_info cntr_ops[] = { + DEBUGFS_OPS("counter_names", dev_names_read, NULL), + DEBUGFS_OPS("counters", dev_counters_read, NULL), + DEBUGFS_OPS("portcounter_names", portnames_read, NULL), +}; + +static const struct counter_info port_cntr_ops[] = { + DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL), + DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write, + i2c1_debugfs_open, i2c1_debugfs_release), + DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write, + i2c2_debugfs_open, i2c2_debugfs_release), + DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL), + DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write, + qsfp1_debugfs_open, qsfp1_debugfs_release), + DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write, + qsfp2_debugfs_open, qsfp2_debugfs_release), + DEBUGFS_XOPS("exprom_wp", exprom_wp_debugfs_read, + exprom_wp_debugfs_write, exprom_wp_debugfs_open, + exprom_wp_debugfs_release), + DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write), + DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL), + DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write), +}; + +static void *_sdma_cpu_list_seq_start(struct seq_file *s, loff_t *pos) +{ + if (*pos >= num_online_cpus()) + return NULL; + + return pos; +} + +static void *_sdma_cpu_list_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= num_online_cpus()) + return NULL; + + return pos; +} + +static void _sdma_cpu_list_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _sdma_cpu_list_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + loff_t *spos = v; + loff_t i = *spos; + + sdma_seqfile_dump_cpu_list(s, dd, (unsigned long)i); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(sdma_cpu_list); +DEBUGFS_SEQ_FILE_OPEN(sdma_cpu_list) +DEBUGFS_FILE_OPS(sdma_cpu_list); + +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ + char name[sizeof("port0counters") + 1]; + char link[10]; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_pportdata *ppd; + struct dentry *root; + int unit = dd->unit; + int i, j; + + if (!hfi1_dbg_root) + return; + snprintf(name, sizeof(name), "%s_%d", class_name(), unit); + snprintf(link, sizeof(link), "%d", unit); + root = debugfs_create_dir(name, hfi1_dbg_root); + ibd->hfi1_ibdev_dbg = root; + + ibd->hfi1_ibdev_link = + debugfs_create_symlink(link, hfi1_dbg_root, name); + + debugfs_create_file("opcode_stats", 0444, root, ibd, + &_opcode_stats_file_ops); + debugfs_create_file("tx_opcode_stats", 0444, root, ibd, + &_tx_opcode_stats_file_ops); + debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops); + debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops); + debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops); + debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops); + debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops); + debugfs_create_file("sdma_cpu_list", 0444, root, ibd, + &_sdma_cpu_list_file_ops); + + /* dev counter files */ + for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) + debugfs_create_file(cntr_ops[i].name, 0444, root, dd, + &cntr_ops[i].ops); + + /* per port files */ + for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++) + for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) { + snprintf(name, + sizeof(name), + port_cntr_ops[i].name, + j + 1); + debugfs_create_file(name, + !port_cntr_ops[i].ops.write ? + S_IRUGO : + S_IRUGO | S_IWUSR, + root, ppd, &port_cntr_ops[i].ops); + } + + hfi1_fault_init_debugfs(ibd); +} + +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ + if (!hfi1_dbg_root) + goto out; + hfi1_fault_exit_debugfs(ibd); + debugfs_remove(ibd->hfi1_ibdev_link); + debugfs_remove_recursive(ibd->hfi1_ibdev_dbg); +out: + ibd->hfi1_ibdev_dbg = NULL; +} + +/* + * driver stats field names, one line per stat, single string. Used by + * programs like hfistats to print the stats in a way which works for + * different versions of drivers, without changing program source. + * if hfi1_ib_stats changes, this needs to change. Names need to be + * 12 chars or less (w/o newline), for proper display by hfistats utility. + */ +static const char * const hfi1_statnames[] = { + /* must be element 0*/ + "KernIntr", + "ErrorIntr", + "Tx_Errs", + "Rcv_Errs", + "H/W_Errs", + "NoPIOBufs", + "CtxtsOpen", + "RcvLen_Errs", + "EgrBufFull", + "EgrHdrFull" +}; + +static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_names_seq_next( + struct seq_file *s, + void *v, + loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_names_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _driver_stats_names_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + + seq_printf(s, "%s\n", hfi1_statnames[*spos]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats_names); +DEBUGFS_SEQ_FILE_OPEN(driver_stats_names) +DEBUGFS_FILE_OPS(driver_stats_names); + +static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static void hfi1_sps_show_ints(struct seq_file *s) +{ + unsigned long index, flags; + struct hfi1_devdata *dd; + u64 sps_ints = 0; + + xa_lock_irqsave(&hfi1_dev_table, flags); + xa_for_each(&hfi1_dev_table, index, dd) { + sps_ints += get_all_cpu_total(dd->int_counter); + } + xa_unlock_irqrestore(&hfi1_dev_table, flags); + seq_write(s, &sps_ints, sizeof(u64)); +} + +static int _driver_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + u64 *stats = (u64 *)&hfi1_stats; + + /* special case for interrupts */ + if (*spos == 0) + hfi1_sps_show_ints(s); + else + seq_write(s, stats + *spos, sizeof(u64)); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats); +DEBUGFS_SEQ_FILE_OPEN(driver_stats) +DEBUGFS_FILE_OPS(driver_stats); + +void hfi1_dbg_init(void) +{ + hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); + debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL, + &_driver_stats_names_file_ops); + debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL, + &_driver_stats_file_ops); +} + +void hfi1_dbg_exit(void) +{ + debugfs_remove_recursive(hfi1_dbg_root); + hfi1_dbg_root = NULL; +} diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h new file mode 100644 index 0000000000..29a5a8de2c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016, 2018 Intel Corporation. + */ + +#ifndef _HFI1_DEBUGFS_H +#define _HFI1_DEBUGFS_H + +struct hfi1_ibdev; + +#define DEBUGFS_SEQ_FILE_OPS(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +} + +#define DEBUGFS_SEQ_FILE_OPEN(name) \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} + +#define DEBUGFS_FILE_OPS(name) \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = hfi1_seq_read, \ + .llseek = hfi1_seq_lseek, \ + .release = seq_release \ +} + + +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos); +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence); + +#ifdef CONFIG_DEBUG_FS +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); +void hfi1_dbg_init(void); +void hfi1_dbg_exit(void); + +#else +static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ +} + +static inline void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ +} + +static inline void hfi1_dbg_init(void) +{ +} + +static inline void hfi1_dbg_exit(void) +{ +} +#endif + +#endif /* _HFI1_DEBUGFS_H */ diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c new file mode 100644 index 0000000000..b0a00b7aae --- /dev/null +++ b/drivers/infiniband/hw/hfi1/device.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/fs.h> + +#include "hfi.h" +#include "device.h" + +static char *hfi1_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +static const struct class class = { + .name = "hfi1", + .devnode = hfi1_devnode, +}; + +static char *hfi1_user_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +static const struct class user_class = { + .name = "hfi1_user", + .devnode = hfi1_user_devnode, +}; +static dev_t hfi1_dev; + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible, + struct kobject *parent) +{ + const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor); + struct device *device = NULL; + int ret; + + cdev_init(cdev, fops); + cdev->owner = THIS_MODULE; + cdev_set_parent(cdev, parent); + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + pr_err("Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto done; + } + + if (user_accessible) + device = device_create(&user_class, NULL, dev, NULL, "%s", name); + else + device = device_create(&class, NULL, dev, NULL, "%s", name); + + if (IS_ERR(device)) { + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); + cdev_del(cdev); + } +done: + *devp = device; + return ret; +} + +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp) +{ + struct device *device = *devp; + + if (device) { + device_unregister(device); + *devp = NULL; + + cdev_del(cdev); + } +} + +static const char *hfi1_class_name = "hfi1"; + +const char *class_name(void) +{ + return hfi1_class_name; +} + +int __init dev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME); + if (ret < 0) { + pr_err("Could not allocate chrdev region (err %d)\n", -ret); + goto done; + } + + ret = class_register(&class); + if (ret) { + pr_err("Could not create device class (err %d)\n", -ret); + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + + ret = class_register(&user_class); + if (ret) { + pr_err("Could not create device class for user accessible files (err %d)\n", + -ret); + class_unregister(&class); + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + +done: + return ret; +} + +void dev_cleanup(void) +{ + class_unregister(&class); + class_unregister(&user_class); + + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); +} diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h new file mode 100644 index 0000000000..c371b5612b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/device.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#ifndef _HFI1_DEVICE_H +#define _HFI1_DEVICE_H + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible, + struct kobject *parent); +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp); +const char *class_name(void); +int __init dev_init(void); +void dev_cleanup(void); + +#endif /* _HFI1_DEVICE_H */ diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c new file mode 100644 index 0000000000..f4492fa407 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -0,0 +1,1905 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2020 Intel Corporation. + * Copyright(c) 2021 Cornelis Networks. + */ + +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/io.h> +#include <linux/delay.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/prefetch.h> +#include <rdma/ib_verbs.h> +#include <linux/etherdevice.h> + +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "sdma.h" +#include "debugfs.h" +#include "vnic.h" +#include "fault.h" + +#include "ipoib.h" +#include "netdev.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +DEFINE_MUTEX(hfi1_mutex); /* general driver use */ + +unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; +module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO); +MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify( + HFI1_DEFAULT_MAX_MTU)); + +unsigned int hfi1_cu = 1; +module_param_named(cu, hfi1_cu, uint, S_IRUGO); +MODULE_PARM_DESC(cu, "Credit return units"); + +unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT; +static int hfi1_caps_set(const char *val, const struct kernel_param *kp); +static int hfi1_caps_get(char *buffer, const struct kernel_param *kp); +static const struct kernel_param_ops cap_ops = { + .set = hfi1_caps_set, + .get = hfi1_caps_get +}; +module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Cornelis Omni-Path Express driver"); + +/* + * MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define MAX_PKT_RECV 64 +/* + * MAX_PKT_THREAD_RCV is the max # of packets processed before + * the qp_wait_list queue is flushed. + */ +#define MAX_PKT_RECV_THREAD (MAX_PKT_RECV * 4) +#define EGR_HEAD_UPDATE_THRESHOLD 16 + +struct hfi1_ib_stats hfi1_stats; + +static int hfi1_caps_set(const char *val, const struct kernel_param *kp) +{ + int ret = 0; + unsigned long *cap_mask_ptr = (unsigned long *)kp->arg, + cap_mask = *cap_mask_ptr, value, diff, + write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_WRITABLE_MASK); + + ret = kstrtoul(val, 0, &value); + if (ret) { + pr_warn("Invalid module parameter value for 'cap_mask'\n"); + goto done; + } + /* Get the changed bits (except the locked bit) */ + diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK); + + /* Remove any bits that are not allowed to change after driver load */ + if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) { + pr_warn("Ignoring non-writable capability bits %#lx\n", + diff & ~write_mask); + diff &= write_mask; + } + + /* Mask off any reserved bits */ + diff &= ~HFI1_CAP_RESERVED_MASK; + /* Clear any previously set and changing bits */ + cap_mask &= ~diff; + /* Update the bits with the new capability */ + cap_mask |= (value & diff); + /* Check for any kernel/user restrictions */ + diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^ + ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT); + cap_mask &= ~diff; + /* Set the bitmask to the final set */ + *cap_mask_ptr = cap_mask; +done: + return ret; +} + +static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) +{ + unsigned long cap_mask = *(unsigned long *)kp->arg; + + cap_mask &= ~HFI1_CAP_LOCKED_SMASK; + cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT); + + return sysfs_emit(buffer, "0x%lx\n", cap_mask); +} + +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); + struct hfi1_devdata *dd = container_of(ibdev, + struct hfi1_devdata, verbs_dev); + return dd->pcidev; +} + +/* + * Return count of units with at least one port ACTIVE. + */ +int hfi1_count_active_units(void) +{ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + unsigned long index, flags; + int pidx, nunits_active = 0; + + xa_lock_irqsave(&hfi1_dev_table, flags); + xa_for_each(&hfi1_dev_table, index, dd) { + if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1) + continue; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && ppd->linkup) { + nunits_active++; + break; + } + } + } + xa_unlock_irqrestore(&hfi1_dev_table, flags); + return nunits_active; +} + +/* + * Get address of eager buffer from it's index (allocated in chunks, not + * contiguous). + */ +static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf, + u8 *update) +{ + u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf); + + *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset; + return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) + + (offset * RCV_BUF_BLOCK_SIZE)); +} + +static inline void *hfi1_get_header(struct hfi1_ctxtdata *rcd, + __le32 *rhf_addr) +{ + u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); + + return (void *)(rhf_addr - rcd->rhf_offset + offset); +} + +static inline struct ib_header *hfi1_get_msgheader(struct hfi1_ctxtdata *rcd, + __le32 *rhf_addr) +{ + return (struct ib_header *)hfi1_get_header(rcd, rhf_addr); +} + +static inline struct hfi1_16b_header + *hfi1_get_16B_header(struct hfi1_ctxtdata *rcd, + __le32 *rhf_addr) +{ + return (struct hfi1_16b_header *)hfi1_get_header(rcd, rhf_addr); +} + +/* + * Validate and encode the a given RcvArray Buffer size. + * The function will check whether the given size falls within + * allowed size ranges for the respective type and, optionally, + * return the proper encoding. + */ +int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded) +{ + if (unlikely(!PAGE_ALIGNED(size))) + return 0; + if (unlikely(size < MIN_EAGER_BUFFER)) + return 0; + if (size > + (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER)) + return 0; + if (encoded) + *encoded = ilog2(size / PAGE_SIZE) + 1; + return 1; +} + +static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, + struct hfi1_packet *packet) +{ + struct ib_header *rhdr = packet->hdr; + u32 rte = rhf_rcv_type_err(packet->rhf); + u32 mlid_base; + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ibdev *verbs_dev = &dd->verbs_dev; + struct rvt_dev_info *rdi = &verbs_dev->rdi; + + if ((packet->rhf & RHF_DC_ERR) && + hfi1_dbg_fault_suppress_err(verbs_dev)) + return; + + if (packet->rhf & RHF_ICRC_ERR) + return; + + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + goto drop; + } else { + u8 lnh = ib_get_lnh(rhdr); + + mlid_base = be16_to_cpu(IB_MULTICAST_LID_BASE); + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &rhdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + packet->ohdr = &rhdr->u.l.oth; + packet->grh = &rhdr->u.l.grh; + } else { + goto drop; + } + } + + if (packet->rhf & RHF_TID_ERR) { + /* For TIDERR and RC QPs preemptively schedule a NAK */ + u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + u32 dlid = ib_get_dlid(rhdr); + u32 qp_num; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + /* Check for GRH */ + if (packet->grh) { + u32 vtf; + struct ib_grh *grh = packet->grh; + + if (grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } + + /* Get the destination QP number. */ + qp_num = ib_bth_get_qpn(packet->ohdr); + if (dlid < mlid_base) { + struct rvt_qp *qp; + unsigned long flags; + + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) { + rcu_read_unlock(); + goto drop; + } + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock_irqsave(&qp->r_lock, flags); + + /* Check for valid receive state. */ + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + hfi1_rc_hdrerr(rcd, packet, qp); + break; + default: + /* For now don't handle any other QP types */ + break; + } + + spin_unlock_irqrestore(&qp->r_lock, flags); + rcu_read_unlock(); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + + /* handle "RcvTypeErr" flags */ + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + { + void *ebuf = NULL; + u8 opcode; + + if (rhf_use_egr_bfr(packet->rhf)) + ebuf = packet->ebuf; + + if (!ebuf) + goto drop; /* this should never happen */ + + opcode = ib_bth_get_opcode(packet->ohdr); + if (opcode == IB_OPCODE_CNP) { + /* + * Only in pre-B0 h/w is the CNP_OPCODE handled + * via this code path. + */ + struct rvt_qp *qp = NULL; + u32 lqpn, rqpn; + u16 rlid; + u8 svc_type, sl, sc5; + + sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf); + sl = ibp->sc_to_sl[sc5]; + + lqpn = ib_bth_get_qpn(packet->ohdr); + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn); + if (!qp) { + rcu_read_unlock(); + goto drop; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_UD: + rlid = 0; + rqpn = 0; + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: + rlid = ib_get_slid(rhdr); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + default: + rcu_read_unlock(); + goto drop; + } + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); + rcu_read_unlock(); + } + + packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK; + break; + } + default: + break; + } + +drop: + return; +} + +static inline void init_packet(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet) +{ + packet->rsize = get_hdrqentsize(rcd); /* words */ + packet->maxcnt = get_hdrq_cnt(rcd) * packet->rsize; /* words */ + packet->rcd = rcd; + packet->updegr = 0; + packet->etail = -1; + packet->rhf_addr = get_rhf_addr(rcd); + packet->rhf = rhf_to_cpu(packet->rhf_addr); + packet->rhqoff = hfi1_rcd_head(rcd); + packet->numpkt = 0; +} + +/* We support only two types - 9B and 16B for now */ +static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &return_cnp, + [HFI1_PKT_TYPE_16B] = &return_cnp_16B +}; + +/** + * hfi1_process_ecn_slowpath - Process FECN or BECN bits + * @qp: The packet's destination QP + * @pkt: The packet itself. + * @prescan: Is the caller the RXQ prescan + * + * Process the packet's FECN or BECN bits. By now, the packet + * has already been evaluated whether processing of those bit should + * be done. + * The significance of the @prescan argument is that if the caller + * is the RXQ prescan, a CNP will be send out instead of waiting for the + * normal packet processing to send an ACK with BECN set (or a CNP). + */ +bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool prescan) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_other_headers *ohdr = pkt->ohdr; + struct ib_grh *grh = pkt->grh; + u32 rqpn = 0; + u16 pkey; + u32 rlid, slid, dlid = 0; + u8 hdr_type, sc, svc_type, opcode; + bool is_mcast = false, ignore_fecn = false, do_cnp = false, + fecn, becn; + + /* can be called from prescan */ + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + pkey = hfi1_16B_get_pkey(pkt->hdr); + sc = hfi1_16B_get_sc(pkt->hdr); + dlid = hfi1_16B_get_dlid(pkt->hdr); + slid = hfi1_16B_get_slid(pkt->hdr); + is_mcast = hfi1_is_16B_mcast(dlid); + opcode = ib_bth_get_opcode(ohdr); + hdr_type = HFI1_PKT_TYPE_16B; + fecn = hfi1_16B_get_fecn(pkt->hdr); + becn = hfi1_16B_get_becn(pkt->hdr); + } else { + pkey = ib_bth_get_pkey(ohdr); + sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf); + dlid = qp->ibqp.qp_type != IB_QPT_UD ? ib_get_dlid(pkt->hdr) : + ppd->lid; + slid = ib_get_slid(pkt->hdr); + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); + opcode = ib_bth_get_opcode(ohdr); + hdr_type = HFI1_PKT_TYPE_9B; + fecn = ib_bth_get_fecn(ohdr); + becn = ib_bth_get_becn(ohdr); + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_UD: + rlid = slid; + rqpn = ib_get_sqpn(pkt->ohdr); + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + rlid = slid; + rqpn = ib_get_sqpn(pkt->ohdr); + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: + rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + case IB_QPT_RC: + rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_RC; + break; + default: + return false; + } + + ignore_fecn = is_mcast || (opcode == IB_OPCODE_CNP) || + (opcode == IB_OPCODE_RC_ACKNOWLEDGE); + /* + * ACKNOWLEDGE packets do not get a CNP but this will be + * guarded by ignore_fecn above. + */ + do_cnp = prescan || + (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST && + opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(ACK); + + /* Call appropriate CNP handler */ + if (!ignore_fecn && do_cnp && fecn) + hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey, + dlid, rlid, sc, grh); + + if (becn) { + u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + u8 sl = ibp->sc_to_sl[sc]; + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); + } + return !ignore_fecn && fecn; +} + +struct ps_mdata { + struct hfi1_ctxtdata *rcd; + u32 rsize; + u32 maxcnt; + u32 ps_head; + u32 ps_tail; + u32 ps_seq; +}; + +static inline void init_ps_mdata(struct ps_mdata *mdata, + struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + mdata->rcd = rcd; + mdata->rsize = packet->rsize; + mdata->maxcnt = packet->maxcnt; + mdata->ps_head = packet->rhqoff; + + if (get_dma_rtail_setting(rcd)) { + mdata->ps_tail = get_rcvhdrtail(rcd); + if (rcd->ctxt == HFI1_CTRL_CTXT) + mdata->ps_seq = hfi1_seq_cnt(rcd); + else + mdata->ps_seq = 0; /* not used with DMA_RTAIL */ + } else { + mdata->ps_tail = 0; /* used only with DMA_RTAIL*/ + mdata->ps_seq = hfi1_seq_cnt(rcd); + } +} + +static inline int ps_done(struct ps_mdata *mdata, u64 rhf, + struct hfi1_ctxtdata *rcd) +{ + if (get_dma_rtail_setting(rcd)) + return mdata->ps_head == mdata->ps_tail; + return mdata->ps_seq != rhf_rcv_seq(rhf); +} + +static inline int ps_skip(struct ps_mdata *mdata, u64 rhf, + struct hfi1_ctxtdata *rcd) +{ + /* + * Control context can potentially receive an invalid rhf. + * Drop such packets. + */ + if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail)) + return mdata->ps_seq != rhf_rcv_seq(rhf); + + return 0; +} + +static inline void update_ps_mdata(struct ps_mdata *mdata, + struct hfi1_ctxtdata *rcd) +{ + mdata->ps_head += mdata->rsize; + if (mdata->ps_head >= mdata->maxcnt) + mdata->ps_head = 0; + + /* Control context must do seq counting */ + if (!get_dma_rtail_setting(rcd) || + rcd->ctxt == HFI1_CTRL_CTXT) + mdata->ps_seq = hfi1_seq_incr_wrap(mdata->ps_seq); +} + +/* + * prescan_rxq - search through the receive queue looking for packets + * containing Excplicit Congestion Notifications (FECNs, or BECNs). + * When an ECN is found, process the Congestion Notification, and toggle + * it off. + * This is declared as a macro to allow quick checking of the port to avoid + * the overhead of a function call if not enabled. + */ +#define prescan_rxq(rcd, packet) \ + do { \ + if (rcd->ppd->cc_prescan) \ + __prescan_rxq(packet); \ + } while (0) +static void __prescan_rxq(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ps_mdata mdata; + + init_ps_mdata(&mdata, packet); + + while (1) { + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + + packet->rcd->rhf_offset; + struct rvt_qp *qp; + struct ib_header *hdr; + struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi; + u64 rhf = rhf_to_cpu(rhf_addr); + u32 etype = rhf_rcv_type(rhf), qpn, bth1; + u8 lnh; + + if (ps_done(&mdata, rhf, rcd)) + break; + + if (ps_skip(&mdata, rhf, rcd)) + goto next; + + if (etype != RHF_RCV_TYPE_IB) + goto next; + + packet->hdr = hfi1_get_msgheader(packet->rcd, rhf_addr); + hdr = packet->hdr; + lnh = ib_get_lnh(hdr); + + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &hdr->u.oth; + packet->grh = NULL; + } else if (lnh == HFI1_LRH_GRH) { + packet->ohdr = &hdr->u.l.oth; + packet->grh = &hdr->u.l.grh; + } else { + goto next; /* just in case */ + } + + if (!hfi1_may_ecn(packet)) + goto next; + + bth1 = be32_to_cpu(packet->ohdr->bth[1]); + qpn = bth1 & RVT_QPN_MASK; + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn); + + if (!qp) { + rcu_read_unlock(); + goto next; + } + + hfi1_process_ecn_slowpath(qp, packet, true); + rcu_read_unlock(); + + /* turn off BECN, FECN */ + bth1 &= ~(IB_FECN_SMASK | IB_BECN_SMASK); + packet->ohdr->bth[1] = cpu_to_be32(bth1); +next: + update_ps_mdata(&mdata, rcd); + } +} + +static void process_rcv_qp_work(struct hfi1_packet *packet) +{ + struct rvt_qp *qp, *nqp; + struct hfi1_ctxtdata *rcd = packet->rcd; + + /* + * Iterate over all QPs waiting to respond. + * The list won't change since the IRQ is only run on one CPU. + */ + list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { + list_del_init(&qp->rspwait); + if (qp->r_flags & RVT_R_RSP_NAK) { + qp->r_flags &= ~RVT_R_RSP_NAK; + packet->qp = qp; + hfi1_send_rc_ack(packet, 0); + } + if (qp->r_flags & RVT_R_RSP_SEND) { + unsigned long flags; + + qp->r_flags &= ~RVT_R_RSP_SEND; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & + RVT_PROCESS_OR_FLUSH_SEND) + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + rvt_put_qp(qp); + } +} + +static noinline int max_packet_exceeded(struct hfi1_packet *packet, int thread) +{ + if (thread) { + if ((packet->numpkt & (MAX_PKT_RECV_THREAD - 1)) == 0) + /* allow defered processing */ + process_rcv_qp_work(packet); + cond_resched(); + return RCV_PKT_OK; + } else { + this_cpu_inc(*packet->rcd->dd->rcv_limit); + return RCV_PKT_LIMIT; + } +} + +static inline int check_max_packet(struct hfi1_packet *packet, int thread) +{ + int ret = RCV_PKT_OK; + + if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) + ret = max_packet_exceeded(packet, thread); + return ret; +} + +static noinline int skip_rcv_packet(struct hfi1_packet *packet, int thread) +{ + int ret; + + packet->rcd->dd->ctx0_seq_drop++; + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + packet->numpkt++; + ret = check_max_packet(packet, thread); + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + + return ret; +} + +static void process_rcv_packet_napi(struct hfi1_packet *packet) +{ + packet->etype = rhf_rcv_type(packet->rhf); + + /* total length */ + packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + /* retrieve eager buffer details */ + packet->etail = rhf_egr_index(packet->rhf); + packet->ebuf = get_egrbuf(packet->rcd, packet->rhf, + &packet->updegr); + /* + * Prefetch the contents of the eager buffer. It is + * OK to send a negative length to prefetch_range(). + * The +2 is the size of the RHF. + */ + prefetch_range(packet->ebuf, + packet->tlen - ((packet->rcd->rcvhdrqentsize - + (rhf_hdrq_offset(packet->rhf) + + 2)) * 4)); + + packet->rcd->rhf_rcv_function_map[packet->etype](packet); + packet->numpkt++; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); +} + +static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) +{ + int ret; + + packet->etype = rhf_rcv_type(packet->rhf); + + /* total length */ + packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + /* retrieve eager buffer details */ + packet->ebuf = NULL; + if (rhf_use_egr_bfr(packet->rhf)) { + packet->etail = rhf_egr_index(packet->rhf); + packet->ebuf = get_egrbuf(packet->rcd, packet->rhf, + &packet->updegr); + /* + * Prefetch the contents of the eager buffer. It is + * OK to send a negative length to prefetch_range(). + * The +2 is the size of the RHF. + */ + prefetch_range(packet->ebuf, + packet->tlen - ((get_hdrqentsize(packet->rcd) - + (rhf_hdrq_offset(packet->rhf) + + 2)) * 4)); + } + + /* + * Call a type specific handler for the packet. We + * should be able to trust that etype won't be beyond + * the range of valid indexes. If so something is really + * wrong and we can probably just let things come + * crashing down. There is no need to eat another + * comparison in this performance critical code. + */ + packet->rcd->rhf_rcv_function_map[packet->etype](packet); + packet->numpkt++; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + ret = check_max_packet(packet, thread); + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + + return ret; +} + +static inline void process_rcv_update(int last, struct hfi1_packet *packet) +{ + /* + * Update head regs etc., every 16 packets, if not last pkt, + * to help prevent rcvhdrq overflows, when many packets + * are processed and queue is nearly full. + * Don't request an interrupt for intermediate updates. + */ + if (!last && !(packet->numpkt & 0xf)) { + update_usrhead(packet->rcd, packet->rhqoff, packet->updegr, + packet->etail, 0, 0); + packet->updegr = 0; + } + packet->grh = NULL; +} + +static inline void finish_packet(struct hfi1_packet *packet) +{ + /* + * Nothing we need to free for the packet. + * + * The only thing we need to do is a final update and call for an + * interrupt + */ + update_usrhead(packet->rcd, hfi1_rcd_head(packet->rcd), packet->updegr, + packet->etail, rcv_intr_dynamic, packet->numpkt); +} + +/* + * handle_receive_interrupt_napi_fp - receive a packet + * @rcd: the context + * @budget: polling budget + * + * Called from interrupt handler for receive interrupt. + * This is the fast path interrupt handler + * when executing napi soft irq environment. + */ +int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget) +{ + struct hfi1_packet packet; + + init_packet(rcd, &packet); + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) + goto bail; + + while (packet.numpkt < budget) { + process_rcv_packet_napi(&packet); + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + break; + + process_rcv_update(0, &packet); + } + hfi1_set_rcd_head(rcd, packet.rhqoff); +bail: + finish_packet(&packet); + return packet.numpkt; +} + +/* + * Handle receive interrupts when using the no dma rtail option. + */ +int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread) +{ + int last = RCV_PKT_OK; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) { + last = RCV_PKT_DONE; + goto bail; + } + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + last = process_rcv_packet(&packet, thread); + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + last = RCV_PKT_DONE; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); + hfi1_set_rcd_head(rcd, packet.rhqoff); +bail: + finish_packet(&packet); + return last; +} + +int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread) +{ + u32 hdrqtail; + int last = RCV_PKT_OK; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) { + last = RCV_PKT_DONE; + goto bail; + } + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + last = process_rcv_packet(&packet, thread); + if (packet.rhqoff == hdrqtail) + last = RCV_PKT_DONE; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); + hfi1_set_rcd_head(rcd, packet.rhqoff); +bail: + finish_packet(&packet); + return last; +} + +static void set_all_fastpath(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + u16 i; + + /* + * For dynamically allocated kernel contexts (like vnic) switch + * interrupt handler only for that context. Otherwise, switch + * interrupt handler for all statically allocated kernel contexts. + */ + if (rcd->ctxt >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic) { + hfi1_rcd_get(rcd); + hfi1_set_fast(rcd); + hfi1_rcd_put(rcd); + return; + } + + for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd && (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic)) + hfi1_set_fast(rcd); + hfi1_rcd_put(rcd); + } +} + +void set_all_slowpath(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + u16 i; + + /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */ + for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; + if (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic) + rcd->do_interrupt = rcd->slow_handler; + + hfi1_rcd_put(rcd); + } +} + +static bool __set_armed_to_active(struct hfi1_packet *packet) +{ + u8 etype = rhf_rcv_type(packet->rhf); + u8 sc = SC15_PACKET; + + if (etype == RHF_RCV_TYPE_IB) { + struct ib_header *hdr = hfi1_get_msgheader(packet->rcd, + packet->rhf_addr); + sc = hfi1_9B_get_sc5(hdr, packet->rhf); + } else if (etype == RHF_RCV_TYPE_BYPASS) { + struct hfi1_16b_header *hdr = hfi1_get_16B_header( + packet->rcd, + packet->rhf_addr); + sc = hfi1_16B_get_sc(hdr); + } + if (sc != SC15_PACKET) { + int hwstate = driver_lstate(packet->rcd->ppd); + struct work_struct *lsaw = + &packet->rcd->ppd->linkstate_active_work; + + if (hwstate != IB_PORT_ACTIVE) { + dd_dev_info(packet->rcd->dd, + "Unexpected link state %s\n", + opa_lstate_name(hwstate)); + return false; + } + + queue_work(packet->rcd->ppd->link_wq, lsaw); + return true; + } + return false; +} + +/** + * set_armed_to_active - the fast path for armed to active + * @packet: the packet structure + * + * Return true if packet processing needs to bail. + */ +static bool set_armed_to_active(struct hfi1_packet *packet) +{ + if (likely(packet->rcd->ppd->host_link_state != HLS_UP_ARMED)) + return false; + return __set_armed_to_active(packet); +} + +/* + * handle_receive_interrupt - receive a packet + * @rcd: the context + * + * Called from interrupt handler for errors or receive interrupt. + * This is the slow path interrupt handler. + */ +int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 hdrqtail; + int needset, last = RCV_PKT_OK; + struct hfi1_packet packet; + int skip_pkt = 0; + + if (!rcd->rcvhdrq) + return RCV_PKT_OK; + /* Control context will always use the slow path interrupt handler */ + needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1; + + init_packet(rcd, &packet); + + if (!get_dma_rtail_setting(rcd)) { + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) { + last = RCV_PKT_DONE; + goto bail; + } + hdrqtail = 0; + } else { + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) { + last = RCV_PKT_DONE; + goto bail; + } + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + + /* + * Control context can potentially receive an invalid + * rhf. Drop such packets. + */ + if (rcd->ctxt == HFI1_CTRL_CTXT) + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) + skip_pkt = 1; + } + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + if (hfi1_need_drop(dd)) { + /* On to the next packet */ + packet.rhqoff += packet.rsize; + packet.rhf_addr = (__le32 *)rcd->rcvhdrq + + packet.rhqoff + + rcd->rhf_offset; + packet.rhf = rhf_to_cpu(packet.rhf_addr); + + } else if (skip_pkt) { + last = skip_rcv_packet(&packet, thread); + skip_pkt = 0; + } else { + if (set_armed_to_active(&packet)) + goto bail; + last = process_rcv_packet(&packet, thread); + } + + if (!get_dma_rtail_setting(rcd)) { + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + last = RCV_PKT_DONE; + } else { + if (packet.rhqoff == hdrqtail) + last = RCV_PKT_DONE; + /* + * Control context can potentially receive an invalid + * rhf. Drop such packets. + */ + if (rcd->ctxt == HFI1_CTRL_CTXT) { + bool lseq; + + lseq = hfi1_seq_incr(rcd, + rhf_rcv_seq(packet.rhf)); + if (!last && lseq) + skip_pkt = 1; + } + } + + if (needset) { + needset = false; + set_all_fastpath(dd, rcd); + } + process_rcv_update(last, &packet); + } + + process_rcv_qp_work(&packet); + hfi1_set_rcd_head(rcd, packet.rhqoff); + +bail: + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + finish_packet(&packet); + return last; +} + +/* + * handle_receive_interrupt_napi_sp - receive a packet + * @rcd: the context + * @budget: polling budget + * + * Called from interrupt handler for errors or receive interrupt. + * This is the slow path interrupt handler + * when executing napi soft irq environment. + */ +int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget) +{ + struct hfi1_devdata *dd = rcd->dd; + int last = RCV_PKT_OK; + bool needset = true; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) + goto bail; + + while (last != RCV_PKT_DONE && packet.numpkt < budget) { + if (hfi1_need_drop(dd)) { + /* On to the next packet */ + packet.rhqoff += packet.rsize; + packet.rhf_addr = (__le32 *)rcd->rcvhdrq + + packet.rhqoff + + rcd->rhf_offset; + packet.rhf = rhf_to_cpu(packet.rhf_addr); + + } else { + if (set_armed_to_active(&packet)) + goto bail; + process_rcv_packet_napi(&packet); + } + + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + last = RCV_PKT_DONE; + + if (needset) { + needset = false; + set_all_fastpath(dd, rcd); + } + + process_rcv_update(last, &packet); + } + + hfi1_set_rcd_head(rcd, packet.rhqoff); + +bail: + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + finish_packet(&packet); + return packet.numpkt; +} + +/* + * We may discover in the interrupt that the hardware link state has + * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet), + * and we need to update the driver's notion of the link state. We cannot + * run set_link_state from interrupt context, so we queue this function on + * a workqueue. + * + * We delay the regular interrupt processing until after the state changes + * so that the link will be in the correct state by the time any application + * we wake up attempts to send a reply to any message it received. + * (Subsequent receive interrupts may possibly force the wakeup before we + * update the link state.) + * + * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes + * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues, + * so we're safe from use-after-free of the rcd. + */ +void receive_interrupt_work(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + linkstate_active_work); + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ctxtdata *rcd; + u16 i; + + /* Received non-SC15 packet implies neighbor_normal */ + ppd->neighbor_normal = 1; + set_link_state(ppd, HLS_UP_ACTIVE); + + /* + * Interrupt all statically allocated kernel contexts that could + * have had an interrupt during auto activation. + */ + for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (rcd) + force_recv_intr(rcd); + hfi1_rcd_put(rcd); + } +} + +/* + * Convert a given MTU size to the on-wire MAD packet enumeration. + * Return -1 if the size is invalid. + */ +int mtu_to_enum(u32 mtu, int default_if_bad) +{ + switch (mtu) { + case 0: return OPA_MTU_0; + case 256: return OPA_MTU_256; + case 512: return OPA_MTU_512; + case 1024: return OPA_MTU_1024; + case 2048: return OPA_MTU_2048; + case 4096: return OPA_MTU_4096; + case 8192: return OPA_MTU_8192; + case 10240: return OPA_MTU_10240; + } + return default_if_bad; +} + +u16 enum_to_mtu(int mtu) +{ + switch (mtu) { + case OPA_MTU_0: return 0; + case OPA_MTU_256: return 256; + case OPA_MTU_512: return 512; + case OPA_MTU_1024: return 1024; + case OPA_MTU_2048: return 2048; + case OPA_MTU_4096: return 4096; + case OPA_MTU_8192: return 8192; + case OPA_MTU_10240: return 10240; + default: return 0xffff; + } +} + +/* + * set_mtu - set the MTU + * @ppd: the per port data + * + * We can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. We do not deal with what happens + * to programs that are already running when the size changes. + */ +int set_mtu(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + int i, drain, ret = 0, is_up = 0; + + ppd->ibmtu = 0; + for (i = 0; i < ppd->vls_supported; i++) + if (ppd->ibmtu < dd->vld[i].mtu) + ppd->ibmtu = dd->vld[i].mtu; + ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd); + + mutex_lock(&ppd->hls_lock); + if (ppd->host_link_state == HLS_UP_INIT || + ppd->host_link_state == HLS_UP_ARMED || + ppd->host_link_state == HLS_UP_ACTIVE) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * MTU is specified per-VL. To ensure that no packet gets + * stuck (due, e.g., to the MTU for the packet's VL being + * reduced), empty the per-VL FIFOs before adjusting MTU. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n", + __func__); + goto err; + } + + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc) +{ + struct hfi1_devdata *dd = ppd->dd; + + ppd->lid = lid; + ppd->lmc = lmc; + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0); + + dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid); + + return 0; +} + +void shutdown_led_override(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + if (atomic_read(&ppd->led_override_timer_active)) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); + } + + /* Hand control of the LED to the DC for normal operation */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); +} + +static void run_led_override(struct timer_list *t) +{ + struct hfi1_pportdata *ppd = from_timer(ppd, t, led_override_timer); + struct hfi1_devdata *dd = ppd->dd; + unsigned long timeout; + int phase_idx; + + if (!(dd->flags & HFI1_INITTED)) + return; + + phase_idx = ppd->led_override_phase & 1; + + setextled(dd, phase_idx); + + timeout = ppd->led_override_vals[phase_idx]; + + /* Set up for next phase */ + ppd->led_override_phase = !ppd->led_override_phase; + + mod_timer(&ppd->led_override_timer, jiffies + timeout); +} + +/* + * To have the LED blink in a particular pattern, provide timeon and timeoff + * in milliseconds. + * To turn off custom blinking and return to normal operation, use + * shutdown_led_override() + */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff) +{ + if (!(ppd->dd->flags & HFI1_INITTED)) + return; + + /* Convert to jiffies for direct use in timer */ + ppd->led_override_vals[0] = msecs_to_jiffies(timeoff); + ppd->led_override_vals[1] = msecs_to_jiffies(timeon); + + /* Arbitrarily start from LED on phase */ + ppd->led_override_phase = 1; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the handler will be called soon to look at our request. + */ + if (!timer_pending(&ppd->led_override_timer)) { + timer_setup(&ppd->led_override_timer, run_led_override, 0); + ppd->led_override_timer.expires = jiffies + 1; + add_timer(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 1); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); + } +} + +/** + * hfi1_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user contexts are open that use chip resources + */ +int hfi1_reset_device(int unit) +{ + int ret; + struct hfi1_devdata *dd = hfi1_lookup(unit); + struct hfi1_pportdata *ppd; + int pidx; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + dd_dev_info(dd, "Reset on unit %u requested\n", unit); + + if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) { + dd_dev_info(dd, + "Invalid unit number %u or not initialized or not present\n", + unit); + ret = -ENXIO; + goto bail; + } + + /* If there are any user/vnic contexts, we cannot reset */ + mutex_lock(&hfi1_mutex); + if (dd->rcd) + if (hfi1_stats.sps_ctxts) { + mutex_unlock(&hfi1_mutex); + ret = -EBUSY; + goto bail; + } + mutex_unlock(&hfi1_mutex); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + shutdown_led_override(ppd); + } + if (dd->flags & HFI1_HAS_SEND_DMA) + sdma_exit(dd); + + hfi1_reset_cpu_counters(dd); + + ret = hfi1_init(dd, 1); + + if (ret) + dd_dev_err(dd, + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); + else + dd_dev_info(dd, "Reinitialized unit %u after resetting\n", + unit); + +bail: + return ret; +} + +static inline void hfi1_setup_ib_header(struct hfi1_packet *packet) +{ + packet->hdr = (struct hfi1_ib_message_header *) + hfi1_get_msgheader(packet->rcd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; +} + +static int hfi1_bypass_ingress_pkt_check(struct hfi1_packet *packet) +{ + struct hfi1_pportdata *ppd = packet->rcd->ppd; + + /* slid and dlid cannot be 0 */ + if ((!packet->slid) || (!packet->dlid)) + return -EINVAL; + + /* Compare port lid with incoming packet dlid */ + if ((!(hfi1_is_16B_mcast(packet->dlid))) && + (packet->dlid != + opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) { + if ((packet->dlid & ~((1 << ppd->lmc) - 1)) != ppd->lid) + return -EINVAL; + } + + /* No multicast packets with SC15 */ + if ((hfi1_is_16B_mcast(packet->dlid)) && (packet->sc == 0xF)) + return -EINVAL; + + /* Packets with permissive DLID always on SC15 */ + if ((packet->dlid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), + 16B)) && + (packet->sc != 0xF)) + return -EINVAL; + + return 0; +} + +static int hfi1_setup_9B_packet(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct ib_header *hdr; + u8 lnh; + + hfi1_setup_ib_header(packet); + hdr = packet->hdr; + + lnh = ib_get_lnh(hdr); + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &hdr->u.oth; + packet->grh = NULL; + } else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + packet->ohdr = &hdr->u.l.oth; + packet->grh = &hdr->u.l.grh; + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(packet->grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else { + goto drop; + } + + /* Query commonly used fields from packet header */ + packet->payload = packet->ebuf; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->slid = ib_get_slid(hdr); + packet->dlid = ib_get_dlid(hdr); + if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE)))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + be16_to_cpu(IB_MULTICAST_LID_BASE); + packet->sl = ib_get_sl(hdr); + packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf); + packet->pad = ib_bth_get_pad(packet->ohdr); + packet->extra_byte = 0; + packet->pkey = ib_bth_get_pkey(packet->ohdr); + packet->migrated = ib_bth_is_migration(packet->ohdr); + + return 0; +drop: + ibp->rvp.n_pkt_drops++; + return -EINVAL; +} + +static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) +{ + /* + * Bypass packets have a different header/payload split + * compared to an IB packet. + * Current split is set such that 16 bytes of the actual + * header is in the header buffer and the remining is in + * the eager buffer. We chose 16 since hfi1 driver only + * supports 16B bypass packets and we will be able to + * receive the entire LRH with such a split. + */ + + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + u8 l4; + + packet->hdr = (struct hfi1_16b_header *) + hfi1_get_16B_header(packet->rcd, + packet->rhf_addr); + l4 = hfi1_16B_get_l4(packet->hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) { + packet->ohdr = packet->ebuf; + packet->grh = NULL; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES); + packet->migrated = opa_bth_is_migration(packet->ohdr); + } else if (l4 == OPA_16B_L4_IB_GLOBAL) { + u32 vtf; + u8 grh_len = sizeof(struct ib_grh); + + packet->ohdr = packet->ebuf + grh_len; + packet->grh = packet->ebuf; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES) + grh_len; + packet->migrated = opa_bth_is_migration(packet->ohdr); + + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(packet->grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else if (l4 == OPA_16B_L4_FM) { + packet->mgmt = packet->ebuf; + packet->ohdr = NULL; + packet->grh = NULL; + packet->opcode = IB_OPCODE_UD_SEND_ONLY; + packet->pad = OPA_16B_L4_FM_PAD; + packet->hlen = OPA_16B_L4_FM_HLEN; + packet->migrated = false; + } else { + goto drop; + } + + /* Query commonly used fields from packet header */ + packet->payload = packet->ebuf + packet->hlen - LRH_16B_BYTES; + packet->slid = hfi1_16B_get_slid(packet->hdr); + packet->dlid = hfi1_16B_get_dlid(packet->hdr); + if (unlikely(hfi1_is_16B_mcast(packet->dlid))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), + 16B); + packet->sc = hfi1_16B_get_sc(packet->hdr); + packet->sl = ibp->sc_to_sl[packet->sc]; + packet->extra_byte = SIZE_OF_LT; + packet->pkey = hfi1_16B_get_pkey(packet->hdr); + + if (hfi1_bypass_ingress_pkt_check(packet)) + goto drop; + + return 0; +drop: + hfi1_cdbg(PKT, "%s: packet dropped", __func__); + ibp->rvp.n_pkt_drops++; + return -EINVAL; +} + +static void show_eflags_errs(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + u32 rte = rhf_rcv_type_err(packet->rhf); + + dd_dev_err(rcd->dd, + "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s] rte 0x%x\n", + rcd->ctxt, packet->rhf, + packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", + packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", + packet->rhf & RHF_DC_ERR ? "dc " : "", + packet->rhf & RHF_TID_ERR ? "tid " : "", + packet->rhf & RHF_LEN_ERR ? "len " : "", + packet->rhf & RHF_ECC_ERR ? "ecc " : "", + packet->rhf & RHF_ICRC_ERR ? "icrc " : "", + rte); +} + +void handle_eflags(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + rcv_hdrerr(rcd, rcd->ppd, packet); + if (rhf_err_flags(packet->rhf)) + show_eflags_errs(packet); +} + +static void hfi1_ipoib_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + struct net_device *netdev; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct napi_struct *napi = rcd->napi; + struct sk_buff *skb; + struct hfi1_netdev_rxq *rxq = container_of(napi, + struct hfi1_netdev_rxq, napi); + u32 extra_bytes; + u32 tlen, qpnum; + bool do_work, do_cnp; + + trace_hfi1_rcvhdr(packet); + + hfi1_setup_ib_header(packet); + + packet->ohdr = &((struct ib_header *)packet->hdr)->u.oth; + packet->grh = NULL; + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return; + } + + qpnum = ib_bth_get_qpn(packet->ohdr); + netdev = hfi1_netdev_get_data(rcd->dd, qpnum); + if (!netdev) + goto drop_no_nd; + + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + trace_ctxt_rsm_hist(rcd->ctxt); + + /* handle congestion notifications */ + do_work = hfi1_may_ecn(packet); + if (unlikely(do_work)) { + do_cnp = (packet->opcode != IB_OPCODE_CNP); + (void)hfi1_process_ecn_slowpath(hfi1_ipoib_priv(netdev)->qp, + packet, do_cnp); + } + + /* + * We have split point after last byte of DETH + * lets strip padding and CRC and ICRC. + * tlen is whole packet len so we need to + * subtract header size as well. + */ + tlen = packet->tlen; + extra_bytes = ib_bth_get_pad(packet->ohdr) + (SIZE_OF_CRC << 2) + + packet->hlen; + if (unlikely(tlen < extra_bytes)) + goto drop; + + tlen -= extra_bytes; + + skb = hfi1_ipoib_prepare_skb(rxq, tlen, packet->ebuf); + if (unlikely(!skb)) + goto drop; + + dev_sw_netstats_rx_add(netdev, skb->len); + + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + + return; + +drop: + ++netdev->stats.rx_dropped; +drop_no_nd: + ibp = rcd_to_iport(packet->rcd); + ++ibp->rvp.n_pkt_drops; +} + +/* + * The following functions are called by the interrupt handler. They are type + * specific handlers for each packet type. + */ +static void process_receive_ib(struct hfi1_packet *packet) +{ + if (hfi1_setup_9B_packet(packet)) + return; + + if (unlikely(hfi1_dbg_should_fault_rx(packet))) + return; + + trace_hfi1_rcvhdr(packet); + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return; + } + + hfi1_ib_rcv(packet); +} + +static void process_receive_bypass(struct hfi1_packet *packet) +{ + struct hfi1_devdata *dd = packet->rcd->dd; + + if (hfi1_setup_bypass_packet(packet)) + return; + + trace_hfi1_rcvhdr(packet); + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return; + } + + if (hfi1_16B_get_l2(packet->hdr) == 0x2) { + hfi1_16B_rcv(packet); + } else { + dd_dev_err(dd, + "Bypass packets other than 16B are not supported in normal operation. Dropping\n"); + incr_cntr64(&dd->sw_rcv_bypass_packet_errors); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + u64 *flits = packet->ebuf; + + if (flits && !(packet->rhf & RHF_LEN_ERR)) { + dd->err_info_rcvport.packet_flit1 = flits[0]; + dd->err_info_rcvport.packet_flit2 = + packet->tlen > sizeof(flits[0]) ? + flits[1] : 0; + } + dd->err_info_rcvport.status_and_code |= + (OPA_EI_STATUS_SMASK | BAD_L2_ERR); + } + } +} + +static void process_receive_error(struct hfi1_packet *packet) +{ + /* KHdrHCRCErr -- KDETH packet with a bad HCRC */ + if (unlikely( + hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) && + (rhf_rcv_type_err(packet->rhf) == RHF_RCV_TYPE_ERROR || + packet->rhf & RHF_DC_ERR))) + return; + + hfi1_setup_ib_header(packet); + handle_eflags(packet); + + if (unlikely(rhf_err_flags(packet->rhf))) + dd_dev_err(packet->rcd->dd, + "Unhandled error packet received. Dropping.\n"); +} + +static void kdeth_process_expected(struct hfi1_packet *packet) +{ + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) + return; + + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; + + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return; + } + + hfi1_kdeth_expected_rcv(packet); +} + +static void kdeth_process_eager(struct hfi1_packet *packet) +{ + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) + return; + + trace_hfi1_rcvhdr(packet); + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; + + show_eflags_errs(packet); + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return; + } + + hfi1_kdeth_eager_rcv(packet); +} + +static void process_receive_invalid(struct hfi1_packet *packet) +{ + dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n", + rhf_rcv_type(packet->rhf)); +} + +#define HFI1_RCVHDR_DUMP_MAX 5 + +void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_packet packet; + struct ps_mdata mdata; + int i; + + seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s ctrl 0x%08llx status 0x%08llx, head %llu tail %llu sw head %u\n", + rcd->ctxt, get_hdrq_cnt(rcd), get_hdrqentsize(rcd), + get_dma_rtail_setting(rcd) ? + "dma_rtail" : "nodma_rtail", + read_kctxt_csr(rcd->dd, rcd->ctxt, RCV_CTXT_CTRL), + read_kctxt_csr(rcd->dd, rcd->ctxt, RCV_CTXT_STATUS), + read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) & + RCV_HDR_HEAD_HEAD_MASK, + read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL), + rcd->head); + + init_packet(rcd, &packet); + init_ps_mdata(&mdata, &packet); + + for (i = 0; i < HFI1_RCVHDR_DUMP_MAX; i++) { + __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + + rcd->rhf_offset; + struct ib_header *hdr; + u64 rhf = rhf_to_cpu(rhf_addr); + u32 etype = rhf_rcv_type(rhf), qpn; + u8 opcode; + u32 psn; + u8 lnh; + + if (ps_done(&mdata, rhf, rcd)) + break; + + if (ps_skip(&mdata, rhf, rcd)) + goto next; + + if (etype > RHF_RCV_TYPE_IB) + goto next; + + packet.hdr = hfi1_get_msgheader(rcd, rhf_addr); + hdr = packet.hdr; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + + if (lnh == HFI1_LRH_BTH) + packet.ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) + packet.ohdr = &hdr->u.l.oth; + else + goto next; /* just in case */ + + opcode = (be32_to_cpu(packet.ohdr->bth[0]) >> 24); + qpn = be32_to_cpu(packet.ohdr->bth[1]) & RVT_QPN_MASK; + psn = mask_psn(be32_to_cpu(packet.ohdr->bth[2])); + + seq_printf(s, "\tEnt %u: opcode 0x%x, qpn 0x%x, psn 0x%x\n", + mdata.ps_head, opcode, qpn, psn); +next: + update_ps_mdata(&mdata, rcd); + } +} + +const rhf_rcv_function_ptr normal_rhf_rcv_functions[] = { + [RHF_RCV_TYPE_EXPECTED] = kdeth_process_expected, + [RHF_RCV_TYPE_EAGER] = kdeth_process_eager, + [RHF_RCV_TYPE_IB] = process_receive_ib, + [RHF_RCV_TYPE_ERROR] = process_receive_error, + [RHF_RCV_TYPE_BYPASS] = process_receive_bypass, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, +}; + +const rhf_rcv_function_ptr netdev_rhf_rcv_functions[] = { + [RHF_RCV_TYPE_EXPECTED] = process_receive_invalid, + [RHF_RCV_TYPE_EAGER] = process_receive_invalid, + [RHF_RCV_TYPE_IB] = hfi1_ipoib_ib_rcv, + [RHF_RCV_TYPE_ERROR] = process_receive_error, + [RHF_RCV_TYPE_BYPASS] = hfi1_vnic_bypass_rcv, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, +}; diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c new file mode 100644 index 0000000000..2b5d264f41 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/efivar.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/string.h> +#include <linux/string_helpers.h> + +#include "efivar.h" + +/* GUID for HFI1 variables in EFI */ +#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \ + 0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4) +/* largest EFI data size we expect */ +#define EFI_DATA_SIZE 4096 + +/* + * Read the named EFI variable. Return the size of the actual data in *size + * and a kmalloc'ed buffer in *return_data. The caller must free the + * data. It is guaranteed that *return_data will be NULL and *size = 0 + * if this routine fails. + * + * Return 0 on success, -errno on failure. + */ +static int read_efi_var(const char *name, unsigned long *size, + void **return_data) +{ + efi_status_t status; + efi_char16_t *uni_name; + efi_guid_t guid; + unsigned long temp_size; + void *temp_buffer; + void *data; + int i; + int ret; + + /* set failure return values */ + *size = 0; + *return_data = NULL; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) + return -EOPNOTSUPP; + + uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL); + temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL); + + if (!uni_name || !temp_buffer) { + ret = -ENOMEM; + goto fail; + } + + /* input: the size of the buffer */ + temp_size = EFI_DATA_SIZE; + + /* convert ASCII to unicode - it is a 1:1 mapping */ + for (i = 0; name[i]; i++) + uni_name[i] = name[i]; + + /* need a variable for our GUID */ + guid = HFI1_EFIVAR_GUID; + + /* call into EFI runtime services */ + status = efi.get_variable( + uni_name, + &guid, + NULL, + &temp_size, + temp_buffer); + + /* + * It would be nice to call efi_status_to_err() here, but that + * is in the EFIVAR_FS code and may not be compiled in. + * However, even that is insufficient since it does not cover + * EFI_BUFFER_TOO_SMALL which could be an important return. + * For now, just split out success or not found. + */ + ret = status == EFI_SUCCESS ? 0 : + status == EFI_NOT_FOUND ? -ENOENT : + -EINVAL; + if (ret) + goto fail; + + /* + * We have successfully read the EFI variable into our + * temporary buffer. Now allocate a correctly sized + * buffer. + */ + data = kmemdup(temp_buffer, temp_size, GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto fail; + } + + *size = temp_size; + *return_data = data; + +fail: + kfree(uni_name); + kfree(temp_buffer); + + return ret; +} + +/* + * Read an HFI1 EFI variable of the form: + * <PCIe address>-<kind> + * Return an kalloc'ed array and size of the data. + * + * Returns 0 on success, -errno on failure. + */ +int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind, + unsigned long *size, void **return_data) +{ + char prefix_name[64]; + char name[128]; + int result; + + /* create a common prefix */ + snprintf(prefix_name, sizeof(prefix_name), "%04x:%02x:%02x.%x", + pci_domain_nr(dd->pcidev->bus), + dd->pcidev->bus->number, + PCI_SLOT(dd->pcidev->devfn), + PCI_FUNC(dd->pcidev->devfn)); + snprintf(name, sizeof(name), "%s-%s", prefix_name, kind); + result = read_efi_var(name, size, return_data); + + /* + * If reading the lowercase EFI variable fail, read the uppercase + * variable. + */ + if (result) { + string_upper(prefix_name, prefix_name); + snprintf(name, sizeof(name), "%s-%s", prefix_name, kind); + result = read_efi_var(name, size, return_data); + } + + return result; +} diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h new file mode 100644 index 0000000000..5ebc2f07bb --- /dev/null +++ b/drivers/infiniband/hw/hfi1/efivar.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#ifndef _HFI1_EFIVAR_H +#define _HFI1_EFIVAR_H + +#include <linux/efi.h> + +#include "hfi.h" + +int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind, + unsigned long *size, void **return_data); + +#endif /* _HFI1_EFIVAR_H */ diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c new file mode 100644 index 0000000000..fbe9581074 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/delay.h> +#include "hfi.h" +#include "common.h" +#include "eprom.h" + +/* + * The EPROM is logically divided into three partitions: + * partition 0: the first 128K, visible from PCI ROM BAR + * partition 1: 4K config file (sector size) + * partition 2: the rest + */ +#define P0_SIZE (128 * 1024) +#define P1_SIZE (4 * 1024) +#define P1_START P0_SIZE +#define P2_START (P0_SIZE + P1_SIZE) + +/* controller page size, in bytes */ +#define EP_PAGE_SIZE 256 +#define EP_PAGE_MASK (EP_PAGE_SIZE - 1) +#define EP_PAGE_DWORDS (EP_PAGE_SIZE / sizeof(u32)) + +/* controller commands */ +#define CMD_SHIFT 24 +#define CMD_NOP (0) +#define CMD_READ_DATA(addr) ((0x03 << CMD_SHIFT) | addr) +#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT)) + +/* controller interface speeds */ +#define EP_SPEED_FULL 0x2 /* full speed */ + +/* + * How long to wait for the EPROM to become available, in ms. + * The spec 32 Mb EPROM takes around 40s to erase then write. + * Double it for safety. + */ +#define EPROM_TIMEOUT 80000 /* ms */ + +/* + * Read a 256 byte (64 dword) EPROM page. + * All callers have verified the offset is at a page boundary. + */ +static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result) +{ + int i; + + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset)); + for (i = 0; i < EP_PAGE_DWORDS; i++) + result[i] = (u32)read_csr(dd, ASIC_EEP_DATA); + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */ +} + +/* + * Read length bytes starting at offset from the start of the EPROM. + */ +static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, void *dest) +{ + u32 buffer[EP_PAGE_DWORDS]; + u32 end; + u32 start_offset; + u32 read_start; + u32 bytes; + + if (len == 0) + return 0; + + end = start + len; + + /* + * Make sure the read range is not outside of the controller read + * command address range. Note that '>' is correct below - the end + * of the range is OK if it stops at the limit, but no higher. + */ + if (end > (1 << CMD_SHIFT)) + return -EINVAL; + + /* read the first partial page */ + start_offset = start & EP_PAGE_MASK; + if (start_offset) { + /* partial starting page */ + + /* align and read the page that contains the start */ + read_start = start & ~EP_PAGE_MASK; + read_page(dd, read_start, buffer); + + /* the rest of the page is available data */ + bytes = EP_PAGE_SIZE - start_offset; + + if (len <= bytes) { + /* end is within this page */ + memcpy(dest, (u8 *)buffer + start_offset, len); + return 0; + } + + memcpy(dest, (u8 *)buffer + start_offset, bytes); + + start += bytes; + len -= bytes; + dest += bytes; + } + /* start is now page aligned */ + + /* read whole pages */ + while (len >= EP_PAGE_SIZE) { + read_page(dd, start, buffer); + memcpy(dest, buffer, EP_PAGE_SIZE); + + start += EP_PAGE_SIZE; + len -= EP_PAGE_SIZE; + dest += EP_PAGE_SIZE; + } + + /* read the last partial page */ + if (len) { + read_page(dd, start, buffer); + memcpy(dest, buffer, len); + } + + return 0; +} + +/* + * Initialize the EPROM handler. + */ +int eprom_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* only the discrete chip has an EPROM */ + if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) + return 0; + + /* + * It is OK if both HFIs reset the EPROM as long as they don't + * do it at the same time. + */ + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire EPROM resource, no EPROM support\n", + __func__); + goto done_asic; + } + + /* reset EPROM to be sure it is in a good state */ + + /* set reset */ + write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK); + /* clear reset, set speed */ + write_csr(dd, ASIC_EEP_CTL_STAT, + EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); + + /* wake the device with command "release powerdown NoID" */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); + + dd->eprom_available = true; + release_chip_resource(dd, CR_EPROM); +done_asic: + return ret; +} + +/* magic character sequence that begins an image */ +#define IMAGE_START_MAGIC "APO=" + +/* magic character sequence that might trail an image */ +#define IMAGE_TRAIL_MAGIC "egamiAPO" + +/* EPROM file types */ +#define HFI1_EFT_PLATFORM_CONFIG 2 + +/* segment size - 128 KiB */ +#define SEG_SIZE (128 * 1024) + +struct hfi1_eprom_footer { + u32 oprom_size; /* size of the oprom, in bytes */ + u16 num_table_entries; + u16 version; /* version of this footer */ + u32 magic; /* must be last */ +}; + +struct hfi1_eprom_table_entry { + u32 type; /* file type */ + u32 offset; /* file offset from start of EPROM */ + u32 size; /* file size, in bytes */ +}; + +/* + * Calculate the max number of table entries that will fit within a directory + * buffer of size 'dir_size'. + */ +#define MAX_TABLE_ENTRIES(dir_size) \ + (((dir_size) - sizeof(struct hfi1_eprom_footer)) / \ + sizeof(struct hfi1_eprom_table_entry)) + +#define DIRECTORY_SIZE(n) (sizeof(struct hfi1_eprom_footer) + \ + (sizeof(struct hfi1_eprom_table_entry) * (n))) + +#define MAGIC4(a, b, c, d) ((d) << 24 | (c) << 16 | (b) << 8 | (a)) +#define FOOTER_MAGIC MAGIC4('e', 'p', 'r', 'm') +#define FOOTER_VERSION 1 + +/* + * Read all of partition 1. The actual file is at the front. Adjust + * the returned size if a trailing image magic is found. + */ +static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, + u32 *size) +{ + void *buffer; + void *p; + u32 length; + int ret; + + buffer = kmalloc(P1_SIZE, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + ret = read_length(dd, P1_START, P1_SIZE, buffer); + if (ret) { + kfree(buffer); + return ret; + } + + /* config partition is valid only if it starts with IMAGE_START_MAGIC */ + if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) { + kfree(buffer); + return -ENOENT; + } + + /* scan for image magic that may trail the actual data */ + p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); + if (p) + length = p - buffer; + else + length = P1_SIZE; + + *data = buffer; + *size = length; + return 0; +} + +/* + * The segment magic has been checked. There is a footer and table of + * contents present. + * + * directory is a u32 aligned buffer of size EP_PAGE_SIZE. + */ +static int read_segment_platform_config(struct hfi1_devdata *dd, + void *directory, void **data, u32 *size) +{ + struct hfi1_eprom_footer *footer; + struct hfi1_eprom_table_entry *table; + struct hfi1_eprom_table_entry *entry; + void *buffer = NULL; + void *table_buffer = NULL; + int ret, i; + u32 directory_size; + u32 seg_base, seg_offset; + u32 bytes_available, ncopied, to_copy; + + /* the footer is at the end of the directory */ + footer = (struct hfi1_eprom_footer *) + (directory + EP_PAGE_SIZE - sizeof(*footer)); + + /* make sure the structure version is supported */ + if (footer->version != FOOTER_VERSION) + return -EINVAL; + + /* oprom size cannot be larger than a segment */ + if (footer->oprom_size >= SEG_SIZE) + return -EINVAL; + + /* the file table must fit in a segment with the oprom */ + if (footer->num_table_entries > + MAX_TABLE_ENTRIES(SEG_SIZE - footer->oprom_size)) + return -EINVAL; + + /* find the file table start, which precedes the footer */ + directory_size = DIRECTORY_SIZE(footer->num_table_entries); + if (directory_size <= EP_PAGE_SIZE) { + /* the file table fits into the directory buffer handed in */ + table = (struct hfi1_eprom_table_entry *) + (directory + EP_PAGE_SIZE - directory_size); + } else { + /* need to allocate and read more */ + table_buffer = kmalloc(directory_size, GFP_KERNEL); + if (!table_buffer) + return -ENOMEM; + ret = read_length(dd, SEG_SIZE - directory_size, + directory_size, table_buffer); + if (ret) + goto done; + table = table_buffer; + } + + /* look for the platform configuration file in the table */ + for (entry = NULL, i = 0; i < footer->num_table_entries; i++) { + if (table[i].type == HFI1_EFT_PLATFORM_CONFIG) { + entry = &table[i]; + break; + } + } + if (!entry) { + ret = -ENOENT; + goto done; + } + + /* + * Sanity check on the configuration file size - it should never + * be larger than 4 KiB. + */ + if (entry->size > (4 * 1024)) { + dd_dev_err(dd, "Bad configuration file size 0x%x\n", + entry->size); + ret = -EINVAL; + goto done; + } + + /* check for bogus offset and size that wrap when added together */ + if (entry->offset + entry->size < entry->offset) { + dd_dev_err(dd, + "Bad configuration file start + size 0x%x+0x%x\n", + entry->offset, entry->size); + ret = -EINVAL; + goto done; + } + + /* allocate the buffer to return */ + buffer = kmalloc(entry->size, GFP_KERNEL); + if (!buffer) { + ret = -ENOMEM; + goto done; + } + + /* + * Extract the file by looping over segments until it is fully read. + */ + seg_offset = entry->offset % SEG_SIZE; + seg_base = entry->offset - seg_offset; + ncopied = 0; + while (ncopied < entry->size) { + /* calculate data bytes available in this segment */ + + /* start with the bytes from the current offset to the end */ + bytes_available = SEG_SIZE - seg_offset; + /* subtract off footer and table from segment 0 */ + if (seg_base == 0) { + /* + * Sanity check: should not have a starting point + * at or within the directory. + */ + if (bytes_available <= directory_size) { + dd_dev_err(dd, + "Bad configuration file - offset 0x%x within footer+table\n", + entry->offset); + ret = -EINVAL; + goto done; + } + bytes_available -= directory_size; + } + + /* calculate bytes wanted */ + to_copy = entry->size - ncopied; + + /* max out at the available bytes in this segment */ + if (to_copy > bytes_available) + to_copy = bytes_available; + + /* + * Read from the EPROM. + * + * The sanity check for entry->offset is done in read_length(). + * The EPROM offset is validated against what the hardware + * addressing supports. In addition, if the offset is larger + * than the actual EPROM, it silently wraps. It will work + * fine, though the reader may not get what they expected + * from the EPROM. + */ + ret = read_length(dd, seg_base + seg_offset, to_copy, + buffer + ncopied); + if (ret) + goto done; + + ncopied += to_copy; + + /* set up for next segment */ + seg_offset = footer->oprom_size; + seg_base += SEG_SIZE; + } + + /* success */ + ret = 0; + *data = buffer; + *size = entry->size; + +done: + kfree(table_buffer); + if (ret) + kfree(buffer); + return ret; +} + +/* + * Read the platform configuration file from the EPROM. + * + * On success, an allocated buffer containing the data and its size are + * returned. It is up to the caller to free this buffer. + * + * Return value: + * 0 - success + * -ENXIO - no EPROM is available + * -EBUSY - not able to acquire access to the EPROM + * -ENOENT - no recognizable file written + * -ENOMEM - buffer could not be allocated + * -EINVAL - invalid EPROM contentents found + */ +int eprom_read_platform_config(struct hfi1_devdata *dd, void **data, u32 *size) +{ + u32 directory[EP_PAGE_DWORDS]; /* aligned buffer */ + int ret; + + if (!dd->eprom_available) + return -ENXIO; + + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); + if (ret) + return -EBUSY; + + /* read the last page of the segment for the EPROM format magic */ + ret = read_length(dd, SEG_SIZE - EP_PAGE_SIZE, EP_PAGE_SIZE, directory); + if (ret) + goto done; + + /* last dword of the segment contains a magic value */ + if (directory[EP_PAGE_DWORDS - 1] == FOOTER_MAGIC) { + /* segment format */ + ret = read_segment_platform_config(dd, directory, data, size); + } else { + /* partition format */ + ret = read_partition_platform_config(dd, data, size); + } + +done: + release_chip_resource(dd, CR_EPROM); + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h new file mode 100644 index 0000000000..772c516366 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/eprom.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +struct hfi1_devdata; + +int eprom_init(struct hfi1_devdata *dd); +int eprom_read_platform_config(struct hfi1_devdata *dd, void **buf_ret, + u32 *size_ret); diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c new file mode 100644 index 0000000000..b86f697c79 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/exp_rcv.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2017 Intel Corporation. + */ + +#include "exp_rcv.h" +#include "trace.h" + +/** + * hfi1_exp_tid_set_init - initialize exp_tid_set + * @set: the set + */ +static void hfi1_exp_tid_set_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +/** + * hfi1_exp_tid_group_init - initialize rcd expected receive + * @rcd: the rcd + */ +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd) +{ + hfi1_exp_tid_set_init(&rcd->tid_group_list); + hfi1_exp_tid_set_init(&rcd->tid_used_list); + hfi1_exp_tid_set_init(&rcd->tid_full_list); +} + +/** + * hfi1_alloc_ctxt_rcv_groups - initialize expected receive groups + * @rcd: the context to add the groupings to + */ +int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 tidbase; + struct tid_group *grp; + int i; + u32 ngroups; + + ngroups = rcd->expected_count / dd->rcv_entries.group_size; + rcd->groups = + kcalloc_node(ngroups, sizeof(*rcd->groups), + GFP_KERNEL, rcd->numa_id); + if (!rcd->groups) + return -ENOMEM; + tidbase = rcd->expected_base; + for (i = 0; i < ngroups; i++) { + grp = &rcd->groups[i]; + grp->size = dd->rcv_entries.group_size; + grp->base = tidbase; + tid_group_add_tail(grp, &rcd->tid_group_list); + tidbase += dd->rcv_entries.group_size; + } + + return 0; +} + +/** + * hfi1_free_ctxt_rcv_groups - free expected receive groups + * @rcd: the context to free + * + * The routine dismantles the expect receive linked + * list and clears any tids associated with the receive + * context. + * + * This should only be called for kernel contexts and the + * a base user context. + */ +void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) +{ + kfree(rcd->groups); + rcd->groups = NULL; + hfi1_exp_tid_group_init(rcd); + + hfi1_clear_tids(rcd); +} diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h new file mode 100644 index 0000000000..41f7fe5d18 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2017 Intel Corporation. + */ + +#ifndef _HFI1_EXP_RCV_H +#define _HFI1_EXP_RCV_H +#include "hfi.h" + +#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) + +#define EXP_TID_TIDLEN_MASK 0x7FFULL +#define EXP_TID_TIDLEN_SHIFT 0 +#define EXP_TID_TIDCTRL_MASK 0x3ULL +#define EXP_TID_TIDCTRL_SHIFT 20 +#define EXP_TID_TIDIDX_MASK 0x3FFULL +#define EXP_TID_TIDIDX_SHIFT 22 +#define EXP_TID_GET(tid, field) \ + (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) + +#define EXP_TID_SET(field, value) \ + (((value) & EXP_TID_TID##field##_MASK) << \ + EXP_TID_TID##field##_SHIFT) +#define EXP_TID_CLEAR(tid, field) ({ \ + (tid) &= ~(EXP_TID_TID##field##_MASK << \ + EXP_TID_TID##field##_SHIFT); \ + }) +#define EXP_TID_RESET(tid, field, value) do { \ + EXP_TID_CLEAR(tid, field); \ + (tid) |= EXP_TID_SET(field, (value)); \ + } while (0) + +/* + * Define fields in the KDETH header so we can update the header + * template. + */ +#define KDETH_OFFSET_SHIFT 0 +#define KDETH_OFFSET_MASK 0x7fff +#define KDETH_OM_SHIFT 15 +#define KDETH_OM_MASK 0x1 +#define KDETH_TID_SHIFT 16 +#define KDETH_TID_MASK 0x3ff +#define KDETH_TIDCTRL_SHIFT 26 +#define KDETH_TIDCTRL_MASK 0x3 +#define KDETH_INTR_SHIFT 28 +#define KDETH_INTR_MASK 0x1 +#define KDETH_SH_SHIFT 29 +#define KDETH_SH_MASK 0x1 +#define KDETH_KVER_SHIFT 30 +#define KDETH_KVER_MASK 0x3 +#define KDETH_JKEY_SHIFT 0x0 +#define KDETH_JKEY_MASK 0xff +#define KDETH_HCRC_UPPER_SHIFT 16 +#define KDETH_HCRC_UPPER_MASK 0xff +#define KDETH_HCRC_LOWER_SHIFT 24 +#define KDETH_HCRC_LOWER_MASK 0xff + +#define KDETH_GET(val, field) \ + (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) +#define KDETH_SET(dw, field, val) do { \ + u32 dwval = le32_to_cpu(dw); \ + dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ + dwval |= (((val) & KDETH_##field##_MASK) << \ + KDETH_##field##_SHIFT); \ + dw = cpu_to_le32(dwval); \ + } while (0) + +#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); }) + +/* KDETH OM multipliers and switch over point */ +#define KDETH_OM_SMALL 4 +#define KDETH_OM_SMALL_SHIFT 2 +#define KDETH_OM_LARGE 64 +#define KDETH_OM_LARGE_SHIFT 6 +#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) + +struct tid_group { + struct list_head list; + u32 base; + u8 size; + u8 used; + u8 map; +}; + +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) { + writeq(0, dd->rcvarray_wc + (index * 8)); + if ((index & 3) == 3) + flush_wc(); + } +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline u32 create_tid(u32 rcventry, u32 npages) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)) | + EXP_TID_SET(LEN, npages); +} + +/** + * hfi1_tid_group_to_idx - convert an index to a group + * @rcd - the receive context + * @grp - the group pointer + */ +static inline u16 +hfi1_tid_group_to_idx(struct hfi1_ctxtdata *rcd, struct tid_group *grp) +{ + return grp - &rcd->groups[0]; +} + +/** + * hfi1_idx_to_tid_group - convert a group to an index + * @rcd - the receive context + * @idx - the index + */ +static inline struct tid_group * +hfi1_idx_to_tid_group(struct hfi1_ctxtdata *rcd, u16 idx) +{ + return &rcd->groups[idx]; +} + +int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd); + +#endif /* _HFI1_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c new file mode 100644 index 0000000000..3af77a0840 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bitmap.h> + +#include "debugfs.h" +#include "fault.h" +#include "trace.h" + +#define HFI1_FAULT_DIR_TX BIT(0) +#define HFI1_FAULT_DIR_RX BIT(1) +#define HFI1_FAULT_DIR_TXRX (HFI1_FAULT_DIR_TX | HFI1_FAULT_DIR_RX) + +static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void _fault_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _fault_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + + for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); + } + for_each_possible_cpu(j) { + struct hfi1_opcode_stats_perctx *sp = + per_cpu_ptr(dd->tx_opstats, j); + + n_packets += sp->stats[i].n_packets; + n_bytes += sp->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + if (!ibd->fault->n_rxfaults[i] && !ibd->fault->n_txfaults[i]) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i, + (unsigned long long)n_packets, + (unsigned long long)n_bytes, + (unsigned long long)ibd->fault->n_rxfaults[i], + (unsigned long long)ibd->fault->n_txfaults[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(fault_stats); +DEBUGFS_SEQ_FILE_OPEN(fault_stats); +DEBUGFS_FILE_OPS(fault_stats); + +static int fault_opcodes_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + /* 1280 = 256 opcodes * 4 chars/opcode + 255 commas + NULL */ + size_t copy, datalen = 1280; + char *data, *token, *ptr, *end; + struct fault *fault = file->private_data; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + copy = min(len, datalen - 1); + if (copy_from_user(data, buf, copy)) { + ret = -EFAULT; + goto free_data; + } + + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + goto free_data; + ptr = data; + token = ptr; + for (ptr = data; *ptr; ptr = end + 1, token = ptr) { + char *dash; + unsigned long range_start, range_end, i; + bool remove = false; + unsigned long bound = 1U << BITS_PER_BYTE; + + end = strchr(ptr, ','); + if (end) + *end = '\0'; + if (token[0] == '-') { + remove = true; + token++; + } + dash = strchr(token, '-'); + if (dash) + *dash = '\0'; + if (kstrtoul(token, 0, &range_start)) + break; + if (dash) { + token = dash + 1; + if (kstrtoul(token, 0, &range_end)) + break; + } else { + range_end = range_start; + } + if (range_start == range_end && range_start == -1UL) { + bitmap_zero(fault->opcodes, sizeof(fault->opcodes) * + BITS_PER_BYTE); + break; + } + /* Check the inputs */ + if (range_start >= bound || range_end >= bound) + break; + + for (i = range_start; i <= range_end; i++) { + if (remove) + clear_bit(i, fault->opcodes); + else + set_bit(i, fault->opcodes); + } + if (!end) + break; + } + ret = len; + + debugfs_file_put(file->f_path.dentry); +free_data: + kfree(data); + return ret; +} + +static ssize_t fault_opcodes_read(struct file *file, char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + char *data; + size_t datalen = 1280, size = 0; /* see fault_opcodes_write() */ + unsigned long bit = 0, zero = 0; + struct fault *fault = file->private_data; + size_t bitsize = sizeof(fault->opcodes) * BITS_PER_BYTE; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + goto free_data; + bit = find_first_bit(fault->opcodes, bitsize); + while (bit < bitsize) { + zero = find_next_zero_bit(fault->opcodes, bitsize, bit); + if (zero - 1 != bit) + size += scnprintf(data + size, + datalen - size - 1, + "0x%lx-0x%lx,", bit, zero - 1); + else + size += scnprintf(data + size, + datalen - size - 1, "0x%lx,", + bit); + bit = find_next_bit(fault->opcodes, bitsize, zero); + } + debugfs_file_put(file->f_path.dentry); + data[size - 1] = '\n'; + data[size] = '\0'; + ret = simple_read_from_buffer(buf, len, pos, data, size); +free_data: + kfree(data); + return ret; +} + +static const struct file_operations __fault_opcodes_fops = { + .owner = THIS_MODULE, + .open = fault_opcodes_open, + .read = fault_opcodes_read, + .write = fault_opcodes_write, + .llseek = no_llseek +}; + +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + debugfs_remove_recursive(ibd->fault->dir); + kfree(ibd->fault); + ibd->fault = NULL; +} + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + struct dentry *parent = ibd->hfi1_ibdev_dbg; + struct dentry *fault_dir; + + ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL); + if (!ibd->fault) + return -ENOMEM; + + ibd->fault->attr.interval = 1; + ibd->fault->attr.require_end = ULONG_MAX; + ibd->fault->attr.stacktrace_depth = 32; + ibd->fault->attr.dname = NULL; + ibd->fault->attr.verbose = 0; + ibd->fault->enable = false; + ibd->fault->opcode = false; + ibd->fault->fault_skip = 0; + ibd->fault->skip = 0; + ibd->fault->direction = HFI1_FAULT_DIR_TXRX; + ibd->fault->suppress_err = false; + bitmap_zero(ibd->fault->opcodes, + sizeof(ibd->fault->opcodes) * BITS_PER_BYTE); + + fault_dir = + fault_create_debugfs_attr("fault", parent, &ibd->fault->attr); + if (IS_ERR(fault_dir)) { + kfree(ibd->fault); + ibd->fault = NULL; + return -ENOENT; + } + ibd->fault->dir = fault_dir; + + debugfs_create_file("fault_stats", 0444, fault_dir, ibd, + &_fault_stats_file_ops); + debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable); + debugfs_create_bool("suppress_err", 0600, fault_dir, + &ibd->fault->suppress_err); + debugfs_create_bool("opcode_mode", 0600, fault_dir, + &ibd->fault->opcode); + debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault, + &__fault_opcodes_fops); + debugfs_create_u64("skip_pkts", 0600, fault_dir, + &ibd->fault->fault_skip); + debugfs_create_u64("skip_usec", 0600, fault_dir, + &ibd->fault->fault_skip_usec); + debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction); + + return 0; +} + +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + return ibd->fault->suppress_err; + return false; +} + +static bool __hfi1_should_fault(struct hfi1_ibdev *ibd, u32 opcode, + u8 direction) +{ + bool ret = false; + + if (!ibd->fault || !ibd->fault->enable) + return false; + if (!(ibd->fault->direction & direction)) + return false; + if (ibd->fault->opcode) { + if (bitmap_empty(ibd->fault->opcodes, + (sizeof(ibd->fault->opcodes) * + BITS_PER_BYTE))) + return false; + if (!(test_bit(opcode, ibd->fault->opcodes))) + return false; + } + if (ibd->fault->fault_skip_usec && + time_before(jiffies, ibd->fault->skip_usec)) + return false; + if (ibd->fault->fault_skip && ibd->fault->skip) { + ibd->fault->skip--; + return false; + } + ret = should_fail(&ibd->fault->attr, 1); + if (ret) { + ibd->fault->skip = ibd->fault->fault_skip; + ibd->fault->skip_usec = jiffies + + usecs_to_jiffies(ibd->fault->fault_skip_usec); + } + return ret; +} + +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) +{ + struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device); + + if (__hfi1_should_fault(ibd, opcode, HFI1_FAULT_DIR_TX)) { + trace_hfi1_fault_opcode(qp, opcode); + ibd->fault->n_txfaults[opcode]++; + return true; + } + return false; +} + +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + struct hfi1_ibdev *ibd = &packet->rcd->dd->verbs_dev; + + if (__hfi1_should_fault(ibd, packet->opcode, HFI1_FAULT_DIR_RX)) { + trace_hfi1_fault_packet(packet); + ibd->fault->n_rxfaults[packet->opcode]++; + return true; + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/fault.h b/drivers/infiniband/hw/hfi1/fault.h new file mode 100644 index 0000000000..7fe7f47219 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _HFI1_FAULT_H +#define _HFI1_FAULT_H + +#include <linux/fault-inject.h> +#include <linux/dcache.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <rdma/rdma_vt.h> + +#include "hfi.h" + +struct hfi1_ibdev; + +#if defined(CONFIG_FAULT_INJECTION) && defined(CONFIG_FAULT_INJECTION_DEBUG_FS) +struct fault { + struct fault_attr attr; + struct dentry *dir; + u64 n_rxfaults[(1U << BITS_PER_BYTE)]; + u64 n_txfaults[(1U << BITS_PER_BYTE)]; + u64 fault_skip; + u64 skip; + u64 fault_skip_usec; + unsigned long skip_usec; + unsigned long opcodes[(1U << BITS_PER_BYTE) / BITS_PER_LONG]; + bool enable; + bool suppress_err; + bool opcode; + u8 direction; +}; + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd); +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode); +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet); +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd); +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd); + +#else + +static inline int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + return 0; +} + +static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + return false; +} + +static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, + u32 opcode) +{ + return false; +} + +static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + return false; +} + +static inline void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ +} +#endif +#endif /* _HFI1_FAULT_H */ diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c new file mode 100644 index 0000000000..a5ab22cedd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -0,0 +1,1714 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2020 Intel Corporation. + */ + +#include <linux/poll.h> +#include <linux/cdev.h> +#include <linux/vmalloc.h> +#include <linux/io.h> +#include <linux/sched/mm.h> +#include <linux/bitmap.h> + +#include <rdma/ib.h> + +#include "hfi.h" +#include "pio.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "mmu_rb.h" +#include "user_sdma.h" +#include "user_exp_rcv.h" +#include "aspm.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */ + +/* + * File operation functions + */ +static int hfi1_file_open(struct inode *inode, struct file *fp); +static int hfi1_file_close(struct inode *inode, struct file *fp); +static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from); +static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt); +static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma); + +static u64 kvirt_to_phys(void *addr); +static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len); +static void init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo); +static int init_user_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); +static void user_init(struct hfi1_ctxtdata *uctxt); +static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len); +static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len); +static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, + u32 len); +static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg, + u32 len); +static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg, + u32 len); +static int setup_base_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); +static int setup_subctxt(struct hfi1_ctxtdata *uctxt); + +static int find_sub_ctxt(struct hfi1_filedata *fd, + const struct hfi1_user_info *uinfo); +static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, + struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata **cd); +static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt); +static __poll_t poll_urgent(struct file *fp, struct poll_table_struct *pt); +static __poll_t poll_next(struct file *fp, struct poll_table_struct *pt); +static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt, + unsigned long arg); +static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg); +static int ctxt_reset(struct hfi1_ctxtdata *uctxt); +static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt, + unsigned long arg); +static vm_fault_t vma_fault(struct vm_fault *vmf); +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg); + +static const struct file_operations hfi1_file_ops = { + .owner = THIS_MODULE, + .write_iter = hfi1_write_iter, + .open = hfi1_file_open, + .release = hfi1_file_close, + .unlocked_ioctl = hfi1_file_ioctl, + .poll = hfi1_poll, + .mmap = hfi1_file_mmap, + .llseek = noop_llseek, +}; + +static const struct vm_operations_struct vm_ops = { + .fault = vma_fault, +}; + +/* + * Types of memories mapped into user processes' space + */ +enum mmap_types { + PIO_BUFS = 1, + PIO_BUFS_SOP, + PIO_CRED, + RCV_HDRQ, + RCV_EGRBUF, + UREGS, + EVENTS, + STATUS, + RTAIL, + SUBCTXT_UREGS, + SUBCTXT_RCV_HDRQ, + SUBCTXT_EGRBUF, + SDMA_COMP +}; + +/* + * Masks and offsets defining the mmap tokens + */ +#define HFI1_MMAP_OFFSET_MASK 0xfffULL +#define HFI1_MMAP_OFFSET_SHIFT 0 +#define HFI1_MMAP_SUBCTXT_MASK 0xfULL +#define HFI1_MMAP_SUBCTXT_SHIFT 12 +#define HFI1_MMAP_CTXT_MASK 0xffULL +#define HFI1_MMAP_CTXT_SHIFT 16 +#define HFI1_MMAP_TYPE_MASK 0xfULL +#define HFI1_MMAP_TYPE_SHIFT 24 +#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL +#define HFI1_MMAP_MAGIC_SHIFT 32 + +#define HFI1_MMAP_MAGIC 0xdabbad00 + +#define HFI1_MMAP_TOKEN_SET(field, val) \ + (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT) +#define HFI1_MMAP_TOKEN_GET(field, token) \ + (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK) +#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \ + (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \ + HFI1_MMAP_TOKEN_SET(TYPE, type) | \ + HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \ + HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \ + HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr)))) + +#define dbg(fmt, ...) \ + pr_info(fmt, ##__VA_ARGS__) + +static inline int is_valid_mmap(u64 token) +{ + return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC); +} + +static int hfi1_file_open(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fd; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + + if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1)) + return -EINVAL; + + if (!refcount_inc_not_zero(&dd->user_refcount)) + return -ENXIO; + + /* The real work is performed later in assign_ctxt() */ + + fd = kzalloc(sizeof(*fd), GFP_KERNEL); + + if (!fd || init_srcu_struct(&fd->pq_srcu)) + goto nomem; + spin_lock_init(&fd->pq_rcu_lock); + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->rec_cpu_num = -1; /* no cpu affinity by default */ + fd->dd = dd; + fp->private_data = fd; + return 0; +nomem: + kfree(fd); + fp->private_data = NULL; + if (refcount_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + return -ENOMEM; +} + +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + int ret = 0; + int uval = 0; + + hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd); + if (cmd != HFI1_IOCTL_ASSIGN_CTXT && + cmd != HFI1_IOCTL_GET_VERS && + !uctxt) + return -EINVAL; + + switch (cmd) { + case HFI1_IOCTL_ASSIGN_CTXT: + ret = assign_ctxt(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_CTXT_INFO: + ret = get_ctxt_info(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_USER_INFO: + ret = get_base_info(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_CREDIT_UPD: + if (uctxt) + sc_return_credits(uctxt->sc); + break; + + case HFI1_IOCTL_TID_UPDATE: + ret = user_exp_rcv_setup(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_TID_FREE: + ret = user_exp_rcv_clear(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_TID_INVAL_READ: + ret = user_exp_rcv_invalid(fd, arg, _IOC_SIZE(cmd)); + break; + + case HFI1_IOCTL_RECV_CTRL: + ret = manage_rcvq(uctxt, fd->subctxt, arg); + break; + + case HFI1_IOCTL_POLL_TYPE: + if (get_user(uval, (int __user *)arg)) + return -EFAULT; + uctxt->poll_type = (typeof(uctxt->poll_type))uval; + break; + + case HFI1_IOCTL_ACK_EVENT: + ret = user_event_ack(uctxt, fd->subctxt, arg); + break; + + case HFI1_IOCTL_SET_PKEY: + ret = set_ctxt_pkey(uctxt, arg); + break; + + case HFI1_IOCTL_CTXT_RESET: + ret = ctxt_reset(uctxt); + break; + + case HFI1_IOCTL_GET_VERS: + uval = HFI1_USER_SWVERSION; + if (put_user(uval, (int __user *)arg)) + return -EFAULT; + break; + + default: + return -EINVAL; + } + + return ret; +} + +static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) +{ + struct hfi1_filedata *fd = kiocb->ki_filp->private_data; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq = fd->cq; + int done = 0, reqs = 0; + unsigned long dim = from->nr_segs; + int idx; + + if (!HFI1_CAP_IS_KSET(SDMA)) + return -EINVAL; + if (!from->user_backed) + return -EINVAL; + idx = srcu_read_lock(&fd->pq_srcu); + pq = srcu_dereference(fd->pq, &fd->pq_srcu); + if (!cq || !pq) { + srcu_read_unlock(&fd->pq_srcu, idx); + return -EIO; + } + + trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim); + + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { + srcu_read_unlock(&fd->pq_srcu, idx); + return -ENOSPC; + } + + while (dim) { + const struct iovec *iov = iter_iov(from); + int ret; + unsigned long count = 0; + + ret = hfi1_user_sdma_process_request( + fd, (struct iovec *)(iov + done), + dim, &count); + if (ret) { + reqs = ret; + break; + } + dim -= count; + done += count; + reqs++; + } + + srcu_read_unlock(&fd->pq_srcu, idx); + return reqs; +} + +static inline void mmap_cdbg(u16 ctxt, u8 subctxt, u8 type, u8 mapio, u8 vmf, + u64 memaddr, void *memvirt, dma_addr_t memdma, + ssize_t memlen, struct vm_area_struct *vma) +{ + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf/dma:%d/%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx", + ctxt, subctxt, type, mapio, vmf, !!memdma, + memaddr ?: (u64)memvirt, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); +} + +static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd; + unsigned long flags; + u64 token = vma->vm_pgoff << PAGE_SHIFT, + memaddr = 0; + void *memvirt = NULL; + dma_addr_t memdma = 0; + u8 subctxt, mapio = 0, vmf = 0, type; + ssize_t memlen = 0; + int ret = 0; + u16 ctxt; + + if (!is_valid_mmap(token) || !uctxt || + !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto done; + } + dd = uctxt->dd; + ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token); + subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token); + type = HFI1_MMAP_TOKEN_GET(TYPE, token); + if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) { + ret = -EINVAL; + goto done; + } + + /* + * vm_pgoff is used as a buffer selector cookie. Always mmap from + * the beginning. + */ + vma->vm_pgoff = 0; + flags = vma->vm_flags; + + switch (type) { + case PIO_BUFS: + case PIO_BUFS_SOP: + memaddr = ((dd->physaddr + TXE_PIO_SEND) + + /* chip pio base */ + (uctxt->sc->hw_context * BIT(16))) + + /* 64K PIO space / ctxt */ + (type == PIO_BUFS_SOP ? + (TXE_PIO_SIZE / 2) : 0); /* sop? */ + /* + * Map only the amount allocated to the context, not the + * entire available context's PIO space. + */ + memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE); + flags &= ~VM_MAYREAD; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + mapio = 1; + break; + case PIO_CRED: { + u64 cr_page_offset; + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + /* + * The credit return location for this context could be on the + * second or third page allocated for credit returns (if number + * of enabled contexts > 64 and 128 respectively). + */ + cr_page_offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & + PAGE_MASK; + memvirt = dd->cr_base[uctxt->numa_id].va + cr_page_offset; + memdma = dd->cr_base[uctxt->numa_id].dma + cr_page_offset; + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + /* + * The driver has already allocated memory for credit + * returns and programmed it into the chip. Has that + * memory been flagged as non-cached? + */ + /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ + break; + } + case RCV_HDRQ: + memlen = rcvhdrq_size(uctxt); + memvirt = uctxt->rcvhdrq; + memdma = uctxt->rcvhdrq_dma; + break; + case RCV_EGRBUF: { + unsigned long vm_start_save; + unsigned long vm_end_save; + int i; + /* + * The RcvEgr buffer need to be handled differently + * as multiple non-contiguous pages need to be mapped + * into the user process. + */ + memlen = uctxt->egrbufs.size; + if ((vma->vm_end - vma->vm_start) != memlen) { + dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n", + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + if (vma->vm_flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + vm_flags_clear(vma, VM_MAYWRITE); + /* + * Mmap multiple separate allocations into a single vma. From + * here, dma_mmap_coherent() calls dma_direct_mmap(), which + * requires the mmap to exactly fill the vma starting at + * vma_start. Adjust the vma start and end for each eager + * buffer segment mapped. Restore the originals when done. + */ + vm_start_save = vma->vm_start; + vm_end_save = vma->vm_end; + vma->vm_end = vma->vm_start; + for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { + memlen = uctxt->egrbufs.buffers[i].len; + memvirt = uctxt->egrbufs.buffers[i].addr; + memdma = uctxt->egrbufs.buffers[i].dma; + vma->vm_end += memlen; + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, + memvirt, memdma, memlen, vma); + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); + if (ret < 0) { + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; + goto done; + } + vma->vm_start += memlen; + } + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; + ret = 0; + goto done; + } + case UREGS: + /* + * Map only the page that contains this context's user + * registers. + */ + memaddr = (unsigned long) + (dd->physaddr + RXE_PER_CONTEXT_USER) + + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE); + /* + * TidFlow table is on the same page as the rest of the + * user registers. + */ + memlen = PAGE_SIZE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + mapio = 1; + break; + case EVENTS: + /* + * Use the page where this context's flags are. User level + * knows where it's own bitmap is within the page. + */ + memaddr = (unsigned long) + (dd->events + uctxt_offset(uctxt)) & PAGE_MASK; + memlen = PAGE_SIZE; + /* + * v3.7 removes VM_RESERVED but the effect is kept by + * using VM_IO. + */ + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case STATUS: + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + memaddr = kvirt_to_phys((void *)dd->status); + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + break; + case RTAIL: + if (!HFI1_CAP_IS_USET(DMA_RTAIL)) { + /* + * If the memory allocation failed, the context alloc + * also would have failed, so we would never get here + */ + ret = -EINVAL; + goto done; + } + if ((flags & VM_WRITE) || !hfi1_rcvhdrtail_kvaddr(uctxt)) { + ret = -EPERM; + goto done; + } + memlen = PAGE_SIZE; + memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt); + memdma = uctxt->rcvhdrqtailaddr_dma; + flags &= ~VM_MAYWRITE; + break; + case SUBCTXT_UREGS: + memaddr = (u64)uctxt->subctxt_uregbase; + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_RCV_HDRQ: + memaddr = (u64)uctxt->subctxt_rcvhdr_base; + memlen = rcvhdrq_size(uctxt) * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_EGRBUF: + memaddr = (u64)uctxt->subctxt_rcvegrbuf; + memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + flags &= ~VM_MAYWRITE; + vmf = 1; + break; + case SDMA_COMP: { + struct hfi1_user_sdma_comp_q *cq = fd->cq; + + if (!cq) { + ret = -EFAULT; + goto done; + } + memaddr = (u64)cq->comps; + memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries); + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + } + default: + ret = -EINVAL; + break; + } + + if ((vma->vm_end - vma->vm_start) != memlen) { + hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu", + uctxt->ctxt, fd->subctxt, + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + + vm_flags_reset(vma, flags); + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma, + memlen, vma); + if (vmf) { + vma->vm_pgoff = PFN_DOWN(memaddr); + vma->vm_ops = &vm_ops; + ret = 0; + } else if (memdma) { + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); + } else if (mapio) { + ret = io_remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(memaddr), + memlen, + vma->vm_page_prot); + } else if (memvirt) { + ret = remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(__pa(memvirt)), + memlen, + vma->vm_page_prot); + } else { + ret = remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(memaddr), + memlen, + vma->vm_page_prot); + } +done: + return ret; +} + +/* + * Local (non-chip) user memory is not mapped right away but as it is + * accessed by the user-level code. + */ +static vm_fault_t vma_fault(struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt; + __poll_t pollflag; + + uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt; + if (!uctxt) + pollflag = EPOLLERR; + else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT) + pollflag = poll_urgent(fp, pt); + else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV) + pollflag = poll_next(fp, pt); + else /* invalid */ + pollflag = EPOLLERR; + + return pollflag; +} + +static int hfi1_file_close(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fdata = fp->private_data; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + unsigned long flags, *ev; + + fp->private_data = NULL; + + if (!uctxt) + goto done; + + hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); + + flush_wc(); + /* drain user sdma queue */ + hfi1_user_sdma_free_queues(fdata, uctxt); + + /* release the cpu */ + hfi1_put_proc_affinity(fdata->rec_cpu_num); + + /* clean up rcv side */ + hfi1_user_exp_rcv_free(fdata); + + /* + * fdata->uctxt is used in the above cleanup. It is not ready to be + * removed until here. + */ + fdata->uctxt = NULL; + hfi1_rcd_put(uctxt); + + /* + * Clear any left over, unhandled events so the next process that + * gets this context doesn't get confused. + */ + ev = dd->events + uctxt_offset(uctxt) + fdata->subctxt; + *ev = 0; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + __clear_bit(fdata->subctxt, uctxt->in_use_ctxts); + if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) { + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + goto done; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS | + HFI1_RCVCTRL_URGENT_DIS, uctxt); + /* Clear the context's J_KEY */ + hfi1_clear_ctxt_jkey(dd, uctxt); + /* + * If a send context is allocated, reset context integrity + * checks to default and disable the send context. + */ + if (uctxt->sc) { + sc_disable(uctxt->sc); + set_pio_integrity(uctxt->sc); + } + + hfi1_free_ctxt_rcv_groups(uctxt); + hfi1_clear_ctxt_pkey(dd, uctxt); + + uctxt->event_flags = 0; + + deallocate_ctxt(uctxt); +done: + + if (refcount_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + + cleanup_srcu_struct(&fdata->pq_srcu); + kfree(fdata); + return 0; +} + +/* + * Convert kernel *virtual* addresses to physical addresses. + * This is used to vmalloc'ed addresses. + */ +static u64 kvirt_to_phys(void *addr) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(addr); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +/** + * complete_subctxt - complete sub-context info + * @fd: valid filedata pointer + * + * Sub-context info can only be set up after the base context + * has been completed. This is indicated by the clearing of the + * HFI1_CTXT_BASE_UINIT bit. + * + * Wait for the bit to be cleared, and then complete the subcontext + * initialization. + * + */ +static int complete_subctxt(struct hfi1_filedata *fd) +{ + int ret; + unsigned long flags; + + /* + * sub-context info can only be set up after the base context + * has been completed. + */ + ret = wait_event_interruptible( + fd->uctxt->wait, + !test_bit(HFI1_CTXT_BASE_UNINIT, &fd->uctxt->event_flags)); + + if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags)) + ret = -ENOMEM; + + /* Finish the sub-context init */ + if (!ret) { + fd->rec_cpu_num = hfi1_get_proc_affinity(fd->uctxt->numa_id); + ret = init_user_ctxt(fd, fd->uctxt); + } + + if (ret) { + spin_lock_irqsave(&fd->dd->uctxt_lock, flags); + __clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts); + spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags); + hfi1_rcd_put(fd->uctxt); + fd->uctxt = NULL; + } + + return ret; +} + +static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len) +{ + int ret; + unsigned int swmajor; + struct hfi1_ctxtdata *uctxt = NULL; + struct hfi1_user_info uinfo; + + if (fd->uctxt) + return -EINVAL; + + if (sizeof(uinfo) != len) + return -EINVAL; + + if (copy_from_user(&uinfo, (void __user *)arg, sizeof(uinfo))) + return -EFAULT; + + swmajor = uinfo.userversion >> 16; + if (swmajor != HFI1_USER_SWMAJOR) + return -ENODEV; + + if (uinfo.subctxt_cnt > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; + + /* + * Acquire the mutex to protect against multiple creations of what + * could be a shared base context. + */ + mutex_lock(&hfi1_mutex); + /* + * Get a sub context if available (fd->uctxt will be set). + * ret < 0 error, 0 no context, 1 sub-context found + */ + ret = find_sub_ctxt(fd, &uinfo); + + /* + * Allocate a base context if context sharing is not required or a + * sub context wasn't found. + */ + if (!ret) + ret = allocate_ctxt(fd, fd->dd, &uinfo, &uctxt); + + mutex_unlock(&hfi1_mutex); + + /* Depending on the context type, finish the appropriate init */ + switch (ret) { + case 0: + ret = setup_base_ctxt(fd, uctxt); + if (ret) + deallocate_ctxt(uctxt); + break; + case 1: + ret = complete_subctxt(fd); + break; + default: + break; + } + + return ret; +} + +/** + * match_ctxt - match context + * @fd: valid filedata pointer + * @uinfo: user info to compare base context with + * @uctxt: context to compare uinfo to. + * + * Compare the given context with the given information to see if it + * can be used for a sub context. + */ +static int match_ctxt(struct hfi1_filedata *fd, + const struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata *uctxt) +{ + struct hfi1_devdata *dd = fd->dd; + unsigned long flags; + u16 subctxt; + + /* Skip dynamically allocated kernel contexts */ + if (uctxt->sc && (uctxt->sc->type == SC_KERNEL)) + return 0; + + /* Skip ctxt if it doesn't match the requested one */ + if (memcmp(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)) || + uctxt->jkey != generate_jkey(current_uid()) || + uctxt->subctxt_id != uinfo->subctxt_id || + uctxt->subctxt_cnt != uinfo->subctxt_cnt) + return 0; + + /* Verify the sharing process matches the base */ + if (uctxt->userversion != uinfo->userversion) + return -EINVAL; + + /* Find an unused sub context */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) { + /* context is being closed, do not use */ + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + return 0; + } + + subctxt = find_first_zero_bit(uctxt->in_use_ctxts, + HFI1_MAX_SHARED_CTXTS); + if (subctxt >= uctxt->subctxt_cnt) { + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + return -EBUSY; + } + + fd->subctxt = subctxt; + __set_bit(fd->subctxt, uctxt->in_use_ctxts); + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + fd->uctxt = uctxt; + hfi1_rcd_get(uctxt); + + return 1; +} + +/** + * find_sub_ctxt - fund sub-context + * @fd: valid filedata pointer + * @uinfo: matching info to use to find a possible context to share. + * + * The hfi1_mutex must be held when this function is called. It is + * necessary to ensure serialized creation of shared contexts. + * + * Return: + * 0 No sub-context found + * 1 Subcontext found and allocated + * errno EINVAL (incorrect parameters) + * EBUSY (all sub contexts in use) + */ +static int find_sub_ctxt(struct hfi1_filedata *fd, + const struct hfi1_user_info *uinfo) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = fd->dd; + u16 i; + int ret; + + if (!uinfo->subctxt_cnt) + return 0; + + for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) { + uctxt = hfi1_rcd_get_by_index(dd, i); + if (uctxt) { + ret = match_ctxt(fd, uinfo, uctxt); + hfi1_rcd_put(uctxt); + /* value of != 0 will return */ + if (ret) + return ret; + } + } + + return 0; +} + +static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, + struct hfi1_user_info *uinfo, + struct hfi1_ctxtdata **rcd) +{ + struct hfi1_ctxtdata *uctxt; + int ret, numa; + + if (dd->flags & HFI1_FROZEN) { + /* + * Pick an error that is unique from all other errors + * that are returned so the user process knows that + * it tried to allocate while the SPC was frozen. It + * it should be able to retry with success in a short + * while. + */ + return -EIO; + } + + if (!dd->freectxts) + return -EBUSY; + + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node. + */ + fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node); + if (fd->rec_cpu_num != -1) + numa = cpu_to_node(fd->rec_cpu_num); + else + numa = numa_node_id(); + ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt); + if (ret < 0) { + dd_dev_err(dd, "user ctxtdata allocation failed\n"); + return ret; + } + hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)", + uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num, + uctxt->numa_id); + + /* + * Allocate and enable a PIO send context. + */ + uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, dd->node); + if (!uctxt->sc) { + ret = -ENOMEM; + goto ctxdata_free; + } + hfi1_cdbg(PROC, "allocated send context %u(%u)", uctxt->sc->sw_index, + uctxt->sc->hw_context); + ret = sc_enable(uctxt->sc); + if (ret) + goto ctxdata_free; + + /* + * Setup sub context information if the user-level has requested + * sub contexts. + * This has to be done here so the rest of the sub-contexts find the + * proper base context. + * NOTE: _set_bit() can be used here because the context creation is + * protected by the mutex (rather than the spin_lock), and will be the + * very first instance of this context. + */ + __set_bit(0, uctxt->in_use_ctxts); + if (uinfo->subctxt_cnt) + init_subctxts(uctxt, uinfo); + uctxt->userversion = uinfo->userversion; + uctxt->flags = hfi1_cap_mask; /* save current flag state */ + init_waitqueue_head(&uctxt->wait); + strscpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); + memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); + uctxt->jkey = generate_jkey(current_uid()); + hfi1_stats.sps_ctxts++; + /* + * Disable ASPM when there are open user/PSM contexts to avoid + * issues with ASPM L1 exit latency + */ + if (dd->freectxts-- == dd->num_user_contexts) + aspm_disable_all(dd); + + *rcd = uctxt; + + return 0; + +ctxdata_free: + hfi1_free_ctxt(uctxt); + return ret; +} + +static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt) +{ + mutex_lock(&hfi1_mutex); + hfi1_stats.sps_ctxts--; + if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts) + aspm_enable_all(uctxt->dd); + mutex_unlock(&hfi1_mutex); + + hfi1_free_ctxt(uctxt); +} + +static void init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo) +{ + uctxt->subctxt_cnt = uinfo->subctxt_cnt; + uctxt->subctxt_id = uinfo->subctxt_id; + set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); +} + +static int setup_subctxt(struct hfi1_ctxtdata *uctxt) +{ + int ret = 0; + u16 num_subctxts = uctxt->subctxt_cnt; + + uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE); + if (!uctxt->subctxt_uregbase) + return -ENOMEM; + + /* We can take the size of the RcvHdr Queue from the master */ + uctxt->subctxt_rcvhdr_base = vmalloc_user(rcvhdrq_size(uctxt) * + num_subctxts); + if (!uctxt->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size * + num_subctxts); + if (!uctxt->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + return 0; + +bail_rhdr: + vfree(uctxt->subctxt_rcvhdr_base); + uctxt->subctxt_rcvhdr_base = NULL; +bail_ureg: + vfree(uctxt->subctxt_uregbase); + uctxt->subctxt_uregbase = NULL; + + return ret; +} + +static void user_init(struct hfi1_ctxtdata *uctxt) +{ + unsigned int rcvctrl_ops = 0; + + /* initialize poll variables... */ + uctxt->urgent = 0; + uctxt->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (hfi1_rcvhdrtail_kvaddr(uctxt)) + clear_rcvhdrtail(uctxt); + + /* Setup J_KEY before enabling the context */ + hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) + rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; + /* + * Ignore the bit in the flags for now until proper + * support for multiple packet per rcv array entry is + * added. + */ + if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + /* + * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. + * We can't rely on the correct value to be set from prior + * uses of the chip or ctxt. Therefore, add the rcvctrl op + * for both cases. + */ + if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + else + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); +} + +static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) +{ + struct hfi1_ctxt_info cinfo; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + if (sizeof(cinfo) != len) + return -EINVAL; + + memset(&cinfo, 0, sizeof(cinfo)); + cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) & + HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | + HFI1_CAP_KGET_MASK(uctxt->flags, K2U); + /* adjust flag if this fd is not able to cache */ + if (!fd->use_mn) + cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */ + + cinfo.num_active = hfi1_count_active_units(); + cinfo.unit = uctxt->dd->unit; + cinfo.ctxt = uctxt->ctxt; + cinfo.subctxt = fd->subctxt; + cinfo.rcvtids = roundup(uctxt->egrbufs.alloced, + uctxt->dd->rcv_entries.group_size) + + uctxt->expected_count; + cinfo.credits = uctxt->sc->credits; + cinfo.numa_node = uctxt->numa_id; + cinfo.rec_cpu = fd->rec_cpu_num; + cinfo.send_ctxt = uctxt->sc->hw_context; + + cinfo.egrtids = uctxt->egrbufs.alloced; + cinfo.rcvhdrq_cnt = get_hdrq_cnt(uctxt); + cinfo.rcvhdrq_entsize = get_hdrqentsize(uctxt) << 2; + cinfo.sdma_ring_size = fd->cq->nentries; + cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; + + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo); + if (copy_to_user((void __user *)arg, &cinfo, len)) + return -EFAULT; + + return 0; +} + +static int init_user_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) +{ + int ret; + + ret = hfi1_user_sdma_alloc_queues(uctxt, fd); + if (ret) + return ret; + + ret = hfi1_user_exp_rcv_init(fd, uctxt); + if (ret) + hfi1_user_sdma_free_queues(fd, uctxt); + + return ret; +} + +static int setup_base_ctxt(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) +{ + struct hfi1_devdata *dd = uctxt->dd; + int ret = 0; + + hfi1_init_ctxt(uctxt->sc); + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + + /* If sub-contexts are enabled, do the appropriate setup */ + if (uctxt->subctxt_cnt) + ret = setup_subctxt(uctxt); + if (ret) + goto done; + + ret = hfi1_alloc_ctxt_rcv_groups(uctxt); + if (ret) + goto done; + + ret = init_user_ctxt(fd, uctxt); + if (ret) { + hfi1_free_ctxt_rcv_groups(uctxt); + goto done; + } + + user_init(uctxt); + + /* Now that the context is set up, the fd can get a reference. */ + fd->uctxt = uctxt; + hfi1_rcd_get(uctxt); + +done: + if (uctxt->subctxt_cnt) { + /* + * On error, set the failed bit so sub-contexts will clean up + * correctly. + */ + if (ret) + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); + + /* + * Base context is done (successfully or not), notify anybody + * using a sub-context that is waiting for this completion. + */ + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } + + return ret; +} + +static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) +{ + struct hfi1_base_info binfo; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned offset; + + trace_hfi1_uctxtdata(uctxt->dd, uctxt, fd->subctxt); + + if (sizeof(binfo) != len) + return -EINVAL; + + memset(&binfo, 0, sizeof(binfo)); + binfo.hw_version = dd->revision; + binfo.sw_version = HFI1_USER_SWVERSION; + binfo.bthqp = RVT_KDETH_QP_PREFIX; + binfo.jkey = uctxt->jkey; + /* + * If more than 64 contexts are enabled the allocated credit + * return will span two or three contiguous pages. Since we only + * map the page containing the context's credit return address, + * we need to calculate the offset in the proper page. + */ + offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE; + binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt, + fd->subctxt, offset); + binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP, + uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt, + fd->subctxt, + uctxt->rcvhdrq); + binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt, + fd->subctxt, + uctxt->egrbufs.rcvtids[0].dma); + binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt, + fd->subctxt, 0); + /* + * user regs are at + * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE)) + */ + binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt, + fd->subctxt, 0); + offset = offset_in_page((uctxt_offset(uctxt) + fd->subctxt) * + sizeof(*dd->events)); + binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt, + fd->subctxt, + offset); + binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt, + fd->subctxt, + dd->status); + if (HFI1_CAP_IS_USET(DMA_RTAIL)) + binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt, + fd->subctxt, 0); + if (uctxt->subctxt_cnt) { + binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF, + uctxt->ctxt, + fd->subctxt, 0); + } + + if (copy_to_user((void __user *)arg, &binfo, len)) + return -EFAULT; + + return 0; +} + +/** + * user_exp_rcv_setup - Set up the given tid rcv list + * @fd: file data of the current driver instance + * @arg: ioctl argumnent for user space information + * @len: length of data structure associated with ioctl command + * + * Wrapper to validate ioctl information before doing _rcv_setup. + * + */ +static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, + u32 len) +{ + int ret; + unsigned long addr; + struct hfi1_tid_info tinfo; + + if (sizeof(tinfo) != len) + return -EINVAL; + + if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo)))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_setup(fd, &tinfo); + if (!ret) { + /* + * Copy the number of tidlist entries we used + * and the length of the buffer we registered. + */ + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + + addr = arg + offsetof(struct hfi1_tid_info, length); + if (!ret && copy_to_user((void __user *)addr, &tinfo.length, + sizeof(tinfo.length))) + ret = -EFAULT; + + if (ret) + hfi1_user_exp_rcv_invalid(fd, &tinfo); + } + + return ret; +} + +/** + * user_exp_rcv_clear - Clear the given tid rcv list + * @fd: file data of the current driver instance + * @arg: ioctl argumnent for user space information + * @len: length of data structure associated with ioctl command + * + * The hfi1_user_exp_rcv_clear() can be called from the error path. Because + * of this, we need to use this wrapper to copy the user space information + * before doing the clear. + */ +static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg, + u32 len) +{ + int ret; + unsigned long addr; + struct hfi1_tid_info tinfo; + + if (sizeof(tinfo) != len) + return -EINVAL; + + if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo)))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_clear(fd, &tinfo); + if (!ret) { + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + return -EFAULT; + } + + return ret; +} + +/** + * user_exp_rcv_invalid - Invalidate the given tid rcv list + * @fd: file data of the current driver instance + * @arg: ioctl argumnent for user space information + * @len: length of data structure associated with ioctl command + * + * Wrapper to validate ioctl information before doing _rcv_invalid. + * + */ +static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg, + u32 len) +{ + int ret; + unsigned long addr; + struct hfi1_tid_info tinfo; + + if (sizeof(tinfo) != len) + return -EINVAL; + + if (!fd->invalid_tids) + return -EINVAL; + + if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo)))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_invalid(fd, &tinfo); + if (ret) + return ret; + + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + + return ret; +} + +static __poll_t poll_urgent(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + __poll_t pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (uctxt->urgent != uctxt->urgent_poll) { + pollflag = EPOLLIN | EPOLLRDNORM; + uctxt->urgent_poll = uctxt->urgent; + } else { + pollflag = 0; + set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static __poll_t poll_next(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + __poll_t pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (hdrqempty(uctxt)) { + set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt); + pollflag = 0; + } else { + pollflag = EPOLLIN | EPOLLRDNORM; + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = ppd->dd; + u16 ctxt; + + if (!dd->events) + return -EINVAL; + + for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts; + ctxt++) { + uctxt = hfi1_rcd_get_by_index(dd, ctxt); + if (uctxt) { + unsigned long *evs; + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + evs = dd->events + uctxt_offset(uctxt); + set_bit(evtbit, evs); + for (i = 1; i < uctxt->subctxt_cnt; i++) + set_bit(evtbit, evs + i); + hfi1_rcd_put(uctxt); + } + } + + return 0; +} + +/** + * manage_rcvq - manage a context's receive queue + * @uctxt: the context + * @subctxt: the sub-context + * @arg: start/stop action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt, + unsigned long arg) +{ + struct hfi1_devdata *dd = uctxt->dd; + unsigned int rcvctrl_op; + int start_stop; + + if (subctxt) + return 0; + + if (get_user(start_stop, (int __user *)arg)) + return -EFAULT; + + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (hfi1_rcvhdrtail_kvaddr(uctxt)) + clear_rcvhdrtail(uctxt); + rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB; + } else { + rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; + } + hfi1_rcvctrl(dd, rcvctrl_op, uctxt); + /* always; new head should be equal to new tail; see above */ + + return 0; +} + +/* + * clear the event notifier events for this context. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt, + unsigned long arg) +{ + int i; + struct hfi1_devdata *dd = uctxt->dd; + unsigned long *evs; + unsigned long events; + + if (!dd->events) + return 0; + + if (get_user(events, (unsigned long __user *)arg)) + return -EFAULT; + + evs = dd->events + uctxt_offset(uctxt) + subctxt; + + for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + clear_bit(i, evs); + } + return 0; +} + +static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg) +{ + int i; + struct hfi1_pportdata *ppd = uctxt->ppd; + struct hfi1_devdata *dd = uctxt->dd; + u16 pkey; + + if (!HFI1_CAP_IS_USET(PKEY_CHECK)) + return -EPERM; + + if (get_user(pkey, (u16 __user *)arg)) + return -EFAULT; + + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) + if (pkey == ppd->pkeys[i]) + return hfi1_set_ctxt_pkey(dd, uctxt, pkey); + + return -ENOENT; +} + +/** + * ctxt_reset - Reset the user context + * @uctxt: valid user context + */ +static int ctxt_reset(struct hfi1_ctxtdata *uctxt) +{ + struct send_context *sc; + struct hfi1_devdata *dd; + int ret = 0; + + if (!uctxt || !uctxt->dd || !uctxt->sc) + return -EINVAL; + + /* + * There is no protection here. User level has to guarantee that + * no one will be writing to the send context while it is being + * re-initialized. If user level breaks that guarantee, it will + * break it's own context and no one else's. + */ + dd = uctxt->dd; + sc = uctxt->sc; + + /* + * Wait until the interrupt handler has marked the context as + * halted or frozen. Report error if we time out. + */ + wait_event_interruptible_timeout( + sc->halt_wait, (sc->flags & SCF_HALTED), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (!(sc->flags & SCF_HALTED)) + return -ENOLCK; + + /* + * If the send context was halted due to a Freeze, wait until the + * device has been "unfrozen" before resetting the context. + */ + if (sc->flags & SCF_FROZEN) { + wait_event_interruptible_timeout( + dd->event_queue, + !(READ_ONCE(dd->flags) & HFI1_FROZEN), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (dd->flags & HFI1_FROZEN) + return -ENOLCK; + + if (dd->flags & HFI1_FORCED_FREEZE) + /* + * Don't allow context reset if we are into + * forced freeze + */ + return -ENODEV; + + sc_disable(sc); + ret = sc_enable(sc); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt); + } else { + ret = sc_restart(sc); + } + if (!ret) + sc_return_credits(sc); + + return ret; +} + +static void user_remove(struct hfi1_devdata *dd) +{ + + hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device); +} + +static int user_add(struct hfi1_devdata *dd) +{ + char name[10]; + int ret; + + snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit); + ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops, + &dd->user_cdev, &dd->user_device, + true, &dd->verbs_dev.rdi.ibdev.dev.kobj); + if (ret) + user_remove(dd); + + return ret; +} + +/* + * Create per-unit files in /dev + */ +int hfi1_device_create(struct hfi1_devdata *dd) +{ + return user_add(dd); +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void hfi1_device_remove(struct hfi1_devdata *dd) +{ + user_remove(dd); +} diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c new file mode 100644 index 0000000000..0c0cef5b1e --- /dev/null +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -0,0 +1,2253 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2017 Intel Corporation. + */ + +#include <linux/firmware.h> +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/crc32.h> + +#include "hfi.h" +#include "trace.h" + +/* + * Make it easy to toggle firmware file name and if it gets loaded by + * editing the following. This may be something we do while in development + * but not necessarily something a user would ever need to use. + */ +#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin" +#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw" +#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw" +#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw" +#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw" +#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw" +#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw" +#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw" +#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw" + +MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC); +MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME); +MODULE_FIRMWARE(DEFAULT_FW_SBUS_NAME); +MODULE_FIRMWARE(DEFAULT_FW_PCIE_NAME); + +static uint fw_8051_load = 1; +static uint fw_fabric_serdes_load = 1; +static uint fw_pcie_serdes_load = 1; +static uint fw_sbus_load = 1; + +/* Firmware file names get set in hfi1_firmware_init() based on the above */ +static char *fw_8051_name; +static char *fw_fabric_serdes_name; +static char *fw_sbus_name; +static char *fw_pcie_serdes_name; + +#define SBUS_MAX_POLL_COUNT 100 +#define SBUS_COUNTER(reg, name) \ + (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \ + ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK) + +/* + * Firmware security header. + */ +struct css_header { + u32 module_type; + u32 header_len; + u32 header_version; + u32 module_id; + u32 module_vendor; + u32 date; /* BCD yyyymmdd */ + u32 size; /* in DWORDs */ + u32 key_size; /* in DWORDs */ + u32 modulus_size; /* in DWORDs */ + u32 exponent_size; /* in DWORDs */ + u32 reserved[22]; +}; + +/* expected field values */ +#define CSS_MODULE_TYPE 0x00000006 +#define CSS_HEADER_LEN 0x000000a1 +#define CSS_HEADER_VERSION 0x00010000 +#define CSS_MODULE_VENDOR 0x00008086 + +#define KEY_SIZE 256 +#define MU_SIZE 8 +#define EXPONENT_SIZE 4 + +/* size of platform configuration partition */ +#define MAX_PLATFORM_CONFIG_FILE_SIZE 4096 + +/* size of file of plaform configuration encoded in format version 4 */ +#define PLATFORM_CONFIG_FORMAT_4_FILE_SIZE 528 + +/* the file itself */ +struct firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 firmware[]; +}; + +struct augmented_firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 r2[KEY_SIZE]; + u8 mu[MU_SIZE]; + u8 firmware[]; +}; + +/* augmented file size difference */ +#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \ + sizeof(struct firmware_file)) + +struct firmware_details { + /* Linux core piece */ + const struct firmware *fw; + + struct css_header *css_header; + u8 *firmware_ptr; /* pointer to binary data */ + u32 firmware_len; /* length in bytes */ + u8 *modulus; /* pointer to the modulus */ + u8 *exponent; /* pointer to the exponent */ + u8 *signature; /* pointer to the signature */ + u8 *r2; /* pointer to r2 */ + u8 *mu; /* pointer to mu */ + struct augmented_firmware_file dummy_header; +}; + +/* + * The mutex protects fw_state, fw_err, and all of the firmware_details + * variables. + */ +static DEFINE_MUTEX(fw_mutex); +enum fw_state { + FW_EMPTY, + FW_TRY, + FW_FINAL, + FW_ERR +}; + +static enum fw_state fw_state = FW_EMPTY; +static int fw_err; +static struct firmware_details fw_8051; +static struct firmware_details fw_fabric; +static struct firmware_details fw_pcie; +static struct firmware_details fw_sbus; + +/* flags for turn_off_spicos() */ +#define SPICO_SBUS 0x1 +#define SPICO_FABRIC 0x2 +#define ENABLE_SPICO_SMASK 0x1 + +/* security block commands */ +#define RSA_CMD_INIT 0x1 +#define RSA_CMD_START 0x2 + +/* security block status */ +#define RSA_STATUS_IDLE 0x0 +#define RSA_STATUS_ACTIVE 0x1 +#define RSA_STATUS_DONE 0x2 +#define RSA_STATUS_FAILED 0x3 + +/* RSA engine timeout, in ms */ +#define RSA_ENGINE_TIMEOUT 100 /* ms */ + +/* hardware mutex timeout, in ms */ +#define HM_TIMEOUT 10 /* ms */ + +/* 8051 memory access timeout, in us */ +#define DC8051_ACCESS_TIMEOUT 100 /* us */ + +/* the number of fabric SerDes on the SBus */ +#define NUM_FABRIC_SERDES 4 + +/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */ +#define SBUS_READ_COMPLETE 0x4 + +/* SBus fabric SerDes addresses, one set per HFI */ +static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = { + { 0x01, 0x02, 0x03, 0x04 }, + { 0x28, 0x29, 0x2a, 0x2b } +}; + +/* SBus PCIe SerDes addresses, one set per HFI */ +static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = { + { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, + 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 }, + { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d, + 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d } +}; + +/* SBus PCIe PCS addresses, one set per HFI */ +const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = { + { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17, + 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 }, + { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e } +}; + +/* SBus fabric SerDes broadcast addresses, one per HFI */ +static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 }; +static const u8 all_fabric_serdes_broadcast = 0xe1; + +/* SBus PCIe SerDes broadcast addresses, one per HFI */ +const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 }; +static const u8 all_pcie_serdes_broadcast = 0xe0; + +static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = { + 0, + SYSTEM_TABLE_MAX, + PORT_TABLE_MAX, + RX_PRESET_TABLE_MAX, + TX_PRESET_TABLE_MAX, + QSFP_ATTEN_TABLE_MAX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +/* forwards */ +static void dispose_one_firmware(struct firmware_details *fdet); +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet); +static void dump_fw_version(struct hfi1_devdata *dd); + +/* + * Read a single 64-bit value from 8051 data memory. + * + * Expects: + * o caller to have already set up data read, no auto increment + * o caller to turn off read enable when finished + * + * The address argument is a byte offset. Bits 0:2 in the address are + * ignored - i.e. the hardware will always do aligned 8-byte reads as if + * the lower bits are zero. + * + * Return 0 on success, -ENXIO on a read error (timeout). + */ +static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result) +{ + u64 reg; + int count; + + /* step 1: set the address, clear enable */ + reg = (addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + /* step 2: enable */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, + reg | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout reading 8051 data\n"); + return -ENXIO; + } + ndelay(10); + } + + /* gather the data */ + *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA); + + return 0; +} + +/* + * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks. + * Return 0 on success, -errno on error. + */ +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result) +{ + unsigned long flags; + u32 done; + int ret = 0; + + spin_lock_irqsave(&dd->dc8051_memlock, flags); + + /* data read set-up, no auto-increment */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + for (done = 0; done < len; addr += 8, done += 8, result++) { + ret = __read_8051_data(dd, addr, result); + if (ret) + break; + } + + /* turn off read enable */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + + spin_unlock_irqrestore(&dd->dc8051_memlock, flags); + + return ret; +} + +/* + * Write data or code to the 8051 code or data RAM. + */ +static int write_8051(struct hfi1_devdata *dd, int code, u32 start, + const u8 *data, u32 len) +{ + u64 reg; + u32 offset; + int aligned, count; + + /* check alignment */ + aligned = ((unsigned long)data & 0x7) == 0; + + /* write set-up */ + reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull) + | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg); + + reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT) + | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + + /* write */ + for (offset = 0; offset < len; offset += 8) { + int bytes = len - offset; + + if (bytes < 8) { + reg = 0; + memcpy(®, &data[offset], bytes); + } else if (aligned) { + reg = *(u64 *)&data[offset]; + } else { + memcpy(®, &data[offset], 8); + } + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout writing 8051 data\n"); + return -ENXIO; + } + udelay(1); + } + } + + /* turn off write access, auto increment (also sets to data access) */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + return 0; +} + +/* return 0 if values match, non-zero and complain otherwise */ +static int invalid_header(struct hfi1_devdata *dd, const char *what, + u32 actual, u32 expected) +{ + if (actual == expected) + return 0; + + dd_dev_err(dd, + "invalid firmware header field %s: expected 0x%x, actual 0x%x\n", + what, expected, actual); + return 1; +} + +/* + * Verify that the static fields in the CSS header match. + */ +static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css) +{ + /* verify CSS header fields (most sizes are in DW, so add /4) */ + if (invalid_header(dd, "module_type", css->module_type, + CSS_MODULE_TYPE) || + invalid_header(dd, "header_len", css->header_len, + (sizeof(struct firmware_file) / 4)) || + invalid_header(dd, "header_version", css->header_version, + CSS_HEADER_VERSION) || + invalid_header(dd, "module_vendor", css->module_vendor, + CSS_MODULE_VENDOR) || + invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) || + invalid_header(dd, "modulus_size", css->modulus_size, + KEY_SIZE / 4) || + invalid_header(dd, "exponent_size", css->exponent_size, + EXPONENT_SIZE / 4)) { + return -EINVAL; + } + return 0; +} + +/* + * Make sure there are at least some bytes after the prefix. + */ +static int payload_check(struct hfi1_devdata *dd, const char *name, + long file_size, long prefix_size) +{ + /* make sure we have some payload */ + if (prefix_size >= file_size) { + dd_dev_err(dd, + "firmware \"%s\", size %ld, must be larger than %ld bytes\n", + name, file_size, prefix_size); + return -EINVAL; + } + + return 0; +} + +/* + * Request the firmware from the system. Extract the pieces and fill in + * fdet. If successful, the caller will need to call dispose_one_firmware(). + * Returns 0 on success, -ERRNO on error. + */ +static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, + struct firmware_details *fdet) +{ + struct css_header *css; + int ret; + + memset(fdet, 0, sizeof(*fdet)); + + ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev); + if (ret) { + dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n", + name, ret); + return ret; + } + + /* verify the firmware */ + if (fdet->fw->size < sizeof(struct css_header)) { + dd_dev_err(dd, "firmware \"%s\" is too small\n", name); + ret = -EINVAL; + goto done; + } + css = (struct css_header *)fdet->fw->data; + + hfi1_cdbg(FIRMWARE, "Firmware %s details:", name); + hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size); + hfi1_cdbg(FIRMWARE, "CSS structure:"); + hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type); + hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)", + css->header_len, 4 * css->header_len); + hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version); + hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id); + hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor); + hfi1_cdbg(FIRMWARE, " date 0x%x", css->date); + hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)", + css->size, 4 * css->size); + hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)", + css->key_size, 4 * css->key_size); + hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)", + css->modulus_size, 4 * css->modulus_size); + hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)", + css->exponent_size, 4 * css->exponent_size); + hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes", + fdet->fw->size - sizeof(struct firmware_file)); + + /* + * If the file does not have a valid CSS header, fail. + * Otherwise, check the CSS size field for an expected size. + * The augmented file has r2 and mu inserted after the header + * was generated, so there will be a known difference between + * the CSS header size and the actual file size. Use this + * difference to identify an augmented file. + * + * Note: css->size is in DWORDs, multiply by 4 to get bytes. + */ + ret = verify_css_header(dd, css); + if (ret) { + dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name); + } else if ((css->size * 4) == fdet->fw->size) { + /* non-augmented firmware file */ + struct firmware_file *ff = (struct firmware_file *) + fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = ff->modulus; + fdet->exponent = ff->exponent; + fdet->signature = ff->signature; + fdet->r2 = fdet->dummy_header.r2; /* use dummy space */ + fdet->mu = fdet->dummy_header.mu; /* use dummy space */ + fdet->firmware_ptr = ff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct firmware_file); + /* + * Header does not include r2 and mu - generate here. + * For now, fail. + */ + dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n"); + ret = -EINVAL; + } + } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) { + /* augmented firmware file */ + struct augmented_firmware_file *aff = + (struct augmented_firmware_file *)fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct augmented_firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = aff->modulus; + fdet->exponent = aff->exponent; + fdet->signature = aff->signature; + fdet->r2 = aff->r2; + fdet->mu = aff->mu; + fdet->firmware_ptr = aff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct augmented_firmware_file); + } + } else { + /* css->size check failed */ + dd_dev_err(dd, + "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n", + fdet->fw->size / 4, + (fdet->fw->size - AUGMENT_SIZE) / 4, + css->size); + + ret = -EINVAL; + } + +done: + /* if returning an error, clean up after ourselves */ + if (ret) + dispose_one_firmware(fdet); + return ret; +} + +static void dispose_one_firmware(struct firmware_details *fdet) +{ + release_firmware(fdet->fw); + /* erase all previous information */ + memset(fdet, 0, sizeof(*fdet)); +} + +/* + * Obtain the 4 firmwares from the OS. All must be obtained at once or not + * at all. If called with the firmware state in FW_TRY, use alternate names. + * On exit, this routine will have set the firmware state to one of FW_TRY, + * FW_FINAL, or FW_ERR. + * + * Must be holding fw_mutex. + */ +static void __obtain_firmware(struct hfi1_devdata *dd) +{ + int err = 0; + + if (fw_state == FW_FINAL) /* nothing more to obtain */ + return; + if (fw_state == FW_ERR) /* already in error */ + return; + + /* fw_state is FW_EMPTY or FW_TRY */ +retry: + if (fw_state == FW_TRY) { + /* + * We tried the original and it failed. Move to the + * alternate. + */ + dd_dev_warn(dd, "using alternate firmware names\n"); + /* + * Let others run. Some systems, when missing firmware, does + * something that holds for 30 seconds. If we do that twice + * in a row it triggers task blocked warning. + */ + cond_resched(); + if (fw_8051_load) + dispose_one_firmware(&fw_8051); + if (fw_fabric_serdes_load) + dispose_one_firmware(&fw_fabric); + if (fw_sbus_load) + dispose_one_firmware(&fw_sbus); + if (fw_pcie_serdes_load) + dispose_one_firmware(&fw_pcie); + fw_8051_name = ALT_FW_8051_NAME_ASIC; + fw_fabric_serdes_name = ALT_FW_FABRIC_NAME; + fw_sbus_name = ALT_FW_SBUS_NAME; + fw_pcie_serdes_name = ALT_FW_PCIE_NAME; + + /* + * Add a delay before obtaining and loading debug firmware. + * Authorization will fail if the delay between firmware + * authorization events is shorter than 50us. Add 100us to + * make a delay time safe. + */ + usleep_range(100, 120); + } + + if (fw_sbus_load) { + err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus); + if (err) + goto done; + } + + if (fw_pcie_serdes_load) { + err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie); + if (err) + goto done; + } + + if (fw_fabric_serdes_load) { + err = obtain_one_firmware(dd, fw_fabric_serdes_name, + &fw_fabric); + if (err) + goto done; + } + + if (fw_8051_load) { + err = obtain_one_firmware(dd, fw_8051_name, &fw_8051); + if (err) + goto done; + } + +done: + if (err) { + /* oops, had problems obtaining a firmware */ + if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) { + /* retry with alternate (RTL only) */ + fw_state = FW_TRY; + goto retry; + } + dd_dev_err(dd, "unable to obtain working firmware\n"); + fw_state = FW_ERR; + fw_err = -ENOENT; + } else { + /* success */ + if (fw_state == FW_EMPTY && + dd->icode != ICODE_FUNCTIONAL_SIMULATOR) + fw_state = FW_TRY; /* may retry later */ + else + fw_state = FW_FINAL; /* cannot try again */ + } +} + +/* + * Called by all HFIs when loading their firmware - i.e. device probe time. + * The first one will do the actual firmware load. Use a mutex to resolve + * any possible race condition. + * + * The call to this routine cannot be moved to driver load because the kernel + * call request_firmware() requires a device which is only available after + * the first device probe. + */ +static int obtain_firmware(struct hfi1_devdata *dd) +{ + unsigned long timeout; + + mutex_lock(&fw_mutex); + + /* 40s delay due to long delay on missing firmware on some systems */ + timeout = jiffies + msecs_to_jiffies(40000); + while (fw_state == FW_TRY) { + /* + * Another device is trying the firmware. Wait until it + * decides what works (or not). + */ + if (time_after(jiffies, timeout)) { + /* waited too long */ + dd_dev_err(dd, "Timeout waiting for firmware try"); + fw_state = FW_ERR; + fw_err = -ETIMEDOUT; + break; + } + mutex_unlock(&fw_mutex); + msleep(20); /* arbitrary delay */ + mutex_lock(&fw_mutex); + } + /* not in FW_TRY state */ + + /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */ + if (fw_state == FW_EMPTY) + __obtain_firmware(dd); + + mutex_unlock(&fw_mutex); + return fw_err; +} + +/* + * Called when the driver unloads. The timing is asymmetric with its + * counterpart, obtain_firmware(). If called at device remove time, + * then it is conceivable that another device could probe while the + * firmware is being disposed. The mutexes can be moved to do that + * safely, but then the firmware would be requested from the OS multiple + * times. + * + * No mutex is needed as the driver is unloading and there cannot be any + * other callers. + */ +void dispose_firmware(void) +{ + dispose_one_firmware(&fw_8051); + dispose_one_firmware(&fw_fabric); + dispose_one_firmware(&fw_pcie); + dispose_one_firmware(&fw_sbus); + + /* retain the error state, otherwise revert to empty */ + if (fw_state != FW_ERR) + fw_state = FW_EMPTY; +} + +/* + * Called with the result of a firmware download. + * + * Return 1 to retry loading the firmware, 0 to stop. + */ +static int retry_firmware(struct hfi1_devdata *dd, int load_result) +{ + int retry; + + mutex_lock(&fw_mutex); + + if (load_result == 0) { + /* + * The load succeeded, so expect all others to do the same. + * Do not retry again. + */ + if (fw_state == FW_TRY) + fw_state = FW_FINAL; + retry = 0; /* do NOT retry */ + } else if (fw_state == FW_TRY) { + /* load failed, obtain alternate firmware */ + __obtain_firmware(dd); + retry = (fw_state == FW_FINAL); + } else { + /* else in FW_FINAL or FW_ERR, no retry in either case */ + retry = 0; + } + + mutex_unlock(&fw_mutex); + return retry; +} + +/* + * Write a block of data to a given array CSR. All calls will be in + * multiples of 8 bytes. + */ +static void write_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + int qw_size = nbytes / 8; + int i; + + if (((unsigned long)data & 0x7) == 0) { + /* aligned */ + u64 *ptr = (u64 *)data; + + for (i = 0; i < qw_size; i++, ptr++) + write_csr(dd, what + (8 * i), *ptr); + } else { + /* not aligned */ + for (i = 0; i < qw_size; i++, data += 8) { + u64 value; + + memcpy(&value, data, 8); + write_csr(dd, what + (8 * i), value); + } + } +} + +/* + * Write a block of data to a given CSR as a stream of writes. All calls will + * be in multiples of 8 bytes. + */ +static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + u64 *ptr = (u64 *)data; + int qw_size = nbytes / 8; + + for (; qw_size > 0; qw_size--, ptr++) + write_csr(dd, what, *ptr); +} + +/* + * Download the signature and start the RSA mechanism. Wait for + * RSA_ENGINE_TIMEOUT before giving up. + */ +static int run_rsa(struct hfi1_devdata *dd, const char *who, + const u8 *signature) +{ + unsigned long timeout; + u64 reg; + u32 status; + int ret = 0; + + /* write the signature */ + write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE); + + /* initialize RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT); + + /* + * Make sure the engine is idle and insert a delay between the two + * writes to MISC_CFG_RSA_CMD. + */ + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + if (status != RSA_STATUS_IDLE) { + dd_dev_err(dd, "%s security engine not idle - giving up\n", + who); + return -EBUSY; + } + + /* start RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START); + + /* + * Look for the result. + * + * The RSA engine is hooked up to two MISC errors. The driver + * masks these errors as they do not respond to the standard + * error "clear down" mechanism. Look for these errors here and + * clear them when possible. This routine will exit with the + * errors of the current run still set. + * + * MISC_FW_AUTH_FAILED_ERR + * Firmware authorization failed. This can be cleared by + * re-initializing the RSA engine, then clearing the status bit. + * Do not re-init the RSA angine immediately after a successful + * run - this will reset the current authorization. + * + * MISC_KEY_MISMATCH_ERR + * Key does not match. The only way to clear this is to load + * a matching key then clear the status bit. If this error + * is raised, it will persist outside of this routine until a + * matching key is loaded. + */ + timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies; + while (1) { + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + + if (status == RSA_STATUS_IDLE) { + /* should not happen */ + dd_dev_err(dd, "%s firmware security bad idle state\n", + who); + ret = -EINVAL; + break; + } else if (status == RSA_STATUS_DONE) { + /* finished successfully */ + break; + } else if (status == RSA_STATUS_FAILED) { + /* finished unsuccessfully */ + ret = -EINVAL; + break; + } + /* else still active */ + + if (time_after(jiffies, timeout)) { + /* + * Timed out while active. We can't reset the engine + * if it is stuck active, but run through the + * error code to see what error bits are set. + */ + dd_dev_err(dd, "%s firmware security time out\n", who); + ret = -ETIMEDOUT; + break; + } + + msleep(20); + } + + /* + * Arrive here on success or failure. Clear all RSA engine + * errors. All current errors will stick - the RSA logic is keeping + * error high. All previous errors will clear - the RSA logic + * is not keeping the error high. + */ + write_csr(dd, MISC_ERR_CLEAR, + MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK | + MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK); + /* + * All that is left are the current errors. Print warnings on + * authorization failure details, if any. Firmware authorization + * can be retried, so these are only warnings. + */ + reg = read_csr(dd, MISC_ERR_STATUS); + if (ret) { + if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK) + dd_dev_warn(dd, "%s firmware authorization failed\n", + who); + if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK) + dd_dev_warn(dd, "%s firmware key mismatch\n", who); + } + + return ret; +} + +static void load_security_variables(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + /* Security variables a. Write the modulus */ + write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE); + /* Security variables b. Write the r2 */ + write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE); + /* Security variables c. Write the mu */ + write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE); + /* Security variables d. Write the header */ + write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD, + (u8 *)fdet->css_header, + sizeof(struct css_header)); +} + +/* return the 8051 firmware state */ +static inline u32 get_firmware_state(struct hfi1_devdata *dd) +{ + u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + + return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT) + & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK; +} + +/* + * Wait until the firmware is up and ready to take host requests. + * Return 0 on success, -ETIMEDOUT on timeout. + */ +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout) +{ + unsigned long timeout; + + /* in the simulator, the fake 8051 is always ready */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return 0; + + timeout = msecs_to_jiffies(mstimeout) + jiffies; + while (1) { + if (get_firmware_state(dd) == 0xa0) /* ready */ + return 0; + if (time_after(jiffies, timeout)) /* timed out */ + return -ETIMEDOUT; + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } +} + +/* + * Load the 8051 firmware. + */ +static int load_8051_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + u64 reg; + int ret; + u8 ver_major; + u8 ver_minor; + u8 ver_patch; + + /* + * DC Reset sequence + * Load DC 8051 firmware + */ + /* + * DC reset step 1: Reset DC8051 + */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK + | DC_DC8051_CFG_RST_CRAM_SMASK + | DC_DC8051_CFG_RST_DRAM_SMASK + | DC_DC8051_CFG_RST_IRAM_SMASK + | DC_DC8051_CFG_RST_SFR_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* + * DC reset step 2 (optional): Load 8051 data memory with link + * configuration + */ + + /* + * DC reset step 3: Load DC8051 firmware + */ + /* release all but the core reset */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* Firmware load step 1 */ + load_security_variables(dd, fdet); + + /* + * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, 0); + + /* Firmware load steps 3-5 */ + ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr, + fdet->firmware_len); + if (ret) + return ret; + + /* + * DC reset step 4. Host starts the DC8051 firmware + */ + /* + * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK); + + /* Firmware load steps 7-10 */ + ret = run_rsa(dd, "8051", fdet->signature); + if (ret) + return ret; + + /* clear all reset bits, releasing the 8051 */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + + /* + * DC reset step 5. Wait for firmware to be ready to accept host + * requests. + */ + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { /* timed out */ + dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", + get_firmware_state(dd)); + return -ETIMEDOUT; + } + + read_misc_status(dd, &ver_major, &ver_minor, &ver_patch); + dd_dev_info(dd, "8051 firmware version %d.%d.%d\n", + (int)ver_major, (int)ver_minor, (int)ver_patch); + dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch); + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + return -EIO; + } + + return 0; +} + +/* + * Write the SBus request register + * + * No need for masking - the arguments are sized exactly. + */ +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + write_csr(dd, ASIC_CFG_SBUS_REQUEST, + ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) | + ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) | + ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) | + ((u64)receiver_addr << + ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); +} + +/* + * Read a value from the SBus. + * + * Requires the caller to be in fast mode + */ +static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, + u32 data_in) +{ + u64 reg; + int retries; + int success = 0; + u32 result = 0; + u32 result_code = 0; + + sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in); + + for (retries = 0; retries < 100; retries++) { + usleep_range(1000, 1200); /* arbitrary */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT) + & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK; + if (result_code != SBUS_READ_COMPLETE) + continue; + + success = 1; + result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT) + & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK; + break; + } + + if (!success) { + dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__, + result_code); + } + + return result; +} + +/* + * Turn off the SBus and fabric serdes spicos. + * + * + Must be called with Sbus fast mode turned on. + * + Must be called after fabric serdes broadcast is set up. + * + Must be called before the 8051 is loaded - assumes 8051 is not loaded + * when using MISC_CFG_FW_CTRL. + */ +static void turn_off_spicos(struct hfi1_devdata *dd, int flags) +{ + /* only needed on A0 */ + if (!is_ax(dd)) + return; + + dd_dev_info(dd, "Turning off spicos:%s%s\n", + flags & SPICO_SBUS ? " SBus" : "", + flags & SPICO_FABRIC ? " fabric" : ""); + + write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK); + /* disable SBus spico */ + if (flags & SPICO_SBUS) + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01, + WRITE_SBUS_RECEIVER, 0x00000040); + + /* disable the fabric serdes spicos */ + if (flags & SPICO_FABRIC) + sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id], + 0x07, WRITE_SBUS_RECEIVER, 0x00000000); + write_csr(dd, MISC_CFG_FW_CTRL, 0); +} + +/* + * Reset all of the fabric serdes for this HFI in preparation to take the + * link to Polling. + * + * To do a reset, we need to write to the serdes registers. Unfortunately, + * the fabric serdes download to the other HFI on the ASIC will have turned + * off the firmware validation on this HFI. This means we can't write to the + * registers to reset the serdes. Work around this by performing a complete + * re-download and validation of the fabric serdes firmware. This, as a + * by-product, will reset the serdes. NOTE: the re-download requires that + * the 8051 be in the Offline state. I.e. not actively trying to use the + * serdes. This routine is called at the point where the link is Offline and + * is getting ready to go to Polling. + */ +void fabric_serdes_reset(struct hfi1_devdata *dd) +{ + int ret; + + if (!fw_fabric_serdes_load) + return; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n"); + return; + } + set_sbus_fast_mode(dd); + + if (is_ax(dd)) { + /* A0 serdes do not work with a re-download */ + u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; + + /* place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + } else { + turn_off_spicos(dd, SPICO_FABRIC); + /* + * No need for firmware retry - what to download has already + * been decided. + * No need to pay attention to the load return - the only + * failure is a validation failure, which has already been + * checked by the initial download. + */ + (void)load_fabric_serdes_firmware(dd, &fw_fabric); + } + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); +} + +/* Access to the SBus in this routine should probably be serialized */ +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + u64 reg, count = 0; + + /* make sure fast mode is clear */ + clear_sbus_fast_mode(dd); + + sbus_request(dd, receiver_addr, data_addr, command, data_in); + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK); + /* Wait for both DONE and RCV_DATA_VALID to go high */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) { + if (count++ >= SBUS_MAX_POLL_COUNT) { + u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + /* + * If the loop has timed out, we are OK if DONE bit + * is set and RCV_DATA_VALID and EXECUTE counters + * are the same. If not, we cannot proceed. + */ + if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (SBUS_COUNTER(counts, RCV_DATA_VALID) == + SBUS_COUNTER(counts, EXECUTE))) + break; + return -ETIMEDOUT; + } + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + count = 0; + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); + /* Wait for DONE to clear after EXECUTE is cleared */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) { + if (count++ >= SBUS_MAX_POLL_COUNT) + return -ETIME; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + return 0; +} + +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */ + + dd_dev_info(dd, "Downloading fabric firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* step 3: remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* step 4: assert IMEM override */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000); + /* step 5: download SerDes machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: IMEM override off */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "fabric serdes", fdet->signature); + if (err) + return err; + + /* step 12: turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + /* step 13: enable core hardware interrupts */ + sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000); + + return 0; +} + +static int load_sbus_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading SBus firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SPICO into reset and enable off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0); + /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240); + /* step 4: set starting IMEM address for burst download */ + sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000); + /* step 5: download the SBus Master machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: set IMEM_CNTL_EN off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "SBus", fdet->signature); + if (err) + return err; + + /* step 12: set SPICO_ENABLE on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + + return 0; +} + +static int load_pcie_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading PCIe firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: assert single step (halts the SBus Master spico) */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001); + /* step 3: enable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40); + /* step 4: load firmware into SBus Master XDMEM */ + /* + * NOTE: the dmem address, write_en, and wdata are all pre-packed, + * we only need to pick up the bytes and write them + */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 5: disable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + /* step 6: allow SBus Spico to run */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000); + + /* + * steps 7-11: run RSA, if it succeeds, firmware is available to + * be swapped + */ + return run_rsa(dd, "PCIe serdes", fdet->signature); +} + +/* + * Set the given broadcast values on the given list of devices. + */ +static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2, + const u8 *addrs, int count) +{ + while (--count >= 0) { + /* + * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave + * defaults for everything else. Do not read-modify-write, + * per instruction from the manufacturer. + * + * Register 0xfd: + * bits what + * ----- --------------------------------- + * 0 IGNORE_BROADCAST (default 0) + * 11:4 BROADCAST_GROUP_1 (default 0xff) + * 23:16 BROADCAST_GROUP_2 (default 0xff) + */ + sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER, + (u32)bg1 << 4 | (u32)bg2 << 16); + } +} + +int acquire_hw_mutex(struct hfi1_devdata *dd) +{ + unsigned long timeout; + int try = 0; + u8 mask = 1 << dd->hfi1_id; + u8 user = (u8)read_csr(dd, ASIC_CFG_MUTEX); + + if (user == mask) { + dd_dev_info(dd, + "Hardware mutex already acquired, mutex mask %u\n", + (u32)mask); + return 0; + } + +retry: + timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies; + while (1) { + write_csr(dd, ASIC_CFG_MUTEX, mask); + user = (u8)read_csr(dd, ASIC_CFG_MUTEX); + if (user == mask) + return 0; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + msleep(20); + } + + /* timed out */ + dd_dev_err(dd, + "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n", + (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up"); + + if (try == 0) { + /* break mutex and retry */ + write_csr(dd, ASIC_CFG_MUTEX, 0); + try++; + goto retry; + } + + return -EBUSY; +} + +void release_hw_mutex(struct hfi1_devdata *dd) +{ + u8 mask = 1 << dd->hfi1_id; + u8 user = (u8)read_csr(dd, ASIC_CFG_MUTEX); + + if (user != mask) + dd_dev_warn(dd, + "Unable to release hardware mutex, mutex mask %u, my mask %u\n", + (u32)user, (u32)mask); + else + write_csr(dd, ASIC_CFG_MUTEX, 0); +} + +/* return the given resource bit(s) as a mask for the given HFI */ +static inline u64 resource_mask(u32 hfi1_id, u32 resource) +{ + return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0); +} + +static void fail_mutex_acquire_message(struct hfi1_devdata *dd, + const char *func) +{ + dd_dev_err(dd, + "%s: hardware mutex stuck - suggest rebooting the machine\n", + func); +} + +/* + * Acquire access to a chip resource. + * + * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed. + */ +static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, all_bits, my_bit; + int ret; + + if (resource & CR_DYN_MASK) { + /* a dynamic resource is in use if either HFI has set the bit */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 && + (resource & (CR_I2C1 | CR_I2C2))) { + /* discrete devices must serialize across both chains */ + all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) | + resource_mask(1, CR_I2C1 | CR_I2C2); + } else { + all_bits = resource_mask(0, resource) | + resource_mask(1, resource); + } + my_bit = resource_mask(dd->hfi1_id, resource); + } else { + /* non-dynamic resources are not split between HFIs */ + all_bits = resource; + my_bit = resource; + } + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + ret = acquire_hw_mutex(dd); + if (ret) { + fail_mutex_acquire_message(dd, __func__); + ret = -EIO; + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if (scratch0 & all_bits) { + ret = -EBUSY; + } else { + write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); + return ret; +} + +/* + * Acquire access to a chip resource, wait up to mswait milliseconds for + * the resource to become available. + * + * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex + * acquire failed. + */ +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait) +{ + unsigned long timeout; + int ret; + + timeout = jiffies + msecs_to_jiffies(mswait); + while (1) { + ret = __acquire_chip_resource(dd, resource); + if (ret != -EBUSY) + return ret; + /* resource is busy, check our timeout */ + if (time_after_eq(jiffies, timeout)) + return -EBUSY; + usleep_range(80, 120); /* arbitrary delay */ + } +} + +/* + * Release access to a chip resource + */ +void release_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, bit; + + /* only dynamic resources should ever be cleared */ + if (!(resource & CR_DYN_MASK)) { + dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__, + resource); + return; + } + bit = resource_mask(dd->hfi1_id, resource); + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, __func__); + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) != 0) { + scratch0 &= ~bit; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } else { + dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n", + __func__, dd->hfi1_id, resource); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +/* + * Return true if resource is set, false otherwise. Print a warning + * if not set and a function is supplied. + */ +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func) +{ + u64 scratch0, bit; + + if (resource & CR_DYN_MASK) + bit = resource_mask(dd->hfi1_id, resource); + else + bit = resource; + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) == 0) { + if (func) + dd_dev_warn(dd, + "%s: id %d, resource 0x%x, not acquired!\n", + func, dd->hfi1_id, resource); + return false; + } + return true; +} + +static void clear_chip_resources(struct hfi1_devdata *dd, const char *func) +{ + u64 scratch0; + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, func); + goto done; + } + + /* clear all dynamic access bits for this HFI */ + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK); + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +void init_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + +void finish_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + +void set_sbus_fast_mode(struct hfi1_devdata *dd) +{ + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK); +} + +void clear_sbus_fast_mode(struct hfi1_devdata *dd) +{ + u64 reg, count = 0; + + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + while (SBUS_COUNTER(reg, EXECUTE) != + SBUS_COUNTER(reg, RCV_DATA_VALID)) { + if (count++ >= SBUS_MAX_POLL_COUNT) + break; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + } + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); +} + +int load_firmware(struct hfi1_devdata *dd) +{ + int ret; + + if (fw_fabric_serdes_load) { + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) + return ret; + + set_sbus_fast_mode(dd); + + set_serdes_broadcast(dd, all_fabric_serdes_broadcast, + fabric_serdes_broadcast[dd->hfi1_id], + fabric_serdes_addrs[dd->hfi1_id], + NUM_FABRIC_SERDES); + turn_off_spicos(dd, SPICO_FABRIC); + do { + ret = load_fabric_serdes_firmware(dd, &fw_fabric); + } while (retry_firmware(dd, ret)); + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); + if (ret) + return ret; + } + + if (fw_8051_load) { + do { + ret = load_8051_firmware(dd, &fw_8051); + } while (retry_firmware(dd, ret)); + if (ret) + return ret; + } + + dump_fw_version(dd); + return 0; +} + +int hfi1_firmware_init(struct hfi1_devdata *dd) +{ + /* only RTL can use these */ + if (dd->icode != ICODE_RTL_SILICON) { + fw_fabric_serdes_load = 0; + fw_pcie_serdes_load = 0; + fw_sbus_load = 0; + } + + /* no 8051 or QSFP on simulator */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + fw_8051_load = 0; + + if (!fw_8051_name) { + if (dd->icode == ICODE_RTL_SILICON) + fw_8051_name = DEFAULT_FW_8051_NAME_ASIC; + else + fw_8051_name = DEFAULT_FW_8051_NAME_FPGA; + } + if (!fw_fabric_serdes_name) + fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME; + if (!fw_sbus_name) + fw_sbus_name = DEFAULT_FW_SBUS_NAME; + if (!fw_pcie_serdes_name) + fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME; + + return obtain_firmware(dd); +} + +/* + * This function is a helper function for parse_platform_config(...) and + * does not check for validity of the platform configuration cache + * (because we know it is invalid as we are building up the cache). + * As such, this should not be called from anywhere other than + * parse_platform_config + */ +static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table) +{ + u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + + if (!system_table) + return -EINVAL; + + meta_ver_meta = + *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata + + SYSTEM_TABLE_META_VERSION); + + mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + ver_start = meta_ver_meta & mask; + + meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT; + + mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + ver_len = meta_ver_meta & mask; + + ver_start /= 8; + meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1); + + if (meta_ver < 4) { + dd_dev_info( + dd, "%s:Please update platform config\n", __func__); + return -EINVAL; + } + return 0; +} + +int parse_platform_config(struct hfi1_devdata *dd) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + struct hfi1_pportdata *ppd = dd->pport; + u32 *ptr = NULL; + u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0; + u32 record_idx = 0, table_type = 0, table_length_dwords = 0; + int ret = -EINVAL; /* assume failure */ + + /* + * For integrated devices that did not fall back to the default file, + * the SI tuning information for active channels is acquired from the + * scratch register bitmap, thus there is no platform config to parse. + * Skip parsing in these situations. + */ + if (ppd->config_from_scratch) + return 0; + + if (!dd->platform_config.data) { + dd_dev_err(dd, "%s: Missing config file\n", __func__); + ret = -EINVAL; + goto bail; + } + ptr = (u32 *)dd->platform_config.data; + + magic_num = *ptr; + ptr++; + if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) { + dd_dev_err(dd, "%s: Bad config file\n", __func__); + ret = -EINVAL; + goto bail; + } + + /* Field is file size in DWORDs */ + file_length = (*ptr) * 4; + + /* + * Length can't be larger than partition size. Assume platform + * config format version 4 is being used. Interpret the file size + * field as header instead by not moving the pointer. + */ + if (file_length > MAX_PLATFORM_CONFIG_FILE_SIZE) { + dd_dev_info(dd, + "%s:File length out of bounds, using alternative format\n", + __func__); + file_length = PLATFORM_CONFIG_FORMAT_4_FILE_SIZE; + } else { + ptr++; + } + + if (file_length > dd->platform_config.size) { + dd_dev_info(dd, "%s:File claims to be larger than read size\n", + __func__); + ret = -EINVAL; + goto bail; + } else if (file_length < dd->platform_config.size) { + dd_dev_info(dd, + "%s:File claims to be smaller than read size, continuing\n", + __func__); + } + /* exactly equal, perfection */ + + /* + * In both cases where we proceed, using the self-reported file length + * is the safer option. In case of old format a predefined value is + * being used. + */ + while (ptr < (u32 *)(dd->platform_config.data + file_length)) { + header1 = *ptr; + header2 = *(ptr + 1); + if (header1 != ~header2) { + dd_dev_err(dd, "%s: Failed validation at offset %ld\n", + __func__, (ptr - (u32 *) + dd->platform_config.data)); + ret = -EINVAL; + goto bail; + } + + record_idx = *ptr & + ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1); + + table_length_dwords = (*ptr >> + PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1); + + table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1); + + /* Done with this set of headers */ + ptr += 2; + + if (record_idx) { + /* data table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + pcfgcache->config_tables[table_type].num_table = + 1; + ret = check_meta_version(dd, ptr); + if (ret) + goto bail; + break; + case PLATFORM_CONFIG_PORT_TABLE: + pcfgcache->config_tables[table_type].num_table = + 2; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + case PLATFORM_CONFIG_TX_PRESET_TABLE: + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + pcfgcache->config_tables[table_type].num_table = + table_length_dwords; + break; + default: + dd_dev_err(dd, + "%s: Unknown data table %d, offset %ld\n", + __func__, table_type, + (ptr - (u32 *) + dd->platform_config.data)); + ret = -EINVAL; + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table = ptr; + } else { + /* metadata table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + case PLATFORM_CONFIG_PORT_TABLE: + case PLATFORM_CONFIG_RX_PRESET_TABLE: + case PLATFORM_CONFIG_TX_PRESET_TABLE: + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + break; + default: + dd_dev_err(dd, + "%s: Unknown meta table %d, offset %ld\n", + __func__, table_type, + (ptr - + (u32 *)dd->platform_config.data)); + ret = -EINVAL; + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table_metadata = + ptr; + } + + /* Calculate and check table crc */ + crc = crc32_le(~(u32)0, (unsigned char const *)ptr, + (table_length_dwords * 4)); + crc ^= ~(u32)0; + + /* Jump the table */ + ptr += table_length_dwords; + if (crc != *ptr) { + dd_dev_err(dd, "%s: Failed CRC check at offset %ld\n", + __func__, (ptr - + (u32 *)dd->platform_config.data)); + ret = -EINVAL; + goto bail; + } + /* Jump the CRC DWORD */ + ptr++; + } + + pcfgcache->cache_valid = 1; + return 0; +bail: + memset(pcfgcache, 0, sizeof(struct platform_config_cache)); + return ret; +} + +static void get_integrated_platform_config_field( + struct hfi1_devdata *dd, + enum platform_config_table_type_encoding table_type, + int field_index, u32 *data) +{ + struct hfi1_pportdata *ppd = dd->pport; + u8 *cache = ppd->qsfp_info.cache; + u32 tx_preset = 0; + + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + if (field_index == SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) + *data = ppd->max_power_class; + else if (field_index == SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G) + *data = ppd->default_atten; + break; + case PLATFORM_CONFIG_PORT_TABLE: + if (field_index == PORT_TABLE_PORT_TYPE) + *data = ppd->port_type; + else if (field_index == PORT_TABLE_LOCAL_ATTEN_25G) + *data = ppd->local_atten; + else if (field_index == PORT_TABLE_REMOTE_ATTEN_25G) + *data = ppd->remote_atten; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR_APPLY) + *data = (ppd->rx_preset & QSFP_RX_CDR_APPLY_SMASK) >> + QSFP_RX_CDR_APPLY_SHIFT; + else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP_APPLY) + *data = (ppd->rx_preset & QSFP_RX_EMP_APPLY_SMASK) >> + QSFP_RX_EMP_APPLY_SHIFT; + else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP_APPLY) + *data = (ppd->rx_preset & QSFP_RX_AMP_APPLY_SMASK) >> + QSFP_RX_AMP_APPLY_SHIFT; + else if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR) + *data = (ppd->rx_preset & QSFP_RX_CDR_SMASK) >> + QSFP_RX_CDR_SHIFT; + else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP) + *data = (ppd->rx_preset & QSFP_RX_EMP_SMASK) >> + QSFP_RX_EMP_SHIFT; + else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP) + *data = (ppd->rx_preset & QSFP_RX_AMP_SMASK) >> + QSFP_RX_AMP_SHIFT; + break; + case PLATFORM_CONFIG_TX_PRESET_TABLE: + if (cache[QSFP_EQ_INFO_OFFS] & 0x4) + tx_preset = ppd->tx_preset_eq; + else + tx_preset = ppd->tx_preset_noeq; + if (field_index == TX_PRESET_TABLE_PRECUR) + *data = (tx_preset & TX_PRECUR_SMASK) >> + TX_PRECUR_SHIFT; + else if (field_index == TX_PRESET_TABLE_ATTN) + *data = (tx_preset & TX_ATTN_SMASK) >> + TX_ATTN_SHIFT; + else if (field_index == TX_PRESET_TABLE_POSTCUR) + *data = (tx_preset & TX_POSTCUR_SMASK) >> + TX_POSTCUR_SHIFT; + else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR_APPLY) + *data = (tx_preset & QSFP_TX_CDR_APPLY_SMASK) >> + QSFP_TX_CDR_APPLY_SHIFT; + else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ_APPLY) + *data = (tx_preset & QSFP_TX_EQ_APPLY_SMASK) >> + QSFP_TX_EQ_APPLY_SHIFT; + else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR) + *data = (tx_preset & QSFP_TX_CDR_SMASK) >> + QSFP_TX_CDR_SHIFT; + else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ) + *data = (tx_preset & QSFP_TX_EQ_SMASK) >> + QSFP_TX_EQ_SHIFT; + break; + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + default: + break; + } +} + +static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table, + int field, u32 *field_len_bits, + u32 *field_start_bits) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + u32 *src_ptr = NULL; + + if (!pcfgcache->cache_valid) + return -EINVAL; + + switch (table) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + case PLATFORM_CONFIG_PORT_TABLE: + case PLATFORM_CONFIG_RX_PRESET_TABLE: + case PLATFORM_CONFIG_TX_PRESET_TABLE: + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + if (field && field < platform_config_table_limits[table]) + src_ptr = + pcfgcache->config_tables[table].table_metadata + field; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr) + return -EINVAL; + + if (field_start_bits) + *field_start_bits = *src_ptr & + ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + + if (field_len_bits) + *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT) + & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + + return 0; +} + +/* This is the central interface to getting data out of the platform config + * file. It depends on parse_platform_config() having populated the + * platform_config_cache in hfi1_devdata, and checks the cache_valid member to + * validate the sanity of the cache. + * + * The non-obvious parameters: + * @table_index: Acts as a look up key into which instance of the tables the + * relevant field is fetched from. + * + * This applies to the data tables that have multiple instances. The port table + * is an exception to this rule as each HFI only has one port and thus the + * relevant table can be distinguished by hfi_id. + * + * @data: pointer to memory that will be populated with the field requested. + * @len: length of memory pointed by @data in bytes. + */ +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len) +{ + int ret = 0, wlen = 0, seek = 0; + u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + struct hfi1_pportdata *ppd = dd->pport; + + if (data) + memset(data, 0, len); + else + return -EINVAL; + + if (ppd->config_from_scratch) { + /* + * Use saved configuration from ppd for integrated platforms + */ + get_integrated_platform_config_field(dd, table_type, + field_index, data); + return 0; + } + + ret = get_platform_fw_field_metadata(dd, table_type, field_index, + &field_len_bits, + &field_start_bits); + if (ret) + return -EINVAL; + + /* Convert length to bits */ + len *= 8; + + /* Our metadata function checked cache_valid and field_index for us */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) { + if (len < field_len_bits) + return -EINVAL; + + seek = field_start_bits / 8; + wlen = field_len_bits / 8; + + src_ptr = (u32 *)((u8 *)src_ptr + seek); + + /* + * We expect the field to be byte aligned and whole byte + * lengths if we are here + */ + memcpy(data, src_ptr, wlen); + return 0; + } + break; + case PLATFORM_CONFIG_PORT_TABLE: + /* Port table is 4 DWORDS */ + src_ptr = dd->hfi1_id ? + pcfgcache->config_tables[table_type].table + 4 : + pcfgcache->config_tables[table_type].table; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + case PLATFORM_CONFIG_TX_PRESET_TABLE: + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (table_index < + pcfgcache->config_tables[table_type].num_table) + src_ptr += table_index; + else + src_ptr = NULL; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr || len < field_len_bits) + return -EINVAL; + + src_ptr += (field_start_bits / 32); + *data = (*src_ptr >> (field_start_bits % 32)) & + ((1 << field_len_bits) - 1); + + return 0; +} + +/* + * Download the firmware needed for the Gen3 PCIe SerDes. An update + * to the SBus firmware is needed before updating the PCIe firmware. + * + * Note: caller must be holding the SBus resource. + */ +int load_pcie_firmware(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* both firmware loads below use the SBus */ + set_sbus_fast_mode(dd); + + if (fw_sbus_load) { + turn_off_spicos(dd, SPICO_SBUS); + do { + ret = load_sbus_firmware(dd, &fw_sbus); + } while (retry_firmware(dd, ret)); + if (ret) + goto done; + } + + if (fw_pcie_serdes_load) { + dd_dev_info(dd, "Setting PCIe SerDes broadcast\n"); + set_serdes_broadcast(dd, all_pcie_serdes_broadcast, + pcie_serdes_broadcast[dd->hfi1_id], + pcie_serdes_addrs[dd->hfi1_id], + NUM_PCIE_SERDES); + do { + ret = load_pcie_serdes_firmware(dd, &fw_pcie); + } while (retry_firmware(dd, ret)); + if (ret) + goto done; + } + +done: + clear_sbus_fast_mode(dd); + + return ret; +} + +/* + * Read the GUID from the hardware, store it in dd. + */ +void read_guid(struct hfi1_devdata *dd) +{ + /* Take the DC out of reset to get a valid GUID value */ + write_csr(dd, CCE_DC_CTRL, 0); + (void)read_csr(dd, CCE_DC_CTRL); + + dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID); + dd_dev_info(dd, "GUID %llx", + (unsigned long long)dd->base_guid); +} + +/* read and display firmware version info */ +static void dump_fw_version(struct hfi1_devdata *dd) +{ + u32 pcie_vers[NUM_PCIE_SERDES]; + u32 fabric_vers[NUM_FABRIC_SERDES]; + u32 sbus_vers; + int i; + int all_same; + int ret; + u8 rcv_addr; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n"); + return; + } + + /* set fast mode */ + set_sbus_fast_mode(dd); + + /* read version for SBus Master */ + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0); + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1); + dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers); + + /* read version for PCIe SerDes */ + all_same = 1; + pcie_vers[0] = 0; + for (i = 0; i < NUM_PCIE_SERDES; i++) { + rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && pcie_vers[0] != pcie_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n", + pcie_vers[0]); + } else { + dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_PCIE_SERDES; i++) { + dd_dev_info(dd, + "PCIe SerDes lane %d firmware version 0x%x\n", + i, pcie_vers[i]); + } + } + + /* read version for fabric SerDes */ + all_same = 1; + fabric_vers[0] = 0; + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i]; + sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0); + /* wait for interrupt to be processed */ + usleep_range(10000, 11000); + fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0); + if (i > 0 && fabric_vers[0] != fabric_vers[i]) + all_same = 0; + } + + if (all_same) { + dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n", + fabric_vers[0]); + } else { + dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n"); + for (i = 0; i < NUM_FABRIC_SERDES; i++) { + dd_dev_info(dd, + "Fabric SerDes lane %d firmware version 0x%x\n", + i, fabric_vers[i]); + } + } + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); +} diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h new file mode 100644 index 0000000000..38772e52d7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -0,0 +1,2646 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2020-2023 Cornelis Networks, Inc. + * Copyright(c) 2015-2020 Intel Corporation. + */ + +#ifndef _HFI1_KERNEL_H +#define _HFI1_KERNEL_H + +#include <linux/refcount.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/scatterlist.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/kref.h> +#include <linux/sched.h> +#include <linux/cdev.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/i2c.h> +#include <linux/i2c-algo-bit.h> +#include <linux/xarray.h> +#include <rdma/ib_hdrs.h> +#include <rdma/opa_addr.h> +#include <linux/rhashtable.h> +#include <rdma/rdma_vt.h> + +#include "chip_registers.h" +#include "common.h" +#include "opfn.h" +#include "verbs.h" +#include "pio.h" +#include "chip.h" +#include "mad.h" +#include "qsfp.h" +#include "platform.h" +#include "affinity.h" +#include "msix.h" + +/* bumped 1 from s/w major version of TrueScale */ +#define HFI1_CHIP_VERS_MAJ 3U + +/* don't care about this except printing */ +#define HFI1_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define HFI1_OUI 0x001175 +#define HFI1_OUI_LSB 40 + +#define DROP_PACKET_OFF 0 +#define DROP_PACKET_ON 1 + +#define NEIGHBOR_TYPE_HFI 0 +#define NEIGHBOR_TYPE_SWITCH 1 + +#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 + +extern unsigned long hfi1_cap_mask; +#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) +#define HFI1_CAP_UGET_MASK(mask, cap) \ + (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap) +#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap)) +#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap)) +#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \ + HFI1_CAP_MISC_MASK) +/* Offline Disabled Reason is 4-bits */ +#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON) + +/* + * Control context is always 0 and handles the error packets. + * It also handles the VL15 and multicast packets. + */ +#define HFI1_CTRL_CTXT 0 + +/* + * Driver context will store software counters for each of the events + * associated with these status registers + */ +#define NUM_CCE_ERR_STATUS_COUNTERS 41 +#define NUM_RCV_ERR_STATUS_COUNTERS 64 +#define NUM_MISC_ERR_STATUS_COUNTERS 13 +#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36 +#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4 +#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64 +#define NUM_SEND_ERR_STATUS_COUNTERS 3 +#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5 +#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24 + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted hfi1_statnames[] in debugfs.c must + * change to match. + */ +struct hfi1_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct hfi1_ib_stats hfi1_stats; +extern const struct pci_error_handlers hfi1_pci_err_handler; + +extern int num_driver_cntrs; + +/* + * First-cut criterion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Below contains all data related to a single context (formerly called port). + */ + +struct hfi1_opcode_stats_perctx; + +struct ctxt_eager_bufs { + struct eager_buffer { + void *addr; + dma_addr_t dma; + ssize_t len; + } *buffers; + struct { + void *addr; + dma_addr_t dma; + } *rcvtids; + u32 size; /* total size of eager buffers */ + u32 rcvtid_size; /* size of each eager rcv tid */ + u16 count; /* size of buffers array */ + u16 numbufs; /* number of buffers allocated */ + u16 alloced; /* number of rcvarray entries used */ + u16 threshold; /* head update threshold */ +}; + +struct exp_tid_set { + struct list_head list; + u32 count; +}; + +struct hfi1_ctxtdata; +typedef int (*intr_handler)(struct hfi1_ctxtdata *rcd, int data); +typedef void (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +struct tid_queue { + struct list_head queue_head; + /* queue head for QP TID resource waiters */ + u32 enqueue; /* count of tid enqueues */ + u32 dequeue; /* count of tid dequeues */ +}; + +struct hfi1_ctxtdata { + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + volatile __le64 *rcvhdrtail_kvaddr; + /* so functions that need physical port can get it easily */ + struct hfi1_pportdata *ppd; + /* so file ops can get at unit */ + struct hfi1_devdata *dd; + /* this receive context's assigned PIO ACK send context */ + struct send_context *sc; + /* per context recv functions */ + const rhf_rcv_function_ptr *rhf_rcv_function_map; + /* + * The interrupt handler for a particular receive context can vary + * throughout it's lifetime. This is not a lock protected data member so + * it must be updated atomically and the prev and new value must always + * be valid. Worst case is we process an extra interrupt and up to 64 + * packets with the wrong interrupt handler. + */ + intr_handler do_interrupt; + /** fast handler after autoactive */ + intr_handler fast_handler; + /** slow handler */ + intr_handler slow_handler; + /* napi pointer assiociated with netdev */ + struct napi_struct *napi; + /* verbs rx_stats per rcd */ + struct hfi1_opcode_stats_perctx *opstats; + /* clear interrupt mask */ + u64 imask; + /* ctxt rcvhdrq head offset */ + u32 head; + /* number of rcvhdrq entries */ + u16 rcvhdrq_cnt; + u8 ireg; /* clear interrupt register */ + /* receive packet sequence counter */ + u8 seq_cnt; + /* size of each of the rcvhdrq entries */ + u8 rcvhdrqentsize; + /* offset of RHF within receive header entry */ + u8 rhf_offset; + /* dynamic receive available interrupt timeout */ + u8 rcvavail_timeout; + /* Indicates that this is vnic context */ + bool is_vnic; + /* vnic queue index this context is mapped to */ + u8 vnic_q_idx; + /* Is ASPM interrupt supported for this context */ + bool aspm_intr_supported; + /* ASPM state (enabled/disabled) for this context */ + bool aspm_enabled; + /* Is ASPM processing enabled for this context (in intr context) */ + bool aspm_intr_enable; + struct ctxt_eager_bufs egrbufs; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; + /* tid allocation lists */ + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + + /* Timer for re-enabling ASPM if interrupt activity quiets down */ + struct timer_list aspm_timer; + /* per-context configuration flags */ + unsigned long flags; + /* array of tid_groups */ + struct tid_group *groups; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_dma; + dma_addr_t rcvhdrqtailaddr_dma; + /* Last interrupt timestamp */ + ktime_t aspm_ts_last_intr; + /* Last timestamp at which we scheduled a timer for this context */ + ktime_t aspm_ts_timer_sched; + /* Lock to serialize between intr, timer intr and user threads */ + spinlock_t aspm_lock; + /* Reference count the base context usage */ + struct kref kref; + /* numa node of this context */ + int numa_id; + /* associated msix interrupt. */ + s16 msix_intr; + /* job key */ + u16 jkey; + /* number of RcvArray groups for this context. */ + u16 rcv_array_groups; + /* index of first eager TID entry. */ + u16 eager_base; + /* number of expected TID entries */ + u16 expected_count; + /* index of first expected TID entry. */ + u16 expected_base; + /* Device context index */ + u8 ctxt; + + /* PSM Specific fields */ + /* lock protecting all Expected TID data */ + struct mutex exp_mutex; + /* lock protecting all Expected TID data of kernel contexts */ + spinlock_t exp_lock; + /* Queue for QP's waiting for HW TID flows */ + struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* uuid from PSM */ + u8 uuid[16]; + /* same size as task_struct .comm[], command that opened context */ + char comm[TASK_COMM_LEN]; + /* Bitmask of in use context(s) */ + DECLARE_BITMAP(in_use_ctxts, HFI1_MAX_SHARED_CTXTS); + /* per-context event flags for fileops/intr communication */ + unsigned long event_flags; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* + * non-zero if ctxt can be shared, and defines the maximum number of + * sub-contexts for this device context. + */ + u8 subctxt_cnt; + + /* Bit mask to track free TID RDMA HW flows */ + unsigned long flow_mask; + struct tid_flow_state flows[RXE_NUM_TID_FLOWS]; +}; + +/** + * rcvhdrq_size - return total size in bytes for header queue + * @rcd: the receive context + * + * rcvhdrqentsize is in DWs, so we have to convert to bytes + * + */ +static inline u32 rcvhdrq_size(struct hfi1_ctxtdata *rcd) +{ + return PAGE_ALIGN(rcd->rcvhdrq_cnt * + rcd->rcvhdrqentsize * sizeof(u32)); +} + +/* + * Represents a single packet at a high level. Put commonly computed things in + * here so we do not have to keep doing them over and over. The rule of thumb is + * if something is used one time to derive some value, store that something in + * here. If it is used multiple times, then store the result of that derivation + * in here. + */ +struct hfi1_packet { + void *ebuf; + void *hdr; + void *payload; + struct hfi1_ctxtdata *rcd; + __le32 *rhf_addr; + struct rvt_qp *qp; + struct ib_other_headers *ohdr; + struct ib_grh *grh; + struct opa_16b_mgmt *mgmt; + u64 rhf; + u32 maxcnt; + u32 rhqoff; + u32 dlid; + u32 slid; + int numpkt; + u16 tlen; + s16 etail; + u16 pkey; + u8 hlen; + u8 rsize; + u8 updegr; + u8 etype; + u8 extra_byte; + u8 pad; + u8 sc; + u8 sl; + u8 opcode; + bool migrated; +}; + +/* Packet types */ +#define HFI1_PKT_TYPE_9B 0 +#define HFI1_PKT_TYPE_16B 1 + +/* + * OPA 16B Header + */ +#define OPA_16B_L4_MASK 0xFFull +#define OPA_16B_SC_MASK 0x1F00000ull +#define OPA_16B_SC_SHIFT 20 +#define OPA_16B_LID_MASK 0xFFFFFull +#define OPA_16B_DLID_MASK 0xF000ull +#define OPA_16B_DLID_SHIFT 20 +#define OPA_16B_DLID_HIGH_SHIFT 12 +#define OPA_16B_SLID_MASK 0xF00ull +#define OPA_16B_SLID_SHIFT 20 +#define OPA_16B_SLID_HIGH_SHIFT 8 +#define OPA_16B_BECN_MASK 0x80000000ull +#define OPA_16B_BECN_SHIFT 31 +#define OPA_16B_FECN_MASK 0x10000000ull +#define OPA_16B_FECN_SHIFT 28 +#define OPA_16B_L2_MASK 0x60000000ull +#define OPA_16B_L2_SHIFT 29 +#define OPA_16B_PKEY_MASK 0xFFFF0000ull +#define OPA_16B_PKEY_SHIFT 16 +#define OPA_16B_LEN_MASK 0x7FF00000ull +#define OPA_16B_LEN_SHIFT 20 +#define OPA_16B_RC_MASK 0xE000000ull +#define OPA_16B_RC_SHIFT 25 +#define OPA_16B_AGE_MASK 0xFF0000ull +#define OPA_16B_AGE_SHIFT 16 +#define OPA_16B_ENTROPY_MASK 0xFFFFull + +/* + * OPA 16B L2/L4 Encodings + */ +#define OPA_16B_L4_9B 0x00 +#define OPA_16B_L2_TYPE 0x02 +#define OPA_16B_L4_FM 0x08 +#define OPA_16B_L4_IB_LOCAL 0x09 +#define OPA_16B_L4_IB_GLOBAL 0x0A +#define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR + +/* + * OPA 16B Management + */ +#define OPA_16B_L4_FM_PAD 3 /* fixed 3B pad */ +#define OPA_16B_L4_FM_HLEN 24 /* 16B(16) + L4_FM(8) */ + +static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr) +{ + return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK); +} + +static inline u8 hfi1_16B_get_sc(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_SC_MASK) >> OPA_16B_SC_SHIFT); +} + +static inline u32 hfi1_16B_get_dlid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[1] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_DLID_MASK) >> + OPA_16B_DLID_HIGH_SHIFT) << OPA_16B_DLID_SHIFT)); +} + +static inline u32 hfi1_16B_get_slid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[0] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_SLID_MASK) >> + OPA_16B_SLID_HIGH_SHIFT) << OPA_16B_SLID_SHIFT)); +} + +static inline u8 hfi1_16B_get_becn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[0] & OPA_16B_BECN_MASK) >> OPA_16B_BECN_SHIFT); +} + +static inline u8 hfi1_16B_get_fecn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_FECN_MASK) >> OPA_16B_FECN_SHIFT); +} + +static inline u8 hfi1_16B_get_l2(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_L2_MASK) >> OPA_16B_L2_SHIFT); +} + +static inline u16 hfi1_16B_get_pkey(struct hfi1_16b_header *hdr) +{ + return (u16)((hdr->lrh[2] & OPA_16B_PKEY_MASK) >> OPA_16B_PKEY_SHIFT); +} + +static inline u8 hfi1_16B_get_rc(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_RC_MASK) >> OPA_16B_RC_SHIFT); +} + +static inline u8 hfi1_16B_get_age(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[3] & OPA_16B_AGE_MASK) >> OPA_16B_AGE_SHIFT); +} + +static inline u16 hfi1_16B_get_len(struct hfi1_16b_header *hdr) +{ + return (u16)((hdr->lrh[0] & OPA_16B_LEN_MASK) >> OPA_16B_LEN_SHIFT); +} + +static inline u16 hfi1_16B_get_entropy(struct hfi1_16b_header *hdr) +{ + return (u16)(hdr->lrh[3] & OPA_16B_ENTROPY_MASK); +} + +#define OPA_16B_MAKE_QW(low_dw, high_dw) (((u64)(high_dw) << 32) | (low_dw)) + +/* + * BTH + */ +#define OPA_16B_BTH_PAD_MASK 7 +static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) & + OPA_16B_BTH_PAD_MASK); +} + +/* + * 16B Management + */ +#define OPA_16B_MGMT_QPN_MASK 0xFFFFFF +static inline u32 hfi1_16B_get_dest_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->dest_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline u32 hfi1_16B_get_src_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->src_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline void hfi1_16B_set_qpn(struct opa_16b_mgmt *mgmt, + u32 dest_qp, u32 src_qp) +{ + mgmt->dest_qpn = cpu_to_be32(dest_qp & OPA_16B_MGMT_QPN_MASK); + mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK); +} + +/** + * hfi1_get_rc_ohdr - get extended header + * @opah - the opaheader + */ +static inline struct ib_other_headers * +hfi1_get_rc_ohdr(struct hfi1_opa_header *opah) +{ + struct ib_other_headers *ohdr; + struct ib_header *hdr = NULL; + struct hfi1_16b_header *hdr_16b = NULL; + + /* Find out where the BTH is */ + if (opah->hdr_type == HFI1_PKT_TYPE_9B) { + hdr = &opah->ibh; + if (ib_get_lnh(hdr) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + } else { + u8 l4; + + hdr_16b = &opah->opah; + l4 = hfi1_16B_get_l4(hdr_16b); + if (l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr_16b->u.oth; + else + ohdr = &hdr_16b->u.l.oth; + } + return ohdr; +} + +struct rvt_sge_state; + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */ +#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */ +#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define HFI1_IB_CFG_SPD 5 /* current Link spd */ +#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */ +#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */ +#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */ +#define HFI1_IB_CFG_VL_HIGH_LIMIT 19 +#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * HFI or Host Link States + * + * These describe the states the driver thinks the logical and physical + * states are in. Used as an argument to set_link_state(). Implemented + * as bits for easy multi-state checking. The actual state can only be + * one. + */ +#define __HLS_UP_INIT_BP 0 +#define __HLS_UP_ARMED_BP 1 +#define __HLS_UP_ACTIVE_BP 2 +#define __HLS_DN_DOWNDEF_BP 3 /* link down default */ +#define __HLS_DN_POLL_BP 4 +#define __HLS_DN_DISABLE_BP 5 +#define __HLS_DN_OFFLINE_BP 6 +#define __HLS_VERIFY_CAP_BP 7 +#define __HLS_GOING_UP_BP 8 +#define __HLS_GOING_OFFLINE_BP 9 +#define __HLS_LINK_COOLDOWN_BP 10 + +#define HLS_UP_INIT BIT(__HLS_UP_INIT_BP) +#define HLS_UP_ARMED BIT(__HLS_UP_ARMED_BP) +#define HLS_UP_ACTIVE BIT(__HLS_UP_ACTIVE_BP) +#define HLS_DN_DOWNDEF BIT(__HLS_DN_DOWNDEF_BP) /* link down default */ +#define HLS_DN_POLL BIT(__HLS_DN_POLL_BP) +#define HLS_DN_DISABLE BIT(__HLS_DN_DISABLE_BP) +#define HLS_DN_OFFLINE BIT(__HLS_DN_OFFLINE_BP) +#define HLS_VERIFY_CAP BIT(__HLS_VERIFY_CAP_BP) +#define HLS_GOING_UP BIT(__HLS_GOING_UP_BP) +#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP) +#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP) + +#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) +#define HLS_DOWN ~(HLS_UP) + +#define HLS_DEFAULT HLS_DN_POLL + +/* use this MTU size if none other is given */ +#define HFI1_DEFAULT_ACTIVE_MTU 10240 +/* use this MTU size as the default maximum */ +#define HFI1_DEFAULT_MAX_MTU 10240 +/* default partition key */ +#define DEFAULT_PKEY 0xffff + +/* + * Possible fabric manager config parameters for fm_{get,set}_table() + */ +#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */ +#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */ +#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */ +#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */ +#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */ +#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB + */ +#define HFI1_RCVCTRL_TAILUPD_ENB 0x01 +#define HFI1_RCVCTRL_TAILUPD_DIS 0x02 +#define HFI1_RCVCTRL_CTXT_ENB 0x04 +#define HFI1_RCVCTRL_CTXT_DIS 0x08 +#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10 +#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20 +#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define HFI1_RCVCTRL_PKEY_DIS 0x80 +#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400 +#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800 +#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000 +#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 +#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 +#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 +#define HFI1_RCVCTRL_URGENT_ENB 0x40000 +#define HFI1_RCVCTRL_URGENT_DIS 0x80000 + +/* partition enforcement flags */ +#define HFI1_PART_ENFORCE_IN 0x1 +#define HFI1_PART_ENFORCE_OUT 0x2 + +/* how often we check for synthetic counter wrap around */ +#define SYNTH_CNT_TIME 3 + +/* Counter flags */ +#define CNTR_NORMAL 0x0 /* Normal counters, just read register */ +#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */ +#define CNTR_DISABLED 0x2 /* Disable this counter */ +#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */ +#define CNTR_VL 0x8 /* Per VL counter */ +#define CNTR_SDMA 0x10 +#define CNTR_INVALID_VL -1 /* Specifies invalid VL */ +#define CNTR_MODE_W 0x0 +#define CNTR_MODE_R 0x1 + +/* VLs Supported/Operational */ +#define HFI1_MIN_VLS_SUPPORTED 1 +#define HFI1_MAX_VLS_SUPPORTED 8 + +#define HFI1_GUIDS_PER_PORT 5 +#define HFI1_PORT_GUID_INDEX 0 + +static inline void incr_cntr64(u64 *cntr) +{ + if (*cntr < (u64)-1LL) + (*cntr)++; +} + +#define MAX_NAME_SIZE 64 +struct hfi1_msix_entry { + enum irq_type type; + int irq; + void *arg; + cpumask_t mask; + struct irq_affinity_notify notify; +}; + +struct hfi1_msix_info { + /* lock to synchronize in_use_msix access */ + spinlock_t msix_lock; + DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS); + struct hfi1_msix_entry *msix_entries; + u16 max_requested; +}; + +/* per-SL CCA information */ +struct cca_timer { + struct hrtimer hrtimer; + struct hfi1_pportdata *ppd; /* read-only */ + int sl; /* read-only */ + u16 ccti; /* read/write - current value of CCTI */ +}; + +struct link_down_reason { + /* + * SMA-facing value. Should be set from .latest when + * HLS_UP_* -> HLS_DN_* transition actually occurs. + */ + u8 sma; + u8 latest; +}; + +enum { + LO_PRIO_TABLE, + HI_PRIO_TABLE, + MAX_PRIO_TABLE +}; + +struct vl_arb_cache { + /* protect vl arb cache */ + spinlock_t lock; + struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE]; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct hfi1_pportdata { + struct hfi1_ibport ibport_data; + + struct hfi1_devdata *dd; + + /* PHY support */ + struct qsfp_data qsfp_info; + /* Values for SI tuning of SerDes */ + u32 port_type; + u32 tx_preset_eq; + u32 tx_preset_noeq; + u32 rx_preset; + u8 local_atten; + u8 remote_atten; + u8 default_atten; + u8 max_power_class; + + /* did we read platform config from scratch registers? */ + bool config_from_scratch; + + /* GUIDs for this interface, in host order, guids[0] is a port guid */ + u64 guids[HFI1_GUIDS_PER_PORT]; + + /* GUID for peer interface, in host order */ + u64 neighbor_guid; + + /* up or down physical link state */ + u32 linkup; + + /* + * this address is mapped read-only into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + struct workqueue_struct *hfi1_wq; + struct workqueue_struct *link_wq; + + /* move out of interrupt context */ + struct work_struct link_vc_work; + struct work_struct link_up_work; + struct work_struct link_down_work; + struct work_struct sma_message_work; + struct work_struct freeze_work; + struct work_struct link_downgrade_work; + struct work_struct link_bounce_work; + struct delayed_work start_link_work; + /* host link state variables */ + struct mutex hls_lock; + u32 host_link_state; + + /* these are the "32 bit" regs */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + u32 current_egress_rate; /* units [10^6 bits/sec] */ + /* LID programmed for this instance */ + u32 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[MAX_PKEY_VALUES]; + u16 link_width_supported; + u16 link_width_downgrade_supported; + u16 link_speed_supported; + u16 link_width_enabled; + u16 link_width_downgrade_enabled; + u16 link_speed_enabled; + u16 link_width_active; + u16 link_width_downgrade_tx_active; + u16 link_width_downgrade_rx_active; + u16 link_speed_active; + u8 vls_supported; + u8 vls_operational; + u8 actual_vls_operational; + /* LID mask control */ + u8 lmc; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u32 port; /* IB port number and index into dd->pports - 1 */ + /* type of neighbor node */ + u8 neighbor_type; + u8 neighbor_normal; + u8 neighbor_fm_security; /* 1 if firmware checking is disabled */ + u8 neighbor_port_number; + u8 is_sm_config_started; + u8 offline_disabled_reason; + u8 is_active_optimize_enabled; + u8 driver_link_ready; /* driver ready for active link */ + u8 link_enabled; /* link enabled? */ + u8 linkinit_reason; + u8 local_tx_rate; /* rate given to 8051 firmware */ + u8 qsfp_retry_count; + + /* placeholders for IB MAD packet settings */ + u8 overrun_threshold; + u8 phy_error_threshold; + unsigned int is_link_down_queued; + + /* Used to override LED behavior for things like maintenance beaconing*/ + /* + * Alternates per phase of blink + * [0] holds LED off duration, [1] holds LED on duration + */ + unsigned long led_override_vals[2]; + u8 led_override_phase; /* LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + + u32 sm_trap_qp; + u32 sa_qp; + + /* + * cca_timer_lock protects access to the per-SL cca_timer + * structures (specifically the ccti member). + */ + spinlock_t cca_timer_lock ____cacheline_aligned_in_smp; + struct cca_timer cca_timer[OPA_MAX_SLS]; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX]; + + /* congestion entries, each entry corresponding to a SL */ + struct opa_congestion_setting_entry_shadow + congestion_entries[OPA_MAX_SLS]; + + /* + * cc_state_lock protects (write) access to the per-port + * struct cc_state. + */ + spinlock_t cc_state_lock ____cacheline_aligned_in_smp; + + struct cc_state __rcu *cc_state; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u32 cc_sl_control_map; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; + + /* + * begin congestion log related entries + * cc_log_lock protects all congestion log related data + */ + spinlock_t cc_log_lock ____cacheline_aligned_in_smp; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; + u16 threshold_event_counter; + struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS]; + int cc_log_idx; /* index for logging events */ + int cc_mad_idx; /* index for reporting events */ + /* end congestion log related entries */ + + struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE]; + + /* port relative counter buffer */ + u64 *cntrs; + /* port relative synthetic counter buffer */ + u64 *scntrs; + /* port_xmit_discards are synthesized from different egress errors */ + u64 port_xmit_discards; + u64 port_xmit_discards_vl[C_VL_COUNT]; + u64 port_xmit_constraint_errors; + u64 port_rcv_constraint_errors; + /* count of 'link_err' interrupts from DC */ + u64 link_downed; + /* number of times link retrained successfully */ + u64 link_up; + /* number of times a link unknown frame was reported */ + u64 unknown_frame_count; + /* port_ltp_crc_mode is returned in 'portinfo' MADs */ + u16 port_ltp_crc_mode; + /* port_crc_mode_enabled is the crc we support */ + u8 port_crc_mode_enabled; + /* mgmt_allowed is also returned in 'portinfo' MADs */ + u8 mgmt_allowed; + u8 part_enforce; /* partition enforcement flags */ + struct link_down_reason local_link_down_reason; + struct link_down_reason neigh_link_down_reason; + /* Value to be sent to link peer on LinkDown .*/ + u8 remote_link_down_reason; + /* Error events that will cause a port bounce. */ + u32 port_error_action; + struct work_struct linkstate_active_work; + /* Does this port need to prescan for FECNs */ + bool cc_prescan; + /* + * Sample sendWaitCnt & sendWaitVlCnt during link transition + * and counter request. + */ + u64 port_vl_xmit_wait_last[C_VL_COUNT + 1]; + u16 prev_link_width; + u64 vl_xmit_flit_cnt[C_VL_COUNT + 1]; +}; + +typedef void (*opcode_handler)(struct hfi1_packet *packet); +typedef void (*hfi1_make_req)(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); +extern const rhf_rcv_function_ptr normal_rhf_rcv_functions[]; +extern const rhf_rcv_function_ptr netdev_rhf_rcv_functions[]; + +/* return values for the RHF receive functions */ +#define RHF_RCV_CONTINUE 0 /* keep going */ +#define RHF_RCV_DONE 1 /* stop, this packet processed */ +#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ + +struct rcv_array_data { + u16 ngroups; + u16 nctxt_extra; + u8 group_size; +}; + +struct per_vl_data { + u16 mtu; + struct send_context *sc; +}; + +/* 16 to directly index */ +#define PER_VL_SEND_CONTEXTS 16 + +struct err_info_rcvport { + u8 status_and_code; + u64 packet_flit1; + u64 packet_flit2; +}; + +struct err_info_constraint { + u8 status; + u16 pkey; + u32 slid; +}; + +struct hfi1_temp { + unsigned int curr; /* current temperature */ + unsigned int lo_lim; /* low temperature limit */ + unsigned int hi_lim; /* high temperature limit */ + unsigned int crit_lim; /* critical temperature limit */ + u8 triggers; /* temperature triggers */ +}; + +struct hfi1_i2c_bus { + struct hfi1_devdata *controlling_dd; /* current controlling device */ + struct i2c_adapter adapter; /* bus details */ + struct i2c_algo_bit_data algo; /* bus algorithm details */ + int num; /* bus number, 0 or 1 */ +}; + +/* common data between shared ASIC HFIs */ +struct hfi1_asic_data { + struct hfi1_devdata *dds[2]; /* back pointers */ + struct mutex asic_resource_mutex; + struct hfi1_i2c_bus *i2c_bus0; + struct hfi1_i2c_bus *i2c_bus1; +}; + +/* sizes for both the QP and RSM map tables */ +#define NUM_MAP_ENTRIES 256 +#define NUM_MAP_REGS 32 + +/* Virtual NIC information */ +struct hfi1_vnic_data { + struct kmem_cache *txreq_cache; + u8 num_vports; +}; + +struct hfi1_vnic_vport_info; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a hfi1_pportdata struct. + */ +struct sdma_engine; +struct sdma_vl_map; + +#define BOARD_VERS_MAX 96 /* how long the version string can be */ +#define SERIAL_MAX 16 /* length of the serial number */ + +typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64); +struct hfi1_netdev_rx; +struct hfi1_devdata { + struct hfi1_ibdev verbs_dev; /* must be first */ + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev user_cdev; + struct cdev diag_cdev; + struct cdev ui_cdev; + struct device *user_device; + struct device *diag_device; + struct device *ui_device; + + /* first mapping up to RcvArray */ + u8 __iomem *kregbase1; + resource_size_t physaddr; + + /* second uncached mapping from RcvArray to pio send buffers */ + u8 __iomem *kregbase2; + /* for detecting offset above kregbase2 address */ + u32 base2_start; + + /* Per VL data. Enough for all VLs but not all elements are set/used. */ + struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; + /* send context data */ + struct send_context_info *send_contexts; + /* map hardware send contexts to software index */ + u8 *hw_to_sw; + /* spinlock for allocating and releasing send context resources */ + spinlock_t sc_lock; + /* lock for pio_map */ + spinlock_t pio_map_lock; + /* Send Context initialization lock. */ + spinlock_t sc_init_lock; + /* lock for sdma_map */ + spinlock_t sde_map_lock; + /* array of kernel send contexts */ + struct send_context **kernel_send_context; + /* array of vl maps */ + struct pio_vl_map __rcu *pio_map; + /* default flags to last descriptor */ + u64 default_desc1; + + /* fields common to all SDMA engines */ + + volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */ + dma_addr_t sdma_heads_phys; + void *sdma_pad_dma; /* DMA'ed by chip */ + dma_addr_t sdma_pad_phys; + /* for deallocation */ + size_t sdma_heads_size; + /* num used */ + u32 num_sdma; + /* array of engines sized by num_sdma */ + struct sdma_engine *per_sdma; + /* array of vl maps */ + struct sdma_vl_map __rcu *sdma_map; + /* SPC freeze waitqueue and variable */ + wait_queue_head_t sdma_unfreeze_wq; + atomic_t sdma_unfreeze_count; + + u32 lcb_access_count; /* count of LCB users */ + + /* common data between shared ASIC HFIs in this OS */ + struct hfi1_asic_data *asic_data; + + /* mem-mapped pointer to base of PIO buffers */ + void __iomem *piobase; + /* + * write-combining mem-mapped pointer to base of RcvArray + * memory. + */ + void __iomem *rcvarray_wc; + /* + * credit return base - a per-NUMA range of DMA address that + * the chip will use to update the per-context free counter + */ + struct credit_return_base *cr_base; + + /* send context numbers and sizes for each type */ + struct sc_config_sizes sc_sizes[SC_MAX]; + + char *boardname; /* human readable board info */ + + u64 ctx0_seq_drop; + + /* reset value */ + u64 z_int_counter; + u64 z_rcv_limit; + u64 z_send_schedule; + + u64 __percpu *send_schedule; + /* number of reserved contexts for netdev usage */ + u16 num_netdev_contexts; + /* number of receive contexts in use by the driver */ + u32 num_rcv_contexts; + /* number of pio send contexts in use by the driver */ + u32 num_send_contexts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + /* total number of available user/PSM contexts */ + u32 num_user_contexts; + /* base receive interrupt timeout, in CSR units */ + u32 rcv_intr_timeout_csr; + + spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ + spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ + spinlock_t uctxt_lock; /* protect rcd changes */ + struct mutex dc8051_lock; /* exclusive access to 8051 */ + struct workqueue_struct *update_cntr_wq; + struct work_struct update_cntr_work; + /* exclusive access to 8051 memory */ + spinlock_t dc8051_memlock; + int dc8051_timed_out; /* remember if the 8051 timed out */ + /* + * A page that will hold event notification bitmaps for all + * contexts. This page will be mapped into all processes. + */ + unsigned long *events; + /* + * per unit status, see also portdata statusp + * mapped read-only into user processes so they can get unit and + * IB link status cheaply + */ + struct hfi1_status *status; + + /* revision register shadow */ + u64 revision; + /* Base GUID for device (network order) */ + u64 base_guid; + + /* both sides of the PCIe link are gen3 capable */ + u8 link_gen3_capable; + u8 dc_shutdown; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + int node; /* home node of this chip */ + + /* save these PCI fields to restore after a reset */ + u32 pcibar0; + u32 pcibar1; + u32 pci_rom; + u16 pci_command; + u16 pcie_devctl; + u16 pcie_lnkctl; + u16 pcie_devctl2; + u32 pci_msix0; + u32 pci_tph2; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer serial number format + */ + u8 serial[SERIAL_MAX]; + /* human readable board version */ + u8 boardversion[BOARD_VERS_MAX]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from CceRevision */ + u8 majrev; + /* chip minor rev, from CceRevision */ + u8 minrev; + /* hardware ID */ + u8 hfi1_id; + /* implementation code */ + u8 icode; + /* vAU of this device */ + u8 vau; + /* vCU of this device */ + u8 vcu; + /* link credits of this device */ + u16 link_credits; + /* initial vl15 credits to use */ + u16 vl15_init; + + /* + * Cached value for vl15buf, read during verify cap interrupt. VL15 + * credits are to be kept at 0 and set when handling the link-up + * interrupt. This removes the possibility of receiving VL15 MAD + * packets before this HFI is ready. + */ + u16 vl15buf_cached; + + /* Misc small ints */ + u8 n_krcv_queues; + u8 qos_shift; + + u16 irev; /* implementation revision */ + u32 dc8051_ver; /* 8051 firmware version */ + + spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */ + struct platform_config platform_config; + struct platform_config_cache pcfg_cache; + + struct diag_client *diag_client; + + /* general interrupt: mask of handled interrupts */ + u64 gi_mask[CCE_NUM_INT_CSRS]; + + struct rcv_array_data rcv_entries; + + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + + /* + * 64 bit synthetic counters + */ + struct timer_list synth_stats_timer; + + /* MSI-X information */ + struct hfi1_msix_info msix_info; + + /* + * device counters + */ + char *cntrnames; + size_t cntrnameslen; + size_t ndevcntrs; + u64 *cntrs; + u64 *scntrs; + + /* + * remembered values for synthetic counters + */ + u64 last_tx; + u64 last_rx; + + /* + * per-port counters + */ + size_t nportcntrs; + char *portcntrnames; + size_t portcntrnameslen; + + struct err_info_rcvport err_info_rcvport; + struct err_info_constraint err_info_rcv_constraint; + struct err_info_constraint err_info_xmit_constraint; + + atomic_t drop_packet; + bool do_drop; + u8 err_info_uncorrectable; + u8 err_info_fmconfig; + + /* + * Software counters for the status bits defined by the + * associated error status registers + */ + u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS]; + u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS]; + u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS]; + u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS]; + u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS]; + u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS]; + u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS]; + + /* Software counter that spans all contexts */ + u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS]; + /* Software counter that spans all DMA engines */ + u64 sw_send_dma_eng_err_status_cnt[ + NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS]; + /* Software counter that aggregates all cce_err_status errors */ + u64 sw_cce_err_status_aggregate; + /* Software counter that aggregates all bypass packet rcv errors */ + u64 sw_rcv_bypass_packet_errors; + + /* Save the enabled LCB error bits */ + u64 lcb_err_en; + struct cpu_mask_set *comp_vect; + int *comp_vect_mappings; + u32 comp_vect_possible_cpus; + + /* + * Capability to have different send engines simply by changing a + * pointer value. + */ + send_routine process_pio_send ____cacheline_aligned_in_smp; + send_routine process_dma_send; + void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); + int (*process_vnic_dma_send)(struct hfi1_devdata *dd, u8 q_idx, + struct hfi1_vnic_vport_info *vinfo, + struct sk_buff *skb, u64 pbc, u8 plen); + /* hfi1_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct hfi1_pportdata *pport; + /* receive context data */ + struct hfi1_ctxtdata **rcd; + u64 __percpu *int_counter; + /* verbs tx opcode stats */ + struct hfi1_opcode_stats_perctx __percpu *tx_opstats; + /* device (not port) flags, basically device capabilities */ + u16 flags; + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes or VNIC */ + u8 first_dyn_alloc_ctxt; + /* adding a new field here would make it part of this cacheline */ + + /* seqlock for sc2vl */ + seqlock_t sc2vl_lock ____cacheline_aligned_in_smp; + u64 sc2vl[4]; + u64 __percpu *rcv_limit; + /* adding a new field here would make it part of this cacheline */ + + /* OUI comes from the HW. Used everywhere as 3 separate bytes. */ + u8 oui1; + u8 oui2; + u8 oui3; + + /* Timer and counter used to detect RcvBufOvflCnt changes */ + struct timer_list rcverr_timer; + + wait_queue_head_t event_queue; + + /* receive context tail dummy address */ + __le64 *rcvhdrtail_dummy_kvaddr; + dma_addr_t rcvhdrtail_dummy_dma; + + u32 rcv_ovfl_cnt; + /* Serialize ASPM enable/disable between multiple verbs contexts */ + spinlock_t aspm_lock; + /* Number of verbs contexts which have disabled ASPM */ + atomic_t aspm_disabled_cnt; + /* Keeps track of user space clients */ + refcount_t user_refcount; + /* Used to wait for outstanding user space clients before dev removal */ + struct completion user_comp; + + bool eprom_available; /* true if EPROM is available for this device */ + bool aspm_supported; /* Does HW support ASPM */ + bool aspm_enabled; /* ASPM state: enabled/disabled */ + struct rhashtable *sdma_rht; + + /* vnic data */ + struct hfi1_vnic_data vnic; + /* Lock to protect IRQ SRC register access */ + spinlock_t irq_src_lock; + int vnic_num_vports; + struct hfi1_netdev_rx *netdev_rx; + struct hfi1_affinity_node *affinity_entry; + + /* Keeps track of IPoIB RSM rule users */ + atomic_t ipoib_rsm_usr_num; +}; + +/* 8051 firmware version helper */ +#define dc8051_ver(a, b, c) ((a) << 16 | (b) << 8 | (c)) +#define dc8051_ver_maj(a) (((a) & 0xff0000) >> 16) +#define dc8051_ver_min(a) (((a) & 0x00ff00) >> 8) +#define dc8051_ver_patch(a) ((a) & 0x0000ff) + +/* f_put_tid types */ +#define PT_EXPECTED 0 +#define PT_EAGER 1 +#define PT_INVALID_FLUSH 2 +#define PT_INVALID 3 + +struct tid_rb_node; + +/* Private data for file operations */ +struct hfi1_filedata { + struct srcu_struct pq_srcu; + struct hfi1_devdata *dd; + struct hfi1_ctxtdata *uctxt; + struct hfi1_user_sdma_comp_q *cq; + /* update side lock for SRCU */ + spinlock_t pq_rcu_lock; + struct hfi1_user_sdma_pkt_q __rcu *pq; + u16 subctxt; + /* for cpu affinity; -1 if none */ + int rec_cpu_num; + u32 tid_n_pinned; + bool use_mn; + struct tid_rb_node **entry_to_rb; + spinlock_t tid_lock; /* protect tid_[limit,used] counters */ + u32 tid_limit; + u32 tid_used; + u32 *invalid_tids; + u32 invalid_tid_idx; + /* protect invalid_tids array and invalid_tid_idx */ + spinlock_t invalid_lock; +}; + +extern struct xarray hfi1_dev_table; +struct hfi1_devdata *hfi1_lookup(int unit); + +static inline unsigned long uctxt_offset(struct hfi1_ctxtdata *uctxt) +{ + return (uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) * + HFI1_MAX_SHARED_CTXTS; +} + +int hfi1_init(struct hfi1_devdata *dd, int reinit); +int hfi1_count_active_units(void); + +int hfi1_diag_add(struct hfi1_devdata *dd); +void hfi1_diag_remove(struct hfi1_devdata *dd); +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup); + +void handle_user_interrupt(struct hfi1_ctxtdata *rcd); + +int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd); +int hfi1_create_kctxts(struct hfi1_devdata *dd); +int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, + struct hfi1_ctxtdata **rcd); +void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd); +void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, + struct hfi1_devdata *dd, u8 hw_pidx, u32 port); +void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); +int hfi1_rcd_put(struct hfi1_ctxtdata *rcd); +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd); +struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, + u16 ctxt); +struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt); +int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); +int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); +int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); +int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget); +int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget); +void set_all_slowpath(struct hfi1_devdata *dd); + +extern const struct pci_device_id hfi1_pci_tbl[]; +void hfi1_make_ud_req_9B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + +/* receive packet handler dispositions */ +#define RCV_PKT_OK 0x0 /* keep going */ +#define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */ +#define RCV_PKT_DONE 0x2 /* stop, no more packets detected */ + +/** + * hfi1_rcd_head - add accessor for rcd head + * @rcd: the context + */ +static inline u32 hfi1_rcd_head(struct hfi1_ctxtdata *rcd) +{ + return rcd->head; +} + +/** + * hfi1_set_rcd_head - add accessor for rcd head + * @rcd: the context + * @head: the new head + */ +static inline void hfi1_set_rcd_head(struct hfi1_ctxtdata *rcd, u32 head) +{ + rcd->head = head; +} + +/* calculate the current RHF address */ +static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd) +{ + return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->rhf_offset; +} + +/* return DMA_RTAIL configuration */ +static inline bool get_dma_rtail_setting(struct hfi1_ctxtdata *rcd) +{ + return !!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL); +} + +/** + * hfi1_seq_incr_wrap - wrapping increment for sequence + * @seq: the current sequence number + * + * Returns: the incremented seq + */ +static inline u8 hfi1_seq_incr_wrap(u8 seq) +{ + if (++seq > RHF_MAX_SEQ) + seq = 1; + return seq; +} + +/** + * hfi1_seq_cnt - return seq_cnt member + * @rcd: the receive context + * + * Return seq_cnt member + */ +static inline u8 hfi1_seq_cnt(struct hfi1_ctxtdata *rcd) +{ + return rcd->seq_cnt; +} + +/** + * hfi1_set_seq_cnt - return seq_cnt member + * @rcd: the receive context + * + * Return seq_cnt member + */ +static inline void hfi1_set_seq_cnt(struct hfi1_ctxtdata *rcd, u8 cnt) +{ + rcd->seq_cnt = cnt; +} + +/** + * last_rcv_seq - is last + * @rcd: the receive context + * @seq: sequence + * + * return true if last packet + */ +static inline bool last_rcv_seq(struct hfi1_ctxtdata *rcd, u32 seq) +{ + return seq != rcd->seq_cnt; +} + +/** + * rcd_seq_incr - increment context sequence number + * @rcd: the receive context + * @seq: the current sequence number + * + * Returns: true if the this was the last packet + */ +static inline bool hfi1_seq_incr(struct hfi1_ctxtdata *rcd, u32 seq) +{ + rcd->seq_cnt = hfi1_seq_incr_wrap(rcd->seq_cnt); + return last_rcv_seq(rcd, seq); +} + +/** + * get_hdrqentsize - return hdrq entry size + * @rcd: the receive context + */ +static inline u8 get_hdrqentsize(struct hfi1_ctxtdata *rcd) +{ + return rcd->rcvhdrqentsize; +} + +/** + * get_hdrq_cnt - return hdrq count + * @rcd: the receive context + */ +static inline u16 get_hdrq_cnt(struct hfi1_ctxtdata *rcd) +{ + return rcd->rcvhdrq_cnt; +} + +/** + * hfi1_is_slowpath - check if this context is slow path + * @rcd: the receive context + */ +static inline bool hfi1_is_slowpath(struct hfi1_ctxtdata *rcd) +{ + return rcd->do_interrupt == rcd->slow_handler; +} + +/** + * hfi1_is_fastpath - check if this context is fast path + * @rcd: the receive context + */ +static inline bool hfi1_is_fastpath(struct hfi1_ctxtdata *rcd) +{ + if (rcd->ctxt == HFI1_CTRL_CTXT) + return false; + + return rcd->do_interrupt == rcd->fast_handler; +} + +/** + * hfi1_set_fast - change to the fast handler + * @rcd: the receive context + */ +static inline void hfi1_set_fast(struct hfi1_ctxtdata *rcd) +{ + if (unlikely(!rcd)) + return; + if (unlikely(!hfi1_is_fastpath(rcd))) + rcd->do_interrupt = rcd->fast_handler; +} + +int hfi1_reset_device(int); + +void receive_interrupt_work(struct work_struct *work); + +/* extract service channel from header and rhf */ +static inline int hfi1_9B_get_sc5(struct ib_header *hdr, u64 rhf) +{ + return ib_get_sc(hdr) | ((!!(rhf_dc_info(rhf))) << 4); +} + +#define HFI1_JKEY_WIDTH 16 +#define HFI1_JKEY_MASK (BIT(16) - 1) +#define HFI1_ADMIN_JKEY_RANGE 32 + +/* + * J_KEYs are split and allocated in the following groups: + * 0 - 31 - users with administrator privileges + * 32 - 63 - kernel protocols using KDETH packets + * 64 - 65535 - all other users using KDETH packets + */ +static inline u16 generate_jkey(kuid_t uid) +{ + u16 jkey = from_kuid(current_user_ns(), uid) & HFI1_JKEY_MASK; + + if (capable(CAP_SYS_ADMIN)) + jkey &= HFI1_ADMIN_JKEY_RANGE - 1; + else if (jkey < 64) + jkey |= BIT(HFI1_JKEY_WIDTH - 1); + + return jkey; +} + +/* + * active_egress_rate + * + * returns the active egress rate in units of [10^6 bits/sec] + */ +static inline u32 active_egress_rate(struct hfi1_pportdata *ppd) +{ + u16 link_speed = ppd->link_speed_active; + u16 link_width = ppd->link_width_active; + u32 egress_rate; + + if (link_speed == OPA_LINK_SPEED_25G) + egress_rate = 25000; + else /* assume OPA_LINK_SPEED_12_5G */ + egress_rate = 12500; + + switch (link_width) { + case OPA_LINK_WIDTH_4X: + egress_rate *= 4; + break; + case OPA_LINK_WIDTH_3X: + egress_rate *= 3; + break; + case OPA_LINK_WIDTH_2X: + egress_rate *= 2; + break; + default: + /* assume IB_WIDTH_1X */ + break; + } + + return egress_rate; +} + +/* + * egress_cycles + * + * Returns the number of 'fabric clock cycles' to egress a packet + * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock + * rate is (approximately) 805 MHz, the units of the returned value + * are (1/805 MHz). + */ +static inline u32 egress_cycles(u32 len, u32 rate) +{ + u32 cycles; + + /* + * cycles is: + * + * (length) [bits] / (rate) [bits/sec] + * --------------------------------------------------- + * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec] + */ + + cycles = len * 8; /* bits */ + cycles *= 805; + cycles /= rate; + + return cycles; +} + +void set_link_ipg(struct hfi1_pportdata *ppd); +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, + u32 rqpn, u8 svc_type); +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, + u16 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh); +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u16 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); +typedef void (*hfi1_handle_cnp)(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u16 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); + +#define PKEY_CHECK_INVALID -1 +int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey, + u8 sc5, int8_t s_pkey_index); + +#define PACKET_EGRESS_TIMEOUT 350 +static inline void pause_for_credit_return(struct hfi1_devdata *dd) +{ + /* Pause at least 1us, to ensure chip returns all credits */ + u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000; + + udelay(usec ? usec : 1); +} + +/** + * sc_to_vlt() - reverse lookup sc to vl + * @dd - devdata + * @sc5 - 5 bit sc + */ +static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5) +{ + unsigned seq; + u8 rval; + + if (sc5 >= OPA_MAX_SCS) + return (u8)(0xff); + + do { + seq = read_seqbegin(&dd->sc2vl_lock); + rval = *(((u8 *)dd->sc2vl) + sc5); + } while (read_seqretry(&dd->sc2vl_lock, seq)); + + return rval; +} + +#define PKEY_MEMBER_MASK 0x8000 +#define PKEY_LOW_15_MASK 0x7fff + +/* + * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the ingress partition key table), return 0 + * otherwise. Use the matching criteria for ingress partition keys + * specified in the OPAv1 spec., section 9.10.14. + */ +static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 ment = ent & PKEY_LOW_15_MASK; + + if (mkey == ment) { + /* + * If pkey[15] is clear (limited partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (!(pkey & PKEY_MEMBER_MASK)) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/* + * ingress_pkey_table_search - search the entire pkey table for + * an entry which matches 'pkey'. return 0 if a match is found, + * and 1 otherwise. + */ +static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) +{ + int i; + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } + return 1; +} + +/* + * ingress_pkey_table_fail - record a failure of ingress pkey validation, + * i.e., increment port_rcv_constraint_errors for the port, and record + * the 'error info' for this failure. + */ +static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, + u32 slid) +{ + struct hfi1_devdata *dd = ppd->dd; + + incr_cntr64(&ppd->port_rcv_constraint_errors); + if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) { + dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK; + dd->err_info_rcv_constraint.slid = slid; + dd->err_info_rcv_constraint.pkey = pkey; + } +} + +/* + * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx + * is a hint as to the best place in the partition key table to begin + * searching. This function should not be called on the data path because + * of performance reasons. On datapath pkey check is expected to be done + * by HW and rcv_pkey_check function should be called instead. + */ +static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u8 idx, u32 slid, bool force) +{ + if (!(force) && !(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* The most likely matching pkey has index 'idx' */ + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx])) + return 0; + + /* no match - try the whole table */ + if (!ingress_pkey_table_search(ppd, pkey)) + return 0; + +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* + * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. It only ensures pkey is vlid for QP0. This function + * should be called on the data path instead of ingress_pkey_check + * as on data path, pkey check is done by HW (except for QP0). + */ +static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + return 0; +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* MTU handling */ + +/* MTU enumeration, 256-4k match IB */ +#define OPA_MTU_0 0 +#define OPA_MTU_256 1 +#define OPA_MTU_512 2 +#define OPA_MTU_1024 3 +#define OPA_MTU_2048 4 +#define OPA_MTU_4096 5 + +u32 lrh_max_header_bytes(struct hfi1_devdata *dd); +int mtu_to_enum(u32 mtu, int default_if_bad); +u16 enum_to_mtu(int mtu); +static inline int valid_ib_mtu(unsigned int mtu) +{ + return mtu == 256 || mtu == 512 || + mtu == 1024 || mtu == 2048 || + mtu == 4096; +} + +static inline int valid_opa_max_mtu(unsigned int mtu) +{ + return mtu >= 2048 && + (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240); +} + +int set_mtu(struct hfi1_pportdata *ppd); + +int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc); +void hfi1_disable_after_error(struct hfi1_devdata *dd); +int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit); +int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encode); + +int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t); +int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t); + +void set_up_vau(struct hfi1_devdata *dd, u8 vau); +void set_up_vl15(struct hfi1_devdata *dd, u16 vl15buf); +void reset_link_credits(struct hfi1_devdata *dd); +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); + +int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc); + +static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev) +{ + return container_of(dev, struct hfi1_devdata, verbs_dev); +} + +static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp) +{ + return container_of(ibp, struct hfi1_pportdata, ibport_data); +} + +static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi) +{ + return container_of(rdi, struct hfi1_ibdev, rdi); +} + +static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u32 port) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +static inline struct hfi1_ibport *rcd_to_iport(struct hfi1_ctxtdata *rcd) +{ + return &rcd->ppd->ibport_data; +} + +/** + * hfi1_may_ecn - Check whether FECN or BECN processing should be done + * @pkt: the packet to be evaluated + * + * Check whether the FECN or BECN bits in the packet's header are + * enabled, depending on packet type. + * + * This function only checks for FECN and BECN bits. Additional checks + * are done in the slowpath (hfi1_process_ecn_slowpath()) in order to + * ensure correct handling. + */ +static inline bool hfi1_may_ecn(struct hfi1_packet *pkt) +{ + bool fecn, becn; + + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + fecn = hfi1_16B_get_fecn(pkt->hdr); + becn = hfi1_16B_get_becn(pkt->hdr); + } else { + fecn = ib_bth_get_fecn(pkt->ohdr); + becn = ib_bth_get_becn(pkt->ohdr); + } + return fecn || becn; +} + +bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool prescan); +static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt) +{ + bool do_work; + + do_work = hfi1_may_ecn(pkt); + if (unlikely(do_work)) + return hfi1_process_ecn_slowpath(qp, pkt, false); + return false; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 ret; + + if (index >= ARRAY_SIZE(ppd->pkeys)) + ret = 0; + else + ret = ppd->pkeys[index]; + + return ret; +} + +/* + * Return the indexed GUID from the port GUIDs table. + */ +static inline __be64 get_sguid(struct hfi1_ibport *ibp, unsigned int index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + WARN_ON(index >= HFI1_GUIDS_PER_PORT); + return cpu_to_be64(ppd->guids[index]); +} + +/* + * Called by readers of cc_state only, must call under rcu_read_lock(). + */ +static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) +{ + return rcu_dereference(ppd->cc_state); +} + +/* + * Called by writers of cc_state only, must call under cc_state_lock. + */ +static inline +struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) +{ + return rcu_dereference_protected(ppd->cc_state, + lockdep_is_held(&ppd->cc_state_lock)); +} + +/* + * values for dd->flags (_device_ related flags) + */ +#define HFI1_INITTED 0x1 /* chip and driver up and initted */ +#define HFI1_PRESENT 0x2 /* chip accesses can be done */ +#define HFI1_FROZEN 0x4 /* chip in SPC freeze */ +#define HFI1_HAS_SDMA_TIMEOUT 0x8 +#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ +#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ +#define HFI1_SHUTDOWN 0x100 /* device is shutting down */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) + +/* ctxt_flag bit offsets */ + /* base context has not finished initializing */ +#define HFI1_CTXT_BASE_UNINIT 1 + /* base context initaliation failed */ +#define HFI1_CTXT_BASE_FAILED 2 + /* waiting for a packet to arrive */ +#define HFI1_CTXT_WAITING_RCV 3 + /* waiting for an urgent packet to arrive */ +#define HFI1_CTXT_WAITING_URG 4 + +/* free up any allocated data at closes */ +int hfi1_init_dd(struct hfi1_devdata *dd); +void hfi1_free_devdata(struct hfi1_devdata *dd); + +/* LED beaconing functions */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff); +void shutdown_led_override(struct hfi1_pportdata *ppd); + +#define HFI1_CREDIT_RETURN_RATE (100) + +/* + * The number of words for the KDETH protocol field. If this is + * larger then the actual field used, then part of the payload + * will be in the header. + * + * Optimally, we want this sized so that a typical case will + * use full cache lines. The typical local KDETH header would + * be: + * + * Bytes Field + * 8 LRH + * 12 BHT + * ?? KDETH + * 8 RHF + * --- + * 28 + KDETH + * + * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS + */ +#define DEFAULT_RCVHDRSIZE 9 + +/* + * Maximal header byte count: + * + * Bytes Field + * 8 LRH + * 40 GRH (optional) + * 12 BTH + * ?? KDETH + * 8 RHF + * --- + * 68 + KDETH + * + * We also want to maintain a cache line alignment to assist DMA'ing + * of the header bytes. Round up to a good size. + */ +#define DEFAULT_RCVHDR_ENTSIZE 32 + +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages); +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, + size_t npages, bool writable, struct page **pages); +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty); + +/** + * hfi1_rcvhdrtail_kvaddr - return tail kvaddr + * @rcd - the receive context + */ +static inline __le64 *hfi1_rcvhdrtail_kvaddr(const struct hfi1_ctxtdata *rcd) +{ + return (__le64 *)rcd->rcvhdrtail_kvaddr; +} + +static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + u64 *kv = (u64 *)hfi1_rcvhdrtail_kvaddr(rcd); + + if (kv) + *kv = 0ULL; +} + +static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32)le64_to_cpu(*hfi1_rcvhdrtail_kvaddr(rcd)); +} + +static inline bool hfi1_packet_present(struct hfi1_ctxtdata *rcd) +{ + if (likely(!rcd->rcvhdrtail_kvaddr)) { + u32 seq = rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))); + + return !last_rcv_seq(rcd, seq); + } + return hfi1_rcd_head(rcd) != get_rcvhdrtail(rcd); +} + +/* + * sysfs interface. + */ + +extern const char ib_hfi1_version[]; +extern const struct attribute_group ib_hfi1_attr_group; +extern const struct attribute_group *hfi1_attr_port_groups[]; + +int hfi1_device_create(struct hfi1_devdata *dd); +void hfi1_device_remove(struct hfi1_devdata *dd); + +int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd); +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd); +/* Hook for sysfs read of QSFP */ +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); + +int hfi1_pcie_init(struct hfi1_devdata *dd); +void hfi1_pcie_cleanup(struct pci_dev *pdev); +int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); +void hfi1_pcie_ddcleanup(struct hfi1_devdata *); +int pcie_speeds(struct hfi1_devdata *dd); +int restore_pci_variables(struct hfi1_devdata *dd); +int save_pci_variables(struct hfi1_devdata *dd); +int do_pcie_gen3_transition(struct hfi1_devdata *dd); +void tune_pcie_caps(struct hfi1_devdata *dd); +int parse_platform_config(struct hfi1_devdata *dd); +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len); + +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +static inline void flush_wc(void) +{ + asm volatile("sfence" : : : "memory"); +} + +void handle_eflags(struct hfi1_packet *packet); +void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd); + +/* global module parameter variables */ +extern unsigned int hfi1_max_mtu; +extern unsigned int hfi1_cu; +extern unsigned int user_credit_return_threshold; +extern int num_user_contexts; +extern unsigned long n_krcvqs; +extern uint krcvqs[]; +extern int krcvqsset; +extern uint loopback; +extern uint quick_linkup; +extern uint rcv_intr_timeout; +extern uint rcv_intr_count; +extern uint rcv_intr_dynamic; +extern ushort link_crc_mask; + +extern struct mutex hfi1_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define DRIVER_NAME "hfi1" +#define HFI1_USER_MINOR_BASE 0 +#define HFI1_TRACE_MINOR 127 +#define HFI1_NMINORS 255 + +#define PCI_VENDOR_ID_INTEL 0x8086 +#define PCI_DEVICE_ID_INTEL0 0x24f0 +#define PCI_DEVICE_ID_INTEL1 0x24f1 + +#define HFI1_PKT_USER_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) + +#define HFI1_PKT_KERNEL_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK) + +static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, + u16 ctxt_type) +{ + u64 base_sc_integrity; + + /* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY)) + return 0; + + base_sc_integrity = + SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK +#ifndef CONFIG_FAULT_INJECTION + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK +#endif + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (ctxt_type == SC_USER) + base_sc_integrity |= +#ifndef CONFIG_FAULT_INJECTION + SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK | +#endif + HFI1_PKT_USER_SC_INTEGRITY; + else if (ctxt_type != SC_KERNEL) + base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; + + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sc_integrity |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + + return base_sc_integrity; +} + +static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) +{ + u64 base_sdma_integrity; + + /* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY)) + return 0; + + base_sdma_integrity = + SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (!HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK; + + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + + return base_sdma_integrity; +} + +#define dd_dev_emerg(dd, fmt, ...) \ + dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) + +#define dd_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) + +#define dd_dev_err_ratelimited(dd, fmt, ...) \ + dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) + +#define dd_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) + +#define dd_dev_warn_ratelimited(dd, fmt, ...) \ + dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) + +#define dd_dev_info(dd, fmt, ...) \ + dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) + +#define dd_dev_info_ratelimited(dd, fmt, ...) \ + dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) + +#define dd_dev_dbg(dd, fmt, ...) \ + dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) + +#define hfi1_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct hfi1_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +/* in intr.c... */ +void hfi1_format_hwerrors(u64 hwerrs, + const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); + +#define USER_OPCODE_CHECK_VAL 0xC0 +#define USER_OPCODE_CHECK_MASK 0xC0 +#define OPCODE_CHECK_VAL_DISABLED 0x0 +#define OPCODE_CHECK_MASK_DISABLED 0x0 + +static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + dd->z_int_counter = get_all_cpu_total(dd->int_counter); + dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit); + dd->z_send_schedule = get_all_cpu_total(dd->send_schedule); + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rvp.z_rc_acks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_acks); + ppd->ibport_data.rvp.z_rc_qacks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks); + } +} + +/* Control LED state */ +static inline void setextled(struct hfi1_devdata *dd, u32 on) +{ + if (on) + write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F); + else + write_csr(dd, DCC_CFG_LED_CNTRL, 0x10); +} + +/* return the i2c resource given the target */ +static inline u32 i2c_target(u32 target) +{ + return target ? CR_I2C2 : CR_I2C1; +} + +/* return the i2c chain chip resource that this HFI uses for QSFP */ +static inline u32 qsfp_resource(struct hfi1_devdata *dd) +{ + return i2c_target(dd->hfi1_id); +} + +/* Is this device integrated or discrete? */ +static inline bool is_integrated(struct hfi1_devdata *dd) +{ + return dd->pcidev->device == PCI_DEVICE_ID_INTEL1; +} + +/** + * hfi1_need_drop - detect need for drop + * @dd: - the device + * + * In some cases, the first packet needs to be dropped. + * + * Return true is the current packet needs to be dropped and false otherwise. + */ +static inline bool hfi1_need_drop(struct hfi1_devdata *dd) +{ + if (unlikely(dd->do_drop && + atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) == + DROP_PACKET_ON)) { + dd->do_drop = false; + return true; + } + return false; +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); + +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +static inline void hfi1_update_ah_attr(struct ib_device *ibdev, + struct rdma_ah_attr *attr) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid = rdma_ah_get_dlid(attr); + + /* + * Kernel clients may not have setup GRH information + * Set that here. + */ + ibp = to_iport(ibdev, rdma_ah_get_port_num(attr)); + ppd = ppd_from_ibp(ibp); + if ((((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (!(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))) || + (rdma_ah_get_make_grd(attr))) { + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + rdma_ah_set_interface_id(attr, OPA_MAKE_ID(dlid)); + rdma_ah_set_subnet_prefix(attr, ibp->rvp.gid_prefix); + } +} + +/* + * hfi1_check_mcast- Check if the given lid is + * in the OPA multicast range. + * + * The LID might either reside in ah.dlid or might be + * in the GRH of the address handle as DGID if extended + * addresses are in use. + */ +static inline bool hfi1_check_mcast(u32 lid) +{ + return ((lid >= opa_get_mcast_base(OPA_MCAST_NR)) && + (lid != be32_to_cpu(OPA_LID_PERMISSIVE))); +} + +#define opa_get_lid(lid, format) \ + __opa_get_lid(lid, OPA_PORT_PACKET_FORMAT_##format) + +/* Convert a lid to a specific lid space */ +static inline u32 __opa_get_lid(u32 lid, u8 format) +{ + bool is_mcast = hfi1_check_mcast(lid); + + switch (format) { + case OPA_PORT_PACKET_FORMAT_8B: + case OPA_PORT_PACKET_FORMAT_10B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF0000); + return lid & 0xFFFFF; + case OPA_PORT_PACKET_FORMAT_16B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF00000); + return lid & 0xFFFFFF; + case OPA_PORT_PACKET_FORMAT_9B: + if (is_mcast) + return (lid - + opa_get_mcast_base(OPA_MCAST_NR) + + be16_to_cpu(IB_MULTICAST_LID_BASE)); + else + return lid & 0xFFFF; + default: + return lid; + } +} + +/* Return true if the given lid is the OPA 16B multicast range */ +static inline bool hfi1_is_16B_mcast(u32 lid) +{ + return ((lid >= + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) && + (lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))); +} + +static inline void hfi1_make_opa_lid(struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + u32 dlid = rdma_ah_get_dlid(attr); + + /* Modify ah_attr.dlid to be in the 32 bit LID space. + * This is how the address will be laid out: + * Assuming MCAST_NR to be 4, + * 32 bit permissive LID = 0xFFFFFFFF + * Multicast LID range = 0xFFFFFFFE to 0xF0000000 + * Unicast LID range = 0xEFFFFFFF to 1 + * Invalid LID = 0 + */ + if (ib_is_opa_gid(&grh->dgid)) + dlid = opa_get_lid_from_gid(&grh->dgid); + else if ((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE))) + dlid = dlid - be16_to_cpu(IB_MULTICAST_LID_BASE) + + opa_get_mcast_base(OPA_MCAST_NR); + else if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) + dlid = be32_to_cpu(OPA_LID_PERMISSIVE); + + rdma_ah_set_dlid(attr, dlid); +} + +static inline u8 hfi1_get_packet_type(u32 lid) +{ + /* 9B if lid > 0xF0000000 */ + if (lid >= opa_get_mcast_base(OPA_MCAST_NR)) + return HFI1_PKT_TYPE_9B; + + /* 16B if lid > 0xC000 */ + if (lid >= opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 9B)) + return HFI1_PKT_TYPE_16B; + + return HFI1_PKT_TYPE_9B; +} + +static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr) +{ + /* + * If there was an incoming 16B packet with permissive + * LIDs, OPA GIDs would have been programmed when those + * packets were received. A 16B packet will have to + * be sent in response to that packet. Return a 16B + * header type if that's the case. + */ + if (rdma_ah_get_dlid(attr) == be32_to_cpu(OPA_LID_PERMISSIVE)) + return (ib_is_opa_gid(&rdma_ah_read_grh(attr)->dgid)) ? + HFI1_PKT_TYPE_16B : HFI1_PKT_TYPE_9B; + + /* + * Return a 16B header type if either the destination + * or source lid is extended. + */ + if (hfi1_get_packet_type(rdma_ah_get_dlid(attr)) == HFI1_PKT_TYPE_16B) + return HFI1_PKT_TYPE_16B; + + return hfi1_get_packet_type(lid); +} + +static inline void hfi1_make_ext_grh(struct hfi1_packet *packet, + struct ib_grh *grh, u32 slid, + u32 dlid) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (!ibp) + return; + + grh->hop_limit = 1; + grh->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; + if (slid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)) + grh->sgid.global.interface_id = + OPA_MAKE_ID(be32_to_cpu(OPA_LID_PERMISSIVE)); + else + grh->sgid.global.interface_id = OPA_MAKE_ID(slid); + + /* + * Upper layers (like mad) may compare the dgid in the + * wc that is obtained here with the sgid_index in + * the wr. Since sgid_index in wr is always 0 for + * extended lids, set the dgid here to the default + * IB gid. + */ + grh->dgid.global.subnet_prefix = ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); +} + +static inline int hfi1_get_16b_padding(u32 hdr_size, u32 payload) +{ + return -(hdr_size + payload + (SIZE_OF_CRC << 2) + + SIZE_OF_LT) & 0x7; +} + +static inline void hfi1_make_ib_hdr(struct ib_header *hdr, + u16 lrh0, u16 len, + u16 dlid, u16 slid) +{ + hdr->lrh[0] = cpu_to_be16(lrh0); + hdr->lrh[1] = cpu_to_be16(dlid); + hdr->lrh[2] = cpu_to_be16(len); + hdr->lrh[3] = cpu_to_be16(slid); +} + +static inline void hfi1_make_16b_hdr(struct hfi1_16b_header *hdr, + u32 slid, u32 dlid, + u16 len, u16 pkey, + bool becn, bool fecn, u8 l4, + u8 sc) +{ + u32 lrh0 = 0; + u32 lrh1 = 0x40000000; + u32 lrh2 = 0; + u32 lrh3 = 0; + + lrh0 = (lrh0 & ~OPA_16B_BECN_MASK) | (becn << OPA_16B_BECN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LEN_MASK) | (len << OPA_16B_LEN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LID_MASK) | (slid & OPA_16B_LID_MASK); + lrh1 = (lrh1 & ~OPA_16B_FECN_MASK) | (fecn << OPA_16B_FECN_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_SC_MASK) | (sc << OPA_16B_SC_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_LID_MASK) | (dlid & OPA_16B_LID_MASK); + lrh2 = (lrh2 & ~OPA_16B_SLID_MASK) | + ((slid >> OPA_16B_SLID_SHIFT) << OPA_16B_SLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_DLID_MASK) | + ((dlid >> OPA_16B_DLID_SHIFT) << OPA_16B_DLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_PKEY_MASK) | ((u32)pkey << OPA_16B_PKEY_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_L4_MASK) | l4; + + hdr->lrh[0] = lrh0; + hdr->lrh[1] = lrh1; + hdr->lrh[2] = lrh2; + hdr->lrh[3] = lrh3; +} +#endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c new file mode 100644 index 0000000000..6de37c5d7d --- /dev/null +++ b/drivers/infiniband/hw/hfi1/init.c @@ -0,0 +1,1980 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + * Copyright(c) 2021 Cornelis Networks. + */ + +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> +#include <linux/delay.h> +#include <linux/xarray.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/hrtimer.h> +#include <linux/bitmap.h> +#include <linux/numa.h> +#include <rdma/rdma_vt.h> + +#include "hfi.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "mad.h" +#include "sdma.h" +#include "debugfs.h" +#include "verbs.h" +#include "aspm.h" +#include "affinity.h" +#include "vnic.h" +#include "exp_rcv.h" +#include "netdev.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +/* + * min buffers we want to have per context, after driver + */ +#define HFI1_MIN_USER_CTXT_BUFCNT 7 + +#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ +#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ + +#define NUM_IB_PORTS 1 + +/* + * Number of user receive contexts we are configured to use (to allow for more + * pio buffers per ctxt, etc.) Zero means use one user context per CPU. + */ +int num_user_contexts = -1; +module_param_named(num_user_contexts, num_user_contexts, int, 0444); +MODULE_PARM_DESC( + num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); + +uint krcvqs[RXE_NUM_DATA_VL]; +int krcvqsset; +module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); + +/* computed based on above array */ +unsigned long n_krcvqs; + +static unsigned hfi1_rcvarr_split = 25; +module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); +MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); + +static uint eager_buffer_size = (8 << 20); /* 8MB */ +module_param(eager_buffer_size, uint, S_IRUGO); +MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB"); + +static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ +module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); + +static uint hfi1_hdrq_entsize = 32; +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); + +unsigned int user_credit_return_threshold = 33; /* default is 33% */ +module_param(user_credit_return_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); + +DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); + +static int hfi1_create_kctxt(struct hfi1_devdata *dd, + struct hfi1_pportdata *ppd) +{ + struct hfi1_ctxtdata *rcd; + int ret; + + /* Control context has to be always 0 */ + BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); + + ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); + if (ret < 0) { + dd_dev_err(dd, "Kernel receive context allocation failed\n"); + return ret; + } + + /* + * Set up the kernel context flags here and now because they use + * default values for all receive side memories. User contexts will + * be handled as they are created. + */ + rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + + /* Control context must use DMA_RTAIL */ + if (rcd->ctxt == HFI1_CTRL_CTXT) + rcd->flags |= HFI1_CAP_DMA_RTAIL; + rcd->fast_handler = get_dma_rtail_setting(rcd) ? + handle_receive_interrupt_dma_rtail : + handle_receive_interrupt_nodma_rtail; + + hfi1_set_seq_cnt(rcd, 1); + + rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); + if (!rcd->sc) { + dd_dev_err(dd, "Kernel send context allocation failed\n"); + return -ENOMEM; + } + hfi1_init_ctxt(rcd->sc); + + return 0; +} + +/* + * Create the receive context array and one or more kernel contexts + */ +int hfi1_create_kctxts(struct hfi1_devdata *dd) +{ + u16 i; + int ret; + + dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), + GFP_KERNEL, dd->node); + if (!dd->rcd) + return -ENOMEM; + + for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { + ret = hfi1_create_kctxt(dd, dd->pport); + if (ret) + goto bail; + } + + return 0; +bail: + for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) + hfi1_free_ctxt(dd->rcd[i]); + + /* All the contexts should be freed, free the array */ + kfree(dd->rcd); + dd->rcd = NULL; + return ret; +} + +/* + * Helper routines for the receive context reference count (rcd and uctxt). + */ +static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) +{ + kref_init(&rcd->kref); +} + +/** + * hfi1_rcd_free - When reference is zero clean up. + * @kref: pointer to an initialized rcd data structure + * + */ +static void hfi1_rcd_free(struct kref *kref) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = + container_of(kref, struct hfi1_ctxtdata, kref); + + spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); + rcd->dd->rcd[rcd->ctxt] = NULL; + spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); + + hfi1_free_ctxtdata(rcd->dd, rcd); + + kfree(rcd); +} + +/** + * hfi1_rcd_put - decrement reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to put a reference after the init. + */ +int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) +{ + if (rcd) + return kref_put(&rcd->kref, hfi1_rcd_free); + + return 0; +} + +/** + * hfi1_rcd_get - increment reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to get a reference after the init. + * + * Return : reflect kref_get_unless_zero(), which returns non-zero on + * increment, otherwise 0. + */ +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) +{ + return kref_get_unless_zero(&rcd->kref); +} + +/** + * allocate_rcd_index - allocate an rcd index from the rcd array + * @dd: pointer to a valid devdata structure + * @rcd: rcd data structure to assign + * @index: pointer to index that is allocated + * + * Find an empty index in the rcd array, and assign the given rcd to it. + * If the array is full, we are EBUSY. + * + */ +static int allocate_rcd_index(struct hfi1_devdata *dd, + struct hfi1_ctxtdata *rcd, u16 *index) +{ + unsigned long flags; + u16 ctxt; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt < dd->num_rcv_contexts) { + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + hfi1_rcd_init(rcd); + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + if (ctxt >= dd->num_rcv_contexts) + return -EBUSY; + + *index = ctxt; + + return 0; +} + +/** + * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the + * array + * @dd: pointer to a valid devdata structure + * @ctxt: the index of an possilbe rcd + * + * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given + * ctxt index is valid. + * + * The caller is responsible for making the _put(). + * + */ +struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, + u16 ctxt) +{ + if (ctxt < dd->num_rcv_contexts) + return hfi1_rcd_get_by_index(dd, ctxt); + + return NULL; +} + +/** + * hfi1_rcd_get_by_index - get by index + * @dd: pointer to a valid devdata structure + * @ctxt: the index of an possilbe rcd + * + * We need to protect access to the rcd array. If access is needed to + * one or more index, get the protecting spinlock and then increment the + * kref. + * + * The caller is responsible for making the _put(). + * + */ +struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = NULL; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd[ctxt]) { + rcd = dd->rcd[ctxt]; + if (!hfi1_rcd_get(rcd)) + rcd = NULL; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + return rcd; +} + +/* + * Common code for user and kernel context create and setup. + * NOTE: the initial kref is done here (hf1_rcd_init()). + */ +int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, + struct hfi1_ctxtdata **context) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ctxtdata *rcd; + unsigned kctxt_ngroups = 0; + u32 base; + + if (dd->rcv_entries.nctxt_extra > + dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) + kctxt_ngroups = (dd->rcv_entries.nctxt_extra - + (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); + rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); + if (rcd) { + u32 rcvtids, max_entries; + u16 ctxt; + int ret; + + ret = allocate_rcd_index(dd, rcd, &ctxt); + if (ret) { + *context = NULL; + kfree(rcd); + return ret; + } + + INIT_LIST_HEAD(&rcd->qp_wait_list); + hfi1_exp_tid_group_init(rcd); + rcd->ppd = ppd; + rcd->dd = dd; + rcd->numa_id = numa; + rcd->rcv_array_groups = dd->rcv_entries.ngroups; + rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; + rcd->slow_handler = handle_receive_interrupt; + rcd->do_interrupt = rcd->slow_handler; + rcd->msix_intr = CCE_NUM_MSIX_VECTORS; + + mutex_init(&rcd->exp_mutex); + spin_lock_init(&rcd->exp_lock); + INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); + + hfi1_cdbg(PROC, "setting up context %u", rcd->ctxt); + + /* + * Calculate the context's RcvArray entry starting point. + * We do this here because we have to take into account all + * the RcvArray entries that previous context would have + * taken and we have to account for any extra groups assigned + * to the static (kernel) or dynamic (vnic/user) contexts. + */ + if (ctxt < dd->first_dyn_alloc_ctxt) { + if (ctxt < kctxt_ngroups) { + base = ctxt * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else { + base = kctxt_ngroups + + (ctxt * dd->rcv_entries.ngroups); + } + } else { + u16 ct = ctxt - dd->first_dyn_alloc_ctxt; + + base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + + kctxt_ngroups); + if (ct < dd->rcv_entries.nctxt_extra) { + base += ct * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else { + base += dd->rcv_entries.nctxt_extra + + (ct * dd->rcv_entries.ngroups); + } + } + rcd->eager_base = base * dd->rcv_entries.group_size; + + rcd->rcvhdrq_cnt = rcvhdrcnt; + rcd->rcvhdrqentsize = hfi1_hdrq_entsize; + rcd->rhf_offset = + rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); + /* + * Simple Eager buffer allocation: we have already pre-allocated + * the number of RcvArray entry groups. Each ctxtdata structure + * holds the number of groups for that context. + * + * To follow CSR requirements and maintain cacheline alignment, + * make sure all sizes and bases are multiples of group_size. + * + * The expected entry count is what is left after assigning + * eager. + */ + max_entries = rcd->rcv_array_groups * + dd->rcv_entries.group_size; + rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); + rcd->egrbufs.count = round_down(rcvtids, + dd->rcv_entries.group_size); + if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { + dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", + rcd->ctxt); + rcd->egrbufs.count = MAX_EAGER_ENTRIES; + } + hfi1_cdbg(PROC, + "ctxt%u: max Eager buffer RcvArray entries: %u", + rcd->ctxt, rcd->egrbufs.count); + + /* + * Allocate array that will hold the eager buffer accounting + * data. + * This will allocate the maximum possible buffer count based + * on the value of the RcvArray split parameter. + * The resulting value will be rounded down to the closest + * multiple of dd->rcv_entries.group_size. + */ + rcd->egrbufs.buffers = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.buffers), + GFP_KERNEL, numa); + if (!rcd->egrbufs.buffers) + goto bail; + rcd->egrbufs.rcvtids = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.rcvtids), + GFP_KERNEL, numa); + if (!rcd->egrbufs.rcvtids) + goto bail; + rcd->egrbufs.size = eager_buffer_size; + /* + * The size of the buffers programmed into the RcvArray + * entries needs to be big enough to handle the highest + * MTU supported. + */ + if (rcd->egrbufs.size < hfi1_max_mtu) { + rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); + hfi1_cdbg(PROC, + "ctxt%u: eager bufs size too small. Adjusting to %u", + rcd->ctxt, rcd->egrbufs.size); + } + rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; + + /* Applicable only for statically created kernel contexts */ + if (ctxt < dd->first_dyn_alloc_ctxt) { + rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), + GFP_KERNEL, numa); + if (!rcd->opstats) + goto bail; + + /* Initialize TID flow generations for the context */ + hfi1_kern_init_ctxt_generations(rcd); + } + + *context = rcd; + return 0; + } + +bail: + *context = NULL; + hfi1_free_ctxt(rcd); + return -ENOMEM; +} + +/** + * hfi1_free_ctxt - free context + * @rcd: pointer to an initialized rcd data structure + * + * This wrapper is the free function that matches hfi1_create_ctxtdata(). + * When a context is done being used (kernel or user), this function is called + * for the "final" put to match the kref init from hfi1_create_ctxtdata(). + * Other users of the context do a get/put sequence to make sure that the + * structure isn't removed while in use. + */ +void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) +{ + hfi1_rcd_put(rcd); +} + +/* + * Select the largest ccti value over all SLs to determine the intra- + * packet gap for the link. + * + * called with cca_timer_lock held (to protect access to cca_timer + * array), and rcu_read_lock() (to protect access to cc_state). + */ +void set_link_ipg(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + struct cc_state *cc_state; + int i; + u16 cce, ccti_limit, max_ccti = 0; + u16 shift, mult; + u64 src; + u32 current_egress_rate; /* Mbits /sec */ + u64 max_pkt_time; + /* + * max_pkt_time is the maximum packet egress time in units + * of the fabric clock period 1/(805 MHz). + */ + + cc_state = get_cc_state(ppd); + + if (!cc_state) + /* + * This should _never_ happen - rcu_read_lock() is held, + * and set_link_ipg() should not be called if cc_state + * is NULL. + */ + return; + + for (i = 0; i < OPA_MAX_SLS; i++) { + u16 ccti = ppd->cca_timer[i].ccti; + + if (ccti > max_ccti) + max_ccti = ccti; + } + + ccti_limit = cc_state->cct.ccti_limit; + if (max_ccti > ccti_limit) + max_ccti = ccti_limit; + + cce = cc_state->cct.entries[max_ccti].entry; + shift = (cce & 0xc000) >> 14; + mult = (cce & 0x3fff); + + current_egress_rate = active_egress_rate(ppd); + + max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); + + src = (max_pkt_time >> shift) * mult; + + src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; + src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; + + write_csr(dd, SEND_STATIC_RATE_CONTROL, src); +} + +static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) +{ + struct cca_timer *cca_timer; + struct hfi1_pportdata *ppd; + int sl; + u16 ccti_timer, ccti_min; + struct cc_state *cc_state; + unsigned long flags; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + cca_timer = container_of(t, struct cca_timer, hrtimer); + ppd = cca_timer->ppd; + sl = cca_timer->sl; + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return HRTIMER_NORESTART; + } + + /* + * 1) decrement ccti for SL + * 2) calculate IPG for link (set_link_ipg()) + * 3) restart timer, unless ccti is at min value + */ + + ccti_min = cc_state->cong_setting.entries[sl].ccti_min; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + + spin_lock_irqsave(&ppd->cca_timer_lock, flags); + + if (cca_timer->ccti > ccti_min) { + cca_timer->ccti--; + set_link_ipg(ppd); + } + + if (cca_timer->ccti > ccti_min) { + unsigned long nsec = 1024 * ccti_timer; + /* ccti_timer is in units of 1.024 usec */ + hrtimer_forward_now(t, ns_to_ktime(nsec)); + ret = HRTIMER_RESTART; + } + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + rcu_read_unlock(); + return ret; +} + +/* + * Common code for initializing the physical port structure. + */ +void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, + struct hfi1_devdata *dd, u8 hw_pidx, u32 port) +{ + int i; + uint default_pkey_idx; + struct cc_state *cc_state; + + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + ppd->prev_link_width = LINK_WIDTH_DEFAULT; + /* + * There are C_VL_COUNT number of PortVLXmitWait counters. + * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. + */ + for (i = 0; i < C_VL_COUNT + 1; i++) { + ppd->port_vl_xmit_wait_last[i] = 0; + ppd->vl_xmit_flit_cnt[i] = 0; + } + + default_pkey_idx = 1; + + ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; + ppd->part_enforce |= HFI1_PART_ENFORCE_IN; + ppd->pkeys[0] = 0x8001; + + INIT_WORK(&ppd->link_vc_work, handle_verify_cap); + INIT_WORK(&ppd->link_up_work, handle_link_up); + INIT_WORK(&ppd->link_down_work, handle_link_down); + INIT_WORK(&ppd->freeze_work, handle_freeze); + INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); + INIT_WORK(&ppd->sma_message_work, handle_sma_message); + INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); + INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); + INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); + INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); + + mutex_init(&ppd->hls_lock); + spin_lock_init(&ppd->qsfp_info.qsfp_lock); + + ppd->qsfp_info.ppd = ppd; + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; + + ppd->hfi1_wq = NULL; + + spin_lock_init(&ppd->cca_timer_lock); + + for (i = 0; i < OPA_MAX_SLS; i++) { + hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + ppd->cca_timer[i].ppd = ppd; + ppd->cca_timer[i].sl = i; + ppd->cca_timer[i].ccti = 0; + ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + } + + ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; + + spin_lock_init(&ppd->cc_state_lock); + spin_lock_init(&ppd->cc_log_lock); + cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); + RCU_INIT_POINTER(ppd->cc_state, cc_state); + if (!cc_state) + goto bail; + return; + +bail: + dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct hfi1_devdata *dd) +{ + return 0; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the hfi1_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_after_reset(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_ctxtdata *rcd; + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS, rcd); + hfi1_rcd_put(rcd); + } + pio_send_control(dd, PSC_GLOBAL_DISABLE); + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + + return 0; +} + +static void enable_chip(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + u32 rcvmask; + u16 i; + + /* enable PIO send */ + pio_send_control(dd, PSC_GLOBAL_ENABLE); + + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and initializes them. + */ + for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; + rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; + rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) + rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; + hfi1_rcvctrl(dd, rcvmask, rcd); + sc_enable(rcd->sc); + hfi1_rcd_put(rcd); + } +} + +/** + * create_workqueues - create per port workqueues + * @dd: the hfi1_ib device + */ +static int create_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (!ppd->hfi1_wq) { + ppd->hfi1_wq = + alloc_workqueue( + "hfi%d_%d", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | + WQ_MEM_RECLAIM, + HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, + dd->unit, pidx); + if (!ppd->hfi1_wq) + goto wq_error; + } + if (!ppd->link_wq) { + /* + * Make the link workqueue single-threaded to enforce + * serialization. + */ + ppd->link_wq = + alloc_workqueue( + "hfi_link_%d_%d", + WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, + 1, /* max_active */ + dd->unit, pidx); + if (!ppd->link_wq) + goto wq_error; + } + } + return 0; +wq_error: + pr_err("alloc_workqueue failed for port %d\n", pidx + 1); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } + } + return -ENOMEM; +} + +/** + * destroy_workqueues - destroy per port workqueues + * @dd: the hfi1_ib device + */ +static void destroy_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } + } +} + +/** + * enable_general_intr() - Enable the IRQs that will be handled by the + * general interrupt handler. + * @dd: valid devdata + * + */ +static void enable_general_intr(struct hfi1_devdata *dd) +{ + set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); + set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); + set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); + set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); + set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); + set_intr_bits(dd, IS_DC_START, IS_DC_END, true); + set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); +} + +/** + * hfi1_init - do the actual initialization sequence on the chip + * @dd: the hfi1_ib device + * @reinit: re-initializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int hfi1_init(struct hfi1_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + unsigned long len; + u16 i; + struct hfi1_ctxtdata *rcd; + struct hfi1_pportdata *ppd; + + /* Set up send low level handlers */ + dd->process_pio_send = hfi1_verbs_send_pio; + dd->process_dma_send = hfi1_verbs_send_dma; + dd->pio_inline_send = pio_copy; + dd->process_vnic_dma_send = hfi1_vnic_send_dma; + + if (is_ax(dd)) { + atomic_set(&dd->drop_packet, DROP_PACKET_ON); + dd->do_drop = true; + } else { + atomic_set(&dd->drop_packet, DROP_PACKET_OFF); + dd->do_drop = false; + } + + /* make sure the link is not "up" */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + ppd->linkup = 0; + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* dd->rcd can be NULL if early initialization failed */ + for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; + + lastfail = hfi1_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = hfi1_setup_eagerbufs(rcd); + if (!lastfail) + lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); + if (lastfail) { + dd_dev_err(dd, + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + ret = lastfail; + } + /* enable IRQ */ + hfi1_rcd_put(rcd); + } + + /* Allocate enough memory for user event notification. */ + len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * + sizeof(*dd->events)); + dd->events = vmalloc_user(len); + if (!dd->events) + dd_dev_err(dd, "Failed to allocate user events page\n"); + /* + * Allocate a page for device and port status. + * Page will be shared amongst all user processes. + */ + dd->status = vmalloc_user(PAGE_SIZE); + if (!dd->status) + dd_dev_err(dd, "Failed to allocate dev status page\n"); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (dd->status) + /* Currently, we only have one port */ + ppd->statusp = &dd->status->port; + + set_mtu(ppd); + } + + /* enable chip even if we have an error, so we can debug cause */ + enable_chip(dd); + +done: + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ret) { + /* enable all interrupts from the chip */ + enable_general_intr(dd); + init_qsfp_int(dd); + + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* + * start the serdes - must be after interrupts are + * enabled so we are notified when the link goes up + */ + lastfail = bringup_serdes(ppd); + if (lastfail) + dd_dev_info(dd, + "Failed to bring up port %u\n", + ppd->port); + + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (ppd->statusp) + *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + } + } + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +struct hfi1_devdata *hfi1_lookup(int unit) +{ + return xa_load(&hfi1_dev_table, unit); +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void stop_timers(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->led_override_timer.function) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + } +} + +/** + * shutdown_device - shut down a device + * @dd: the hfi1_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by hfi1_init(dd, 1) + */ +static void shutdown_device(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ctxtdata *rcd; + unsigned pidx; + int i; + + if (dd->flags & HFI1_SHUTDOWN) + return; + dd->flags |= HFI1_SHUTDOWN; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + ppd->linkup = 0; + if (ppd->statusp) + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + } + dd->flags &= ~HFI1_INITTED; + + /* mask and clean up interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + msix_clean_up_interrupts(dd); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_PKEY_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); + hfi1_rcd_put(rcd); + } + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_flush(dd->send_contexts[i].sc); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* disable all contexts */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + /* disable the send device */ + pio_send_control(dd, PSC_GLOBAL_DISABLE); + + shutdown_led_override(ppd); + + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + hfi1_quiet_serdes(ppd); + if (ppd->hfi1_wq) + flush_workqueue(ppd->hfi1_wq); + if (ppd->link_wq) + flush_workqueue(ppd->link_wq); + } + sdma_exit(dd); +} + +/** + * hfi1_free_ctxtdata - free a context's allocated data + * @dd: the hfi1_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * It should never change any chip state, or global driver state. + */ +void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + u32 e; + + if (!rcd) + return; + + if (rcd->rcvhdrq) { + dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), + rcd->rcvhdrq, rcd->rcvhdrq_dma); + rcd->rcvhdrq = NULL; + if (hfi1_rcvhdrtail_kvaddr(rcd)) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)hfi1_rcvhdrtail_kvaddr(rcd), + rcd->rcvhdrqtailaddr_dma); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + + /* all the RcvArray entries should have been cleared by now */ + kfree(rcd->egrbufs.rcvtids); + rcd->egrbufs.rcvtids = NULL; + + for (e = 0; e < rcd->egrbufs.alloced; e++) { + if (rcd->egrbufs.buffers[e].addr) + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[e].len, + rcd->egrbufs.buffers[e].addr, + rcd->egrbufs.buffers[e].dma); + } + kfree(rcd->egrbufs.buffers); + rcd->egrbufs.alloced = 0; + rcd->egrbufs.buffers = NULL; + + sc_free(rcd->sc); + rcd->sc = NULL; + + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); + kfree(rcd->opstats); + + rcd->subctxt_uregbase = NULL; + rcd->subctxt_rcvegrbuf = NULL; + rcd->subctxt_rcvhdr_base = NULL; + rcd->opstats = NULL; +} + +/* + * Release our hold on the shared asic data. If we are the last one, + * return the structure to be finalized outside the lock. Must be + * holding hfi1_dev_table lock. + */ +static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) +{ + struct hfi1_asic_data *ad; + int other; + + if (!dd->asic_data) + return NULL; + dd->asic_data->dds[dd->hfi1_id] = NULL; + other = dd->hfi1_id ? 0 : 1; + ad = dd->asic_data; + dd->asic_data = NULL; + /* return NULL if the other dd still has a link */ + return ad->dds[other] ? NULL : ad; +} + +static void finalize_asic_data(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad) +{ + clean_up_i2c(dd, ad); + kfree(ad); +} + +/** + * hfi1_free_devdata - cleans up and frees per-unit data structure + * @dd: pointer to a valid devdata structure + * + * It cleans up and frees all data structures set up by + * by hfi1_alloc_devdata(). + */ +void hfi1_free_devdata(struct hfi1_devdata *dd) +{ + struct hfi1_asic_data *ad; + unsigned long flags; + + xa_lock_irqsave(&hfi1_dev_table, flags); + __xa_erase(&hfi1_dev_table, dd->unit); + ad = release_asic_data(dd); + xa_unlock_irqrestore(&hfi1_dev_table, flags); + + finalize_asic_data(dd, ad); + free_platform_config(dd); + rcu_barrier(); /* wait for rcu callbacks to complete */ + free_percpu(dd->int_counter); + free_percpu(dd->rcv_limit); + free_percpu(dd->send_schedule); + free_percpu(dd->tx_opstats); + dd->int_counter = NULL; + dd->rcv_limit = NULL; + dd->send_schedule = NULL; + dd->tx_opstats = NULL; + kfree(dd->comp_vect); + dd->comp_vect = NULL; + if (dd->rcvhdrtail_dummy_kvaddr) + dma_free_coherent(&dd->pcidev->dev, sizeof(u64), + (void *)dd->rcvhdrtail_dummy_kvaddr, + dd->rcvhdrtail_dummy_dma); + dd->rcvhdrtail_dummy_kvaddr = NULL; + sdma_clean(dd, dd->num_sdma); + rvt_dealloc_device(&dd->verbs_dev.rdi); +} + +/** + * hfi1_alloc_devdata - Allocate our primary per-unit data structure. + * @pdev: Valid PCI device + * @extra: How many bytes to alloc past the default + * + * Must be done via verbs allocator, because the verbs cleanup process + * both does cleanup and free of the data structure. + * "extra" is for chip-specific data. + */ +static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, + size_t extra) +{ + struct hfi1_devdata *dd; + int ret, nports; + + /* extra is * number of ports */ + nports = extra / sizeof(struct hfi1_pportdata); + + dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, + nports); + if (!dd) + return ERR_PTR(-ENOMEM); + dd->num_pports = nports; + dd->pport = (struct hfi1_pportdata *)(dd + 1); + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b, + GFP_KERNEL); + if (ret < 0) { + dev_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + goto bail; + } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); + /* + * If the BIOS does not have the NUMA node information set, select + * NUMA 0 so we get consistent performance. + */ + dd->node = pcibus_to_node(pdev->bus); + if (dd->node == NUMA_NO_NODE) { + dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n"); + dd->node = 0; + } + + /* + * Initialize all locks for the device. This needs to be as early as + * possible so locks are usable. + */ + spin_lock_init(&dd->sc_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->rcvctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->hfi1_diag_trans_lock); + spin_lock_init(&dd->sc_init_lock); + spin_lock_init(&dd->dc8051_memlock); + seqlock_init(&dd->sc2vl_lock); + spin_lock_init(&dd->sde_map_lock); + spin_lock_init(&dd->pio_map_lock); + mutex_init(&dd->dc8051_lock); + init_waitqueue_head(&dd->event_queue); + spin_lock_init(&dd->irq_src_lock); + + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + goto bail; + } + + dd->rcv_limit = alloc_percpu(u64); + if (!dd->rcv_limit) { + ret = -ENOMEM; + goto bail; + } + + dd->send_schedule = alloc_percpu(u64); + if (!dd->send_schedule) { + ret = -ENOMEM; + goto bail; + } + + dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); + if (!dd->tx_opstats) { + ret = -ENOMEM; + goto bail; + } + + dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); + if (!dd->comp_vect) { + ret = -ENOMEM; + goto bail; + } + + /* allocate dummy tail memory for all receive contexts */ + dd->rcvhdrtail_dummy_kvaddr = + dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64), + &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); + if (!dd->rcvhdrtail_dummy_kvaddr) { + ret = -ENOMEM; + goto bail; + } + + atomic_set(&dd->ipoib_rsm_usr_num, 0); + return dd; + +bail: + hfi1_free_devdata(dd); + return ERR_PTR(ret); +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void hfi1_disable_after_error(struct hfi1_devdata *dd) +{ + if (dd->flags & HFI1_INITTED) { + u32 pidx; + + dd->flags &= ~HFI1_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & HFI1_PRESENT) + set_link_state(ppd, HLS_DN_DISABLE); + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_HWERROR; +} + +static void remove_one(struct pci_dev *); +static int init_one(struct pci_dev *, const struct pci_device_id *); +static void shutdown_one(struct pci_dev *); + +#define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: " +#define PFX DRIVER_NAME ": " + +const struct pci_device_id hfi1_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); + +static struct pci_driver hfi1_pci_driver = { + .name = DRIVER_NAME, + .probe = init_one, + .remove = remove_one, + .shutdown = shutdown_one, + .id_table = hfi1_pci_tbl, + .err_handler = &hfi1_pci_err_handler, +}; + +static void __init compute_krcvqs(void) +{ + int i; + + for (i = 0; i < krcvqsset; i++) + n_krcvqs += krcvqs[i]; +} + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init hfi1_mod_init(void) +{ + int ret; + + ret = dev_init(); + if (ret) + goto bail; + + ret = node_affinity_init(); + if (ret) + goto bail; + + /* validate max MTU before any devices start */ + if (!valid_opa_max_mtu(hfi1_max_mtu)) { + pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", + hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); + hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; + } + /* valid CUs run from 1-128 in powers of 2 */ + if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) + hfi1_cu = 1; + /* valid credit return threshold is 0-100, variable is unsigned */ + if (user_credit_return_threshold > 100) + user_credit_return_threshold = 100; + + compute_krcvqs(); + /* + * sanitize receive interrupt count, time must wait until after + * the hardware type is known + */ + if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) + rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; + /* reject invalid combinations */ + if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { + pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); + rcv_intr_count = 1; + } + if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { + /* + * Avoid indefinite packet delivery by requiring a timeout + * if count is > 1. + */ + pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); + rcv_intr_timeout = 1; + } + if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { + /* + * The dynamic algorithm expects a non-zero timeout + * and a count > 1. + */ + pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); + rcv_intr_dynamic = 0; + } + + /* sanitize link CRC options */ + link_crc_mask &= SUPPORTED_CRCS; + + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + hfi1_dbg_init(); + ret = pci_register_driver(&hfi1_pci_driver); + if (ret < 0) { + pr_err("Unable to register driver: error %d\n", -ret); + goto bail_dev; + } + goto bail; /* all OK */ + +bail_dev: + hfi1_dbg_exit(); + dev_cleanup(); +bail: + return ret; +} + +module_init(hfi1_mod_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit hfi1_mod_cleanup(void) +{ + pci_unregister_driver(&hfi1_pci_driver); + opfn_exit(); + node_affinity_destroy_all(); + hfi1_dbg_exit(); + + WARN_ON(!xa_empty(&hfi1_dev_table)); + dispose_firmware(); /* asymmetric with obtain_firmware() */ + dev_cleanup(); +} + +module_exit(hfi1_mod_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct hfi1_devdata *dd) +{ + int ctxt; + int pidx; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd = &dd->pport[pidx]; + struct cc_state *cc_state; + int i; + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; + + for (i = 0; i < OPA_MAX_SLS; i++) + hrtimer_cancel(&ppd->cca_timer[i].hrtimer); + + spin_lock(&ppd->cc_state_lock); + cc_state = get_cc_state_protected(ppd); + RCU_INIT_POINTER(ppd->cc_state, NULL); + spin_unlock(&ppd->cc_state_lock); + + if (cc_state) + kfree_rcu(cc_state, rcu); + } + + free_credit_return(dd); + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + */ + for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; + + if (rcd) { + hfi1_free_ctxt_rcv_groups(rcd); + hfi1_free_ctxt(rcd); + } + } + + kfree(dd->rcd); + dd->rcd = NULL; + + free_pio_map(dd); + /* must follow rcv context free - need to remove rcv's hooks */ + for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) + sc_free(dd->send_contexts[ctxt].sc); + dd->num_send_contexts = 0; + kfree(dd->send_contexts); + dd->send_contexts = NULL; + kfree(dd->hw_to_sw); + dd->hw_to_sw = NULL; + kfree(dd->boardname); + vfree(dd->events); + vfree(dd->status); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void postinit_cleanup(struct hfi1_devdata *dd) +{ + hfi1_start_cleanup(dd); + hfi1_comp_vectors_clean_up(dd); + hfi1_dev_affinity_clean_up(dd); + + hfi1_pcie_ddcleanup(dd); + hfi1_pcie_cleanup(dd->pcidev); + + cleanup_device_data(dd); + + hfi1_free_devdata(dd); +} + +static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret = 0, j, pidx, initfail; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + + /* First, lock the non-writable module parameters */ + HFI1_CAP_LOCK(); + + /* Validate dev ids */ + if (!(ent->device == PCI_DEVICE_ID_INTEL0 || + ent->device == PCI_DEVICE_ID_INTEL1)) { + dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", + ent->device); + ret = -ENODEV; + goto bail; + } + + /* Allocate the dd so we can get to work */ + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + goto bail; + } + + /* Validate some global module parameters */ + ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt); + if (ret) + goto bail; + + /* use the encoding function as a sanitization check */ + if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { + dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); + ret = -EINVAL; + goto bail; + } + + /* The receive eager buffer size must be set before the receive + * contexts are created. + * + * Set the eager buffer size. Validate that it falls in a range + * allowed by the hardware - all powers of 2 between the min and + * max. The maximum valid MTU is within the eager buffer range + * so we do not need to cap the max_mtu by an eager buffer size + * setting. + */ + if (eager_buffer_size) { + if (!is_power_of_2(eager_buffer_size)) + eager_buffer_size = + roundup_pow_of_two(eager_buffer_size); + eager_buffer_size = + clamp_val(eager_buffer_size, + MIN_EAGER_BUFFER * 8, + MAX_EAGER_BUFFER_TOTAL); + dd_dev_info(dd, "Eager buffer size %u\n", + eager_buffer_size); + } else { + dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); + ret = -EINVAL; + goto bail; + } + + /* restrict value of hfi1_rcvarr_split */ + hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); + + ret = hfi1_pcie_init(dd); + if (ret) + goto bail; + + /* + * Do device-specific initialization, function table setup, dd + * allocation, etc. + */ + ret = hfi1_init_dd(dd); + if (ret) + goto clean_bail; /* error already printed */ + + ret = create_workqueues(dd); + if (ret) + goto clean_bail; + + /* do the generic initialization */ + initfail = hfi1_init(dd, 0); + + ret = hfi1_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!initfail && !ret) { + dd->flags |= HFI1_INITTED; + /* create debufs files after init and ib register */ + hfi1_dbg_ibdev_init(&dd->verbs_dev); + } + + j = hfi1_device_create(dd); + if (j) + dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + + if (initfail || ret) { + msix_clean_up_interrupts(dd); + stop_timers(dd); + flush_workqueue(ib_wq); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + hfi1_quiet_serdes(dd->pport + pidx); + ppd = dd->pport + pidx; + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } + } + if (!j) + hfi1_device_remove(dd); + if (!ret) + hfi1_unregister_ib_device(dd); + postinit_cleanup(dd); + if (initfail) + ret = initfail; + goto bail; /* everything already cleaned */ + } + + sdma_start(dd); + + return 0; + +clean_bail: + hfi1_pcie_cleanup(pdev); +bail: + return ret; +} + +static void wait_for_clients(struct hfi1_devdata *dd) +{ + /* + * Remove the device init value and complete the device if there is + * no clients or wait for active clients to finish. + */ + if (refcount_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + + wait_for_completion(&dd->user_comp); +} + +static void remove_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + /* close debugfs files before ib unregister */ + hfi1_dbg_ibdev_exit(&dd->verbs_dev); + + /* remove the /dev hfi1 interface */ + hfi1_device_remove(dd); + + /* wait for existing user space clients to finish */ + wait_for_clients(dd); + + /* unregister from IB core */ + hfi1_unregister_ib_device(dd); + + /* free netdev data */ + hfi1_free_rx(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + shutdown_device(dd); + destroy_workqueues(dd); + + stop_timers(dd); + + /* wait until all of our (qsfp) queue_work() calls complete */ + flush_workqueue(ib_wq); + + postinit_cleanup(dd); +} + +static void shutdown_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + shutdown_device(dd); +} + +/** + * hfi1_create_rcvhdrq - create a receive header queue + * @dd: the hfi1_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + unsigned amt; + + if (!rcd->rcvhdrq) { + amt = rcvhdrq_size(rcd); + + rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt, + &rcd->rcvhdrq_dma, + GFP_KERNEL); + + if (!rcd->rcvhdrq) { + dd_dev_err(dd, + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || + HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { + rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, + &rcd->rcvhdrqtailaddr_dma, + GFP_KERNEL); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + } + } + + set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize, + rcd->rcvhdrq_cnt); + + return 0; + +bail_free: + dd_dev_err(dd, + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_dma); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user + * contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 max_entries, egrtop, alloced_bytes = 0; + u16 order, idx = 0; + int ret = 0; + u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); + + /* + * The minimum size of the eager buffers is a groups of MTU-sized + * buffers. + * The global eager_buffer_size parameter is checked against the + * theoretical lower limit of the value. Here, we check against the + * MTU. + */ + if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) + rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; + /* + * If using one-pkt-per-egr-buffer, lower the eager buffer + * size to the max MTU (page-aligned). + */ + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) + rcd->egrbufs.rcvtid_size = round_mtu; + + /* + * Eager buffers sizes of 1MB or less require smaller TID sizes + * to satisfy the "multiple of 8 RcvArray entries" requirement. + */ + if (rcd->egrbufs.size <= (1 << 20)) + rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, + rounddown_pow_of_two(rcd->egrbufs.size / 8)); + + while (alloced_bytes < rcd->egrbufs.size && + rcd->egrbufs.alloced < rcd->egrbufs.count) { + rcd->egrbufs.buffers[idx].addr = + dma_alloc_coherent(&dd->pcidev->dev, + rcd->egrbufs.rcvtid_size, + &rcd->egrbufs.buffers[idx].dma, + GFP_KERNEL); + if (rcd->egrbufs.buffers[idx].addr) { + rcd->egrbufs.buffers[idx].len = + rcd->egrbufs.rcvtid_size; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = + rcd->egrbufs.buffers[idx].addr; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = + rcd->egrbufs.buffers[idx].dma; + rcd->egrbufs.alloced++; + alloced_bytes += rcd->egrbufs.rcvtid_size; + idx++; + } else { + u32 new_size, i, j; + u64 offset = 0; + + /* + * Fail the eager buffer allocation if: + * - we are already using the lowest acceptable size + * - we are using one-pkt-per-egr-buffer (this implies + * that we are accepting only one size) + */ + if (rcd->egrbufs.rcvtid_size == round_mtu || + !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { + dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", + rcd->ctxt); + ret = -ENOMEM; + goto bail_rcvegrbuf_phys; + } + + new_size = rcd->egrbufs.rcvtid_size / 2; + + /* + * If the first attempt to allocate memory failed, don't + * fail everything but continue with the next lower + * size. + */ + if (idx == 0) { + rcd->egrbufs.rcvtid_size = new_size; + continue; + } + + /* + * Re-partition already allocated buffers to a smaller + * size. + */ + rcd->egrbufs.alloced = 0; + for (i = 0, j = 0, offset = 0; j < idx; i++) { + if (i >= rcd->egrbufs.count) + break; + rcd->egrbufs.rcvtids[i].dma = + rcd->egrbufs.buffers[j].dma + offset; + rcd->egrbufs.rcvtids[i].addr = + rcd->egrbufs.buffers[j].addr + offset; + rcd->egrbufs.alloced++; + if ((rcd->egrbufs.buffers[j].dma + offset + + new_size) == + (rcd->egrbufs.buffers[j].dma + + rcd->egrbufs.buffers[j].len)) { + j++; + offset = 0; + } else { + offset += new_size; + } + } + rcd->egrbufs.rcvtid_size = new_size; + } + } + rcd->egrbufs.numbufs = idx; + rcd->egrbufs.size = alloced_bytes; + + hfi1_cdbg(PROC, + "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB", + rcd->ctxt, rcd->egrbufs.alloced, + rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); + + /* + * Set the contexts rcv array head update threshold to the closest + * power of 2 (so we can use a mask instead of modulo) below half + * the allocated entries. + */ + rcd->egrbufs.threshold = + rounddown_pow_of_two(rcd->egrbufs.alloced / 2); + /* + * Compute the expected RcvArray entry base. This is done after + * allocating the eager buffers in order to maximize the + * expected RcvArray entries for the context. + */ + max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; + egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); + rcd->expected_count = max_entries - egrtop; + if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) + rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; + + rcd->expected_base = rcd->eager_base + egrtop; + hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u", + rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, + rcd->eager_base, rcd->expected_base); + + if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { + hfi1_cdbg(PROC, + "ctxt%u: current Eager buffer size is invalid %u", + rcd->ctxt, rcd->egrbufs.rcvtid_size); + ret = -EINVAL; + goto bail_rcvegrbuf_phys; + } + + for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { + hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, + rcd->egrbufs.rcvtids[idx].dma, order); + cond_resched(); + } + + return 0; + +bail_rcvegrbuf_phys: + for (idx = 0; idx < rcd->egrbufs.alloced && + rcd->egrbufs.buffers[idx].addr; + idx++) { + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[idx].len, + rcd->egrbufs.buffers[idx].addr, + rcd->egrbufs.buffers[idx].dma); + rcd->egrbufs.buffers[idx].addr = NULL; + rcd->egrbufs.buffers[idx].dma = 0; + rcd->egrbufs.buffers[idx].len = 0; + } + + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c new file mode 100644 index 0000000000..70376e6dba --- /dev/null +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/bitmap.h> + +#include "hfi.h" +#include "common.h" +#include "sdma.h" + +#define LINK_UP_DELAY 500 /* in microseconds */ + +static void set_mgmt_allowed(struct hfi1_pportdata *ppd) +{ + u32 frame; + struct hfi1_devdata *dd = ppd->dd; + + if (ppd->neighbor_type == NEIGHBOR_TYPE_HFI) { + ppd->mgmt_allowed = 1; + } else { + read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame); + ppd->mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) + & MGMT_ALLOWED_MASK; + } +} + +/* + * Our neighbor has indicated that we are allowed to act as a fabric + * manager, so place the full management partition key in the second + * (0-based) pkey array position. Note that we should already have + * the limited management partition key in array element 1, and also + * that the port is not yet up when add_full_mgmt_pkey() is invoked. + */ +static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* Sanity check - ppd->pkeys[2] should be 0, or already initialized */ + if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY))) + dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n", + __func__, ppd->pkeys[2], FULL_MGMT_P_KEY); + ppd->pkeys[2] = FULL_MGMT_P_KEY; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(ppd->dd, ppd->port); +} + +/** + * format_hwmsg - format a single hwerror message + * @msg: message buffer + * @msgl: length of message buffer + * @hwmsg: message to add to message buffer + */ +static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * hfi1_format_hwerrors - format hardware error messages for display + * @hwerrs: hardware errors bit vector + * @hwerrmsgs: hardware error descriptions + * @nhwerrmsgs: number of hwerrmsgs + * @msg: message buffer + * @msgl: message buffer length + */ +void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t msgl) +{ + int i; + + for (i = 0; i < nhwerrmsgs; i++) + if (hwerrs & hwerrmsgs[i].mask) + format_hwmsg(msg, msgl, hwerrmsgs[i].msg); +} + +static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) +{ + struct ib_event event; + struct hfi1_devdata *dd = ppd->dd; + + /* + * Only call ib_dispatch_event() if the IB device has been + * registered. HFI1_INITED is set iff the driver has successfully + * registered with the IB core. + */ + if (!(dd->flags & HFI1_INITTED)) + return; + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = ppd->port; + event.event = ev; + ib_dispatch_event(&event); +} + +/** + * handle_linkup_change - finish linkup/down state changes + * @dd: valid device + * @linkup: link state information + * + * Handle a linkup or link down notification. + * The HW needs time to finish its link up state change. Give it that chance. + * + * This is called outside an interrupt. + * + */ +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + enum ib_event_type ev; + + if (!(ppd->linkup ^ !!linkup)) + return; /* no change, nothing to do */ + + if (linkup) { + /* + * Quick linkup and all link up on the simulator does not + * trigger or implement: + * - VerifyCap interrupt + * - VerifyCap frames + * But rather moves directly to LinkUp. + * + * Do the work of the VerifyCap interrupt handler, + * handle_verify_cap(), but do not try moving the state to + * LinkUp as we are already there. + * + * NOTE: This uses this device's vAU, vCU, and vl15_init for + * the remote values. Both sides must be using the values. + */ + if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + set_up_vau(dd, dd->vau); + set_up_vl15(dd, dd->vl15_init); + assign_remote_cm_au_table(dd, dd->vcu); + } + + ppd->neighbor_guid = + read_csr(dd, DC_DC8051_STS_REMOTE_GUID); + ppd->neighbor_type = + read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & + DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; + ppd->neighbor_port_number = + read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + ppd->neighbor_fm_security = + read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) & + DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK; + dd_dev_info(dd, + "Neighbor Guid %llx, Type %d, Port Num %d\n", + ppd->neighbor_guid, ppd->neighbor_type, + ppd->neighbor_port_number); + + /* HW needs LINK_UP_DELAY to settle, give it that chance */ + udelay(LINK_UP_DELAY); + + /* + * 'MgmtAllowed' information, which is exchanged during + * LNI, is available at this point. + */ + set_mgmt_allowed(ppd); + + if (ppd->mgmt_allowed) + add_full_mgmt_pkey(ppd); + + /* physical link went up */ + ppd->linkup = 1; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + + /* link widths are not available until the link is fully up */ + get_linkup_link_widths(ppd); + + } else { + /* physical link went down */ + ppd->linkup = 0; + + /* clear HW details of the previous connection */ + ppd->actual_vls_operational = 0; + reset_link_credits(dd); + + /* freeze after a link down to guarantee a clean egress */ + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN); + + ev = IB_EVENT_PORT_ERR; + + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT); + + /* if we are down, the neighbor is down */ + ppd->neighbor_normal = 0; + + /* notify IB of the link change */ + signal_ib_event(ppd, ev); + } +} + +/* + * Handle receive or urgent interrupts for user contexts. This means a user + * process was waiting for a packet to arrive, and didn't want to poll. + */ +void handle_user_interrupt(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + unsigned long flags; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (bitmap_empty(rcd->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) + goto done; + + if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) { + wake_up_interruptible(&rcd->wait); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd); + } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG, + &rcd->event_flags)) { + rcd->urgent++; + wake_up_interruptible(&rcd->wait); + } +done: + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c new file mode 100644 index 0000000000..1114898026 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "iowait.h" +#include "trace_iowait.h" + +/* 1 priority == 16 starve_cnt */ +#define IOWAIT_PRIORITY_STARVE_SHIFT 4 + +void iowait_set_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_set(wait, flag); + set_bit(flag, &wait->flags); +} + +bool iowait_flag_set(struct iowait *wait, u32 flag) +{ + return test_bit(flag, &wait->flags); +} + +inline void iowait_clear_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_clear(wait, flag); + clear_bit(flag, &wait->flags); +} + +/* + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)) +{ + int i; + + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; + wait->init_priority = init_priority; + wait->flags = 0; + for (i = 0; i < IOWAIT_SES; i++) { + wait->wait[i].iow = wait; + INIT_LIST_HEAD(&wait->wait[i].tx_head); + if (i == IOWAIT_IB_SE) + INIT_WORK(&wait->wait[i].iowork, func); + else + INIT_WORK(&wait->wait[i].iowork, tidfunc); + } +} + +/** + * iowait_cancel_work - cancel all work in iowait + * @w: the iowait struct + */ +void iowait_cancel_work(struct iowait *w) +{ + cancel_work_sync(&iowait_get_ib_work(w)->iowork); + /* Make sure that the iowork for TID RDMA is used */ + if (iowait_get_tid_work(w)->iowork.func) + cancel_work_sync(&iowait_get_tid_work(w)->iowork); +} + +/** + * iowait_set_work_flag - set work flag based on leg + * @w: the iowait work struct + */ +int iowait_set_work_flag(struct iowait_work *w) +{ + if (w == &w->iow->wait[IOWAIT_IB_SE]) { + iowait_set_flag(w->iow, IOWAIT_PENDING_IB); + return IOWAIT_IB_SE; + } + iowait_set_flag(w->iow, IOWAIT_PENDING_TID); + return IOWAIT_TID_SE; +} + +/** + * iowait_priority_update_top - update the top priority entry + * @w: the iowait struct + * @top: a pointer to the top priority entry + * @idx: the index of the current iowait in an array + * @top_idx: the array index for the iowait entry that has the top priority + * + * This function is called to compare the priority of a given + * iowait with the given top priority entry. The top index will + * be returned. + */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx) +{ + u8 cnt, tcnt; + + /* Convert priority into starve_cnt and compare the total.*/ + cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt; + tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + + top->starved_cnt; + if (cnt > tcnt) + return idx; + else + return top_idx; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h new file mode 100644 index 0000000000..4df0700cba --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -0,0 +1,457 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#ifndef _HFI1_IOWAIT_H +#define _HFI1_IOWAIT_H + +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/sched.h> + +#include "sdma_txreq.h" + +/* + * typedef (*restart_t)() - restart callback + * @work: pointer to work structure + */ +typedef void (*restart_t)(struct work_struct *work); + +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + +struct sdma_txreq; +struct sdma_engine; +/** + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** + * @list: used to add/insert into QP/PQ wait lists + * @tx_head: overflow list of sdma_txreq's + * @sleep: no space callback + * @wakeup: space callback wakeup + * @sdma_drained: sdma count drained + * @init_priority: callback to manipulate priority + * @lock: lock protected head of wait queue + * @iowork: workqueue overhead + * @wait_dma: wait for sdma_busy == 0 + * @wait_pio: wait for pio_busy == 0 + * @sdma_busy: # of packets in flight + * @count: total number of descriptors in tx_head'ed list + * @tx_limit: limit for overflow queuing + * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array for multiple legs + * + * This is to be embedded in user's state structure + * (QP or PQ). + * + * The sleep and wakeup members are a + * bit misnamed. They do not strictly + * speaking sleep or wake up, but they + * are callbacks for the ULP to implement + * what ever queuing/dequeuing of + * the embedded iowait and its containing struct + * when a resource shortage like SDMA ring space + * or PIO credit space is seen. + * + * Both potentially have locks help + * so sleeping is not allowed and it is not + * supported to submit txreqs from the wakeup + * call directly because of lock conflicts. + * + * The wait_dma member along with the iow + * + * The lock field is used by waiters to record + * the seqlock_t that guards the list head. + * Waiters explicity know that, but the destroy + * code that unwaits QPs does not. + */ +struct iowait { + struct list_head list; + int (*sleep)( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent + ); + void (*wakeup)(struct iowait *wait, int reason); + void (*sdma_drained)(struct iowait *wait); + void (*init_priority)(struct iowait *wait); + seqlock_t *lock; + wait_queue_head_t wait_dma; + wait_queue_head_t wait_pio; + atomic_t sdma_busy; + atomic_t pio_busy; + u32 count; + u32 tx_limit; + u32 tx_count; + u8 starved_cnt; + u8 priority; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; +}; + +#define SDMA_AVAIL_REASON 0 + +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); + +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)); + +/** + * iowait_schedule() - schedule the default send engine work + * @wait: wait struct to schedule + * @wq: workqueue for schedule + * @cpu: cpu + */ +static inline bool iowait_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); +} + +/** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + +/** + * iowait_sdma_drain() - wait for DMAs to drain + * + * @wait: iowait structure + * + * This will delay until the iowait sdmas have + * completed. + */ +static inline void iowait_sdma_drain(struct iowait *wait) +{ + wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy)); +} + +/** + * iowait_sdma_pending() - return sdma pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_sdma_pending(struct iowait *wait) +{ + return atomic_read(&wait->sdma_busy); +} + +/** + * iowait_sdma_inc - note sdma io pending + * @wait: iowait structure + */ +static inline void iowait_sdma_inc(struct iowait *wait) +{ + atomic_inc(&wait->sdma_busy); +} + +/** + * iowait_sdma_add - add count to pending + * @wait: iowait structure + */ +static inline void iowait_sdma_add(struct iowait *wait, int count) +{ + atomic_add(count, &wait->sdma_busy); +} + +/** + * iowait_sdma_dec - note sdma complete + * @wait: iowait structure + */ +static inline int iowait_sdma_dec(struct iowait *wait) +{ + if (!wait) + return 0; + return atomic_dec_and_test(&wait->sdma_busy); +} + +/** + * iowait_pio_drain() - wait for pios to drain + * + * @wait: iowait structure + * + * This will delay until the iowait pios have + * completed. + */ +static inline void iowait_pio_drain(struct iowait *wait) +{ + wait_event_timeout(wait->wait_pio, + !atomic_read(&wait->pio_busy), + HZ); +} + +/** + * iowait_pio_pending() - return pio pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_pio_pending(struct iowait *wait) +{ + return atomic_read(&wait->pio_busy); +} + +/** + * iowait_pio_inc - note pio pending + * @wait: iowait structure + */ +static inline void iowait_pio_inc(struct iowait *wait) +{ + atomic_inc(&wait->pio_busy); +} + +/** + * iowait_pio_dec - note pio complete + * @wait: iowait structure + */ +static inline int iowait_pio_dec(struct iowait *wait) +{ + if (!wait) + return 0; + return atomic_dec_and_test(&wait->pio_busy); +} + +/** + * iowait_drain_wakeup() - trigger iowait_drain() waiter + * + * @wait: iowait structure + * + * This will trigger any waiters. + */ +static inline void iowait_drain_wakeup(struct iowait *wait) +{ + wake_up(&wait->wait_dma); + wake_up(&wait->wait_pio); + if (wait->sdma_drained) + wait->sdma_drained(wait); +} + +/** + * iowait_get_txhead() - get packet off of iowait list + * + * @wait: iowait_work structure + */ +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&wait->tx_head)) { + tx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + } + return tx; +} + +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + num_desc = tx->num_desc; + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + +static inline void iowait_update_priority(struct iowait_work *w) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } +} + +static inline void iowait_update_all_priority(struct iowait *w) +{ + iowait_update_priority(&w->wait[IOWAIT_IB_SE]); + iowait_update_priority(&w->wait[IOWAIT_TID_SE]); +} + +static inline void iowait_init_priority(struct iowait *w) +{ + w->priority = 0; + if (w->init_priority) + w->init_priority(w); +} + +static inline void iowait_get_priority(struct iowait *w) +{ + iowait_init_priority(w); + iowait_update_all_priority(w); +} + +/** + * iowait_queue - Put the iowait on a wait queue + * @pkts_sent: have some packets been sent before queuing? + * @w: the iowait struct + * @wait_head: the wait queue + * + * This function is called to insert an iowait struct into a + * wait queue after a resource (eg, sdma descriptor or pio + * buffer) is run out. + */ +static inline void iowait_queue(bool pkts_sent, struct iowait *w, + struct list_head *wait_head) +{ + /* + * To play fair, insert the iowait at the tail of the wait queue if it + * has already sent some packets; Otherwise, put it at the head. + * However, if it has priority packets to send, also put it at the + * head. + */ + if (pkts_sent) + w->starved_cnt = 0; + else + w->starved_cnt++; + + if (w->priority > 0 || !pkts_sent) + list_add(&w->list, wait_head); + else + list_add_tail(&w->list, wait_head); +} + +/** + * iowait_starve_clear - clear the wait queue's starve count + * @pkts_sent: have some packets been sent? + * @w: the iowait struct + * + * This function is called to clear the starve count. If no + * packets have been sent, the starve count will not be cleared. + */ +static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) +{ + if (pkts_sent) + w->starved_cnt = 0; +} + +/* Update the top priority index */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx); + +/** + * iowait_packet_queued() - determine if a packet is queued + * @wait: the iowait_work structure + */ +static inline bool iowait_packet_queued(struct iowait_work *wait) +{ + return !list_empty(&wait->tx_head); +} + +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + +#endif diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h new file mode 100644 index 0000000000..aec60d4888 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for IPOIB functionality + */ + +#ifndef HFI1_IPOIB_H +#define HFI1_IPOIB_H + +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/atomic.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/if_infiniband.h> + +#include "hfi.h" +#include "iowait.h" +#include "netdev.h" + +#include <rdma/ib_verbs.h> + +#define HFI1_IPOIB_ENTROPY_SHIFT 24 + +#define HFI1_IPOIB_TXREQ_NAME_LEN 32 + +#define HFI1_IPOIB_PSEUDO_LEN 20 +#define HFI1_IPOIB_ENCAP_LEN 4 + +struct hfi1_ipoib_dev_priv; + +union hfi1_ipoib_flow { + u16 as_int; + struct { + u8 tx_queue; + u8 sc5; + } __attribute__((__packed__)); +}; + +/** + * struct ipoib_txreq - IPOIB transmit descriptor + * @txreq: sdma transmit request + * @sdma_hdr: 9b ib headers + * @sdma_status: status returned by sdma engine + * @complete: non-zero implies complete + * @priv: ipoib netdev private data + * @txq: txq on which skb was output + * @skb: skb to send + */ +struct ipoib_txreq { + struct sdma_txreq txreq; + struct hfi1_sdma_header *sdma_hdr; + int sdma_status; + int complete; + struct hfi1_ipoib_dev_priv *priv; + struct hfi1_ipoib_txq *txq; + struct sk_buff *skb; +}; + +/** + * struct hfi1_ipoib_circ_buf - List of items to be processed + * @items: ring of items each a power of two size + * @max_items: max items + 1 that the ring can contain + * @shift: log2 of size for getting txreq + * @sent_txreqs: count of txreqs posted to sdma + * @tail: ring tail + * @stops: count of stops of queue + * @ring_full: ring has been filled + * @no_desc: descriptor shortage seen + * @complete_txreqs: count of txreqs completed by sdma + * @head: ring head + */ +struct hfi1_ipoib_circ_buf { + void *items; + u32 max_items; + u32 shift; + /* consumer cache line */ + u64 ____cacheline_aligned_in_smp sent_txreqs; + u32 avail; + u32 tail; + atomic_t stops; + atomic_t ring_full; + atomic_t no_desc; + /* producer cache line */ + u64 ____cacheline_aligned_in_smp complete_txreqs; + u32 head; +}; + +/** + * struct hfi1_ipoib_txq - IPOIB per Tx queue information + * @priv: private pointer + * @sde: sdma engine + * @tx_list: tx request list + * @sent_txreqs: count of txreqs posted to sdma + * @flow: tracks when list needs to be flushed for a flow change + * @q_idx: ipoib Tx queue index + * @pkts_sent: indicator packets have been sent from this queue + * @wait: iowait structure + * @napi: pointer to tx napi interface + * @tx_ring: ring of ipoib txreqs to be reaped by napi callback + */ +struct hfi1_ipoib_txq { + struct napi_struct napi; + struct hfi1_ipoib_dev_priv *priv; + struct sdma_engine *sde; + struct list_head tx_list; + union hfi1_ipoib_flow flow; + u8 q_idx; + bool pkts_sent; + struct iowait wait; + + struct hfi1_ipoib_circ_buf ____cacheline_aligned_in_smp tx_ring; +}; + +struct hfi1_ipoib_dev_priv { + struct hfi1_devdata *dd; + struct net_device *netdev; + struct ib_device *device; + struct hfi1_ipoib_txq *txqs; + const struct net_device_ops *netdev_ops; + struct rvt_qp *qp; + u32 qkey; + u16 pkey; + u16 pkey_index; + u8 port_num; +}; + +/* hfi1 ipoib rdma netdev's private data structure */ +struct hfi1_ipoib_rdma_netdev { + struct rdma_netdev rn; /* keep this first */ + /* followed by device private data */ + struct hfi1_ipoib_dev_priv dev_priv; +}; + +static inline struct hfi1_ipoib_dev_priv * +hfi1_ipoib_priv(const struct net_device *dev) +{ + return &((struct hfi1_ipoib_rdma_netdev *)netdev_priv(dev))->dev_priv; +} + +int hfi1_ipoib_send(struct net_device *dev, + struct sk_buff *skb, + struct ib_ah *address, + u32 dqpn); + +int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv); +void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv); + +int hfi1_ipoib_rxq_init(struct net_device *dev); +void hfi1_ipoib_rxq_deinit(struct net_device *dev); + +void hfi1_ipoib_napi_tx_enable(struct net_device *dev); +void hfi1_ipoib_napi_tx_disable(struct net_device *dev); + +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data); + +int hfi1_ipoib_rn_get_params(struct ib_device *device, + u32 port_num, + enum rdma_netdev_t type, + struct rdma_netdev_alloc_params *params); + +void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q); + +#endif /* _IPOIB_H */ diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c new file mode 100644 index 0000000000..5d814afdf7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for ipoib functionality + */ + +#include "ipoib.h" +#include "hfi.h" + +static u32 qpn_from_mac(const u8 *mac_arr) +{ + return (u32)mac_arr[1] << 16 | mac_arr[2] << 8 | mac_arr[3]; +} + +static int hfi1_ipoib_dev_init(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int ret; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = priv->netdev_ops->ndo_init(dev); + if (ret) + goto out_ret; + + ret = hfi1_netdev_add_data(priv->dd, + qpn_from_mac(priv->netdev->dev_addr), + dev); + if (ret < 0) { + priv->netdev_ops->ndo_uninit(dev); + goto out_ret; + } + + return 0; +out_ret: + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; +} + +static void hfi1_ipoib_dev_uninit(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + free_percpu(dev->tstats); + dev->tstats = NULL; + + hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); + + priv->netdev_ops->ndo_uninit(dev); +} + +static int hfi1_ipoib_dev_open(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int ret; + + ret = priv->netdev_ops->ndo_open(dev); + if (!ret) { + struct hfi1_ibport *ibp = to_iport(priv->device, + priv->port_num); + struct rvt_qp *qp; + u32 qpn = qpn_from_mac(priv->netdev->dev_addr); + + rcu_read_lock(); + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (!qp) { + rcu_read_unlock(); + priv->netdev_ops->ndo_stop(dev); + return -EINVAL; + } + rvt_get_qp(qp); + priv->qp = qp; + rcu_read_unlock(); + + hfi1_netdev_enable_queues(priv->dd); + hfi1_ipoib_napi_tx_enable(dev); + } + + return ret; +} + +static int hfi1_ipoib_dev_stop(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + if (!priv->qp) + return 0; + + hfi1_ipoib_napi_tx_disable(dev); + hfi1_netdev_disable_queues(priv->dd); + + rvt_put_qp(priv->qp); + priv->qp = NULL; + + return priv->netdev_ops->ndo_stop(dev); +} + +static const struct net_device_ops hfi1_ipoib_netdev_ops = { + .ndo_init = hfi1_ipoib_dev_init, + .ndo_uninit = hfi1_ipoib_dev_uninit, + .ndo_open = hfi1_ipoib_dev_open, + .ndo_stop = hfi1_ipoib_dev_stop, + .ndo_get_stats64 = dev_get_tstats64, +}; + +static int hfi1_ipoib_mcast_attach(struct net_device *dev, + struct ib_device *device, + union ib_gid *mgid, + u16 mlid, + int set_qkey, + u32 qkey) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr); + struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num); + struct rvt_qp *qp; + int ret = -EINVAL; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (qp) { + rvt_get_qp(qp); + rcu_read_unlock(); + if (set_qkey) + priv->qkey = qkey; + + /* attach QP to multicast group */ + ret = ib_attach_mcast(&qp->ibqp, mgid, mlid); + rvt_put_qp(qp); + } else { + rcu_read_unlock(); + } + + return ret; +} + +static int hfi1_ipoib_mcast_detach(struct net_device *dev, + struct ib_device *device, + union ib_gid *mgid, + u16 mlid) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr); + struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num); + struct rvt_qp *qp; + int ret = -EINVAL; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (qp) { + rvt_get_qp(qp); + rcu_read_unlock(); + ret = ib_detach_mcast(&qp->ibqp, mgid, mlid); + rvt_put_qp(qp); + } else { + rcu_read_unlock(); + } + return ret; +} + +static void hfi1_ipoib_netdev_dtor(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + hfi1_ipoib_txreq_deinit(priv); + hfi1_ipoib_rxq_deinit(priv->netdev); + + free_percpu(dev->tstats); + dev->tstats = NULL; +} + +static void hfi1_ipoib_set_id(struct net_device *dev, int id) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + priv->pkey_index = (u16)id; + ib_query_pkey(priv->device, + priv->port_num, + priv->pkey_index, + &priv->pkey); +} + +static int hfi1_ipoib_setup_rn(struct ib_device *device, + u32 port_num, + struct net_device *netdev, + void *param) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + struct rdma_netdev *rn = netdev_priv(netdev); + struct hfi1_ipoib_dev_priv *priv; + int rc; + + rn->send = hfi1_ipoib_send; + rn->tx_timeout = hfi1_ipoib_tx_timeout; + rn->attach_mcast = hfi1_ipoib_mcast_attach; + rn->detach_mcast = hfi1_ipoib_mcast_detach; + rn->set_id = hfi1_ipoib_set_id; + rn->hca = device; + rn->port_num = port_num; + rn->mtu = netdev->mtu; + + priv = hfi1_ipoib_priv(netdev); + priv->dd = dd; + priv->netdev = netdev; + priv->device = device; + priv->port_num = port_num; + priv->netdev_ops = netdev->netdev_ops; + + ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey); + + rc = hfi1_ipoib_txreq_init(priv); + if (rc) { + dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc); + return rc; + } + + rc = hfi1_ipoib_rxq_init(netdev); + if (rc) { + dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc); + hfi1_ipoib_txreq_deinit(priv); + return rc; + } + + netdev->netdev_ops = &hfi1_ipoib_netdev_ops; + + netdev->priv_destructor = hfi1_ipoib_netdev_dtor; + netdev->needs_free_netdev = true; + + return 0; +} + +int hfi1_ipoib_rn_get_params(struct ib_device *device, + u32 port_num, + enum rdma_netdev_t type, + struct rdma_netdev_alloc_params *params) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + + if (type != RDMA_NETDEV_IPOIB) + return -EOPNOTSUPP; + + if (!HFI1_CAP_IS_KSET(AIP) || !dd->num_netdev_contexts) + return -EOPNOTSUPP; + + if (!port_num || port_num > dd->num_pports) + return -EINVAL; + + params->sizeof_priv = sizeof(struct hfi1_ipoib_rdma_netdev); + params->txqs = dd->num_sdma; + params->rxqs = dd->num_netdev_contexts; + params->param = NULL; + params->initialize_rdma_netdev = hfi1_ipoib_setup_rn; + + return 0; +} diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c new file mode 100644 index 0000000000..629691a572 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#include "netdev.h" +#include "ipoib.h" + +#define HFI1_IPOIB_SKB_PAD ((NET_SKB_PAD) + (NET_IP_ALIGN)) + +static void copy_ipoib_buf(struct sk_buff *skb, void *data, int size) +{ + skb_checksum_none_assert(skb); + skb->protocol = *((__be16 *)data); + + skb_put_data(skb, data, size); + skb->mac_header = HFI1_IPOIB_PSEUDO_LEN; + skb_pull(skb, HFI1_IPOIB_ENCAP_LEN); +} + +static struct sk_buff *prepare_frag_skb(struct napi_struct *napi, int size) +{ + struct sk_buff *skb; + int skb_size = SKB_DATA_ALIGN(size + HFI1_IPOIB_SKB_PAD); + void *frag; + + skb_size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + skb_size = SKB_DATA_ALIGN(skb_size); + frag = napi_alloc_frag(skb_size); + + if (unlikely(!frag)) + return napi_alloc_skb(napi, size); + + skb = build_skb(frag, skb_size); + + if (unlikely(!skb)) { + skb_free_frag(frag); + return NULL; + } + + skb_reserve(skb, HFI1_IPOIB_SKB_PAD); + return skb; +} + +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data) +{ + struct napi_struct *napi = &rxq->napi; + int skb_size = size + HFI1_IPOIB_ENCAP_LEN; + struct sk_buff *skb; + + /* + * For smaller(4k + skb overhead) allocations we will go using + * napi cache. Otherwise we will try to use napi frag cache. + */ + if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb = napi_alloc_skb(napi, skb_size); + else + skb = prepare_frag_skb(napi, skb_size); + + if (unlikely(!skb)) + return NULL; + + copy_ipoib_buf(skb, data, size); + + return skb; +} + +int hfi1_ipoib_rxq_init(struct net_device *netdev) +{ + struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); + struct hfi1_devdata *dd = ipoib_priv->dd; + int ret; + + ret = hfi1_netdev_rx_init(dd); + if (ret) + return ret; + + hfi1_init_aip_rsm(dd); + + return ret; +} + +void hfi1_ipoib_rxq_deinit(struct net_device *netdev) +{ + struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); + struct hfi1_devdata *dd = ipoib_priv->dd; + + hfi1_deinit_aip_rsm(dd); + hfi1_netdev_rx_destroy(dd); +} diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c new file mode 100644 index 0000000000..e7d8313302 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -0,0 +1,868 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for IPOIB SDMA functionality + */ + +#include <linux/log2.h> +#include <linux/circ_buf.h> + +#include "sdma.h" +#include "verbs.h" +#include "trace_ibhdrs.h" +#include "ipoib.h" +#include "trace_tx.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + +struct ipoib_txparms { + struct hfi1_devdata *dd; + struct rdma_ah_attr *ah_attr; + struct hfi1_ibport *ibp; + struct hfi1_ipoib_txq *txq; + union hfi1_ipoib_flow flow; + u32 dqpn; + u8 hdr_dwords; + u8 entropy; +}; + +static struct ipoib_txreq * +hfi1_txreq_from_idx(struct hfi1_ipoib_circ_buf *r, u32 idx) +{ + return (struct ipoib_txreq *)(r->items + (idx << r->shift)); +} + +static u32 hfi1_ipoib_txreqs(const u64 sent, const u64 completed) +{ + return sent - completed; +} + +static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq) +{ + return hfi1_ipoib_txreqs(txq->tx_ring.sent_txreqs, + txq->tx_ring.complete_txreqs); +} + +static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq) +{ + trace_hfi1_txq_stop(txq); + if (atomic_inc_return(&txq->tx_ring.stops) == 1) + netif_stop_subqueue(txq->priv->netdev, txq->q_idx); +} + +static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq) +{ + trace_hfi1_txq_wake(txq); + if (atomic_dec_and_test(&txq->tx_ring.stops)) + netif_wake_subqueue(txq->priv->netdev, txq->q_idx); +} + +static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items - 1); +} + +static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items) >> 1; +} + +static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +{ + ++txq->tx_ring.sent_txreqs; + if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) && + !atomic_xchg(&txq->tx_ring.ring_full, 1)) { + trace_hfi1_txq_full(txq); + hfi1_ipoib_stop_txq(txq); + } +} + +static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) +{ + struct net_device *dev = txq->priv->netdev; + + /* If shutting down just return as queue state is irrelevant */ + if (unlikely(dev->reg_state != NETREG_REGISTERED)) + return; + + /* + * When the queue has been drained to less than half full it will be + * restarted. + * The size of the txreq ring is fixed at initialization. + * The tx queue len can be adjusted upward while the interface is + * running. + * The tx queue len can be large enough to overflow the txreq_ring. + * Use the minimum of the current tx_queue_len or the rings max txreqs + * to protect against ring overflow. + */ + if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) && + atomic_xchg(&txq->tx_ring.ring_full, 0)) { + trace_hfi1_txq_xmit_unstopped(txq); + hfi1_ipoib_wake_txq(txq); + } +} + +static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) +{ + struct hfi1_ipoib_dev_priv *priv = tx->txq->priv; + + if (likely(!tx->sdma_status)) { + dev_sw_netstats_tx_add(priv->netdev, 1, tx->skb->len); + } else { + ++priv->netdev->stats.tx_errors; + dd_dev_warn(priv->dd, + "%s: Status = 0x%x pbc 0x%llx txq = %d sde = %d\n", + __func__, tx->sdma_status, + le64_to_cpu(tx->sdma_hdr->pbc), tx->txq->q_idx, + tx->txq->sde->this_idx); + } + + napi_consume_skb(tx->skb, budget); + tx->skb = NULL; + sdma_txclean(priv->dd, &tx->txreq); +} + +static void hfi1_ipoib_drain_tx_ring(struct hfi1_ipoib_txq *txq) +{ + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; + int i; + struct ipoib_txreq *tx; + + for (i = 0; i < tx_ring->max_items; i++) { + tx = hfi1_txreq_from_idx(tx_ring, i); + tx->complete = 0; + dev_kfree_skb_any(tx->skb); + tx->skb = NULL; + sdma_txclean(txq->priv->dd, &tx->txreq); + } + tx_ring->head = 0; + tx_ring->tail = 0; + tx_ring->complete_txreqs = 0; + tx_ring->sent_txreqs = 0; + tx_ring->avail = hfi1_ipoib_ring_hwat(txq); +} + +static int hfi1_ipoib_poll_tx_ring(struct napi_struct *napi, int budget) +{ + struct hfi1_ipoib_txq *txq = + container_of(napi, struct hfi1_ipoib_txq, napi); + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; + u32 head = tx_ring->head; + u32 max_tx = tx_ring->max_items; + int work_done; + struct ipoib_txreq *tx = hfi1_txreq_from_idx(tx_ring, head); + + trace_hfi1_txq_poll(txq); + for (work_done = 0; work_done < budget; work_done++) { + /* See hfi1_ipoib_sdma_complete() */ + if (!smp_load_acquire(&tx->complete)) + break; + tx->complete = 0; + trace_hfi1_tx_produce(tx, head); + hfi1_ipoib_free_tx(tx, budget); + head = CIRC_NEXT(head, max_tx); + tx = hfi1_txreq_from_idx(tx_ring, head); + } + tx_ring->complete_txreqs += work_done; + + /* Finished freeing tx items so store the head value. */ + smp_store_release(&tx_ring->head, head); + + hfi1_ipoib_check_queue_stopped(txq); + + if (work_done < budget) + napi_complete_done(napi, work_done); + + return work_done; +} + +static void hfi1_ipoib_sdma_complete(struct sdma_txreq *txreq, int status) +{ + struct ipoib_txreq *tx = container_of(txreq, struct ipoib_txreq, txreq); + + trace_hfi1_txq_complete(tx->txq); + tx->sdma_status = status; + /* see hfi1_ipoib_poll_tx_ring */ + smp_store_release(&tx->complete, 1); + napi_schedule_irqoff(&tx->txq->napi); +} + +static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_devdata *dd = txp->dd; + struct sdma_txreq *txreq = &tx->txreq; + struct sk_buff *skb = tx->skb; + int ret = 0; + int i; + + if (skb_headlen(skb)) { + ret = sdma_txadd_kvaddr(dd, txreq, skb->data, skb_headlen(skb)); + if (unlikely(ret)) + return ret; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ret = sdma_txadd_page(dd, + txreq, + skb_frag_page(frag), + frag->bv_offset, + skb_frag_size(frag), + NULL, NULL, NULL); + if (unlikely(ret)) + break; + } + + return ret; +} + +static int hfi1_ipoib_build_tx_desc(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_devdata *dd = txp->dd; + struct sdma_txreq *txreq = &tx->txreq; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; + u16 pkt_bytes = + sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2) + tx->skb->len; + int ret; + + ret = sdma_txinit(txreq, 0, pkt_bytes, hfi1_ipoib_sdma_complete); + if (unlikely(ret)) + return ret; + + /* add pbc + headers */ + ret = sdma_txadd_kvaddr(dd, + txreq, + sdma_hdr, + sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2)); + if (unlikely(ret)) + return ret; + + /* add the ulp payload */ + return hfi1_ipoib_build_ulp_payload(tx, txp); +} + +static void hfi1_ipoib_build_ib_tx_headers(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_dev_priv *priv = tx->txq->priv; + struct hfi1_sdma_header *sdma_hdr = tx->sdma_hdr; + struct sk_buff *skb = tx->skb; + struct hfi1_pportdata *ppd = ppd_from_ibp(txp->ibp); + struct rdma_ah_attr *ah_attr = txp->ah_attr; + struct ib_other_headers *ohdr; + struct ib_grh *grh; + u16 dwords; + u16 slid; + u16 dlid; + u16 lrh0; + u32 bth0; + u32 sqpn = (u32)(priv->netdev->dev_addr[1] << 16 | + priv->netdev->dev_addr[2] << 8 | + priv->netdev->dev_addr[3]); + u16 payload_dwords; + u8 pad_cnt; + + pad_cnt = -skb->len & 3; + + /* Includes ICRC */ + payload_dwords = ((skb->len + pad_cnt) >> 2) + SIZE_OF_CRC; + + /* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */ + txp->hdr_dwords = 7; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + grh = &sdma_hdr->hdr.ibh.u.l.grh; + txp->hdr_dwords += + hfi1_make_grh(txp->ibp, + grh, + rdma_ah_read_grh(ah_attr), + txp->hdr_dwords - LRH_9B_DWORDS, + payload_dwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &sdma_hdr->hdr.ibh.u.l.oth; + } else { + lrh0 = HFI1_LRH_BTH; + ohdr = &sdma_hdr->hdr.ibh.u.oth; + } + + lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; + lrh0 |= (txp->flow.sc5 & 0xf) << 12; + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B); + if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } else { + u16 lid = (u16)ppd->lid; + + if (lid) { + lid |= rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1); + slid = lid; + } else { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } + } + + /* Includes ICRC */ + dwords = txp->hdr_dwords + payload_dwords; + + /* Build the lrh */ + sdma_hdr->hdr.hdr_type = HFI1_PKT_TYPE_9B; + hfi1_make_ib_hdr(&sdma_hdr->hdr.ibh, lrh0, dwords, dlid, slid); + + /* Build the bth */ + bth0 = (IB_OPCODE_UD_SEND_ONLY << 24) | (pad_cnt << 20) | priv->pkey; + + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(txp->dqpn); + ohdr->bth[2] = cpu_to_be32(mask_psn((u32)txp->txq->tx_ring.sent_txreqs)); + + /* Build the deth */ + ohdr->u.ud.deth[0] = cpu_to_be32(priv->qkey); + ohdr->u.ud.deth[1] = cpu_to_be32((txp->entropy << + HFI1_IPOIB_ENTROPY_SHIFT) | sqpn); + + /* Construct the pbc. */ + sdma_hdr->pbc = + cpu_to_le64(create_pbc(ppd, + ib_is_sc5(txp->flow.sc5) << + PBC_DC_INFO_SHIFT, + 0, + sc_to_vlt(priv->dd, txp->flow.sc5), + dwords - SIZE_OF_CRC + + (sizeof(sdma_hdr->pbc) >> 2))); +} + +static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct hfi1_ipoib_txq *txq = txp->txq; + struct ipoib_txreq *tx; + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; + u32 tail = tx_ring->tail; + int ret; + + if (unlikely(!tx_ring->avail)) { + u32 head; + + if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq)) + /* This shouldn't happen with a stopped queue */ + return ERR_PTR(-ENOMEM); + /* See hfi1_ipoib_poll_tx_ring() */ + head = smp_load_acquire(&tx_ring->head); + tx_ring->avail = + min_t(u32, hfi1_ipoib_ring_hwat(txq), + CIRC_CNT(head, tail, tx_ring->max_items)); + } else { + tx_ring->avail--; + } + tx = hfi1_txreq_from_idx(tx_ring, tail); + trace_hfi1_txq_alloc_tx(txq); + + /* so that we can test if the sdma descriptors are there */ + tx->txreq.num_desc = 0; + tx->txq = txq; + tx->skb = skb; + INIT_LIST_HEAD(&tx->txreq.list); + + hfi1_ipoib_build_ib_tx_headers(tx, txp); + + ret = hfi1_ipoib_build_tx_desc(tx, txp); + if (likely(!ret)) { + if (txq->flow.as_int != txp->flow.as_int) { + txq->flow.tx_queue = txp->flow.tx_queue; + txq->flow.sc5 = txp->flow.sc5; + txq->sde = + sdma_select_engine_sc(priv->dd, + txp->flow.tx_queue, + txp->flow.sc5); + trace_hfi1_flow_switch(txq); + } + + return tx; + } + + sdma_txclean(priv->dd, &tx->txreq); + + return ERR_PTR(ret); +} + +static int hfi1_ipoib_submit_tx_list(struct net_device *dev, + struct hfi1_ipoib_txq *txq) +{ + int ret; + u16 count_out; + + ret = sdma_send_txlist(txq->sde, + iowait_get_ib_work(&txq->wait), + &txq->tx_list, + &count_out); + if (likely(!ret) || ret == -EBUSY || ret == -ECOMM) + return ret; + + dd_dev_warn(txq->priv->dd, "cannot send skb tx list, err %d.\n", ret); + + return ret; +} + +static int hfi1_ipoib_flush_tx_list(struct net_device *dev, + struct hfi1_ipoib_txq *txq) +{ + int ret = 0; + + if (!list_empty(&txq->tx_list)) { + /* Flush the current list */ + ret = hfi1_ipoib_submit_tx_list(dev, txq); + + if (unlikely(ret)) + if (ret != -EBUSY) + ++dev->stats.tx_carrier_errors; + } + + return ret; +} + +static int hfi1_ipoib_submit_tx(struct hfi1_ipoib_txq *txq, + struct ipoib_txreq *tx) +{ + int ret; + + ret = sdma_send_txreq(txq->sde, + iowait_get_ib_work(&txq->wait), + &tx->txreq, + txq->pkts_sent); + if (likely(!ret)) { + txq->pkts_sent = true; + iowait_starve_clear(txq->pkts_sent, &txq->wait); + } + + return ret; +} + +static int hfi1_ipoib_send_dma_single(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_txq *txq = txp->txq; + struct hfi1_ipoib_circ_buf *tx_ring; + struct ipoib_txreq *tx; + int ret; + + tx = hfi1_ipoib_send_dma_common(dev, skb, txp); + if (IS_ERR(tx)) { + int ret = PTR_ERR(tx); + + dev_kfree_skb_any(skb); + + if (ret == -ENOMEM) + ++dev->stats.tx_errors; + else + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; + } + + tx_ring = &txq->tx_ring; + trace_hfi1_tx_consume(tx, tx_ring->tail); + /* consume tx */ + smp_store_release(&tx_ring->tail, CIRC_NEXT(tx_ring->tail, tx_ring->max_items)); + ret = hfi1_ipoib_submit_tx(txq, tx); + if (likely(!ret)) { +tx_ok: + trace_sdma_output_ibhdr(txq->priv->dd, + &tx->sdma_hdr->hdr, + ib_is_sc5(txp->flow.sc5)); + hfi1_ipoib_check_queue_depth(txq); + return NETDEV_TX_OK; + } + + txq->pkts_sent = false; + + if (ret == -EBUSY || ret == -ECOMM) + goto tx_ok; + + /* mark complete and kick napi tx */ + smp_store_release(&tx->complete, 1); + napi_schedule(&tx->txq->napi); + + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; +} + +static int hfi1_ipoib_send_dma_list(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_txq *txq = txp->txq; + struct hfi1_ipoib_circ_buf *tx_ring; + struct ipoib_txreq *tx; + + /* Has the flow change ? */ + if (txq->flow.as_int != txp->flow.as_int) { + int ret; + + trace_hfi1_flow_flush(txq); + ret = hfi1_ipoib_flush_tx_list(dev, txq); + if (unlikely(ret)) { + if (ret == -EBUSY) + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + } + tx = hfi1_ipoib_send_dma_common(dev, skb, txp); + if (IS_ERR(tx)) { + int ret = PTR_ERR(tx); + + dev_kfree_skb_any(skb); + + if (ret == -ENOMEM) + ++dev->stats.tx_errors; + else + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; + } + + tx_ring = &txq->tx_ring; + trace_hfi1_tx_consume(tx, tx_ring->tail); + /* consume tx */ + smp_store_release(&tx_ring->tail, CIRC_NEXT(tx_ring->tail, tx_ring->max_items)); + list_add_tail(&tx->txreq.list, &txq->tx_list); + + hfi1_ipoib_check_queue_depth(txq); + + trace_sdma_output_ibhdr(txq->priv->dd, + &tx->sdma_hdr->hdr, + ib_is_sc5(txp->flow.sc5)); + + if (!netdev_xmit_more()) + (void)hfi1_ipoib_flush_tx_list(dev, txq); + + return NETDEV_TX_OK; +} + +static u8 hfi1_ipoib_calc_entropy(struct sk_buff *skb) +{ + if (skb_transport_header_was_set(skb)) { + u8 *hdr = (u8 *)skb_transport_header(skb); + + return (hdr[0] ^ hdr[1] ^ hdr[2] ^ hdr[3]); + } + + return (u8)skb_get_queue_mapping(skb); +} + +int hfi1_ipoib_send(struct net_device *dev, + struct sk_buff *skb, + struct ib_ah *address, + u32 dqpn) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct ipoib_txparms txp; + struct rdma_netdev *rn = netdev_priv(dev); + + if (unlikely(skb->len > rn->mtu + HFI1_IPOIB_ENCAP_LEN)) { + dd_dev_warn(priv->dd, "packet len %d (> %d) too long to send, dropping\n", + skb->len, + rn->mtu + HFI1_IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + txp.dd = priv->dd; + txp.ah_attr = &ibah_to_rvtah(address)->attr; + txp.ibp = to_iport(priv->device, priv->port_num); + txp.txq = &priv->txqs[skb_get_queue_mapping(skb)]; + txp.dqpn = dqpn; + txp.flow.sc5 = txp.ibp->sl_to_sc[rdma_ah_get_sl(txp.ah_attr)]; + txp.flow.tx_queue = (u8)skb_get_queue_mapping(skb); + txp.entropy = hfi1_ipoib_calc_entropy(skb); + + if (netdev_xmit_more() || !list_empty(&txp.txq->tx_list)) + return hfi1_ipoib_send_dma_list(dev, skb, &txp); + + return hfi1_ipoib_send_dma_single(dev, skb, &txp); +} + +/* + * hfi1_ipoib_sdma_sleep - ipoib sdma sleep function + * + * This function gets called from sdma_send_txreq() when there are not enough + * sdma descriptors available to send the packet. It adds Tx queue's wait + * structure to sdma engine's dmawait list to be woken up when descriptors + * become available. + */ +static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *txreq, + uint seq, + bool pkts_sent) +{ + struct hfi1_ipoib_txq *txq = + container_of(wait->iow, struct hfi1_ipoib_txq, wait); + + write_seqlock(&sde->waitlock); + + if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED)) { + if (sdma_progress(sde, seq, txreq)) { + write_sequnlock(&sde->waitlock); + return -EAGAIN; + } + + if (list_empty(&txreq->list)) + /* came from non-list submit */ + list_add_tail(&txreq->list, &txq->tx_list); + if (list_empty(&txq->wait.list)) { + struct hfi1_ibport *ibp = &sde->ppd->ibport_data; + + if (!atomic_xchg(&txq->tx_ring.no_desc, 1)) { + trace_hfi1_txq_queued(txq); + hfi1_ipoib_stop_txq(txq); + } + ibp->rvp.n_dmawait++; + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } + + write_sequnlock(&sde->waitlock); + return -EBUSY; + } + + write_sequnlock(&sde->waitlock); + return -EINVAL; +} + +/* + * hfi1_ipoib_sdma_wakeup - ipoib sdma wakeup function + * + * This function gets called when SDMA descriptors becomes available and Tx + * queue's wait structure was previously added to sdma engine's dmawait list. + */ +static void hfi1_ipoib_sdma_wakeup(struct iowait *wait, int reason) +{ + struct hfi1_ipoib_txq *txq = + container_of(wait, struct hfi1_ipoib_txq, wait); + + trace_hfi1_txq_wakeup(txq); + if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED)) + iowait_schedule(wait, system_highpri_wq, WORK_CPU_UNBOUND); +} + +static void hfi1_ipoib_flush_txq(struct work_struct *work) +{ + struct iowait_work *ioww = + container_of(work, struct iowait_work, iowork); + struct iowait *wait = iowait_ioww_to_iow(ioww); + struct hfi1_ipoib_txq *txq = + container_of(wait, struct hfi1_ipoib_txq, wait); + struct net_device *dev = txq->priv->netdev; + + if (likely(dev->reg_state == NETREG_REGISTERED) && + likely(!hfi1_ipoib_flush_tx_list(dev, txq))) + if (atomic_xchg(&txq->tx_ring.no_desc, 0)) + hfi1_ipoib_wake_txq(txq); +} + +int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) +{ + struct net_device *dev = priv->netdev; + u32 tx_ring_size, tx_item_size; + struct hfi1_ipoib_circ_buf *tx_ring; + int i, j; + + /* + * Ring holds 1 less than tx_ring_size + * Round up to next power of 2 in order to hold at least tx_queue_len + */ + tx_ring_size = roundup_pow_of_two(dev->tx_queue_len + 1); + tx_item_size = roundup_pow_of_two(sizeof(struct ipoib_txreq)); + + priv->txqs = kcalloc_node(dev->num_tx_queues, + sizeof(struct hfi1_ipoib_txq), + GFP_KERNEL, + priv->dd->node); + if (!priv->txqs) + return -ENOMEM; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct ipoib_txreq *tx; + + tx_ring = &txq->tx_ring; + iowait_init(&txq->wait, + 0, + hfi1_ipoib_flush_txq, + NULL, + hfi1_ipoib_sdma_sleep, + hfi1_ipoib_sdma_wakeup, + NULL, + NULL); + txq->priv = priv; + txq->sde = NULL; + INIT_LIST_HEAD(&txq->tx_list); + atomic_set(&txq->tx_ring.stops, 0); + atomic_set(&txq->tx_ring.ring_full, 0); + atomic_set(&txq->tx_ring.no_desc, 0); + txq->q_idx = i; + txq->flow.tx_queue = 0xff; + txq->flow.sc5 = 0xff; + txq->pkts_sent = false; + + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + priv->dd->node); + + txq->tx_ring.items = + kvzalloc_node(array_size(tx_ring_size, tx_item_size), + GFP_KERNEL, priv->dd->node); + if (!txq->tx_ring.items) + goto free_txqs; + + txq->tx_ring.max_items = tx_ring_size; + txq->tx_ring.shift = ilog2(tx_item_size); + txq->tx_ring.avail = hfi1_ipoib_ring_hwat(txq); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) { + hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr = + kzalloc_node(sizeof(*tx->sdma_hdr), + GFP_KERNEL, priv->dd->node); + if (!hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr) + goto free_txqs; + } + + netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring); + } + + return 0; + +free_txqs: + for (i--; i >= 0; i--) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + netif_napi_del(&txq->napi); + tx_ring = &txq->tx_ring; + for (j = 0; j < tx_ring_size; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); + } + + kfree(priv->txqs); + priv->txqs = NULL; + return -ENOMEM; +} + +static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) +{ + struct sdma_txreq *txreq; + struct sdma_txreq *txreq_tmp; + + list_for_each_entry_safe(txreq, txreq_tmp, &txq->tx_list, list) { + struct ipoib_txreq *tx = + container_of(txreq, struct ipoib_txreq, txreq); + + list_del(&txreq->list); + sdma_txclean(txq->priv->dd, &tx->txreq); + dev_kfree_skb_any(tx->skb); + tx->skb = NULL; + txq->tx_ring.complete_txreqs++; + } + + if (hfi1_ipoib_used(txq)) + dd_dev_warn(txq->priv->dd, + "txq %d not empty found %u requests\n", + txq->q_idx, + hfi1_ipoib_txreqs(txq->tx_ring.sent_txreqs, + txq->tx_ring.complete_txreqs)); +} + +void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv) +{ + int i, j; + + for (i = 0; i < priv->netdev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; + + iowait_cancel_work(&txq->wait); + iowait_sdma_drain(&txq->wait); + hfi1_ipoib_drain_tx_list(txq); + netif_napi_del(&txq->napi); + hfi1_ipoib_drain_tx_ring(txq); + for (j = 0; j < tx_ring->max_items; j++) + kfree(hfi1_txreq_from_idx(tx_ring, j)->sdma_hdr); + kvfree(tx_ring->items); + } + + kfree(priv->txqs); + priv->txqs = NULL; +} + +void hfi1_ipoib_napi_tx_enable(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + napi_enable(&txq->napi); + } +} + +void hfi1_ipoib_napi_tx_disable(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + napi_disable(&txq->napi); + hfi1_ipoib_drain_tx_ring(txq); + } +} + +void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct hfi1_ipoib_txq *txq = &priv->txqs[q]; + + dd_dev_info(priv->dd, "timeout txq %p q %u stopped %u stops %d no_desc %d ring_full %d\n", + txq, q, + __netif_subqueue_stopped(dev, txq->q_idx), + atomic_read(&txq->tx_ring.stops), + atomic_read(&txq->tx_ring.no_desc), + atomic_read(&txq->tx_ring.ring_full)); + dd_dev_info(priv->dd, "sde %p engine %u\n", + txq->sde, + txq->sde ? txq->sde->this_idx : 0); + dd_dev_info(priv->dd, "flow %x\n", txq->flow.as_int); + dd_dev_info(priv->dd, "sent %llu completed %llu used %llu\n", + txq->tx_ring.sent_txreqs, txq->tx_ring.complete_txreqs, + hfi1_ipoib_used(txq)); + dd_dev_info(priv->dd, "tx_queue_len %u max_items %u\n", + dev->tx_queue_len, txq->tx_ring.max_items); + dd_dev_info(priv->dd, "head %u tail %u\n", + txq->tx_ring.head, txq->tx_ring.tail); + dd_dev_info(priv->dd, "wait queued %u\n", + !list_empty(&txq->wait.list)); + dd_dev_info(priv->dd, "tx_list empty %u\n", + list_empty(&txq->tx_list)); +} + diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c new file mode 100644 index 0000000000..e5e783c458 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -0,0 +1,4896 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2018 Intel Corporation. + */ + +#include <linux/net.h> +#include <rdma/opa_addr.h> +#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \ + / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16))) + +#include "hfi.h" +#include "mad.h" +#include "trace.h" +#include "qp.h" +#include "vnic.h" + +/* the reset value from the FM is supposed to be 0xffff, handle both */ +#define OPA_LINK_WIDTH_RESET_OLD 0x0fff +#define OPA_LINK_WIDTH_RESET 0xffff + +struct trap_node { + struct list_head list; + struct opa_mad_notice_attr data; + __be64 tid; + int len; + u32 retry; + u8 in_use; + u8 repress; +}; + +static int smp_length_check(u32 data_size, u32 request_len) +{ + if (unlikely(request_len < data_size)) + return -EINVAL; + + return 0; +} + +static int reply(struct ib_mad_hdr *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static inline void clear_opa_smp_data(struct opa_smp *smp) +{ + void *data = opa_get_smp_data(smp); + size_t size = opa_get_smp_data_size(smp); + + memset(data, 0, size); +} + +static u16 hfi1_lookup_pkey_value(struct hfi1_ibport *ibp, int pkey_idx) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (pkey_idx < ARRAY_SIZE(ppd->pkeys)) + return ppd->pkeys[pkey_idx]; + + return 0; +} + +void hfi1_event_pkey_change(struct hfi1_devdata *dd, u32 port) +{ + struct ib_event event; + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); +} + +/* + * If the port is down, clean up all pending traps. We need to be careful + * with the given trap, because it may be queued. + */ +static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap) +{ + struct trap_node *node, *q; + unsigned long flags; + struct list_head trap_list; + int i; + + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list); + ibp->rvp.trap_lists[i].list_len = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + /* + * Remove all items from the list, freeing all the non-given + * traps. + */ + list_for_each_entry_safe(node, q, &trap_list, list) { + list_del(&node->list); + if (node != trap) + kfree(node); + } + } + + /* + * If this wasn't on one of the lists it would not be freed. If it + * was on the list, it is now safe to free. + */ + kfree(trap); +} + +static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp, + struct trap_node *trap) +{ + struct trap_node *node; + struct trap_list *trap_list; + unsigned long flags; + unsigned long timeout; + int found = 0; + unsigned int queue_id; + static int trap_count; + + queue_id = trap->data.generic_type & 0x0F; + if (queue_id >= RVT_MAX_TRAP_LISTS) { + trap_count++; + pr_err_ratelimited("hfi1: Invalid trap 0x%0x dropped. Total dropped: %d\n", + trap->data.generic_type, trap_count); + kfree(trap); + return NULL; + } + + /* + * Since the retry (handle timeout) does not remove a trap request + * from the list, all we have to do is compare the node. + */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + trap_list = &ibp->rvp.trap_lists[queue_id]; + + list_for_each_entry(node, &trap_list->list, list) { + if (node == trap) { + node->retry++; + found = 1; + break; + } + } + + /* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */ + if (!found) { + if (trap_list->list_len < RVT_MAX_TRAP_LEN) { + trap_list->list_len++; + list_add_tail(&trap->list, &trap_list->list); + } else { + pr_warn_ratelimited("hfi1: Maximum trap limit reached for 0x%0x traps\n", + trap->data.generic_type); + kfree(trap); + } + } + + /* + * Next check to see if there is a timer pending. If not, set it up + * and get the first trap from the list. + */ + node = NULL; + if (!timer_pending(&ibp->rvp.trap_timer)) { + /* + * o14-2 + * If the time out is set we have to wait until it expires + * before the trap can be sent. + * This should be > RVT_TRAP_TIMEOUT + */ + timeout = (RVT_TRAP_TIMEOUT * + (1UL << ibp->rvp.subnet_timeout)) / 1000; + mod_timer(&ibp->rvp.trap_timer, + jiffies + usecs_to_jiffies(timeout)); + node = list_first_entry(&trap_list->list, struct trap_node, + list); + node->in_use = 1; + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + return node; +} + +static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp, + struct opa_smp *smp) +{ + struct trap_list *trap_list; + struct trap_node *trap; + unsigned long flags; + int i; + + if (smp->attr_id != IB_SMP_ATTR_NOTICE) + return; + + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + trap_list = &ibp->rvp.trap_lists[i]; + trap = list_first_entry_or_null(&trap_list->list, + struct trap_node, list); + if (trap && trap->tid == smp->tid) { + if (trap->in_use) { + trap->repress = 1; + } else { + trap_list->list_len--; + list_del(&trap->list); + kfree(trap); + } + break; + } + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); +} + +static void hfi1_update_sm_ah_attr(struct hfi1_ibport *ibp, + struct rdma_ah_attr *attr, u32 dlid) +{ + rdma_ah_set_dlid(attr, dlid); + rdma_ah_set_port_num(attr, ppd_from_ibp(ibp)->port); + if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + grh->sgid_index = 0; + grh->hop_limit = 1; + grh->dgid.global.subnet_prefix = + ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = OPA_MAKE_ID(dlid); + } +} + +static int hfi1_modify_qp0_ah(struct hfi1_ibport *ibp, + struct rvt_ah *ah, u32 dlid) +{ + struct rdma_ah_attr attr; + struct rvt_qp *qp0; + int ret = -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.type = ah->ibah.type; + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ret = rdma_modify_ah(&ah->ibah, &attr); + rcu_read_unlock(); + return ret; +} + +static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid) +{ + struct rdma_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct rvt_qp *qp0; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ppd(ppd); + u32 port_num = ppd->port; + + memset(&attr, 0, sizeof(attr)); + attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num); + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0); + rcu_read_unlock(); + return ah; +} + +static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap) +{ + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct opa_smp *smp; + unsigned long flags; + int pkey_idx; + u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; + + agent = ibp->rvp.send_agent; + if (!agent) { + cleanup_traps(ibp, trap); + return; + } + + /* o14-3.2.1 */ + if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) { + cleanup_traps(ibp, trap); + return; + } + + /* Add the trap to the list if necessary and see if we can send it */ + trap = check_and_add_trap(ibp, trap); + if (!trap) + return; + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n", + __func__, hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + + send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0, + IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (IS_ERR(send_buf)) + return; + + smp = send_buf->mad; + smp->base_version = OPA_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + smp->class_version = OPA_SM_CLASS_VERSION; + smp->method = IB_MGMT_METHOD_TRAP; + + /* Only update the transaction ID for new traps (o13-5). */ + if (trap->tid == 0) { + ibp->rvp.tid++; + /* make sure that tid != 0 */ + if (ibp->rvp.tid == 0) + ibp->rvp.tid++; + trap->tid = cpu_to_be64(ibp->rvp.tid); + } + smp->tid = trap->tid; + + smp->attr_id = IB_SMP_ATTR_NOTICE; + /* o14-1: smp->mkey = 0; */ + + memcpy(smp->route.lid.data, &trap->data, trap->len); + + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (!ibp->rvp.sm_ah) { + if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + struct ib_ah *ah; + + ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid); + if (IS_ERR(ah)) { + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; + } + send_buf->ah = ah; + ibp->rvp.sm_ah = ibah_to_rvtah(ah); + } else { + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; + } + } else { + send_buf->ah = &ibp->rvp.sm_ah->ibah; + } + + /* + * If the trap was repressed while things were getting set up, don't + * bother sending it. This could happen for a retry. + */ + if (trap->repress) { + list_del(&trap->list); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + kfree(trap); + ib_free_send_mad(send_buf); + return; + } + + trap->in_use = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + if (ib_post_send_mad(send_buf, NULL)) + ib_free_send_mad(send_buf); +} + +void hfi1_handle_trap_timer(struct timer_list *t) +{ + struct hfi1_ibport *ibp = from_timer(ibp, t, rvp.trap_timer); + struct trap_node *trap = NULL; + unsigned long flags; + int i; + + /* Find the trap with the highest priority */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) { + trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list, + struct trap_node, list); + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + if (trap) + send_trap(ibp, trap); +} + +static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid) +{ + struct trap_node *trap; + + trap = kzalloc(sizeof(*trap), GFP_ATOMIC); + if (!trap) + return NULL; + + INIT_LIST_HEAD(&trap->list); + trap->data.generic_type = type; + trap->data.prod_type_lsb = IB_NOTICE_PROD_CA; + trap->data.trap_num = trap_num; + trap->data.issuer_lid = cpu_to_be32(lid); + + return trap; +} + +/* + * Send a bad P_Key trap (ch. 14.3.8). + */ +void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, u32 lid1, u32 lid2) +{ + struct trap_node *trap; + u32 lid = ppd_from_ibp(ibp)->lid; + + ibp->rvp.n_pkt_drops++; + ibp->rvp.pkey_violations++; + + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY, + lid); + if (!trap) + return; + + /* Send violation trap */ + trap->data.ntc_257_258.lid1 = cpu_to_be32(lid1); + trap->data.ntc_257_258.lid2 = cpu_to_be32(lid2); + trap->data.ntc_257_258.key = cpu_to_be32(key); + trap->data.ntc_257_258.sl = sl << 3; + trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1); + trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2); + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); +} + +/* + * Send a bad M_Key trap (ch. 14.3.9). + */ +static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt) +{ + struct trap_node *trap; + u32 lid = ppd_from_ibp(ibp)->lid; + + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY, + lid); + if (!trap) + return; + + /* Send violation trap */ + trap->data.ntc_256.lid = trap->data.issuer_lid; + trap->data.ntc_256.method = mad->method; + trap->data.ntc_256.attr_id = mad->attr_id; + trap->data.ntc_256.attr_mod = mad->attr_mod; + trap->data.ntc_256.mkey = mkey; + if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + trap->data.ntc_256.dr_slid = dr_slid; + trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) { + trap->data.ntc_256.dr_trunc_hop |= + IB_NOTICE_TRAP_DR_TRUNC; + hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path); + } + trap->data.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(trap->data.ntc_256.dr_rtn_path, return_path, + hop_cnt); + } + + trap->len = sizeof(trap->data); + + send_trap(ibp, trap); +} + +/* + * Send a Port Capability Mask Changed trap (ch. 14.3.11). + */ +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num) +{ + struct trap_node *trap; + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; + u32 lid = ppd_from_ibp(ibp)->lid; + + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; + + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags); + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); +} + +/* + * Send a System Image GUID Changed trap (ch. 14.3.12). + */ +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) +{ + struct trap_node *trap; + u32 lid = ppd_from_ibp(ibp)->lid; + + trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID, + lid); + if (!trap) + return; + + trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; + trap->data.ntc_145.lid = trap->data.issuer_lid; + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); +} + +/* + * Send a Node Description Changed trap (ch. 14.3.13). + */ +void hfi1_node_desc_chg(struct hfi1_ibport *ibp) +{ + struct trap_node *trap; + u32 lid = ppd_from_ibp(ibp)->lid; + + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; + + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.change_flags = + cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG); + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); +} + +static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u32 port, u32 *resp_len, u32 max_len) +{ + struct opa_node_description *nd; + + if (am || smp_length_check(sizeof(*nd), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + nd = (struct opa_node_description *)data; + + memcpy(nd->data, ibdev->node_desc, sizeof(nd->data)); + + if (resp_len) + *resp_len += sizeof(*nd); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct opa_node_info *ni; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 pidx = port - 1; /* IB number port from 1, hw from 0 */ + + ni = (struct opa_node_info *)data; + + /* GUID 0 is illegal */ + if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 || + smp_length_check(sizeof(*ni), max_len) || + get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ni->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX); + ni->base_version = OPA_MGMT_BASE_VERSION; + ni->class_version = OPA_SM_CLASS_VERSION; + ni->node_type = 1; /* channel adapter */ + ni->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + ni->system_image_guid = ib_hfi1_sys_image_guid; + ni->node_guid = ibdev->node_guid; + ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + ni->device_id = cpu_to_be16(dd->pcidev->device); + ni->revision = cpu_to_be32(dd->minrev); + ni->local_port_num = port; + ni->vendor_id[0] = dd->oui1; + ni->vendor_id[1] = dd->oui2; + ni->vendor_id[2] = dd->oui3; + + if (resp_len) + *resp_len += sizeof(*ni); + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev, + u32 port) +{ + struct ib_node_info *nip = (struct ib_node_info *)&smp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 pidx = port - 1; /* IB number port from 1, hw from 0 */ + + /* GUID 0 is illegal */ + if (smp->attr_mod || pidx >= dd->num_pports || + ibdev->node_guid == 0 || + get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + nip->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX); + nip->base_version = OPA_MGMT_BASE_VERSION; + nip->class_version = OPA_SM_CLASS_VERSION; + nip->node_type = 1; /* channel adapter */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = ib_hfi1_sys_image_guid; + nip->node_guid = ibdev->node_guid; + nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->pcidev->device); + nip->revision = cpu_to_be32(dd->minrev); + nip->local_port_num = port; + nip->vendor_id[0] = dd->oui1; + nip->vendor_id[1] = dd->oui2; + nip->vendor_id[2] = dd->oui3; + + return reply((struct ib_mad_hdr *)smp); +} + +static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w); +} + +static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w); +} + +static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s); +} + +static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + int mad_flags, __be64 mkey, __be32 dr_slid, + u8 return_path[], u8 hop_cnt) +{ + int valid_mkey = 0; + int ret = 0; + + /* Is the mkey in the process of expiring? */ + if (ibp->rvp.mkey_lease_timeout && + time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + ibp->rvp.mkey_lease_timeout = 0; + ibp->rvp.mkeyprot = 0; + } + + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->rvp.mkey == 0 || + ibp->rvp.mkey == mkey) + valid_mkey = 1; + + /* Unset lease timeout on any valid Get/Set/TrapRepress */ + if (valid_mkey && ibp->rvp.mkey_lease_timeout && + (mad->method == IB_MGMT_METHOD_GET || + mad->method == IB_MGMT_METHOD_SET || + mad->method == IB_MGMT_METHOD_TRAP_REPRESS)) + ibp->rvp.mkey_lease_timeout = 0; + + if (!valid_mkey) { + switch (mad->method) { + case IB_MGMT_METHOD_GET: + /* Bad mkey not a violation below level 2 */ + if (ibp->rvp.mkeyprot < 2) + break; + fallthrough; + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (ibp->rvp.mkey_violations != 0xFFFF) + ++ibp->rvp.mkey_violations; + if (!ibp->rvp.mkey_lease_timeout && + ibp->rvp.mkey_lease_period) + ibp->rvp.mkey_lease_timeout = jiffies + + ibp->rvp.mkey_lease_period * HZ; + /* Generate a trap notice. */ + bad_mkey(ibp, mad, mkey, dr_slid, return_path, + hop_cnt); + ret = 1; + } + } + + return ret; +} + +/* + * The SMA caches reads from LCB registers in case the LCB is unavailable. + * (The LCB is unavailable in certain link states, for example.) + */ +struct lcb_datum { + u32 off; + u64 val; +}; + +static struct lcb_datum lcb_cache[] = { + { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 }, +}; + +static int write_lcb_cache(u32 off, u64 val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + lcb_cache[i].val = val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +static int read_lcb_cache(u32 off, u64 *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + *val = lcb_cache[i].val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +void read_ltp_rtt(struct hfi1_devdata *dd) +{ + u64 reg; + + if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, ®)) + dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__); + else + write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg); +} + +static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + int i; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct opa_port_info *pi = (struct opa_port_info *)data; + u8 mtu; + u8 credit_rate; + u8 is_beaconing_active; + u32 state; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 buffer_units; + u64 tmp = 0; + + if (num_ports != 1 || smp_length_check(sizeof(*pi), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + pi->lid = cpu_to_be32(ppd->lid); + + /* Only return the mkey if the protection field allows it. */ + if (!(smp->method == IB_MGMT_METHOD_GET && + ibp->rvp.mkey != smp->mkey && + ibp->rvp.mkeyprot == 1)) + pi->mkey = ibp->rvp.mkey; + + pi->subnet_prefix = ibp->rvp.gid_prefix; + pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid); + pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); + pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp); + pi->sa_qp = cpu_to_be32(ppd->sa_qp); + + pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled); + pi->link_width.supported = cpu_to_be16(ppd->link_width_supported); + pi->link_width.active = cpu_to_be16(ppd->link_width_active); + + pi->link_width_downgrade.supported = + cpu_to_be16(ppd->link_width_downgrade_supported); + pi->link_width_downgrade.enabled = + cpu_to_be16(ppd->link_width_downgrade_enabled); + pi->link_width_downgrade.tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + pi->link_width_downgrade.rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + + pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported); + pi->link_speed.active = cpu_to_be16(ppd->link_speed_active); + pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled); + + state = driver_lstate(ppd); + + if (start_of_sm_config && (state == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + + pi->port_phys_conf = (ppd->port_type & 0xf); + + pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + pi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6; + pi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason; + + pi->port_states.portphysstate_portstate = + (driver_pstate(ppd) << 4) | state; + + pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc; + + memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu)); + for (i = 0; i < ppd->vls_supported; i++) { + mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU); + if ((i % 2) == 0) + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4); + else + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu; + } + /* don't forget VL 15 */ + mtu = mtu_to_enum(dd->vld[15].mtu, 2048); + pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu; + pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL; + pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS); + pi->partenforce_filterraw |= + (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON); + if (ppd->part_enforce & HFI1_PART_ENFORCE_IN) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN; + if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT; + pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations); + pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations); + + pi->vl.cap = ppd->vls_supported; + pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit); + pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP); + pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP); + + pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout; + + pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 | + OPA_PORT_LINK_MODE_OPA << 5 | + OPA_PORT_LINK_MODE_OPA); + + pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode); + + pi->port_mode = cpu_to_be16( + ppd->is_active_optimize_enabled ? + OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0); + + pi->port_packet_format.supported = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B | + OPA_PORT_PACKET_FORMAT_16B); + pi->port_packet_format.enabled = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B | + OPA_PORT_PACKET_FORMAT_16B); + + /* flit_control.interleave is (OPA V1, version .76): + * bits use + * ---- --- + * 2 res + * 2 DistanceSupported + * 2 DistanceEnabled + * 5 MaxNextLevelTxEnabled + * 5 MaxNestLevelRxSupported + * + * HFI supports only "distance mode 1" (see OPA V1, version .76, + * section 9.6.2), so set DistanceSupported, DistanceEnabled + * to 0x1. + */ + pi->flit_control.interleave = cpu_to_be16(0x1400); + + pi->link_down_reason = ppd->local_link_down_reason.sma; + pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma; + pi->port_error_action = cpu_to_be32(ppd->port_error_action); + pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096); + + /* 32.768 usec. response time (guessing) */ + pi->resptimevalue = 3; + + pi->local_port_num = port; + + /* buffer info for FM */ + pi->overall_buffer_space = cpu_to_be16(dd->link_credits); + + pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid); + pi->neigh_port_num = ppd->neighbor_port_number; + pi->port_neigh_mode = + (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) | + (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) | + (ppd->neighbor_fm_security ? + OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0); + + /* HFIs shall always return VL15 credits to their + * neighbor in a timely manner, without any credit return pacing. + */ + credit_rate = 0; + buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC; + buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK; + buffer_units |= (credit_rate << 6) & + OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE; + buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT; + pi->buffer_units = cpu_to_be32(buffer_units); + + pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags); + pi->collectivemask_multicastmask = ((OPA_COLLECTIVE_NR & 0x7) + << 3 | (OPA_MCAST_NR & 0x7)); + + /* HFI supports a replay buffer 128 LTPs in size */ + pi->replay_depth.buffer = 0x80; + /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp); + + /* + * this counter is 16 bits wide, but the replay_depth.wire + * variable is only 8 bits + */ + if (tmp > 0xff) + tmp = 0xff; + pi->replay_depth.wire = tmp; + + if (resp_len) + *resp_len += sizeof(struct opa_port_info); + + return reply((struct ib_mad_hdr *)smp); +} + +/** + * get_pkeys - return the PKEY table + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct hfi1_devdata *dd, u32 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd = dd->pport + port - 1; + + memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys)); + + return 0; +} + +static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_req = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + __be16 *p; + u16 *q; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + size_t size; + + if (n_blocks_req == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_req); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16); + + if (smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (start_block + n_blocks_req > n_blocks_avail || + n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; " + "avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_req, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + p = (__be16 *)data; + q = (u16 *)data; + /* get the real pkeys if we are requesting the first block */ + if (start_block == 0) { + get_pkeys(dd, port, q); + for (i = 0; i < npkeys; i++) + p[i] = cpu_to_be16(q[i]); + if (resp_len) + *resp_len += size; + } else { + smp->status |= IB_SMP_INVALID_FIELD; + } + return reply((struct ib_mad_hdr *)smp); +} + +enum { + HFI_TRANSITION_DISALLOWED, + HFI_TRANSITION_IGNORED, + HFI_TRANSITION_ALLOWED, + HFI_TRANSITION_UNDEFINED, +}; + +/* + * Use shortened names to improve readability of + * {logical,physical}_state_transitions + */ +enum { + __D = HFI_TRANSITION_DISALLOWED, + __I = HFI_TRANSITION_IGNORED, + __A = HFI_TRANSITION_ALLOWED, + __U = HFI_TRANSITION_UNDEFINED, +}; + +/* + * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are + * represented in physical_state_transitions. + */ +#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1) + +/* + * Within physical_state_transitions, rows represent "old" states, + * columns "new" states, and physical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 6-4). + */ +static const struct { + u8 allowed[__N_PHYSTATES][__N_PHYSTATES]; +} physical_state_transitions = { + { + /* 2 3 4 5 6 7 8 9 10 11 */ + /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D }, + /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A }, + /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D }, + /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D }, + /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D }, + /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I }, + } +}; + +/* + * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented + * logical_state_transitions + */ + +#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1) + +/* + * Within logical_state_transitions rows represent "old" states, + * columns "new" states, and logical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 9-12). + */ +static const struct { + u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES]; +} logical_state_transitions = { + { + /* 1 2 3 4 5 */ + /* 1 */ { __I, __D, __D, __D, __U}, + /* 2 */ { __D, __I, __A, __D, __U}, + /* 3 */ { __D, __D, __I, __A, __U}, + /* 4 */ { __D, __D, __I, __I, __U}, + /* 5 */ { __U, __U, __U, __U, __U}, + } +}; + +static int logical_transition_allowed(int old, int new) +{ + if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER || + new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) { + pr_warn("invalid logical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORT_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into logical_state_transitions */ + old -= IB_PORT_DOWN; + new -= IB_PORT_DOWN; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return logical_state_transitions.allowed[old][new]; +} + +static int physical_transition_allowed(int old, int new) +{ + if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX || + new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) { + pr_warn("invalid physical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORTPHYSSTATE_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into physical_state_transitions */ + old -= IB_PORTPHYSSTATE_POLLING; + new -= IB_PORTPHYSSTATE_POLLING; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return physical_state_transitions.allowed[old][new]; +} + +static int port_states_transition_allowed(struct hfi1_pportdata *ppd, + u32 logical_new, u32 physical_new) +{ + u32 physical_old = driver_pstate(ppd); + u32 logical_old = driver_lstate(ppd); + int ret, logical_allowed, physical_allowed; + + ret = logical_transition_allowed(logical_old, logical_new); + logical_allowed = ret; + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid logical state transition %s -> %s\n", + opa_lstate_name(logical_old), + opa_lstate_name(logical_new)); + return ret; + } + + ret = physical_transition_allowed(physical_old, physical_new); + physical_allowed = ret; + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid physical state transition %s -> %s\n", + opa_pstate_name(physical_old), + opa_pstate_name(physical_new)); + return ret; + } + + if (logical_allowed == HFI_TRANSITION_IGNORED && + physical_allowed == HFI_TRANSITION_IGNORED) + return HFI_TRANSITION_IGNORED; + + /* + * A change request of Physical Port State from + * 'Offline' to 'Polling' should be ignored. + */ + if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) && + (physical_new == IB_PORTPHYSSTATE_POLLING)) + return HFI_TRANSITION_IGNORED; + + /* + * Either physical_allowed or logical_allowed is + * HFI_TRANSITION_ALLOWED. + */ + return HFI_TRANSITION_ALLOWED; +} + +static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, + u32 logical_state, u32 phys_state, int local_mad) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 link_state; + int ret; + + ret = port_states_transition_allowed(ppd, logical_state, phys_state); + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + /* error message emitted above */ + smp->status |= IB_SMP_INVALID_FIELD; + return 0; + } + + if (ret == HFI_TRANSITION_IGNORED) + return 0; + + if ((phys_state != IB_PORTPHYSSTATE_NOP) && + !(logical_state == IB_PORT_DOWN || + logical_state == IB_PORT_NOP)){ + pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n", + logical_state, phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + /* + * Logical state changes are summarized in OPAv1g1 spec., + * Table 9-12; physical state changes are summarized in + * OPAv1g1 spec., Table 6.4. + */ + switch (logical_state) { + case IB_PORT_NOP: + if (phys_state == IB_PORTPHYSSTATE_NOP) + break; + fallthrough; + case IB_PORT_DOWN: + if (phys_state == IB_PORTPHYSSTATE_NOP) { + link_state = HLS_DN_DOWNDEF; + } else if (phys_state == IB_PORTPHYSSTATE_POLLING) { + link_state = HLS_DN_POLL; + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE, + 0, OPA_LINKDOWN_REASON_FM_BOUNCE); + } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) { + link_state = HLS_DN_DISABLE; + } else { + pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n", + phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + if ((link_state == HLS_DN_POLL || + link_state == HLS_DN_DOWNDEF)) { + /* + * Going to poll. No matter what the current state, + * always move offline first, then tune and start the + * link. This correctly handles a FM link bounce and + * a link enable. Going offline is a no-op if already + * offline. + */ + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } else { + set_link_state(ppd, link_state); + } + if (link_state == HLS_DN_DISABLE && + (ppd->offline_disabled_reason > + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) || + ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED); + /* + * Don't send a reply if the response would be sent + * through the disabled port. + */ + if (link_state == HLS_DN_DISABLE && !local_mad) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + break; + case IB_PORT_ARMED: + ret = set_link_state(ppd, HLS_UP_ARMED); + if (!ret) + send_idle_sma(dd, SMA_IDLE_ARM); + break; + case IB_PORT_ACTIVE: + if (ppd->neighbor_normal) { + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret == 0) + send_idle_sma(dd, SMA_IDLE_ACTIVE); + } else { + pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n"); + smp->status |= IB_SMP_INVALID_FIELD; + } + break; + default: + pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n", + logical_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + return 0; +} + +/* + * subn_set_opa_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + */ +static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len, int local_mad) +{ + struct opa_port_info *pi = (struct opa_port_info *)data; + struct ib_event event; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u8 clientrereg; + unsigned long flags; + u32 smlid; + u32 lid; + u8 ls_old, ls_new, ps_new; + u8 vls; + u8 msl; + u8 crc_enabled; + u16 lse, lwe, mtu; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + int ret, i, invalid = 0, call_set_mtu = 0; + int call_link_downgrade_policy = 0; + + if (num_ports != 1 || + smp_length_check(sizeof(*pi), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + lid = be32_to_cpu(pi->lid); + if (lid & 0xFF000000) { + pr_warn("OPA_PortInfo lid out of range: %X\n", lid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + + + smlid = be32_to_cpu(pi->sm_lid); + if (smlid & 0xFF000000) { + pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + + clientrereg = (pi->clientrereg_subnettimeout & + OPA_PI_MASK_CLIENT_REREGISTER); + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + event.device = ibdev; + event.element.port_num = port; + + ls_old = driver_lstate(ppd); + + ibp->rvp.mkey = pi->mkey; + if (ibp->rvp.gid_prefix != pi->subnet_prefix) { + ibp->rvp.gid_prefix = pi->subnet_prefix; + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } + ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); + + /* Must be a valid unicast LID address. */ + if ((lid == 0 && ls_old > IB_PORT_INIT) || + (hfi1_is_16B_mcast(lid))) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", + lid); + } else if (ppd->lid != lid || + ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) { + if (ppd->lid != lid) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT); + if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT); + hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + + if (HFI1_PORT_GUID_INDEX + 1 < HFI1_GUIDS_PER_PORT) { + /* Manufacture GID from LID to support extended + * addresses + */ + ppd->guids[HFI1_PORT_GUID_INDEX + 1] = + be64_to_cpu(OPA_MAKE_ID(lid)); + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } + } + + msl = pi->smsl & OPA_PI_MASK_SMSL; + if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON) + ppd->linkinit_reason = + (pi->partenforce_filterraw & + OPA_PI_MASK_LINKINIT_REASON); + + /* Must be a valid unicast LID address. */ + if ((smlid == 0 && ls_old > IB_PORT_INIT) || + (hfi1_is_16B_mcast(smlid))) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); + } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { + pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid); + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (ibp->rvp.sm_ah) { + if (smlid != ibp->rvp.sm_lid) + hfi1_modify_qp0_ah(ibp, ibp->rvp.sm_ah, smlid); + if (msl != ibp->rvp.sm_sl) + rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl); + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_lid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_sl = msl; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + if (pi->link_down_reason == 0) { + ppd->local_link_down_reason.sma = 0; + ppd->local_link_down_reason.latest = 0; + } + + if (pi->neigh_link_down_reason == 0) { + ppd->neigh_link_down_reason.sma = 0; + ppd->neigh_link_down_reason.latest = 0; + } + + ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp); + ppd->sa_qp = be32_to_cpu(pi->sa_qp); + + ppd->port_error_action = be32_to_cpu(pi->port_error_action); + lwe = be16_to_cpu(pi->link_width.enabled); + if (lwe) { + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) + set_link_width_enabled(ppd, ppd->link_width_supported); + else if ((lwe & ~ppd->link_width_supported) == 0) + set_link_width_enabled(ppd, lwe); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + lwe = be16_to_cpu(pi->link_width_downgrade.enabled); + /* LWD.E is always applied - 0 means "disabled" */ + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) { + set_link_width_downgrade_enabled(ppd, + ppd-> + link_width_downgrade_supported + ); + } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) { + /* only set and apply if something changed */ + if (lwe != ppd->link_width_downgrade_enabled) { + set_link_width_downgrade_enabled(ppd, lwe); + call_link_downgrade_policy = 1; + } + } else { + smp->status |= IB_SMP_INVALID_FIELD; + } + lse = be16_to_cpu(pi->link_speed.enabled); + if (lse) { + if (lse & be16_to_cpu(pi->link_speed.supported)) + set_link_speed_enabled(ppd, lse); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + + ibp->rvp.mkeyprot = + (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6; + ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT, + ibp->rvp.vl_high_limit); + + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + for (i = 0; i < ppd->vls_supported; i++) { + if ((i % 2) == 0) + mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >> + 4) & 0xF); + else + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] & + 0xF); + if (mtu == 0xffff) { + pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n", + mtu, + (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF); + smp->status |= IB_SMP_INVALID_FIELD; + mtu = hfi1_max_mtu; /* use a valid MTU */ + } + if (dd->vld[i].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl %d from %d to %d\n", + i, dd->vld[i].mtu, mtu); + dd->vld[i].mtu = mtu; + call_set_mtu++; + } + } + /* As per OPAV1 spec: VL15 must support and be configured + * for operation with a 2048 or larger MTU. + */ + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF); + if (mtu < 2048 || mtu == 0xffff) + mtu = 2048; + if (dd->vld[15].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl 15 from %d to %d\n", + dd->vld[15].mtu, mtu); + dd->vld[15].mtu = mtu; + call_set_mtu++; + } + if (call_set_mtu) + set_mtu(ppd); + + /* Set operational VLs */ + vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL; + if (vls) { + if (vls > ppd->vls_supported) { + pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n", + pi->operational_vls); + smp->status |= IB_SMP_INVALID_FIELD; + } else { + if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS, + vls) == -EINVAL) + smp->status |= IB_SMP_INVALID_FIELD; + } + } + + if (pi->mkey_violations == 0) + ibp->rvp.mkey_violations = 0; + + if (pi->pkey_violations == 0) + ibp->rvp.pkey_violations = 0; + + if (pi->qkey_violations == 0) + ibp->rvp.qkey_violations = 0; + + ibp->rvp.subnet_timeout = + pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT; + + crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode); + crc_enabled >>= 4; + crc_enabled &= 0xf; + + if (crc_enabled != 0) + ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled); + + ppd->is_active_optimize_enabled = + !!(be16_to_cpu(pi->port_mode) + & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE); + + ls_new = pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_STATE; + ps_new = (pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4; + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) { + invalid = 1; + smp->status |= IB_SMP_INVALID_FIELD; + } + } + } + + /* Handle CLIENT_REREGISTER event b/c SM asked us for it */ + if (clientrereg) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + + if (!invalid) { + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); + if (ret) + return ret; + } + + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len, + max_len); + + /* restore re-reg bit per o14-12.2.1 */ + pi->clientrereg_subnettimeout |= clientrereg; + + /* + * Apply the new link downgrade policy. This may result in a link + * bounce. Do this after everything else so things are settled. + * Possible problem: if setting the port state above fails, then + * the policy change is not applied. + */ + if (call_link_downgrade_policy) + apply_link_downgrade_policy(ppd, 0); + + return ret; + +get_only: + return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len, + max_len); +} + +/** + * set_pkeys - set the PKEY table for ctxt 0 + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the PKEY table + */ +static int set_pkeys(struct hfi1_devdata *dd, u32 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd; + int i; + int changed = 0; + int update_includes_mgmt_partition = 0; + + /* + * IB port one/two always maps to context zero/one, + * always a kernel context, no locking needed + * If we get here with ppd setup, no need to check + * that rcd is valid. + */ + ppd = dd->pport + (port - 1); + /* + * If the update does not include the management pkey, don't do it. + */ + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (pkeys[i] == LIM_MGMT_P_KEY) { + update_includes_mgmt_partition = 1; + break; + } + } + + if (!update_includes_mgmt_partition) + return 1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = ppd->pkeys[i]; + + if (key == okey) + continue; + /* + * The SM gives us the complete PKey table. We have + * to ensure that we put the PKeys in the matching + * slots. + */ + ppd->pkeys[i] = key; + changed = 1; + } + + if (changed) { + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(dd, port); + } + + return 0; +} + +static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_sent = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + u16 *p = (u16 *)data; + __be16 *q = (__be16 *)data; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + u32 size = 0; + + if (n_blocks_sent == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %u; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_sent); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + size = sizeof(u16) * (n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE); + + if (smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (start_block + n_blocks_sent > n_blocks_avail || + n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_sent, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++) + p[i] = be16_to_cpu(q[i]); + + if (start_block == 0 && set_pkeys(dd, port, p) != 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len, + max_len); +} + +#define ILLEGAL_VL 12 +/* + * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except + * for SC15, which must map to VL15). If we don't remap things this + * way it is possible for VL15 counters to increment when we try to + * send on a SC which is mapped to an invalid VL. + * When getting the table convert ILLEGAL_VL back to VL15. + */ +static void filter_sc2vlt(void *data, bool set) +{ + int i; + u8 *pd = data; + + for (i = 0; i < OPA_MAX_SCS; i++) { + if (i == 15) + continue; + + if (set) { + if ((pd[i] & 0x1f) == 0xf) + pd[i] = ILLEGAL_VL; + } else { + if ((pd[i] & 0x1f) == ILLEGAL_VL) + pd[i] = 0xf; + } + } +} + +static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = data; + + filter_sc2vlt(data, true); + + write_csr(dd, SEND_SC2VLT0, *val++); + write_csr(dd, SEND_SC2VLT1, *val++); + write_csr(dd, SEND_SC2VLT2, *val++); + write_csr(dd, SEND_SC2VLT3, *val++); + write_seqlock_irq(&dd->sc2vl_lock); + memcpy(dd->sc2vl, data, sizeof(dd->sc2vl)); + write_sequnlock_irq(&dd->sc2vl_lock); + return 0; +} + +static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = (u64 *)data; + + *val++ = read_csr(dd, SEND_SC2VLT0); + *val++ = read_csr(dd, SEND_SC2VLT1); + *val++ = read_csr(dd, SEND_SC2VLT2); + *val++ = read_csr(dd, SEND_SC2VLT3); + + filter_sc2vlt((u64 *)data, false); + return 0; +} + +static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */ + unsigned i; + + if (am || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) + *p++ = ibp->sl_to_sc[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sl_to_sc); + int i; + u8 sc; + + if (am || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) { + sc = *p++; + if (ibp->sl_to_sc[i] != sc) { + ibp->sl_to_sc[i] = sc; + + /* Put all stale qps into error state */ + hfi1_error_port_qps(ibp, i); + } + } + + return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */ + unsigned i; + + if (am || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + *p++ = ibp->sc_to_sl[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + size_t size = ARRAY_SIZE(ibp->sc_to_sl); + u8 *p = data; + int i; + + if (am || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + ibp->sc_to_sl[i] = *p++; + + return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *)data; + size_t size = 4 * sizeof(u64); + + if (n_blocks != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + get_sc2vlt_tables(dd, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + int async_update = OPA_AM_ASYNC(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *)data; + struct hfi1_pportdata *ppd; + int lstate; + /* + * set_sc2vlt_tables writes the information contained in *data + * to four 64-bit registers SendSC2VLt[0-3]. We need to make + * sure *max_len is not greater than the total size of the four + * SendSC2VLt[0-3] registers. + */ + size_t size = 4 * sizeof(u64); + + if (n_blocks != 1 || async_update || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + /* + * it's known that async_update is 0 by this point, but include + * the explicit check for clarity + */ + if (!async_update && + (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + set_sc2vlt_tables(dd, vp); + + return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *)data; + int size = sizeof(struct sc2vlnt); + + if (n_blocks != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + fm_get_table(ppd, FM_TBL_SC2VLNT, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *)data; + int lstate; + int size = sizeof(struct sc2vlnt); + + if (n_blocks != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + fm_set_table(ppd, FM_TBL_SC2VLNT, vp); + + return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len, max_len); +} + +static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 lstate; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; + + if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + lstate = driver_lstate(ppd); + + if (start_of_sm_config && (lstate == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + + psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + psi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + psi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason; + + psi->port_states.portphysstate_portstate = + (driver_pstate(ppd) << 4) | (lstate & 0xf); + psi->link_width_downgrade_tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + psi->link_width_downgrade_rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + if (resp_len) + *resp_len += sizeof(struct opa_port_state_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len, int local_mad) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 ls_old; + u8 ls_new, ps_new; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; + int ret, invalid = 0; + + if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + ls_old = driver_lstate(ppd); + + ls_new = port_states_to_logical_state(&psi->port_states); + ps_new = port_states_to_phys_state(&psi->port_states); + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) { + invalid = 1; + smp->status |= IB_SMP_INVALID_FIELD; + } + } + } + + if (!invalid) { + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); + if (ret) + return ret; + } + + return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 addr = OPA_AM_CI_ADDR(am); + u32 len = OPA_AM_CI_LEN(am) + 1; + int ret; + + if (dd->pport->port_type != PORT_TYPE_QSFP || + smp_length_check(len, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + +#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */ +#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1) +#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK) + + /* + * check that addr is within spec, and + * addr and (addr + len - 1) are on the same "page" + */ + if (addr >= 4096 || + (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ret = get_cable_info(dd, port, addr, len, data); + + if (ret == -ENODEV) { + smp->status |= IB_SMP_UNSUP_METH_ATTR; + return reply((struct ib_mad_hdr *)smp); + } + + /* The address range for the CableInfo SMA query is wider than the + * memory available on the QSFP cable. We want to return a valid + * response, albeit zeroed out, for address ranges beyond available + * memory but that are within the CableInfo query spec + */ + if (ret < 0 && ret != -ERANGE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (resp_len) + *resp_len += len; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, u32 *resp_len, + u32 max_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *)data; + int size = sizeof(struct buffer_control); + + if (num_ports != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p); + trace_bct_get(dd, p); + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, u32 *resp_len, + u32 max_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *)data; + + if (num_ports != 1 || smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + ppd = dd->pport + (port - 1); + trace_bct_set(dd, p); + if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + int size = 256; + + if (num_ports != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + case OPA_VLARB_PREEMPT_ELEMENTS: + fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p); + break; + case OPA_VLARB_PREEMPT_MATRIX: + fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p); + break; + default: + pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + size = 0; + break; + } + + if (size > 0 && resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + int size = 256; + + if (num_ports != 1 || smp_length_check(size, max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + /* + * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX + * can be changed from the default values + */ + case OPA_VLARB_PREEMPT_ELEMENTS: + case OPA_VLARB_PREEMPT_MATRIX: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + break; + default: + pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len, + max_len); +} + +struct opa_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 data[2024]; +} __packed; + +struct opa_port_status_req { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; +}; + +#define VL_MASK_ALL 0x00000000000080ffUL + +struct opa_port_status_rsp { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + /* Error counters */ + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + + u8 link_quality_indicator; /* 5res, 3bit */ + u8 res2[6]; + struct _vls_pctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + __be64 port_vl_xmit_discards; + } vls[]; /* real array size defined by # bits set in vl_select_mask */ +}; + +enum counter_selects { + CS_PORT_XMIT_DATA = (1 << 31), + CS_PORT_RCV_DATA = (1 << 30), + CS_PORT_XMIT_PKTS = (1 << 29), + CS_PORT_RCV_PKTS = (1 << 28), + CS_PORT_MCAST_XMIT_PKTS = (1 << 27), + CS_PORT_MCAST_RCV_PKTS = (1 << 26), + CS_PORT_XMIT_WAIT = (1 << 25), + CS_SW_PORT_CONGESTION = (1 << 24), + CS_PORT_RCV_FECN = (1 << 23), + CS_PORT_RCV_BECN = (1 << 22), + CS_PORT_XMIT_TIME_CONG = (1 << 21), + CS_PORT_XMIT_WASTED_BW = (1 << 20), + CS_PORT_XMIT_WAIT_DATA = (1 << 19), + CS_PORT_RCV_BUBBLE = (1 << 18), + CS_PORT_MARK_FECN = (1 << 17), + CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16), + CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15), + CS_PORT_XMIT_DISCARDS = (1 << 14), + CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13), + CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12), + CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11), + CS_PORT_RCV_ERRORS = (1 << 10), + CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9), + CS_FM_CONFIG_ERRORS = (1 << 8), + CS_LINK_ERROR_RECOVERY = (1 << 7), + CS_LINK_DOWNED = (1 << 6), + CS_UNCORRECTABLE_ERRORS = (1 << 5), +}; + +struct opa_clear_port_status { + __be64 port_select_mask[4]; + __be32 counter_select_mask; +}; + +struct opa_aggregate { + __be16 attr_id; + __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */ + __be32 attr_mod; + u8 data[]; +}; + +#define MSK_LLI 0x000000f0 +#define MSK_LLI_SFT 4 +#define MSK_LER 0x0000000f +#define MSK_LER_SFT 0 +#define ADD_LLI 8 +#define ADD_LER 2 + +/* Request contains first three fields, response contains those plus the rest */ +struct opa_port_data_counters_msg { + __be64 port_select_mask[4]; + __be32 vl_select_mask; + __be32 resolution; + + /* Response fields follow */ + struct _port_dctrs { + u8 port_number; + u8 reserved2[3]; + __be32 link_quality_indicator; /* 29res, 3bit */ + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + + __be64 port_error_counter_summary; + /* Sum of error counts/port */ + + struct _vls_dctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + } vls[]; + /* array size defined by #bits set in vl_select_mask*/ + } port; +}; + +struct opa_port_error_counters64_msg { + /* + * Request contains first two fields, response contains the + * whole magilla + */ + __be64 port_select_mask[4]; + __be32 vl_select_mask; + + /* Response-only fields follow */ + __be32 reserved1; + struct _port_ectrs { + u8 port_number; + u8 reserved2[7]; + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + u8 reserved3[7]; + struct _vls_ectrs { + __be64 port_vl_xmit_discards; + } vls[]; + /* array size defined by #bits set in vl_select_mask */ + } port; +}; + +struct opa_port_error_info_msg { + __be64 port_select_mask[4]; + __be32 error_info_select_mask; + __be32 reserved1; + struct _port_ei { + u8 port_number; + u8 reserved2[7]; + + /* PortRcvErrorInfo */ + struct { + u8 status_and_code; + union { + u8 raw[17]; + struct { + /* EI1to12 format */ + u8 packet_flit1[8]; + u8 packet_flit2[8]; + u8 remaining_flit_bits12; + } ei1to12; + struct { + u8 packet_bytes[8]; + u8 remaining_flit_bits; + } ei13; + } ei; + u8 reserved3[6]; + } __packed port_rcv_ei; + + /* ExcessiveBufferOverrunInfo */ + struct { + u8 status_and_sc; + u8 reserved4[7]; + } __packed excessive_buffer_overrun_ei; + + /* PortXmitConstraintErrorInfo */ + struct { + u8 status; + u8 reserved5; + __be16 pkey; + __be32 slid; + } __packed port_xmit_constraint_ei; + + /* PortRcvConstraintErrorInfo */ + struct { + u8 status; + u8 reserved6; + __be16 pkey; + __be32 slid; + } __packed port_rcv_constraint_ei; + + /* PortRcvSwitchRelayErrorInfo */ + struct { + u8 status_and_code; + u8 reserved7[3]; + __u32 error_info; + } __packed port_rcv_switch_relay_ei; + + /* UncorrectableErrorInfo */ + struct { + u8 status_and_code; + u8 reserved8; + } __packed uncorrectable_ei; + + /* FMConfigErrorInfo */ + struct { + u8 status_and_code; + u8 error_info; + } __packed fm_config_ei; + __u32 reserved9; + } port; +}; + +/* opa_port_error_info_msg error_info_select_mask bit definitions */ +enum error_info_selects { + ES_PORT_RCV_ERROR_INFO = (1 << 31), + ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30), + ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29), + ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28), + ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27), + ES_UNCORRECTABLE_ERROR_INFO = (1 << 26), + ES_FM_CONFIG_ERROR_INFO = (1 << 25) +}; + +static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u32 *resp_len) +{ + struct opa_class_port_info *p = + (struct opa_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->base_version = OPA_MGMT_BASE_VERSION; + p->class_version = OPA_SM_CLASS_VERSION; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->cap_mask2_resp_time = cpu_to_be32(18); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)pmp); +} + +static void a0_portstatus(struct hfi1_pportdata *ppd, + struct opa_port_status_rsp *rsp) +{ + if (!is_bx(ppd->dd)) { + unsigned long vl; + u64 sum_vl_xmit_wait = 0; + unsigned long vl_all_mask = VL_MASK_ALL; + + for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) { + u64 tmp = sum_vl_xmit_wait + + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64)~0; + break; + } + sum_vl_xmit_wait = tmp; + } + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); + } +} + +/** + * tx_link_width - convert link width bitmask to integer + * value representing actual link width. + * @link_width: width of active link + * @return: return index of the bit set in link_width var + * + * The function convert and return the index of bit set + * that indicate the current link width. + */ +u16 tx_link_width(u16 link_width) +{ + int n = LINK_WIDTH_DEFAULT; + u16 tx_width = n; + + while (link_width && n) { + if (link_width & (1 << (n - 1))) { + tx_width = n; + break; + } + n--; + } + + return tx_width; +} + +/** + * get_xmit_wait_counters - Convert HFI 's SendWaitCnt/SendWaitVlCnt + * counter in unit of TXE cycle times to flit times. + * @ppd: info of physical Hfi port + * @link_width: width of active link + * @link_speed: speed of active link + * @vl: represent VL0-VL7, VL15 for PortVLXmitWait counters request + * and if vl value is C_VL_COUNT, it represent SendWaitCnt + * counter request + * @return: return SendWaitCnt/SendWaitVlCnt counter value per vl. + * + * Convert SendWaitCnt/SendWaitVlCnt counter from TXE cycle times to + * flit times. Call this function to samples these counters. This + * function will calculate for previous state transition and update + * current state at end of function using ppd->prev_link_width and + * ppd->port_vl_xmit_wait_last to port_vl_xmit_wait_curr and link_width. + */ +u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, + u16 link_width, u16 link_speed, int vl) +{ + u64 port_vl_xmit_wait_curr; + u64 delta_vl_xmit_wait; + u64 xmit_wait_val; + + if (vl > C_VL_COUNT) + return 0; + if (vl < C_VL_COUNT) + port_vl_xmit_wait_curr = + read_port_cntr(ppd, C_TX_WAIT_VL, vl); + else + port_vl_xmit_wait_curr = + read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL); + + xmit_wait_val = + port_vl_xmit_wait_curr - + ppd->port_vl_xmit_wait_last[vl]; + delta_vl_xmit_wait = + convert_xmit_counter(xmit_wait_val, + ppd->prev_link_width, + link_speed); + + ppd->vl_xmit_flit_cnt[vl] += delta_vl_xmit_wait; + ppd->port_vl_xmit_wait_last[vl] = port_vl_xmit_wait_curr; + ppd->prev_link_width = link_width; + + return ppd->vl_xmit_flit_cnt[vl]; +} + +static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + struct opa_port_status_req *req = + (struct opa_port_status_req *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_port_status_rsp *rsp; + unsigned long vl_select_mask = be32_to_cpu(req->vl_select_mask); + unsigned long vl; + size_t response_data_size; + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u32 port_num = req->port_num; + u8 num_vls = hweight64(vl_select_mask); + struct _vls_pctrs *vlinfo; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + int vfi; + u64 tmp, tmp2; + u16 link_width; + u16 link_speed; + + response_data_size = struct_size(rsp, vls, num_vls); + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE; + return reply((struct ib_mad_hdr *)pmp); + } + + if (nports != 1 || (port_num && port_num != port) || + num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + memset(pmp->data, 0, sizeof(pmp->data)); + + rsp = (struct opa_port_status_rsp *)pmp->data; + if (port_num) + rsp->port_num = port_num; + else + rsp->port_num = port; + + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + + hfi1_read_link_quality(dd, &rsp->link_quality_indicator); + + rsp->vl_select_mask = cpu_to_be32((u32)vl_select_mask); + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); + /* + * Convert PortXmitWait counter from TXE cycle times + * to flit times. + */ + link_width = + tx_link_width(ppd->link_width_downgrade_tx_active); + link_speed = get_link_speed(ppd->link_speed_active); + rsp->port_xmit_wait = + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, C_VL_COUNT)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + + /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */ + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + + vlinfo = &rsp->vls[0]; + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl)); + rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + /* + * Convert PortVlXmitWait counter from TXE cycle + * times to flit times. + */ + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); + vlinfo++; + vfi++; + } + + a0_portstatus(ppd, rsp); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static u64 get_error_counter_summary(struct ib_device *ibdev, u32 port, + u8 res_lli, u8 res_ler) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u64 error_counter_summary = 0, tmp; + + error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL); + /* port_rcv_switch_relay_errors is 0 for HFIs */ + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL); + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL); + /* local link integrity must be right-shifted by the lli resolution */ + error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL) >> res_lli); + /* link error recovery must b right-shifted by the ler resolution */ + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); + error_counter_summary += (tmp >> res_ler); + error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL); + /* ppd->link_downed is a 32-bit value */ + error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + /* this is an 8-bit quantity */ + error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff; + + return error_counter_summary; +} + +static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp) +{ + if (!is_bx(ppd->dd)) { + unsigned long vl; + u64 sum_vl_xmit_wait = 0; + unsigned long vl_all_mask = VL_MASK_ALL; + + for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) { + u64 tmp = sum_vl_xmit_wait + + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64)~0; + break; + } + sum_vl_xmit_wait = tmp; + } + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); + } +} + +static void pma_get_opa_port_dctrs(struct ib_device *ibdev, + struct _port_dctrs *rsp) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); +} + +static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + struct opa_port_data_counters_msg *req = + (struct opa_port_data_counters_msg *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct _port_dctrs *rsp; + struct _vls_dctrs *vlinfo; + size_t response_data_size; + u32 num_ports; + u8 lq, num_vls; + u8 res_lli, res_ler; + u64 port_mask; + u32 port_num; + unsigned long vl; + unsigned long vl_select_mask; + int vfi; + u16 link_width; + u16 link_speed; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + vl_select_mask = be32_to_cpu(req->vl_select_mask); + res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT; + res_lli = res_lli ? res_lli + ADD_LLI : 0; + res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT; + res_ler = res_ler ? res_ler + ADD_LER : 0; + + if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = struct_size(req, port.vls, num_vls); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask) * 8); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = &req->port; + memset(rsp, 0, sizeof(*rsp)); + + rsp->port_number = port; + /* + * Note that link_quality_indicator is a 32 bit quantity in + * 'datacounters' queries (as opposed to 'portinfo' queries, + * where it's a byte). + */ + hfi1_read_link_quality(dd, &lq); + rsp->link_quality_indicator = cpu_to_be32((u32)lq); + pma_get_opa_port_dctrs(ibdev, rsp); + + /* + * Convert PortXmitWait counter from TXE + * cycle times to flit times. + */ + link_width = + tx_link_width(ppd->link_width_downgrade_tx_active); + link_speed = get_link_speed(ppd->link_speed_active); + rsp->port_xmit_wait = + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, C_VL_COUNT)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + rsp->port_error_counter_summary = + cpu_to_be64(get_error_counter_summary(ibdev, port, + res_lli, res_ler)); + + vlinfo = &rsp->vls[0]; + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_data = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + /* + * Convert PortVlXmitWait counter from TXE + * cycle times to flit times. + */ + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + /* rsp->port_vl_xmit_time_cong is 0 for HFIs */ + /* rsp->port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? + * does this differ from rsp->vls[vfi].port_vl_xmit_wait + */ + /*rsp->vls[vfi].port_vl_mark_fecn = + * cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + * + offset)); + */ + vlinfo++; + vfi++; + } + + a0_datacounters(ppd, rsp); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u32 port) +{ + struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *) + pmp->data; + struct _port_dctrs rsp; + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_dctrs(ibdev, &rsp); + + p->port_xmit_data = rsp.port_xmit_data; + p->port_rcv_data = rsp.port_rcv_data; + p->port_xmit_packets = rsp.port_xmit_pkts; + p->port_rcv_packets = rsp.port_rcv_pkts; + p->port_unicast_xmit_packets = 0; + p->port_unicast_rcv_packets = 0; + p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts; + p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts; + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + +static void pma_get_opa_port_ectrs(struct ib_device *ibdev, + struct _port_ectrs *rsp, u32 port) +{ + u64 tmp, tmp2; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_switch_relay_errors = 0; + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->local_link_integrity_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL)); + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); +} + +static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ectrs *rsp; + u32 port_num; + struct opa_port_error_counters64_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 num_ports; + u8 num_pslm; + u8 num_vls; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct _vls_ectrs *vlinfo; + unsigned long vl; + u64 port_mask, tmp; + unsigned long vl_select_mask; + int vfi; + + req = (struct opa_port_error_counters64_msg *)pmp->data; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + response_data_size = struct_size(req, port.vls, num_vls); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask) * 8); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = &req->port; + + ibp = to_iport(ibdev, port_num); + ppd = ppd_from_ibp(ibp); + + memset(rsp, 0, sizeof(*rsp)); + rsp->port_number = port_num; + + pma_get_opa_port_ectrs(ibdev, rsp, port_num); + + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + vlinfo = &rsp->vls[0]; + vfi = 0; + vl_select_mask = be32_to_cpu(req->vl_select_mask); + for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) { + memset(vlinfo, 0, sizeof(*vlinfo)); + rsp->vls[vfi].port_vl_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl))); + vlinfo += 1; + vfi++; + } + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_ib_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u32 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct _port_ectrs rsp; + u64 temp_link_overrun_errors; + u64 temp_64; + u32 temp_32; + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_ectrs(ibdev, &rsp, port); + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + p->symbol_error_counter = 0; /* N/A for OPA */ + + temp_32 = be32_to_cpu(rsp.link_error_recovery); + if (temp_32 > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = (u8)temp_32; + + temp_32 = be32_to_cpu(rsp.link_downed); + if (temp_32 > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)temp_32; + + temp_64 = be64_to_cpu(rsp.port_rcv_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors); + p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_discards); + if (temp_64 > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_xmit_constraint_errors = 0xFF; + else + p->port_xmit_constraint_errors = (u8)temp_64; + + temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_rcv_constraint_errors = 0xFFUL; + else + p->port_rcv_constraint_errors = (u8)temp_64; + + /* LocalLink: 7:4, BufferOverrun: 3:0 */ + temp_64 = be64_to_cpu(rsp.local_link_integrity_errors); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + + temp_link_overrun_errors = temp_64 << 4; + + temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + temp_link_overrun_errors |= temp_64; + + p->link_overrun_errors = (u8)temp_link_overrun_errors; + + p->vl15_dropped = 0; /* N/A for OPA */ + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + u32 port_num; + u8 num_pslm; + u64 reg; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = &req->port; + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = sizeof(struct opa_port_error_info_msg); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask) * 8); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + rsp->port_number = port; + + /* PortRcvErrorInfo */ + rsp->port_rcv_ei.status_and_code = + dd->err_info_rcvport.status_and_code; + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1, + &dd->err_info_rcvport.packet_flit1, sizeof(u64)); + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2, + &dd->err_info_rcvport.packet_flit2, sizeof(u64)); + + /* ExcessiverBufferOverrunInfo */ + reg = read_csr(dd, RCV_ERR_INFO); + if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) { + /* + * if the RcvExcessBufferOverrun bit is set, save SC of + * first pkt that encountered an excess buffer overrun + */ + u8 tmp = (u8)reg; + + tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK; + tmp <<= 2; + rsp->excessive_buffer_overrun_ei.status_and_sc = tmp; + /* set the status bit */ + rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80; + } + + rsp->port_xmit_constraint_ei.status = + dd->err_info_xmit_constraint.status; + rsp->port_xmit_constraint_ei.pkey = + cpu_to_be16(dd->err_info_xmit_constraint.pkey); + rsp->port_xmit_constraint_ei.slid = + cpu_to_be32(dd->err_info_xmit_constraint.slid); + + rsp->port_rcv_constraint_ei.status = + dd->err_info_rcv_constraint.status; + rsp->port_rcv_constraint_ei.pkey = + cpu_to_be16(dd->err_info_rcv_constraint.pkey); + rsp->port_rcv_constraint_ei.slid = + cpu_to_be32(dd->err_info_rcv_constraint.slid); + + /* UncorrectableErrorInfo */ + rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable; + + /* FMConfigErrorInfo */ + rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig; + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + struct opa_clear_port_status *req = + (struct opa_clear_port_status *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u64 portn = be64_to_cpu(req->port_select_mask[3]); + u32 counter_select = be32_to_cpu(req->counter_select_mask); + unsigned long vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */ + unsigned long vl; + + if ((nports != 1) || (portn != 1 << port)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * only counters returned by pma_get_opa_portstatus() are + * handled, so when pma_get_opa_portstatus() gets a fix, + * the corresponding change should be made here as well. + */ + + if (counter_select & CS_PORT_XMIT_DATA) + write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_XMIT_PKTS) + write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_RCV_PKTS) + write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_WAIT) { + write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0); + ppd->port_vl_xmit_wait_last[C_VL_COUNT] = 0; + ppd->vl_xmit_flit_cnt[C_VL_COUNT] = 0; + } + /* ignore cs_sw_portCongestion for HFIs */ + + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0); + + /* ignore cs_port_xmit_time_cong for HFIs */ + /* ignore cs_port_xmit_wasted_bw for now */ + /* ignore cs_port_xmit_wait_data for now */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0); + + /* Only applicable for switch */ + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0); + */ + + if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0); + + /* ignore cs_port_rcv_switch_relay_errors for HFIs */ + if (counter_select & CS_PORT_XMIT_DISCARDS) + write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS) + write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LINK_ERROR_RECOVERY) { + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL, 0); + } + + if (counter_select & CS_PORT_RCV_ERRORS) + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) { + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + } + + if (counter_select & CS_FM_CONFIG_ERRORS) + write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LINK_DOWNED) + write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_UNCORRECTABLE_ERRORS) + write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0); + + for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) { + if (counter_select & CS_PORT_XMIT_DATA) + write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_WAIT) { + write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0); + ppd->port_vl_xmit_wait_last[idx_from_vl(vl)] = 0; + ppd->vl_xmit_flit_cnt[idx_from_vl(vl)] = 0; + } + + /* sw_port_vl_congestion is 0 for HFIs */ + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0); + + /* port_vl_xmit_time_cong is 0 for HFIs */ + /* port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0); + + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); + */ + if (counter_select & C_SW_XMIT_DSCD_VL) + write_port_cntr(ppd, C_SW_XMIT_DSCD_VL, + idx_from_vl(vl), 0); + } + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u32 port, u32 *resp_len) +{ + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + u32 port_num; + u8 num_pslm; + u32 error_info_select; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = &req->port; + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask) * 8); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + error_info_select = be32_to_cpu(req->error_info_select_mask); + + /* PortRcvErrorInfo */ + if (error_info_select & ES_PORT_RCV_ERROR_INFO) + /* turn off status bit */ + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + + /* ExcessiverBufferOverrunInfo */ + if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO) + /* + * status bit is essentially kept in the h/w - bit 5 of + * RCV_ERR_INFO + */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + + if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO) + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; + + if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO) + dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK; + + /* UncorrectableErrorInfo */ + if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO) + /* turn off status bit */ + dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK; + + /* FMConfigErrorInfo */ + if (error_info_select & ES_FM_CONFIG_ERROR_INFO) + /* turn off status bit */ + dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK; + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +struct opa_congestion_info_attr { + __be16 congestion_info; + u8 control_table_cap; /* Multiple of 64 entry unit CCTs */ + u8 congestion_log_length; +} __packed; + +static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct opa_congestion_info_attr *p = + (struct opa_congestion_info_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + p->congestion_info = 0; + p->control_table_cap = ppd->cc_max_table_entries; + p->congestion_log_length = OPA_CONG_LOG_ELEMS; + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u32 port, u32 *resp_len, u32 max_len) +{ + int i; + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + struct cc_state *cc_state; + + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + entries = cc_state->cong_setting.entries; + p->port_control = cpu_to_be16(cc_state->cong_setting.port_control); + p->control_map = cpu_to_be32(cc_state->cong_setting.control_map); + for (i = 0; i < OPA_MAX_SLS; i++) { + p->entries[i].ccti_increase = entries[i].ccti_increase; + p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer); + p->entries[i].trigger_threshold = + entries[i].trigger_threshold; + p->entries[i].ccti_min = entries[i].ccti_min; + } + + rcu_read_unlock(); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +/* + * Apply congestion control information stored in the ppd to the + * active structure. + */ +static void apply_cc_state(struct hfi1_pportdata *ppd) +{ + struct cc_state *old_cc_state, *new_cc_state; + + new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); + if (!new_cc_state) + return; + + /* + * Hold the lock for updating *and* to prevent ppd information + * from changing during the update. + */ + spin_lock(&ppd->cc_state_lock); + + old_cc_state = get_cc_state_protected(ppd); + if (!old_cc_state) { + /* never active, or shutting down */ + spin_unlock(&ppd->cc_state_lock); + kfree(new_cc_state); + return; + } + + *new_cc_state = *old_cc_state; + + if (ppd->total_cct_entry) + new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1; + else + new_cc_state->cct.ccti_limit = 0; + + memcpy(new_cc_state->cct.entries, ppd->ccti_entries, + ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)); + + new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED; + new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map; + memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries, + OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry)); + + rcu_assign_pointer(ppd->cc_state, new_cc_state); + + spin_unlock(&ppd->cc_state_lock); + + kfree_rcu(old_cc_state, rcu); +} + +static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + int i; + + if (smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ + spin_lock(&ppd->cc_state_lock); + ppd->cc_sl_control_map = be32_to_cpu(p->control_map); + + entries = ppd->congestion_entries; + for (i = 0; i < OPA_MAX_SLS; i++) { + entries[i].ccti_increase = p->entries[i].ccti_increase; + entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer); + entries[i].trigger_threshold = + p->entries[i].trigger_threshold; + entries[i].ccti_min = p->entries[i].ccti_min; + } + spin_unlock(&ppd->cc_state_lock); + + /* now apply the information */ + apply_cc_state(ppd); + + return __subn_get_opa_cong_setting(smp, am, data, ibdev, port, + resp_len, max_len); +} + +static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u32 port, u32 *resp_len, u32 max_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data; + u64 ts; + int i; + + if (am || smp_length_check(sizeof(*cong_log), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + spin_lock_irq(&ppd->cc_log_lock); + + cong_log->log_type = OPA_CC_LOG_TYPE_HFI; + cong_log->congestion_flags = 0; + cong_log->threshold_event_counter = + cpu_to_be16(ppd->threshold_event_counter); + memcpy(cong_log->threshold_cong_event_map, + ppd->threshold_cong_event_map, + sizeof(cong_log->threshold_cong_event_map)); + /* keep timestamp in units of 1.024 usec */ + ts = ktime_get_ns() / 1024; + cong_log->current_time_stamp = cpu_to_be32(ts); + for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) { + struct opa_hfi1_cong_log_event_internal *cce = + &ppd->cc_events[ppd->cc_mad_idx++]; + if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_mad_idx = 0; + /* + * Entries which are older than twice the time + * required to wrap the counter are supposed to + * be zeroed (CA10-49 IBTA, release 1.2.1, V1). + */ + if ((ts - cce->timestamp) / 2 > U32_MAX) + continue; + memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3); + memcpy(cong_log->events[i].remote_qp_number_cn_entry, + &cce->rqpn, 3); + cong_log->events[i].sl_svc_type_cn_entry = + ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7); + cong_log->events[i].remote_lid_cn_entry = + cpu_to_be32(cce->rlid); + cong_log->events[i].timestamp_cn_entry = + cpu_to_be32(cce->timestamp); + } + + /* + * Reset threshold_cong_event_map, and threshold_event_counter + * to 0 when log is read. + */ + memset(ppd->threshold_cong_event_map, 0x0, + sizeof(ppd->threshold_cong_event_map)); + ppd->threshold_event_counter = 0; + + spin_unlock_irq(&ppd->cc_log_lock); + + if (resp_len) + *resp_len += sizeof(struct opa_hfi1_cong_log); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct ib_cc_table_attr *cc_table_attr = + (struct ib_cc_table_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + struct cc_state *cc_state; + u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || smp_length_check(size, max_len) || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + (IB_CCT_ENTRIES * n_blocks); + + cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit); + + entries = cc_state->cct.entries; + + /* return n_blocks, though the last block may not be full */ + for (j = 0, i = sentry; i < eentry; j++, i++) + cc_table_attr->ccti_entries[j].entry = + cpu_to_be16(entries[i].entry); + + rcu_read_unlock(); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + u16 ccti_limit; + u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || smp_length_check(size, max_len) || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) + + (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1; + + /* sanity check ccti_limit */ + ccti_limit = be16_to_cpu(p->ccti_limit); + if (ccti_limit + 1 > eentry) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ + spin_lock(&ppd->cc_state_lock); + ppd->total_cct_entry = ccti_limit + 1; + entries = ppd->ccti_entries; + for (j = 0, i = sentry; i < eentry; j++, i++) + entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry); + spin_unlock(&ppd->cc_state_lock); + + /* now apply the information */ + apply_cc_state(ppd); + + return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len, + max_len); +} + +struct opa_led_info { + __be32 rsvd_led_mask; + __be32 rsvd; +}; + +#define OPA_LED_SHIFT 31 +#define OPA_LED_MASK BIT(OPA_LED_SHIFT) + +static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd = dd->pport; + struct opa_led_info *p = (struct opa_led_info *)data; + u32 nport = OPA_AM_NPORT(am); + u32 is_beaconing_active; + + if (nport != 1 || smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT); + + if (resp_len) + *resp_len += sizeof(struct opa_led_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_led_info *p = (struct opa_led_info *)data; + u32 nport = OPA_AM_NPORT(am); + int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK); + + if (nport != 1 || smp_length_check(sizeof(*p), max_len)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (on) + hfi1_start_led_override(dd->pport, 2000, 1500); + else + shutdown_led_override(dd->pport); + + return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len, + max_len); +} + +static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_NODE_INFO: + ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_get_opa_psi(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_get_opa_bct(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_CABLE_INFO: + ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_CONGESTION_INFO: + ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_get_opa_cong_setting(smp, am, data, ibdev, + port, resp_len, max_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_LOG: + ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev, + port, resp_len, max_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_get_opa_led_info(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + fallthrough; + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u32 port, + u32 *resp_len, u32 max_len, int local_mad) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, + resp_len, max_len, local_mad); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_set_opa_psi(smp, am, data, ibdev, port, + resp_len, max_len, local_mad); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_set_opa_bct(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_set_opa_cong_setting(smp, am, data, ibdev, + port, resp_len, max_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_set_opa_led_info(smp, am, data, ibdev, port, + resp_len, max_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + fallthrough; + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static inline void set_aggr_error(struct opa_aggregate *ag) +{ + ag->err_reqlength |= cpu_to_be16(0x8000); +} + +static int subn_get_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u32 port, + u32 *resp_len) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* zero the payload for this segment */ + memset(next_smp + sizeof(*agg), 0, agg_data_len); + + (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL, (u32)agg_data_len); + + if (smp->status & IB_SMP_INVALID_FIELD) + break; + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + } + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_set_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u32 port, + u32 *resp_len, int local_mad) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL, (u32)agg_data_len, + local_mad); + + if (smp->status & IB_SMP_INVALID_FIELD) + break; + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + } + + return reply((struct ib_mad_hdr *)smp); +} + +/* + * OPAv1 specifies that, on the transition to link up, these counters + * are cleared: + * PortRcvErrors [*] + * LinkErrorRecovery + * LocalLinkIntegrityErrors + * ExcessiveBufferOverruns [*] + * + * [*] Error info associated with these counters is retained, but the + * error info status is reset to 0. + */ +void clear_linkup_counters(struct hfi1_devdata *dd) +{ + /* PortRcvErrors */ + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + /* LinkErrorRecovery */ + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0); + /* LocalLinkIntegrityErrors */ + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + /* ExcessiveBufferOverruns */ + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; +} + +static int is_full_mgmt_pkey_in_table(struct hfi1_ibport *ibp) +{ + unsigned int i; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) + if (ppd->pkeys[i] == FULL_MGMT_P_KEY) + return 1; + + return 0; +} + +/* + * is_local_mad() returns 1 if 'mad' is sent from, and destined to the + * local node, 0 otherwise. + */ +static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + const struct opa_smp *smp = (const struct opa_smp *)mad; + + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + return (smp->hop_cnt == 0 && + smp->route.dr.dr_slid == OPA_LID_PERMISSIVE && + smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE); + } + + return (in_wc->slid == ppd->lid); +} + +/* + * opa_local_smp_check() should only be called on MADs for which + * is_local_mad() returns true. It applies the SMP checks that are + * specific to SMPs which are sent from, and destined to this node. + * opa_local_smp_check() returns 0 if the SMP passes its checks, 1 + * otherwise. + * + * SMPs which arrive from other nodes are instead checked by + * opa_smp_check(). + */ +static int opa_local_smp_check(struct hfi1_ibport *ibp, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 pkey; + + if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) + return 1; + + pkey = ppd->pkeys[in_wc->pkey_index]; + /* + * We need to do the "node-local" checks specified in OPAv1, + * rev 0.90, section 9.10.26, which are: + * - pkey is 0x7fff, or 0xffff + * - Source QPN == 0 || Destination QPN == 0 + * - the MAD header's management class is either + * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or + * IB_MGMT_CLASS_SUBN_LID_ROUTED + * - SLID != 0 + * + * However, we know (and so don't need to check again) that, + * for local SMPs, the MAD stack passes MADs with: + * - Source QPN of 0 + * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or + * our own port's lid + * + */ + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) + return 0; + ingress_pkey_table_fail(ppd, pkey, in_wc->slid); + return 1; +} + +/** + * hfi1_pkey_validation_pma - It validates PKEYs for incoming PMA MAD packets. + * @ibp: IB port data + * @in_mad: MAD packet with header and data + * @in_wc: Work completion data such as source LID, port number, etc. + * + * These are all the possible logic rules for validating a pkey: + * + * a) If pkey neither FULL_MGMT_P_KEY nor LIM_MGMT_P_KEY, + * and NOT self-originated packet: + * Drop MAD packet as it should always be part of the + * management partition unless it's a self-originated packet. + * + * b) If pkey_index -> FULL_MGMT_P_KEY, and LIM_MGMT_P_KEY in pkey table: + * The packet is coming from a management node and the receiving node + * is also a management node, so it is safe for the packet to go through. + * + * c) If pkey_index -> FULL_MGMT_P_KEY, and LIM_MGMT_P_KEY is NOT in pkey table: + * Drop the packet as LIM_MGMT_P_KEY should always be in the pkey table. + * It could be an FM misconfiguration. + * + * d) If pkey_index -> LIM_MGMT_P_KEY and FULL_MGMT_P_KEY is NOT in pkey table: + * It is safe for the packet to go through since a non-management node is + * talking to another non-management node. + * + * e) If pkey_index -> LIM_MGMT_P_KEY and FULL_MGMT_P_KEY in pkey table: + * Drop the packet because a non-management node is talking to a + * management node, and it could be an attack. + * + * For the implementation, these rules can be simplied to only checking + * for (a) and (e). There's no need to check for rule (b) as + * the packet doesn't need to be dropped. Rule (c) is not possible in + * the driver as LIM_MGMT_P_KEY is always in the pkey table. + * + * Return: + * 0 - pkey is okay, -EINVAL it's a bad pkey + */ +static int hfi1_pkey_validation_pma(struct hfi1_ibport *ibp, + const struct opa_mad *in_mad, + const struct ib_wc *in_wc) +{ + u16 pkey_value = hfi1_lookup_pkey_value(ibp, in_wc->pkey_index); + + /* Rule (a) from above */ + if (!is_local_mad(ibp, in_mad, in_wc) && + pkey_value != LIM_MGMT_P_KEY && + pkey_value != FULL_MGMT_P_KEY) + return -EINVAL; + + /* Rule (e) from above */ + if (pkey_value == LIM_MGMT_P_KEY && + is_full_mgmt_pkey_in_table(ibp)) + return -EINVAL; + + return 0; +} + +static int process_subn_opa(struct ib_device *ibdev, int mad_flags, + u32 port, const struct opa_mad *in_mad, + struct opa_mad *out_mad, + u32 *resp_len, int local_mad) +{ + struct opa_smp *smp = (struct opa_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *data; + u32 am, data_size; + __be16 attr_id; + int ret; + + *out_mad = *in_mad; + data = opa_get_smp_data(smp); + data_size = (u32)opa_get_smp_data_size(smp); + + am = be32_to_cpu(smp->attr_mod); + attr_id = smp->attr_id; + if (smp->class_version != OPA_SM_CLASS_VERSION) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + return ret; + } + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey, + smp->route.dr.dr_slid, smp->route.dr.return_path, + smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void)check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, smp->route.dr.dr_slid, + smp->route.dr.return_path, + smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + return ret; + } + + *resp_len = opa_get_smp_header_size(smp); + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (attr_id) { + default: + clear_opa_smp_data(smp); + ret = subn_get_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len, + data_size); + break; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_get_opa_aggregate(smp, ibdev, port, + resp_len); + break; + } + break; + case IB_MGMT_METHOD_SET: + switch (attr_id) { + default: + ret = subn_set_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len, + data_size, local_mad); + break; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_set_opa_aggregate(smp, ibdev, port, + resp_len, local_mad); + break; + } + break; + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + case IB_MGMT_METHOD_TRAP_REPRESS: + subn_handle_opa_trap_repress(ibp, smp); + /* Always successful */ + ret = IB_MAD_RESULT_SUCCESS; + break; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + + return ret; +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u32 port, const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + return ret; + } + + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, + smp->mkey, (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void)check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, + (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + return ret; + } + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_INFO: + ret = subn_get_nodeinfo(smp, ibdev, port); + break; + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + break; + } + + return ret; +} + +static int process_perf(struct ib_device *ibdev, u32 port, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + struct ib_class_port_info *cpi = (struct ib_class_port_info *) + &pmp->data; + int ret = IB_MAD_RESULT_FAILURE; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)pmp); + return ret; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_COUNTERS: + ret = pma_get_ib_portcounters(pmp, ibdev, port); + break; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_get_ib_portcounters_ext(pmp, ibdev, port); + break; + case IB_PMA_CLASS_PORT_INFO: + cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + ret = reply((struct ib_mad_hdr *)pmp); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_SET: + if (pmp->mad_hdr.attr_id) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + } + break; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + + return ret; +} + +static int process_perf_opa(struct ib_device *ibdev, u32 port, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, u32 *resp_len) +{ + struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + + if (pmp->mad_hdr.class_version != OPA_SM_CLASS_VERSION) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + return reply((struct ib_mad_hdr *)pmp); + } + + *resp_len = sizeof(pmp->mad_hdr); + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len); + break; + case OPA_PM_ATTRIB_ID_PORT_STATUS: + ret = pma_get_opa_portstatus(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS: + ret = pma_get_opa_datacounters(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS: + ret = pma_get_opa_porterrors(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_get_opa_errorinfo(pmp, ibdev, port, + resp_len); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS: + ret = pma_set_opa_portstatus(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_set_opa_errorinfo(pmp, ibdev, port, + resp_len); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + + return ret; +} + +static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, + u32 port, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + int ret; + int pkey_idx; + int local_mad = 0; + u32 resp_len = in_wc->byte_len - sizeof(*in_grh); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n", + hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + *out_mad_pkey_index = (u16)pkey_idx; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + local_mad = is_local_mad(ibp, in_mad, in_wc); + if (local_mad) { + ret = opa_local_smp_check(ibp, in_wc); + if (ret) + return IB_MAD_RESULT_FAILURE; + } + ret = process_subn_opa(ibdev, mad_flags, port, in_mad, + out_mad, &resp_len, local_mad); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = hfi1_pkey_validation_pma(ibp, in_mad, in_wc); + if (ret) + return IB_MAD_RESULT_FAILURE; + + ret = process_perf_opa(ibdev, port, in_mad, out_mad, &resp_len); + goto bail; + + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + if (ret & IB_MAD_RESULT_REPLY) + *out_mad_size = round_up(resp_len, 8); + else if (ret & IB_MAD_RESULT_SUCCESS) + *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh); + + return ret; +} + +static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u32 port, + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int ret; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); + break; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port, in_mad, out_mad); + break; + default: + ret = IB_MAD_RESULT_SUCCESS; + break; + } + + return ret; +} + +/** + * hfi1_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * @out_mad_size: size of the outgoing MAD reply + * @out_mad_pkey_index: used to apss back the packet key index + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u32 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index) +{ + switch (in_mad->mad_hdr.base_version) { + case OPA_MGMT_BASE_VERSION: + return hfi1_process_opa_mad(ibdev, mad_flags, port, + in_wc, in_grh, + (struct opa_mad *)in_mad, + (struct opa_mad *)out_mad, + out_mad_size, + out_mad_pkey_index); + case IB_MGMT_BASE_VERSION: + return hfi1_process_ib_mad(ibdev, mad_flags, port, in_wc, + in_grh, in_mad, out_mad); + default: + break; + } + + return IB_MAD_RESULT_FAILURE; +} diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h new file mode 100644 index 0000000000..1d45a008fa --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -0,0 +1,437 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2017 Intel Corporation. + */ + +#ifndef _HFI1_MAD_H +#define _HFI1_MAD_H + +#include <rdma/ib_pma.h> +#include <rdma/opa_smi.h> +#include <rdma/opa_port_info.h> +#include "opa_compat.h" + +/* + * OPA Traps + */ +#define OPA_TRAP_GID_NOW_IN_SERVICE cpu_to_be16(64) +#define OPA_TRAP_GID_OUT_OF_SERVICE cpu_to_be16(65) +#define OPA_TRAP_ADD_MULTICAST_GROUP cpu_to_be16(66) +#define OPA_TRAL_DEL_MULTICAST_GROUP cpu_to_be16(67) +#define OPA_TRAP_UNPATH cpu_to_be16(68) +#define OPA_TRAP_REPATH cpu_to_be16(69) +#define OPA_TRAP_PORT_CHANGE_STATE cpu_to_be16(128) +#define OPA_TRAP_LINK_INTEGRITY cpu_to_be16(129) +#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN cpu_to_be16(130) +#define OPA_TRAP_FLOW_WATCHDOG cpu_to_be16(131) +#define OPA_TRAP_CHANGE_CAPABILITY cpu_to_be16(144) +#define OPA_TRAP_CHANGE_SYSGUID cpu_to_be16(145) +#define OPA_TRAP_BAD_M_KEY cpu_to_be16(256) +#define OPA_TRAP_BAD_P_KEY cpu_to_be16(257) +#define OPA_TRAP_BAD_Q_KEY cpu_to_be16(258) +#define OPA_TRAP_SWITCH_BAD_PKEY cpu_to_be16(259) +#define OPA_SMA_TRAP_DATA_LINK_WIDTH cpu_to_be16(2048) + +/* + * Generic trap/notice other local changes flags (trap 144). + */ +#define OPA_NOTICE_TRAP_LWDE_CHG 0x08 /* Link Width Downgrade Enable + * changed + */ +#define OPA_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define OPA_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define OPA_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +struct opa_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 toggle_count; + __be32 issuer_lid; + __be32 reserved1; + union ib_gid issuer_gid; + + union { + struct { + u8 details[64]; + } raw_data; + + struct { + union ib_gid gid; + } __packed ntc_64_65_66_67; + + struct { + __be32 lid; + } __packed ntc_128; + + struct { + __be32 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_130_131; + + struct { + __be32 lid; /* LID where change occurred */ + __be32 new_cap_mask; /* new capability mask */ + __be16 reserved2; + __be16 cap_mask3; + __be16 change_flags; /* low 4 bits only */ + } __packed ntc_144; + + struct { + __be64 new_sys_guid; + __be32 lid; /* lid where sys guid changed */ + } __packed ntc_145; + + struct { + __be32 lid; + __be32 dr_slid; + u8 method; + u8 dr_trunc_hop; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be32 lid1; + __be32 lid2; + __be32 key; + u8 sl; /* SL: high 5 bits */ + u8 reserved3[3]; + union ib_gid gid1; + union ib_gid gid2; + __be32 qp1; /* high 8 bits reserved */ + __be32 qp2; /* high 8 bits reserved */ + } __packed ntc_257_258; + + struct { + __be16 flags; /* low 8 bits reserved */ + __be16 pkey; + __be32 lid1; + __be32 lid2; + u8 sl; /* SL: high 5 bits */ + u8 reserved4[3]; + union ib_gid gid1; + union ib_gid gid2; + __be32 qp1; /* high 8 bits reserved */ + __be32 qp2; /* high 8 bits reserved */ + } __packed ntc_259; + + struct { + __be32 lid; + } __packed ntc_2048; + + }; + u8 class_data[]; +}; + +#define IB_VLARB_LOWPRI_0_31 1 +#define IB_VLARB_LOWPRI_32_63 2 +#define IB_VLARB_HIGHPRI_0_31 3 +#define IB_VLARB_HIGHPRI_32_63 4 + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) +#define LINK_SPEED_25G 1 +#define LINK_SPEED_12_5G 2 +#define LINK_WIDTH_DEFAULT 4 +#define DECIMAL_FACTORING 1000 +/* + * The default link width is multiplied by 1000 + * to get accurate value after division. + */ +#define FACTOR_LINK_WIDTH (LINK_WIDTH_DEFAULT * DECIMAL_FACTORING) + +struct ib_pma_portcounters_cong { + u8 reserved; + u8 reserved1; + __be16 port_check_rate; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved2; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved3; + __be16 vl15_dropped; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_xmit_wait; + __be64 port_adr_events; +} __packed; + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define HFI1_XMIT_RATE_UNSUPPORTED 0x0 +#define HFI1_XMIT_RATE_PICO 0x7 +/* number of 4nsec cycles equaling 2secs */ +#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC + +#define IB_CC_SVCTYPE_RC 0x0 +#define IB_CC_SVCTYPE_UC 0x1 +#define IB_CC_SVCTYPE_RD 0x2 +#define IB_CC_SVCTYPE_UD 0x3 + +/* + * There should be an equivalent IB #define for the following, but + * I cannot find it. + */ +#define OPA_CC_LOG_TYPE_HFI 2 + +struct opa_hfi1_cong_log_event_internal { + u32 lqpn; + u32 rqpn; + u8 sl; + u8 svc_type; + u32 rlid; + u64 timestamp; /* wider than 32 bits to detect 32 bit rollover */ +}; + +struct opa_hfi1_cong_log_event { + u8 local_qp_cn_entry[3]; + u8 remote_qp_number_cn_entry[3]; + u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */ + u8 reserved; + __be32 remote_lid_cn_entry; + __be32 timestamp_cn_entry; +} __packed; + +#define OPA_CONG_LOG_ELEMS 96 + +struct opa_hfi1_cong_log { + u8 log_type; + u8 congestion_flags; + __be16 threshold_event_counter; + __be32 current_time_stamp; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; + struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS]; +} __packed; + +#define IB_CC_TABLE_CAP_DEFAULT 31 + +/* Port control flags */ +#define IB_CC_CCS_PC_SL_BASED 0x01 + +struct opa_congestion_setting_entry { + u8 ccti_increase; + u8 reserved; + __be16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_entry_shadow { + u8 ccti_increase; + u8 reserved; + u16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_attr { + __be32 control_map; + __be16 port_control; + struct opa_congestion_setting_entry entries[OPA_MAX_SLS]; +} __packed; + +struct opa_congestion_setting_attr_shadow { + u32 control_map; + u16 port_control; + struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS]; +} __packed; + +#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1 +#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1 + +/* 64 Congestion Control table entries in a single MAD */ +#define IB_CCT_ENTRIES 64 +#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2) + +struct ib_cc_table_entry { + __be16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_entry_shadow { + u16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_attr { + __be16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +struct ib_cc_table_attr_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +#define CC_TABLE_SHADOW_MAX \ + (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES) + +struct cc_table_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; +} __packed; + +/* + * struct cc_state combines the (active) per-port congestion control + * table, and the (active) per-SL congestion settings. cc_state data + * may need to be read in code paths that we want to be fast, so it + * is an RCU protected structure. + */ +struct cc_state { + struct rcu_head rcu; + struct cc_table_shadow cct; + struct opa_congestion_setting_attr_shadow cong_setting; +}; + +/* + * OPA BufferControl MAD + */ + +/* attribute modifier macros */ +#define OPA_AM_NPORT_SHIFT 24 +#define OPA_AM_NPORT_MASK 0xff +#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT) +#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \ + OPA_AM_NPORT_MASK) + +#define OPA_AM_NBLK_SHIFT 24 +#define OPA_AM_NBLK_MASK 0xff +#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT) +#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \ + OPA_AM_NBLK_MASK) + +#define OPA_AM_START_BLK_SHIFT 0 +#define OPA_AM_START_BLK_MASK 0xff +#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \ + OPA_AM_START_BLK_SHIFT) +#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \ + OPA_AM_START_BLK_MASK) + +#define OPA_AM_PORTNUM_SHIFT 0 +#define OPA_AM_PORTNUM_MASK 0xff +#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT) +#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \ + OPA_AM_PORTNUM_MASK) + +#define OPA_AM_ASYNC_SHIFT 12 +#define OPA_AM_ASYNC_MASK 0x1 +#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT) +#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \ + OPA_AM_ASYNC_MASK) + +#define OPA_AM_START_SM_CFG_SHIFT 9 +#define OPA_AM_START_SM_CFG_MASK 0x1 +#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \ + OPA_AM_START_SM_CFG_SHIFT) +#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \ + & OPA_AM_START_SM_CFG_MASK) + +#define OPA_AM_CI_ADDR_SHIFT 19 +#define OPA_AM_CI_ADDR_MASK 0xfff +#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT) +#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \ + OPA_AM_CI_ADDR_MASK) + +#define OPA_AM_CI_LEN_SHIFT 13 +#define OPA_AM_CI_LEN_MASK 0x3f +#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT) +#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \ + OPA_AM_CI_LEN_MASK) + +/* error info macros */ +#define OPA_EI_STATUS_SMASK 0x80 +#define OPA_EI_CODE_SMASK 0x0f + +struct vl_limit { + __be16 dedicated; + __be16 shared; +}; + +struct buffer_control { + __be16 reserved; + __be16 overall_shared_limit; + struct vl_limit vl[OPA_MAX_VLS]; +}; + +struct sc2vlnt { + u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */ +}; + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +void hfi1_event_pkey_change(struct hfi1_devdata *dd, u32 port); +void hfi1_handle_trap_timer(struct timer_list *t); +u16 tx_link_width(u16 link_width); +u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, u16 link_width, + u16 link_speed, int vl); +/** + * get_link_speed - determine whether 12.5G or 25G speed + * @link_speed: the speed of active link + * @return: Return 2 if link speed identified as 12.5G + * or return 1 if link speed is 25G. + * + * The function indirectly calculate required link speed + * value for convert_xmit_counter function. If the link + * speed is 25G, the function return as 1 as it is required + * by xmit counter conversion formula :-( 25G / link_speed). + * This conversion will provide value 1 if current + * link speed is 25G or 2 if 12.5G.This is done to avoid + * 12.5 float number conversion. + */ +static inline u16 get_link_speed(u16 link_speed) +{ + return (link_speed == 1) ? + LINK_SPEED_12_5G : LINK_SPEED_25G; +} + +/** + * convert_xmit_counter - calculate flit times for given xmit counter + * value + * @xmit_wait_val: current xmit counter value + * @link_width: width of active link + * @link_speed: speed of active link + * @return: return xmit counter value in flit times. + */ +static inline u64 convert_xmit_counter(u64 xmit_wait_val, u16 link_width, + u16 link_speed) +{ + return (xmit_wait_val * 2 * (FACTOR_LINK_WIDTH / link_width) + * link_speed) / DECIMAL_FACTORING; +} +#endif /* _HFI1_MAD_H */ diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c new file mode 100644 index 0000000000..7a51f7d73b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2016 - 2017 Intel Corporation. + */ + +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/mmu_notifier.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched/mm.h> + +#include "mmu_rb.h" +#include "trace.h" + +static unsigned long mmu_node_start(struct mmu_rb_node *); +static unsigned long mmu_node_last(struct mmu_rb_node *); +static int mmu_notifier_range_start(struct mmu_notifier *, + const struct mmu_notifier_range *); +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, + unsigned long, unsigned long); +static void release_immediate(struct kref *refcount); +static void handle_remove(struct work_struct *work); + +static const struct mmu_notifier_ops mn_opts = { + .invalidate_range_start = mmu_notifier_range_start, +}; + +INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last, + mmu_node_start, mmu_node_last, static, __mmu_int_rb); + +static unsigned long mmu_node_start(struct mmu_rb_node *node) +{ + return node->addr & PAGE_MASK; +} + +static unsigned long mmu_node_last(struct mmu_rb_node *node) +{ + return PAGE_ALIGN(node->addr + node->len) - 1; +} + +int hfi1_mmu_rb_register(void *ops_arg, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler) +{ + struct mmu_rb_handler *h; + void *free_ptr; + int ret; + + free_ptr = kzalloc(sizeof(*h) + cache_line_size() - 1, GFP_KERNEL); + if (!free_ptr) + return -ENOMEM; + + h = PTR_ALIGN(free_ptr, cache_line_size()); + h->root = RB_ROOT_CACHED; + h->ops = ops; + h->ops_arg = ops_arg; + INIT_HLIST_NODE(&h->mn.hlist); + spin_lock_init(&h->lock); + h->mn.ops = &mn_opts; + INIT_WORK(&h->del_work, handle_remove); + INIT_LIST_HEAD(&h->del_list); + INIT_LIST_HEAD(&h->lru_list); + h->wq = wq; + h->free_ptr = free_ptr; + + ret = mmu_notifier_register(&h->mn, current->mm); + if (ret) { + kfree(free_ptr); + return ret; + } + + *handler = h; + return 0; +} + +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) +{ + struct mmu_rb_node *rbnode; + struct rb_node *node; + unsigned long flags; + struct list_head del_list; + + /* Prevent freeing of mm until we are completely finished. */ + mmgrab(handler->mn.mm); + + /* Unregister first so we don't get any more notifications. */ + mmu_notifier_unregister(&handler->mn, handler->mn.mm); + + /* + * Make sure the wq delete handler is finished running. It will not + * be triggered once the mmu notifiers are unregistered above. + */ + flush_work(&handler->del_work); + + INIT_LIST_HEAD(&del_list); + + spin_lock_irqsave(&handler->lock, flags); + while ((node = rb_first_cached(&handler->root))) { + rbnode = rb_entry(node, struct mmu_rb_node, node); + rb_erase_cached(node, &handler->root); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); + } + spin_unlock_irqrestore(&handler->lock, flags); + + while (!list_empty(&del_list)) { + rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&rbnode->list); + kref_put(&rbnode->refcount, release_immediate); + } + + /* Now the mm may be freed. */ + mmdrop(handler->mn.mm); + + kfree(handler->free_ptr); +} + +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode) +{ + struct mmu_rb_node *node; + unsigned long flags; + int ret = 0; + + trace_hfi1_mmu_rb_insert(mnode); + + if (current->mm != handler->mn.mm) + return -EPERM; + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, mnode->addr, mnode->len); + if (node) { + ret = -EEXIST; + goto unlock; + } + __mmu_int_rb_insert(mnode, &handler->root); + list_add_tail(&mnode->list, &handler->lru_list); + mnode->handler = handler; +unlock: + spin_unlock_irqrestore(&handler->lock, flags); + return ret; +} + +/* Caller must hold handler lock */ +struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, + unsigned long addr, unsigned long len) +{ + struct mmu_rb_node *node; + + trace_hfi1_mmu_rb_search(addr, len); + node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1); + if (node) + list_move_tail(&node->list, &handler->lru_list); + return node; +} + +/* Caller must hold handler lock */ +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, + unsigned long addr, + unsigned long len) +{ + struct mmu_rb_node *node = NULL; + + trace_hfi1_mmu_rb_search(addr, len); + if (!handler->ops->filter) { + node = __mmu_int_rb_iter_first(&handler->root, addr, + (addr + len) - 1); + } else { + for (node = __mmu_int_rb_iter_first(&handler->root, addr, + (addr + len) - 1); + node; + node = __mmu_int_rb_iter_next(node, addr, + (addr + len) - 1)) { + if (handler->ops->filter(node, addr, len)) + return node; + } + } + return node; +} + +/* + * Must NOT call while holding mnode->handler->lock. + * mnode->handler->ops->remove() may sleep and mnode->handler->lock is a + * spinlock. + */ +static void release_immediate(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + trace_hfi1_mmu_release_node(mnode); + mnode->handler->ops->remove(mnode->handler->ops_arg, mnode); +} + +/* Caller must hold mnode->handler->lock */ +static void release_nolock(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + list_move(&mnode->list, &mnode->handler->del_list); + queue_work(mnode->handler->wq, &mnode->handler->del_work); +} + +/* + * struct mmu_rb_node->refcount kref_put() callback. + * Adds mmu_rb_node to mmu_rb_node->handler->del_list and queues + * handler->del_work on handler->wq. + * Does not remove mmu_rb_node from handler->lru_list or handler->rb_root. + * Acquires mmu_rb_node->handler->lock; do not call while already holding + * handler->lock. + */ +void hfi1_mmu_rb_release(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + struct mmu_rb_handler *handler = mnode->handler; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + list_move(&mnode->list, &mnode->handler->del_list); + spin_unlock_irqrestore(&handler->lock, flags); + queue_work(handler->wq, &handler->del_work); +} + +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) +{ + struct mmu_rb_node *rbnode, *ptr; + struct list_head del_list; + unsigned long flags; + bool stop = false; + + if (current->mm != handler->mn.mm) + return; + + INIT_LIST_HEAD(&del_list); + + spin_lock_irqsave(&handler->lock, flags); + list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) { + /* refcount == 1 implies mmu_rb_handler has only rbnode ref */ + if (kref_read(&rbnode->refcount) > 1) + continue; + + if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, + &stop)) { + __mmu_int_rb_remove(rbnode, &handler->root); + /* move from LRU list to delete list */ + list_move(&rbnode->list, &del_list); + } + if (stop) + break; + } + spin_unlock_irqrestore(&handler->lock, flags); + + list_for_each_entry_safe(rbnode, ptr, &del_list, list) { + trace_hfi1_mmu_rb_evict(rbnode); + kref_put(&rbnode->refcount, release_immediate); + } +} + +static int mmu_notifier_range_start(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) +{ + struct mmu_rb_handler *handler = + container_of(mn, struct mmu_rb_handler, mn); + struct rb_root_cached *root = &handler->root; + struct mmu_rb_node *node, *ptr = NULL; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1); + node; node = ptr) { + /* Guard against node removal. */ + ptr = __mmu_int_rb_iter_next(node, range->start, + range->end - 1); + trace_hfi1_mmu_mem_invalidate(node); + /* Remove from rb tree and lru_list. */ + __mmu_int_rb_remove(node, root); + list_del_init(&node->list); + kref_put(&node->refcount, release_nolock); + } + spin_unlock_irqrestore(&handler->lock, flags); + + return 0; +} + +/* + * Work queue function to remove all nodes that have been queued up to + * be removed. The key feature is that mm->mmap_lock is not being held + * and the remove callback can sleep while taking it, if needed. + */ +static void handle_remove(struct work_struct *work) +{ + struct mmu_rb_handler *handler = container_of(work, + struct mmu_rb_handler, + del_work); + struct list_head del_list; + unsigned long flags; + struct mmu_rb_node *node; + + /* remove anything that is queued to get removed */ + spin_lock_irqsave(&handler->lock, flags); + list_replace_init(&handler->del_list, &del_list); + spin_unlock_irqrestore(&handler->lock, flags); + + while (!list_empty(&del_list)) { + node = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&node->list); + trace_hfi1_mmu_release_node(node); + handler->ops->remove(handler->ops_arg, node); + } +} diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h new file mode 100644 index 0000000000..751dc3fe1e --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2016 Intel Corporation. + */ + +#ifndef _HFI1_MMU_RB_H +#define _HFI1_MMU_RB_H + +#include "hfi.h" + +struct mmu_rb_node { + unsigned long addr; + unsigned long len; + unsigned long __last; + struct rb_node node; + struct mmu_rb_handler *handler; + struct list_head list; + struct kref refcount; +}; + +/* filter and evict must not sleep. Only remove is allowed to sleep. */ +struct mmu_rb_ops { + bool (*filter)(struct mmu_rb_node *node, unsigned long addr, + unsigned long len); + void (*remove)(void *ops_arg, struct mmu_rb_node *mnode); + int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop); +}; + +struct mmu_rb_handler { + /* + * struct mmu_notifier is 56 bytes, and spinlock_t is 4 bytes, so + * they fit together in one cache line. mn is relatively rarely + * accessed, so co-locating the spinlock with it achieves much of + * the cacheline contention reduction of giving the spinlock its own + * cacheline without the overhead of doing so. + */ + struct mmu_notifier mn; + spinlock_t lock; /* protect the RB tree */ + + /* Begin on a new cachline boundary here */ + struct rb_root_cached root ____cacheline_aligned_in_smp; + void *ops_arg; + struct mmu_rb_ops *ops; + struct list_head lru_list; + struct work_struct del_work; + struct list_head del_list; + struct workqueue_struct *wq; + void *free_ptr; +}; + +int hfi1_mmu_rb_register(void *ops_arg, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler); +void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); +int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, + struct mmu_rb_node *mnode); +void hfi1_mmu_rb_release(struct kref *refcount); + +void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); +struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, + unsigned long addr, + unsigned long len); + +#endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c new file mode 100644 index 0000000000..77d2ece9a9 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 - 2020 Intel Corporation. + */ + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" +#include "netdev.h" + +/** + * msix_initialize() - Calculate, request and configure MSIx IRQs + * @dd: valid hfi1 devdata + * + */ +int msix_initialize(struct hfi1_devdata *dd) +{ + u32 total; + int ret; + struct hfi1_msix_entry *entries; + + /* + * MSIx interrupt count: + * one for the general, "slow path" interrupt + * one per used SDMA engine + * one per kernel receive context + * one for each VNIC context + * ...any new IRQs should be added here. + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_netdev_contexts; + + if (total >= CCE_NUM_MSIX_VECTORS) + return -EINVAL; + + ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX); + if (ret < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret); + return ret; + } + + entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries), + GFP_KERNEL); + if (!entries) { + pci_free_irq_vectors(dd->pcidev); + return -ENOMEM; + } + + dd->msix_info.msix_entries = entries; + spin_lock_init(&dd->msix_info.msix_lock); + bitmap_zero(dd->msix_info.in_use_msix, total); + dd->msix_info.max_requested = total; + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + + return 0; +} + +/** + * msix_request_irq() - Allocate a free MSIx IRQ + * @dd: valid devdata + * @arg: context information for the IRQ + * @handler: IRQ handler + * @thread: IRQ thread handler (could be NULL) + * @type: affinty IRQ type + * @name: IRQ name + * + * Allocated an MSIx vector if available, and then create the appropriate + * meta data needed to keep track of the pci IRQ request. + * + * Return: + * < 0 Error + * >= 0 MSIx vector + * + */ +static int msix_request_irq(struct hfi1_devdata *dd, void *arg, + irq_handler_t handler, irq_handler_t thread, + enum irq_type type, const char *name) +{ + unsigned long nr; + int irq; + int ret; + struct hfi1_msix_entry *me; + + /* Allocate an MSIx vector */ + spin_lock(&dd->msix_info.msix_lock); + nr = find_first_zero_bit(dd->msix_info.in_use_msix, + dd->msix_info.max_requested); + if (nr < dd->msix_info.max_requested) + __set_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + + if (nr == dd->msix_info.max_requested) + return -ENOSPC; + + if (type < IRQ_SDMA || type >= IRQ_OTHER) + return -EINVAL; + + irq = pci_irq_vector(dd->pcidev, nr); + ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name); + if (ret) { + dd_dev_err(dd, + "%s: request for IRQ %d failed, MSIx %lx, err %d\n", + name, irq, nr, ret); + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + return ret; + } + + /* + * assign arg after pci_request_irq call, so it will be + * cleaned up + */ + me = &dd->msix_info.msix_entries[nr]; + me->irq = irq; + me->arg = arg; + me->type = type; + + /* This is a request, so a failure is not fatal */ + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, "%s: unable to pin IRQ %d\n", name, ret); + + return nr; +} + +static int msix_request_rcd_irq_common(struct hfi1_ctxtdata *rcd, + irq_handler_t handler, + irq_handler_t thread, + const char *name) +{ + int nr = msix_request_irq(rcd->dd, rcd, handler, thread, + rcd->is_vnic ? IRQ_NETDEVCTXT : IRQ_RCVCTXT, + name); + if (nr < 0) + return nr; + + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64; + rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64); + rcd->msix_intr = nr; + remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr); + + return 0; +} + +/** + * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs + * @rcd: valid rcd context + * + */ +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd) +{ + char name[MAX_NAME_SIZE]; + + snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d", + rcd->dd->unit, rcd->ctxt); + + return msix_request_rcd_irq_common(rcd, receive_context_interrupt, + receive_context_thread, name); +} + +/** + * msix_netdev_request_rcd_irq - Helper function for RCVAVAIL IRQs + * for netdev context + * @rcd: valid netdev contexti + */ +int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd) +{ + char name[MAX_NAME_SIZE]; + + snprintf(name, sizeof(name), DRIVER_NAME "_%d nd kctxt%d", + rcd->dd->unit, rcd->ctxt); + return msix_request_rcd_irq_common(rcd, receive_context_interrupt_napi, + NULL, name); +} + +/** + * msix_request_sdma_irq - Helper for getting SDMA IRQ resources + * @sde: valid sdma engine + * + */ +int msix_request_sdma_irq(struct sdma_engine *sde) +{ + int nr; + char name[MAX_NAME_SIZE]; + + snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d", + sde->dd->unit, sde->this_idx); + nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL, + IRQ_SDMA, name); + if (nr < 0) + return nr; + sde->msix_intr = nr; + remap_sdma_interrupts(sde->dd, sde->this_idx, nr); + + return 0; +} + +/** + * msix_request_general_irq - Helper for getting general IRQ + * resources + * @dd: valid device data + */ +int msix_request_general_irq(struct hfi1_devdata *dd) +{ + int nr; + char name[MAX_NAME_SIZE]; + + snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit); + nr = msix_request_irq(dd, dd, general_interrupt, NULL, IRQ_GENERAL, + name); + if (nr < 0) + return nr; + + /* general interrupt must be MSIx vector 0 */ + if (nr) { + msix_free_irq(dd, (u8)nr); + dd_dev_err(dd, "Invalid index %d for GENERAL IRQ\n", nr); + return -EINVAL; + } + + return 0; +} + +/** + * enable_sdma_srcs - Helper to enable SDMA IRQ srcs + * @dd: valid devdata structure + * @i: index of SDMA engine + */ +static void enable_sdma_srcs(struct hfi1_devdata *dd, int i) +{ + set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true); + set_intr_bits(dd, IS_SDMA_PROGRESS_START + i, + IS_SDMA_PROGRESS_START + i, true); + set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true); + set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i, + true); +} + +/** + * msix_request_irqs() - Allocate all MSIx IRQs + * @dd: valid devdata structure + * + * Helper function to request the used MSIx IRQs. + * + */ +int msix_request_irqs(struct hfi1_devdata *dd) +{ + int i; + int ret = msix_request_general_irq(dd); + + if (ret) + return ret; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + ret = msix_request_sdma_irq(sde); + if (ret) + return ret; + enable_sdma_srcs(sde->dd, i); + } + + for (i = 0; i < dd->n_krcv_queues; i++) { + struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i); + + if (rcd) + ret = msix_request_rcd_irq(rcd); + hfi1_rcd_put(rcd); + if (ret) + return ret; + } + + return 0; +} + +/** + * msix_free_irq() - Free the specified MSIx resources and IRQ + * @dd: valid devdata + * @msix_intr: MSIx vector to free. + * + */ +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr) +{ + struct hfi1_msix_entry *me; + + if (msix_intr >= dd->msix_info.max_requested) + return; + + me = &dd->msix_info.msix_entries[msix_intr]; + + if (!me->arg) /* => no irq, no affinity */ + return; + + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, msix_intr, me->arg); + + me->arg = NULL; + + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(msix_intr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); +} + +/** + * msix_clean_up_interrupts - Free all MSIx IRQ resources + * @dd: valid device data data structure + * + * Free the MSIx and associated PCI resources, if they have been allocated. + */ +void msix_clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_msix_entry *me = dd->msix_info.msix_entries; + + /* remove irqs - must happen before disabling/turning off */ + for (i = 0; i < dd->msix_info.max_requested; i++, me++) + msix_free_irq(dd, i); + + /* clean structures */ + kfree(dd->msix_info.msix_entries); + dd->msix_info.msix_entries = NULL; + dd->msix_info.max_requested = 0; + + pci_free_irq_vectors(dd->pcidev); +} + +/** + * msix_netdev_synchronize_irq - netdev IRQ synchronize + * @dd: valid devdata + */ +void msix_netdev_synchronize_irq(struct hfi1_devdata *dd) +{ + int i; + int ctxt_count = hfi1_netdev_ctxt_count(dd); + + for (i = 0; i < ctxt_count; i++) { + struct hfi1_ctxtdata *rcd = hfi1_netdev_get_ctxt(dd, i); + struct hfi1_msix_entry *me; + + me = &dd->msix_info.msix_entries[rcd->msix_intr]; + + synchronize_irq(me->irq); + } +} diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h new file mode 100644 index 0000000000..9530ccb0a2 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 - 2020 Intel Corporation. + */ + +#ifndef _HFI1_MSIX_H +#define _HFI1_MSIX_H + +#include "hfi.h" + +/* MSIx interface */ +int msix_initialize(struct hfi1_devdata *dd); +int msix_request_irqs(struct hfi1_devdata *dd); +void msix_clean_up_interrupts(struct hfi1_devdata *dd); +int msix_request_general_irq(struct hfi1_devdata *dd); +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd); +int msix_request_sdma_irq(struct sdma_engine *sde); +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); + +/* Netdev interface */ +void msix_netdev_synchronize_irq(struct hfi1_devdata *dd); +int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd); + +#endif diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h new file mode 100644 index 0000000000..8aa074670a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#ifndef HFI1_NETDEV_H +#define HFI1_NETDEV_H + +#include "hfi.h" + +#include <linux/netdevice.h> +#include <linux/xarray.h> + +/** + * struct hfi1_netdev_rxq - Receive Queue for HFI + * Both IPoIB and VNIC netdevices will be working on the rx abstraction. + * @napi: napi object + * @rx: ptr to netdev_rx + * @rcd: ptr to receive context data + */ +struct hfi1_netdev_rxq { + struct napi_struct napi; + struct hfi1_netdev_rx *rx; + struct hfi1_ctxtdata *rcd; +}; + +/* + * Number of netdev contexts used. Ensure it is less than or equal to + * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE). + */ +#define HFI1_MAX_NETDEV_CTXTS 8 + +/* Number of NETDEV RSM entries */ +#define NUM_NETDEV_MAP_ENTRIES HFI1_MAX_NETDEV_CTXTS + +/** + * struct hfi1_netdev_rx: data required to setup and run HFI netdev. + * @rx_napi: the dummy netdevice to support "polling" the receive contexts + * @dd: hfi1_devdata + * @rxq: pointer to dummy netdev receive queues. + * @num_rx_q: number of receive queues + * @rmt_index: first free index in RMT Array + * @msix_start: first free MSI-X interrupt vector. + * @dev_tbl: netdev table for unique identifier VNIC and IPoIb VLANs. + * @enabled: atomic counter of netdevs enabling receive queues. + * When 0 NAPI will be disabled. + * @netdevs: atomic counter of netdevs using dummy netdev. + * When 0 receive queues will be freed. + */ +struct hfi1_netdev_rx { + struct net_device rx_napi; + struct hfi1_devdata *dd; + struct hfi1_netdev_rxq *rxq; + int num_rx_q; + int rmt_start; + struct xarray dev_tbl; + /* count of enabled napi polls */ + atomic_t enabled; + /* count of netdevs on top */ + atomic_t netdevs; +}; + +static inline +int hfi1_netdev_ctxt_count(struct hfi1_devdata *dd) +{ + return dd->netdev_rx->num_rx_q; +} + +static inline +struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt) +{ + return dd->netdev_rx->rxq[ctxt].rcd; +} + +static inline +int hfi1_netdev_get_free_rmt_idx(struct hfi1_devdata *dd) +{ + return dd->netdev_rx->rmt_start; +} + +static inline +void hfi1_netdev_set_free_rmt_idx(struct hfi1_devdata *dd, int rmt_idx) +{ + dd->netdev_rx->rmt_start = rmt_idx; +} + +u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts, + struct cpumask *cpu_mask); + +void hfi1_netdev_enable_queues(struct hfi1_devdata *dd); +void hfi1_netdev_disable_queues(struct hfi1_devdata *dd); +int hfi1_netdev_rx_init(struct hfi1_devdata *dd); +int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd); +int hfi1_alloc_rx(struct hfi1_devdata *dd); +void hfi1_free_rx(struct hfi1_devdata *dd); +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data); +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id); + +/* chip.c */ +int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget); + +#endif /* HFI1_NETDEV_H */ diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c new file mode 100644 index 0000000000..720d4c85c9 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for netdev RX functionality + */ + +#include "sdma.h" +#include "verbs.h" +#include "netdev.h" +#include "hfi.h" + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <rdma/ib_verbs.h> + +static int hfi1_netdev_setup_ctxt(struct hfi1_netdev_rx *rx, + struct hfi1_ctxtdata *uctxt) +{ + unsigned int rcvctrl_ops; + struct hfi1_devdata *dd = rx->dd; + int ret; + + uctxt->rhf_rcv_function_map = netdev_rhf_rcv_functions; + uctxt->do_interrupt = &handle_receive_interrupt_napi_sp; + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + + clear_rcvhdrtail(uctxt); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_DIS; + rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_DIS; + + if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); +done: + return ret; +} + +static int hfi1_netdev_allocate_ctxt(struct hfi1_devdata *dd, + struct hfi1_ctxtdata **ctxt) +{ + struct hfi1_ctxtdata *uctxt; + int ret; + + if (dd->flags & HFI1_FROZEN) + return -EIO; + + ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt); + if (ret < 0) { + dd_dev_err(dd, "Unable to create ctxtdata, failing open\n"); + return -ENOMEM; + } + + uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + /* Netdev contexts are always NO_RDMA_RTAIL */ + uctxt->fast_handler = handle_receive_interrupt_napi_fp; + uctxt->slow_handler = handle_receive_interrupt_napi_sp; + hfi1_set_seq_cnt(uctxt, 1); + uctxt->is_vnic = true; + + hfi1_stats.sps_ctxts++; + + dd_dev_info(dd, "created netdev context %d\n", uctxt->ctxt); + *ctxt = uctxt; + + return 0; +} + +static void hfi1_netdev_deallocate_ctxt(struct hfi1_devdata *dd, + struct hfi1_ctxtdata *uctxt) +{ + flush_wc(); + + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + + if (uctxt->msix_intr != CCE_NUM_MSIX_VECTORS) + msix_free_irq(dd, uctxt->msix_intr); + + uctxt->msix_intr = CCE_NUM_MSIX_VECTORS; + uctxt->event_flags = 0; + + hfi1_clear_tids(uctxt); + hfi1_clear_ctxt_pkey(dd, uctxt); + + hfi1_stats.sps_ctxts--; + + hfi1_free_ctxt(uctxt); +} + +static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_rx *rx, + struct hfi1_ctxtdata **ctxt) +{ + int rc; + struct hfi1_devdata *dd = rx->dd; + + rc = hfi1_netdev_allocate_ctxt(dd, ctxt); + if (rc) { + dd_dev_err(dd, "netdev ctxt alloc failed %d\n", rc); + return rc; + } + + rc = hfi1_netdev_setup_ctxt(rx, *ctxt); + if (rc) { + dd_dev_err(dd, "netdev ctxt setup failed %d\n", rc); + hfi1_netdev_deallocate_ctxt(dd, *ctxt); + *ctxt = NULL; + } + + return rc; +} + +/** + * hfi1_num_netdev_contexts - Count of netdev recv contexts to use. + * @dd: device on which to allocate netdev contexts + * @available_contexts: count of available receive contexts + * @cpu_mask: mask of possible cpus to include for contexts + * + * Return: count of physical cores on a node or the remaining available recv + * contexts for netdev recv context usage up to the maximum of + * HFI1_MAX_NETDEV_CTXTS. + * A value of 0 can be returned when acceleration is explicitly turned off, + * a memory allocation error occurs or when there are no available contexts. + * + */ +u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts, + struct cpumask *cpu_mask) +{ + cpumask_var_t node_cpu_mask; + unsigned int available_cpus; + + if (!HFI1_CAP_IS_KSET(AIP)) + return 0; + + /* Always give user contexts priority over netdev contexts */ + if (available_contexts == 0) { + dd_dev_info(dd, "No receive contexts available for netdevs.\n"); + return 0; + } + + if (!zalloc_cpumask_var(&node_cpu_mask, GFP_KERNEL)) { + dd_dev_err(dd, "Unable to allocate cpu_mask for netdevs.\n"); + return 0; + } + + cpumask_and(node_cpu_mask, cpu_mask, cpumask_of_node(dd->node)); + + available_cpus = cpumask_weight(node_cpu_mask); + + free_cpumask_var(node_cpu_mask); + + return min3(available_cpus, available_contexts, + (u32)HFI1_MAX_NETDEV_CTXTS); +} + +static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) +{ + int i; + int rc; + struct hfi1_devdata *dd = rx->dd; + struct net_device *dev = &rx->rx_napi; + + rx->num_rx_q = dd->num_netdev_contexts; + rx->rxq = kcalloc_node(rx->num_rx_q, sizeof(*rx->rxq), + GFP_KERNEL, dd->node); + + if (!rx->rxq) { + dd_dev_err(dd, "Unable to allocate netdev queue data\n"); + return (-ENOMEM); + } + + for (i = 0; i < rx->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &rx->rxq[i]; + + rc = hfi1_netdev_allot_ctxt(rx, &rxq->rcd); + if (rc) + goto bail_context_irq_failure; + + hfi1_rcd_get(rxq->rcd); + rxq->rx = rx; + rxq->rcd->napi = &rxq->napi; + dd_dev_info(dd, "Setting rcv queue %d napi to context %d\n", + i, rxq->rcd->ctxt); + /* + * Disable BUSY_POLL on this NAPI as this is not supported + * right now. + */ + set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); + netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi); + rc = msix_netdev_request_rcd_irq(rxq->rcd); + if (rc) + goto bail_context_irq_failure; + } + + return 0; + +bail_context_irq_failure: + dd_dev_err(dd, "Unable to allot receive context\n"); + for (; i >= 0; i--) { + struct hfi1_netdev_rxq *rxq = &rx->rxq[i]; + + if (rxq->rcd) { + hfi1_netdev_deallocate_ctxt(dd, rxq->rcd); + hfi1_rcd_put(rxq->rcd); + rxq->rcd = NULL; + } + } + kfree(rx->rxq); + rx->rxq = NULL; + + return rc; +} + +static void hfi1_netdev_rxq_deinit(struct hfi1_netdev_rx *rx) +{ + int i; + struct hfi1_devdata *dd = rx->dd; + + for (i = 0; i < rx->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &rx->rxq[i]; + + netif_napi_del(&rxq->napi); + hfi1_netdev_deallocate_ctxt(dd, rxq->rcd); + hfi1_rcd_put(rxq->rcd); + rxq->rcd = NULL; + } + + kfree(rx->rxq); + rx->rxq = NULL; + rx->num_rx_q = 0; +} + +static void enable_queues(struct hfi1_netdev_rx *rx) +{ + int i; + + for (i = 0; i < rx->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &rx->rxq[i]; + + dd_dev_info(rx->dd, "enabling queue %d on context %d\n", i, + rxq->rcd->ctxt); + napi_enable(&rxq->napi); + hfi1_rcvctrl(rx->dd, + HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB, + rxq->rcd); + } +} + +static void disable_queues(struct hfi1_netdev_rx *rx) +{ + int i; + + msix_netdev_synchronize_irq(rx->dd); + + for (i = 0; i < rx->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &rx->rxq[i]; + + dd_dev_info(rx->dd, "disabling queue %d on context %d\n", i, + rxq->rcd->ctxt); + + /* wait for napi if it was scheduled */ + hfi1_rcvctrl(rx->dd, + HFI1_RCVCTRL_CTXT_DIS | HFI1_RCVCTRL_INTRAVAIL_DIS, + rxq->rcd); + napi_synchronize(&rxq->napi); + napi_disable(&rxq->napi); + } +} + +/** + * hfi1_netdev_rx_init - Incrememnts netdevs counter. When called first time, + * it allocates receive queue data and calls netif_napi_add + * for each queue. + * + * @dd: hfi1 dev data + */ +int hfi1_netdev_rx_init(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + int res; + + if (atomic_fetch_inc(&rx->netdevs)) + return 0; + + mutex_lock(&hfi1_mutex); + res = hfi1_netdev_rxq_init(rx); + mutex_unlock(&hfi1_mutex); + return res; +} + +/** + * hfi1_netdev_rx_destroy - Decrements netdevs counter, when it reaches 0 + * napi is deleted and receive queses memory is freed. + * + * @dd: hfi1 dev data + */ +int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + + /* destroy the RX queues only if it is the last netdev going away */ + if (atomic_fetch_add_unless(&rx->netdevs, -1, 0) == 1) { + mutex_lock(&hfi1_mutex); + hfi1_netdev_rxq_deinit(rx); + mutex_unlock(&hfi1_mutex); + } + + return 0; +} + +/** + * hfi1_alloc_rx - Allocates the rx support structure + * @dd: hfi1 dev data + * + * Allocate the rx structure to support gathering the receive + * resources and the dummy netdev. + * + * Updates dd struct pointer upon success. + * + * Return: 0 (success) -error on failure + * + */ +int hfi1_alloc_rx(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_rx *rx; + + dd_dev_info(dd, "allocating rx size %ld\n", sizeof(*rx)); + rx = kzalloc_node(sizeof(*rx), GFP_KERNEL, dd->node); + + if (!rx) + return -ENOMEM; + rx->dd = dd; + init_dummy_netdev(&rx->rx_napi); + + xa_init(&rx->dev_tbl); + atomic_set(&rx->enabled, 0); + atomic_set(&rx->netdevs, 0); + dd->netdev_rx = rx; + + return 0; +} + +void hfi1_free_rx(struct hfi1_devdata *dd) +{ + if (dd->netdev_rx) { + dd_dev_info(dd, "hfi1 rx freed\n"); + kfree(dd->netdev_rx); + dd->netdev_rx = NULL; + } +} + +/** + * hfi1_netdev_enable_queues - This is napi enable function. + * It enables napi objects associated with queues. + * When at least one device has called it it increments atomic counter. + * Disable function decrements counter and when it is 0, + * calls napi_disable for every queue. + * + * @dd: hfi1 dev data + */ +void hfi1_netdev_enable_queues(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_rx *rx; + + if (!dd->netdev_rx) + return; + + rx = dd->netdev_rx; + if (atomic_fetch_inc(&rx->enabled)) + return; + + mutex_lock(&hfi1_mutex); + enable_queues(rx); + mutex_unlock(&hfi1_mutex); +} + +void hfi1_netdev_disable_queues(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_rx *rx; + + if (!dd->netdev_rx) + return; + + rx = dd->netdev_rx; + if (atomic_dec_if_positive(&rx->enabled)) + return; + + mutex_lock(&hfi1_mutex); + disable_queues(rx); + mutex_unlock(&hfi1_mutex); +} + +/** + * hfi1_netdev_add_data - Registers data with unique identifier + * to be requested later this is needed for VNIC and IPoIB VLANs + * implementations. + * This call is protected by mutex idr_lock. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + * @data: data to be associated with index + */ +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + + return xa_insert(&rx->dev_tbl, id, data, GFP_NOWAIT); +} + +/** + * hfi1_netdev_remove_data - Removes data with previously given id. + * Returns the reference to removed entry. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + + return xa_erase(&rx->dev_tbl, id); +} + +/** + * hfi1_netdev_get_data - Gets data with given id + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + + return xa_load(&rx->dev_tbl, id); +} + +/** + * hfi1_netdev_get_first_data - Gets first entry with greater or equal id. + * + * @dd: hfi1 dev data + * @start_id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id) +{ + struct hfi1_netdev_rx *rx = dd->netdev_rx; + unsigned long index = *start_id; + void *ret; + + ret = xa_find(&rx->dev_tbl, &index, UINT_MAX, XA_PRESENT); + *start_id = (int)index; + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h new file mode 100644 index 0000000000..31570b0cfd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opa_compat.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#ifndef _LINUX_H +#define _LINUX_H +/* + * This header file is for OPA-specific definitions which are + * required by the HFI driver, and which aren't yet in the Linux + * IB core. We'll collect these all here, then merge them into + * the kernel when that's convenient. + */ + +/* OPA SMA attribute IDs */ +#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b) +#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f) +#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090) +#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091) + +/* OPA PMA attribute IDs */ +#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040) +#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041) +#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042) +#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043) +#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044) + +/* OPA status codes */ +#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100) + +static inline u8 port_states_to_logical_state(struct opa_port_states *ps) +{ + return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE; +} + +static inline u8 port_states_to_phys_state(struct opa_port_states *ps) +{ + return ((ps->portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf; +} + +/* + * OPA port physical states + * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState + * values are the same in OmniPath Architecture. OPA leverages some of the same + * concepts as InfiniBand, but has a few other states as well. + * + * When writing, only values 0-3 are valid, other values are ignored. + * When reading, 0 is reserved. + * + * Returned by the ibphys_portstate() routine. + */ +enum opa_port_phys_state { + /* Values 0-7 have the same meaning in OPA as in InfiniBand. */ + + IB_PORTPHYSSTATE_NOP = 0, + /* 1 is reserved */ + IB_PORTPHYSSTATE_POLLING = 2, + IB_PORTPHYSSTATE_DISABLED = 3, + IB_PORTPHYSSTATE_TRAINING = 4, + IB_PORTPHYSSTATE_LINKUP = 5, + IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6, + IB_PORTPHYSSTATE_PHY_TEST = 7, + /* 8 is reserved */ + + /* + * Offline: Port is quiet (transmitters disabled) due to lack of + * physical media, unsupported media, or transition between link up + * and next link up attempt + */ + OPA_PORTPHYSSTATE_OFFLINE = 9, + + /* 10 is reserved */ + + /* + * Phy_Test: Specific test patterns are transmitted, and receiver BER + * can be monitored. This facilitates signal integrity testing for the + * physical layer of the port. + */ + OPA_PORTPHYSSTATE_TEST = 11, + + OPA_PORTPHYSSTATE_MAX = 11, + /* values 12-15 are reserved/ignored */ +}; + +#endif /* _LINUX_H */ diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c new file mode 100644 index 0000000000..370a5a8eaa --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "opfn.h" + +#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT) + +#define OPFN_CODE(code) BIT((code) - 1) +#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code) + +struct hfi1_opfn_type { + bool (*request)(struct rvt_qp *qp, u64 *data); + bool (*response)(struct rvt_qp *qp, u64 *data); + bool (*reply)(struct rvt_qp *qp, u64 data); + void (*error)(struct rvt_qp *qp); +}; + +static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = { + [STL_VERBS_EXTD_TID_RDMA] = { + .request = tid_rdma_conn_req, + .response = tid_rdma_conn_resp, + .reply = tid_rdma_conn_reply, + .error = tid_rdma_conn_error, + }, +}; + +static struct workqueue_struct *opfn_wq; + +static void opfn_schedule_conn_request(struct rvt_qp *qp); + +static bool hfi1_opfn_extended(u32 bth1) +{ + return !!(bth1 & IB_BTHE_E); +} + +static void opfn_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_atomic_wr wr; + u16 mask, capcode; + struct hfi1_opfn_type *extd; + u64 data; + unsigned long flags; + int ret = 0; + + trace_hfi1_opfn_state_conn_request(qp); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Exit if the extended bit is not set, or if nothing is requested, or + * if we have completed all requests, or if a previous request is in + * progress + */ + if (!priv->opfn.extended || !priv->opfn.requested || + priv->opfn.requested == priv->opfn.completed || priv->opfn.curr) + goto done; + + mask = priv->opfn.requested & ~priv->opfn.completed; + capcode = ilog2(mask & ~(mask - 1)) + 1; + if (capcode >= STL_VERBS_EXTD_MAX) { + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + extd = &hfi1_opfn_handlers[capcode]; + if (!extd || !extd->request || !extd->request(qp, &data)) { + /* + * Either there is no handler for this capability or the request + * packet could not be generated. Either way, mark it as done so + * we don't keep attempting to complete it. + */ + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + trace_hfi1_opfn_data_conn_request(qp, capcode, data); + data = (data & ~0xf) | capcode; + + memset(&wr, 0, sizeof(wr)); + wr.wr.opcode = IB_WR_OPFN; + wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR; + wr.compare_add = data; + + priv->opfn.curr = capcode; /* A new request is now in progress */ + /* Drop opfn.lock before calling ib_post_send() */ + spin_unlock_irqrestore(&priv->opfn.lock, flags); + + ret = ib_post_send(&qp->ibqp, &wr.wr, NULL); + if (ret) + goto err; + trace_hfi1_opfn_state_conn_request(qp); + return; +err: + trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ", + (u64)ret); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * In case of an unexpected error return from ib_post_send + * clear opfn.curr and reschedule to try again + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + opfn_schedule_conn_request(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_send_conn_request(struct work_struct *work) +{ + struct hfi1_opfn_data *od; + struct hfi1_qp_priv *qpriv; + + od = container_of(work, struct hfi1_opfn_data, opfn_work); + qpriv = container_of(od, struct hfi1_qp_priv, opfn); + + opfn_conn_request(qpriv->owner); +} + +/* + * When QP s_lock is held in the caller, the OPFN request must be scheduled + * to a different workqueue to avoid double locking QP s_lock in call to + * ib_post_send in opfn_conn_request + */ +static void opfn_schedule_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + trace_hfi1_opfn_state_sched_conn_request(qp); + queue_work(opfn_wq, &priv->opfn.opfn_work); +} + +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth) +{ + struct hfi1_qp_priv *priv = qp->priv; + u64 data = be64_to_cpu(ateth->compare_data); + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_response(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_response(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->response) { + e->atomic_data = capcode; + return; + } + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (priv->opfn.completed & OPFN_CODE(capcode)) { + /* + * We are receiving a request for a feature that has already + * been negotiated. This may mean that the other side has reset + */ + priv->opfn.completed &= ~OPFN_CODE(capcode); + if (extd->error) + extd->error(qp); + } + + if (extd->response(qp, &data)) + priv->opfn.completed |= OPFN_CODE(capcode); + e->atomic_data = (data & ~0xf) | capcode; + trace_hfi1_opfn_state_conn_response(qp); + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_reply(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_reply(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Either there is no previous request or the reply is not for the + * current request + */ + if (!priv->opfn.curr || capcode != priv->opfn.curr) + goto done; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->reply) + goto clear; + + if (extd->reply(qp, data)) + priv->opfn.completed |= OPFN_CODE(capcode); +clear: + /* + * Clear opfn.curr to indicate that the previous request is no longer in + * progress + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + trace_hfi1_opfn_state_conn_reply(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd = NULL; + unsigned long flags; + u16 capcode; + + trace_hfi1_opfn_state_conn_error(qp); + trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state); + /* + * The QP has gone into the Error state. We have to invalidate all + * negotiated feature, including the one in progress (if any). The RC + * QP handling will clean the WQE for the connection request. + */ + spin_lock_irqsave(&priv->opfn.lock, flags); + while (priv->opfn.completed) { + capcode = priv->opfn.completed & ~(priv->opfn.completed - 1); + extd = &hfi1_opfn_handlers[ilog2(capcode) + 1]; + if (extd->error) + extd->error(qp); + priv->opfn.completed &= ~OPFN_CODE(capcode); + } + priv->opfn.extended = 0; + priv->opfn.requested = 0; + priv->opfn.curr = STL_VERBS_EXTD_NONE; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + + if (attr_mask & IB_QP_RETRY_CNT) + priv->s_retry = attr->retry_cnt; + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct tid_rdma_params *local = &priv->tid_rdma.local; + + if (attr_mask & IB_QP_TIMEOUT) + priv->tid_retry_timeout_jiffies = qp->timeout_jiffies; + if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || + qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { + tid_rdma_opfn_init(qp, local); + /* + * We only want to set the OPFN requested bit when the + * QP transitions to RTS. + */ + if (attr_mask & IB_QP_STATE && + attr->qp_state == IB_QPS_RTS) { + priv->opfn.requested |= OPFN_MASK(TID_RDMA); + /* + * If the QP is transitioning to RTS and the + * opfn.completed for TID RDMA has already been + * set, the QP is being moved *back* into RTS. + * We can now renegotiate the TID RDMA + * parameters. + */ + if (priv->opfn.completed & + OPFN_MASK(TID_RDMA)) { + priv->opfn.completed &= + ~OPFN_MASK(TID_RDMA); + /* + * Since the opfn.completed bit was + * already set, it is safe to assume + * that the opfn.extended is also set. + */ + opfn_schedule_conn_request(qp); + } + } + } else { + memset(local, 0, sizeof(*local)); + } + } + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->opfn.extended && hfi1_opfn_extended(bth1) && + HFI1_CAP_IS_KSET(OPFN)) { + priv->opfn.extended = 1; + if (qp->state == IB_QPS_RTS) + opfn_conn_request(qp); + } +} + +int opfn_init(void) +{ + opfn_wq = alloc_workqueue("hfi_opfn", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | + WQ_MEM_RECLAIM, + HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); + if (!opfn_wq) + return -ENOMEM; + + return 0; +} + +void opfn_exit(void) +{ + if (opfn_wq) { + destroy_workqueue(opfn_wq); + opfn_wq = NULL; + } +} diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h new file mode 100644 index 0000000000..62f93c1dc0 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#ifndef _HFI1_OPFN_H +#define _HFI1_OPFN_H + +/** + * DOC: Omni Path Feature Negotion (OPFN) + * + * OPFN is a discovery protocol for Intel Omni-Path fabric that + * allows two RC QPs to negotiate a common feature that both QPs + * can support. Currently, the only OPA feature that OPFN + * supports is TID RDMA. + * + * Architecture + * + * OPFN involves the communication between two QPs on the HFI + * level on an Omni-Path fabric, and ULPs have no knowledge of + * OPFN at all. + * + * Implementation + * + * OPFN extends the existing IB RC protocol with the following + * changes: + * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport + * Header (BTH1) to indicate that the RC QP supports OPFN; + * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and + * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN + * request; The 64-bit data carried with the request/response + * contains the parameters for negotiation and will be + * defined in tid_rdma.c file; + * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN. + * + * The OPFN communication will be triggered when an RC QP + * receives a request with Bit 24 of BTH1 set. The responder QP + * will then post send an OPFN request with its local + * parameters, which will be sent to the requester QP once all + * existing requests on the responder QP side have been sent. + * Once the requester QP receives the OPFN request, it will + * keep a copy of the responder QP's parameters, and return a + * response packet with its own local parameters. The responder + * QP receives the response packet and keeps a copy of the requester + * QP's parameters. After this exchange, each side has the parameters + * for both sides and therefore can select the right parameters + * for future transactions + */ + +#include <linux/workqueue.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdmavt_qp.h> + +/* STL Verbs Extended */ +#define IB_BTHE_E_SHIFT 24 +#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX + +enum hfi1_opfn_codes { + STL_VERBS_EXTD_NONE = 0, + STL_VERBS_EXTD_TID_RDMA, + STL_VERBS_EXTD_MAX +}; + +struct hfi1_opfn_data { + u8 extended; + u16 requested; + u16 completed; + enum hfi1_opfn_codes curr; + /* serialize opfn function calls */ + spinlock_t lock; + struct work_struct opfn_work; +}; + +/* WR opcode for OPFN */ +#define IB_WR_OPFN IB_WR_RESERVED3 + +void opfn_send_conn_request(struct work_struct *work); +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth); +void opfn_conn_reply(struct rvt_qp *qp, u64 data); +void opfn_conn_error(struct rvt_qp *qp); +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask); +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1); +int opfn_init(void); +void opfn_exit(void); + +#endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c new file mode 100644 index 0000000000..c132a9c073 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -0,0 +1,1396 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2019 Intel Corporation. + */ + +#include <linux/bitfield.h> +#include <linux/pci.h> +#include <linux/io.h> +#include <linux/delay.h> +#include <linux/vmalloc.h> +#include <linux/module.h> + +#include "hfi.h" +#include "chip_registers.h" +#include "aspm.h" + +/* + * This file contains PCIe utility routines. + */ + +/* + * Do all the common PCIe setup and initialization. + */ +int hfi1_pcie_init(struct hfi1_devdata *dd) +{ + int ret; + struct pci_dev *pdev = dd->pcidev; + + ret = pci_enable_device(pdev); + if (ret) { + /* + * This can happen (in theory) iff: + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + dd_dev_err(dd, "pci enable failed: error %d\n", -ret); + return ret; + } + + ret = pci_request_regions(pdev, DRIVER_NAME); + if (ret) { + dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret); + goto bail; + } + + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (ret) { + /* + * If the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (ret) { + dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret); + goto bail; + } + } + + pci_set_master(pdev); + return 0; + +bail: + hfi1_pcie_cleanup(pdev); + return ret; +} + +/* + * Clean what was done in hfi1_pcie_init() + */ +void hfi1_pcie_cleanup(struct pci_dev *pdev) +{ + pci_disable_device(pdev); + /* + * Release regions should be called after the disable. OK to + * call if request regions has not been called or failed. + */ + pci_release_regions(pdev); +} + +/* + * Do remaining PCIe setup, once dd is allocated, and save away + * fields required to re-initialize after a chip reset, or for + * various other purposes + */ +int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) +{ + unsigned long len; + resource_size_t addr; + int ret = 0; + u32 rcv_array_count; + + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + + /* + * The TXE PIO buffers are at the tail end of the chip space. + * Cut them off and map them separately. + */ + + /* sanity check vs expectations */ + if (len != TXE_PIO_SEND + TXE_PIO_SIZE) { + dd_dev_err(dd, "chip PIO range does not match\n"); + return -EINVAL; + } + + dd->kregbase1 = ioremap(addr, RCV_ARRAY); + if (!dd->kregbase1) { + dd_dev_err(dd, "UC mapping of kregbase1 failed\n"); + return -ENOMEM; + } + dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY); + + /* verify that reads actually work, save revision for reset check */ + dd->revision = readq(dd->kregbase1 + CCE_REVISION); + if (dd->revision == ~(u64)0) { + dd_dev_err(dd, "Cannot read chip CSRs\n"); + goto nomem; + } + + rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); + dd_dev_info(dd, "RcvArray count: %u\n", rcv_array_count); + dd->base2_start = RCV_ARRAY + rcv_array_count * 8; + + dd->kregbase2 = ioremap( + addr + dd->base2_start, + TXE_PIO_SEND - dd->base2_start); + if (!dd->kregbase2) { + dd_dev_err(dd, "UC mapping of kregbase2 failed\n"); + goto nomem; + } + dd_dev_info(dd, "UC base2: %p for %x\n", dd->kregbase2, + TXE_PIO_SEND - dd->base2_start); + + dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE); + if (!dd->piobase) { + dd_dev_err(dd, "WC mapping of send buffers failed\n"); + goto nomem; + } + dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE); + + dd->physaddr = addr; /* used for io_remap, etc. */ + + /* + * Map the chip's RcvArray as write-combining to allow us + * to write an entire cacheline worth of entries in one shot. + */ + dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY, + rcv_array_count * 8); + if (!dd->rcvarray_wc) { + dd_dev_err(dd, "WC mapping of receive array failed\n"); + goto nomem; + } + dd_dev_info(dd, "WC RcvArray: %p for %x\n", + dd->rcvarray_wc, rcv_array_count * 8); + + dd->flags |= HFI1_PRESENT; /* chip.c CSR routines now work */ + return 0; +nomem: + ret = -ENOMEM; + hfi1_pcie_ddcleanup(dd); + return ret; +} + +/* + * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior + * to releasing the dd memory. + * Void because all of the core pcie cleanup functions are void. + */ +void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd) +{ + dd->flags &= ~HFI1_PRESENT; + if (dd->kregbase1) + iounmap(dd->kregbase1); + dd->kregbase1 = NULL; + if (dd->kregbase2) + iounmap(dd->kregbase2); + dd->kregbase2 = NULL; + if (dd->rcvarray_wc) + iounmap(dd->rcvarray_wc); + dd->rcvarray_wc = NULL; + if (dd->piobase) + iounmap(dd->piobase); + dd->piobase = NULL; +} + +/* return the PCIe link speed from the given link status */ +static u32 extract_speed(u16 linkstat) +{ + u32 speed; + + switch (linkstat & PCI_EXP_LNKSTA_CLS) { + default: /* not defined, assume Gen1 */ + case PCI_EXP_LNKSTA_CLS_2_5GB: + speed = 2500; /* Gen 1, 2.5GHz */ + break; + case PCI_EXP_LNKSTA_CLS_5_0GB: + speed = 5000; /* Gen 2, 5GHz */ + break; + case PCI_EXP_LNKSTA_CLS_8_0GB: + speed = 8000; /* Gen 3, 8GHz */ + break; + } + return speed; +} + +/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */ +static void update_lbus_info(struct hfi1_devdata *dd) +{ + u16 linkstat; + int ret; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return; + } + + dd->lbus_width = FIELD_GET(PCI_EXP_LNKSTA_NLW, linkstat); + dd->lbus_speed = extract_speed(linkstat); + snprintf(dd->lbus_info, sizeof(dd->lbus_info), + "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width); +} + +/* + * Read in the current PCIe link width and speed. Find if the link is + * Gen3 capable. + */ +int pcie_speeds(struct hfi1_devdata *dd) +{ + u32 linkcap; + struct pci_dev *parent = dd->pcidev->bus->self; + int ret; + + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_err(dd, "Can't find PCI Express capability!\n"); + return -EINVAL; + } + + /* find if our max speed is Gen3 and parent supports Gen3 speeds */ + dd->link_gen3_capable = 1; + + ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return pcibios_err_to_errno(ret); + } + + if ((linkcap & PCI_EXP_LNKCAP_SLS) != PCI_EXP_LNKCAP_SLS_8_0GB) { + dd_dev_info(dd, + "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", + linkcap & PCI_EXP_LNKCAP_SLS); + dd->link_gen3_capable = 0; + } + + /* + * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed + */ + if (parent && + (dd->pcidev->bus->max_bus_speed == PCIE_SPEED_2_5GT || + dd->pcidev->bus->max_bus_speed == PCIE_SPEED_5_0GT)) { + dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n"); + dd->link_gen3_capable = 0; + } + + /* obtain the link width and current speed */ + update_lbus_info(dd); + + dd_dev_info(dd, "%s\n", dd->lbus_info); + + return 0; +} + +/* + * Restore command and BARs after a reset has wiped them out + * + * Returns 0 on success, otherwise a negative error value + */ +int restore_pci_variables(struct hfi1_devdata *dd) +{ + int ret; + + ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + dd->pcibar0); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + dd->pcibar1); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, + dd->pcie_devctl); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, + dd->pcie_lnkctl); + if (ret) + goto error; + + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, + dd->pcie_devctl2); + if (ret) + goto error; + + ret = pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); + if (ret) + goto error; + + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) { + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, + dd->pci_tph2); + if (ret) + goto error; + } + return 0; + +error: + dd_dev_err(dd, "Unable to write to PCI config\n"); + return pcibios_err_to_errno(ret); +} + +/* + * Save BARs and command to rewrite after device reset + * + * Returns 0 on success, otherwise a negative error value + */ +int save_pci_variables(struct hfi1_devdata *dd) +{ + int ret; + + ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + &dd->pcibar0); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + &dd->pcibar1); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom); + if (ret) + goto error; + + ret = pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, + &dd->pcie_devctl); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, + &dd->pcie_lnkctl); + if (ret) + goto error; + + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, + &dd->pcie_devctl2); + if (ret) + goto error; + + ret = pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); + if (ret) + goto error; + + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) { + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, + &dd->pci_tph2); + if (ret) + goto error; + } + return 0; + +error: + dd_dev_err(dd, "Unable to read from PCI config\n"); + return pcibios_err_to_errno(ret); +} + +/* + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +static int hfi1_pcie_caps; +module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444); +MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); + +/** + * tune_pcie_caps() - Code to adjust PCIe capabilities. + * @dd: Valid device data structure + * + */ +void tune_pcie_caps(struct hfi1_devdata *dd) +{ + struct pci_dev *parent; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs, ectl; + int ret; + + /* + * Turn on extended tags in DevCtl in case the BIOS has turned it off + * to improve WFR SDMA bandwidth + */ + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + dd_dev_info(dd, "Enabling PCIe extended tags\n"); + ectl |= PCI_EXP_DEVCTL_EXT_TAG; + ret = pcie_capability_write_word(dd->pcidev, + PCI_EXP_DEVCTL, ectl); + if (ret) + dd_dev_info(dd, "Unable to write to PCI config\n"); + } + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + /* + * The driver cannot perform the tuning if it does not have + * access to the upstream component. + */ + if (!parent) { + dd_dev_info(dd, "Parent not found\n"); + return; + } + if (!pci_is_root_bus(parent->bus)) { + dd_dev_info(dd, "Parent not root\n"); + return; + } + if (!pci_is_pcie(parent)) { + dd_dev_info(dd, "Parent is not PCI Express capable\n"); + return; + } + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_info(dd, "PCI device is not PCI Express capable\n"); + return; + } + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; + /* Find out supported and configured values for endpoint (us) */ + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; + + /* Find max payload supported by root, endpoint */ + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; + + /* If Supported greater than limit in module param, limit it */ + if (rc_mpss > (hfi1_pcie_caps & 7)) + rc_mpss = hfi1_pcie_caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); + } + /* If less than (allowed, supported), bump endpoint payload */ + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); + } + + /* + * Now the Read Request size. + * No field for max supported, but PCIe spec limits it to 4096, + * which is code '5' (log2(4096) - 7) + */ + max_mrrs = 5; + if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7)) + max_mrrs = (hfi1_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); + } + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); + } +} + +/* End of PCIe capability tuning */ + +/* + * From here through hfi1_pci_err_handler definition is invoked via + * PCI error infrastructure, registered via pci + */ +static pci_ers_result_t +pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + switch (state) { + case pci_channel_io_normal: + dd_dev_info(dd, "State Normal, ignoring\n"); + break; + + case pci_channel_io_frozen: + dd_dev_info(dd, "State Frozen, requesting reset\n"); + pci_disable_device(pdev); + ret = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + if (dd) { + dd_dev_info(dd, "State Permanent Failure, disabling\n"); + /* no more register accesses! */ + dd->flags &= ~HFI1_PRESENT; + hfi1_disable_after_error(dd); + } + /* else early, or other problem */ + ret = PCI_ERS_RESULT_DISCONNECT; + break; + + default: /* shouldn't happen */ + dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n", + state); + break; + } + return ret; +} + +static pci_ers_result_t +pci_mmio_enabled(struct pci_dev *pdev) +{ + u64 words = 0U; + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + if (dd && dd->pport) { + words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL); + if (words == ~0ULL) + ret = PCI_ERS_RESULT_NEED_RESET; + dd_dev_info(dd, + "HFI1 mmio_enabled function called, read wordscntr %llx, returning %d\n", + words, ret); + } + return ret; +} + +static pci_ers_result_t +pci_slot_reset(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static void +pci_resume(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 resume function called\n"); + /* + * Running jobs will fail, since it's asynchronous + * unlike sysfs-requested reset. Better than + * doing nothing. + */ + hfi1_init(dd, 1); /* same as re-init after reset */ +} + +const struct pci_error_handlers hfi1_pci_err_handler = { + .error_detected = pci_error_detected, + .mmio_enabled = pci_mmio_enabled, + .slot_reset = pci_slot_reset, + .resume = pci_resume, +}; + +/*============================================================================*/ +/* PCIe Gen3 support */ + +/* + * This code is separated out because it is expected to be removed in the + * final shipping product. If not, then it will be revisited and items + * will be moved to more standard locations. + */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */ +#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */ +#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */ +#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */ +#define DL_ERR_NONE 0x0 /* no error */ +#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */ + /* or response data */ +#define DL_ERR_DISABLED 0x2 /* hfi disabled */ +#define DL_ERR_SECURITY 0x3 /* security check failed */ +#define DL_ERR_SBUS 0x4 /* SBus status error */ +#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/ + +/* gasket block secondary bus reset delay */ +#define SBR_DELAY_US 200000 /* 200ms */ + +static uint pcie_target = 3; +module_param(pcie_target, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)"); + +static uint pcie_force; +module_param(pcie_force, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed"); + +static uint pcie_retry = 5; +module_param(pcie_retry, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed"); + +#define UNSET_PSET 255 +#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */ +#define DEFAULT_MCP_PSET 6 /* MCP HFI */ +static uint pcie_pset = UNSET_PSET; +module_param(pcie_pset, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10"); + +static uint pcie_ctle = 3; /* discrete on, integrated on */ +module_param(pcie_ctle, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off"); + +/* equalization columns */ +#define PREC 0 +#define ATTN 1 +#define POST 2 + +/* discrete silicon preliminary equalization values */ +static const u8 discrete_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x00, 0x12 }, /* p0 */ + { 0x00, 0x00, 0x0c }, /* p1 */ + { 0x00, 0x00, 0x0f }, /* p2 */ + { 0x00, 0x00, 0x09 }, /* p3 */ + { 0x00, 0x00, 0x00 }, /* p4 */ + { 0x06, 0x00, 0x00 }, /* p5 */ + { 0x09, 0x00, 0x00 }, /* p6 */ + { 0x06, 0x00, 0x0f }, /* p7 */ + { 0x09, 0x00, 0x09 }, /* p8 */ + { 0x0c, 0x00, 0x00 }, /* p9 */ + { 0x00, 0x00, 0x18 }, /* p10 */ +}; + +/* integrated silicon preliminary equalization values */ +static const u8 integrated_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x1e, 0x07 }, /* p0 */ + { 0x00, 0x1e, 0x05 }, /* p1 */ + { 0x00, 0x1e, 0x06 }, /* p2 */ + { 0x00, 0x1e, 0x04 }, /* p3 */ + { 0x00, 0x1e, 0x00 }, /* p4 */ + { 0x03, 0x1e, 0x00 }, /* p5 */ + { 0x04, 0x1e, 0x00 }, /* p6 */ + { 0x03, 0x1e, 0x06 }, /* p7 */ + { 0x03, 0x1e, 0x04 }, /* p8 */ + { 0x05, 0x1e, 0x00 }, /* p9 */ + { 0x00, 0x1e, 0x0a }, /* p10 */ +}; + +static const u8 discrete_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x48, 0x0b, 0x04, 0x04 }, /* p0 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p1 */ + { 0x50, 0x09, 0x06, 0x06 }, /* p2 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p3 */ + { 0x80, 0x05, 0x0f, 0x0a }, /* p4 */ + { 0x70, 0x05, 0x0f, 0x0a }, /* p5 */ + { 0x68, 0x05, 0x0f, 0x0a }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x48, 0x09, 0x06, 0x06 }, /* p8 */ + { 0x60, 0x05, 0x0f, 0x0a }, /* p9 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p10 */ +}; + +static const u8 integrated_ctle_tunings[11][4] = { + /* DC LF HF BW */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p0 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p1 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p2 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p3 */ + { 0x58, 0x0a, 0x05, 0x05 }, /* p4 */ + { 0x48, 0x0a, 0x05, 0x05 }, /* p5 */ + { 0x40, 0x0a, 0x05, 0x05 }, /* p6 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */ + { 0x38, 0x0f, 0x00, 0x00 }, /* p8 */ + { 0x38, 0x09, 0x06, 0x06 }, /* p9 */ + { 0x38, 0x0e, 0x01, 0x01 }, /* p10 */ +}; + +/* helper to format the value to write to hardware */ +#define eq_value(pre, curr, post) \ + ((((u32)(pre)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \ + | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \ + | (((u32)(post)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT)) + +/* + * Load the given EQ preset table into the PCIe hardware. + */ +static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, + u8 div) +{ + struct pci_dev *pdev = dd->pcidev; + u32 hit_error = 0; + u32 violation; + u32 i; + u8 c_minus1, c0, c_plus1; + int ret; + + for (i = 0; i < 11; i++) { + /* set index */ + pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i); + /* write the value */ + c_minus1 = eq[i][PREC] / div; + c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div); + c_plus1 = eq[i][POST] / div; + pci_write_config_dword(pdev, PCIE_CFG_REG_PL102, + eq_value(c_minus1, c0, c_plus1)); + /* check if these coefficients violate EQ rules */ + ret = pci_read_config_dword(dd->pcidev, + PCIE_CFG_REG_PL105, &violation); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + hit_error = 1; + break; + } + + if (violation + & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){ + if (hit_error == 0) { + dd_dev_err(dd, + "Gen3 EQ Table Coefficient rule violations\n"); + dd_dev_err(dd, " prec attn post\n"); + } + dd_dev_err(dd, " p%02d: %02x %02x %02x\n", + i, (u32)eq[i][0], (u32)eq[i][1], + (u32)eq[i][2]); + dd_dev_err(dd, " %02x %02x %02x\n", + (u32)c_minus1, (u32)c0, (u32)c_plus1); + hit_error = 1; + } + } + if (hit_error) + return -EINVAL; + return 0; +} + +/* + * Steps to be done after the PCIe firmware is downloaded and + * before the SBR for the Pcie Gen3. + * The SBus resource is already being held. + */ +static void pcie_post_steps(struct hfi1_devdata *dd) +{ + int i; + + set_sbus_fast_mode(dd); + /* + * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1. + * This avoids a spurious framing error that can otherwise be + * generated by the MAC layer. + * + * Use individual addresses since no broadcast is set up. + */ + for (i = 0; i < NUM_PCIE_SERDES; i++) { + sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i], + 0x03, WRITE_SBUS_RECEIVER, 0x00022132); + } + + clear_sbus_fast_mode(dd); +} + +/* + * Trigger a secondary bus reset (SBR) on ourselves using our parent. + * + * Based on pci_parent_bus_reset() which is not exported by the + * kernel core. + */ +static int trigger_sbr(struct hfi1_devdata *dd) +{ + struct pci_dev *dev = dd->pcidev; + struct pci_dev *pdev; + + /* need a parent */ + if (!dev->bus->self) { + dd_dev_err(dd, "%s: no parent device\n", __func__); + return -ENOTTY; + } + + /* should not be anyone else on the bus */ + list_for_each_entry(pdev, &dev->bus->devices, bus_list) + if (pdev != dev) { + dd_dev_err(dd, + "%s: another device is on the same bus\n", + __func__); + return -ENOTTY; + } + + /* + * This is an end around to do an SBR during probe time. A new API needs + * to be implemented to have cleaner interface but this fixes the + * current brokenness + */ + return pci_bridge_secondary_bus_reset(dev->bus->self); +} + +/* + * Write the given gasket interrupt register. + */ +static void write_gasket_interrupt(struct hfi1_devdata *dd, int index, + u16 code, u16 data) +{ + write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8), + (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) | + ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT))); +} + +/* + * Tell the gasket logic how to react to the reset. + */ +static void arm_gasket_logic(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = (((u64)1 << dd->hfi1_id) << + ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) | + ((u64)pcie_serdes_broadcast[dd->hfi1_id] << + ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT | + ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK | + ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) << + ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT); + write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg); + /* read back to push the write */ + read_csr(dd, ASIC_PCIE_SD_HOST_CMD); +} + +/* + * CCE_PCIE_CTRL long name helpers + * We redefine these shorter macros to use in the code while leaving + * chip_registers.h to be autogenerated from the hardware spec. + */ +#define LANE_BUNDLE_MASK CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK +#define LANE_BUNDLE_SHIFT CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT +#define LANE_DELAY_MASK CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK +#define LANE_DELAY_SHIFT CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT +#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT +#define MARGIN_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_SHIFT +#define MARGIN_G1_G2_OVERWRITE_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK +#define MARGIN_G1_G2_OVERWRITE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT +#define MARGIN_GEN1_GEN2_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK +#define MARGIN_GEN1_GEN2_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT + + /* + * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C). + */ +static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname) +{ + u64 pcie_ctrl; + u64 xmt_margin; + u64 xmt_margin_oe; + u64 lane_delay; + u64 lane_bundle; + + pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL); + + /* + * For Discrete, use full-swing. + * - PCIe TX defaults to full-swing. + * Leave this register as default. + * For Integrated, use half-swing + * - Copy xmt_margin and xmt_margin_oe + * from Gen1/Gen2 to Gen3. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */ + /* extract initial fields */ + xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT) + & MARGIN_GEN1_GEN2_MASK; + xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT) + & MARGIN_G1_G2_OVERWRITE_MASK; + lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK; + lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT) + & LANE_BUNDLE_MASK; + + /* + * For A0, EFUSE values are not set. Override with the + * correct values. + */ + if (is_ax(dd)) { + /* + * xmt_margin and OverwiteEnabel should be the + * same for Gen1/Gen2 and Gen3 + */ + xmt_margin = 0x5; + xmt_margin_oe = 0x1; + lane_delay = 0xF; /* Delay 240ns. */ + lane_bundle = 0x0; /* Set to 1 lane. */ + } + + /* overwrite existing values */ + pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT) + | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT) + | (xmt_margin << MARGIN_SHIFT) + | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT) + | (lane_delay << LANE_DELAY_SHIFT) + | (lane_bundle << LANE_BUNDLE_SHIFT); + + write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl); + } + + dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n", + fname, pcie_ctrl); +} + +/* + * Do all the steps needed to transition the PCIe link to Gen3 speed. + */ +int do_pcie_gen3_transition(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u64 fw_ctrl; + u64 reg, therm; + u32 reg32, fs, lf; + u32 status, err; + int ret; + int do_retry, retry_count = 0; + int intnum = 0; + uint default_pset; + uint pset = pcie_pset; + u16 target_vector, target_speed; + u16 lnkctl2, vendor; + u8 div; + const u8 (*eq)[3]; + const u8 (*ctle_tunings)[4]; + uint static_ctle_mode; + int return_error = 0; + u32 target_width; + + /* PCIe Gen3 is for the ASIC only */ + if (dd->icode != ICODE_RTL_SILICON) + return 0; + + if (pcie_target == 1) { /* target Gen1 */ + target_vector = PCI_EXP_LNKCTL2_TLS_2_5GT; + target_speed = 2500; + } else if (pcie_target == 2) { /* target Gen2 */ + target_vector = PCI_EXP_LNKCTL2_TLS_5_0GT; + target_speed = 5000; + } else if (pcie_target == 3) { /* target Gen3 */ + target_vector = PCI_EXP_LNKCTL2_TLS_8_0GT; + target_speed = 8000; + } else { + /* off or invalid target - skip */ + dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__); + return 0; + } + + /* if already at target speed, done (unless forced) */ + if (dd->lbus_speed == target_speed) { + dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__, + pcie_target, + pcie_force ? "re-doing anyway" : "skipping"); + if (!pcie_force) + return 0; + } + + /* + * The driver cannot do the transition if it has no access to the + * upstream component + */ + if (!parent) { + dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n", + __func__); + return 0; + } + + /* Previous Gen1/Gen2 bus width */ + target_width = dd->lbus_width; + + /* + * Do the Gen3 transition. Steps are those of the PCIe Gen3 + * recipe. + */ + + /* step 1: pcie link working in gen1/gen2 */ + + /* step 2: if either side is not capable of Gen3, done */ + if (pcie_target == 3 && !dd->link_gen3_capable) { + dd_dev_err(dd, "The PCIe link is not Gen3 capable\n"); + ret = -ENOSYS; + goto done_no_mutex; + } + + /* hold the SBus resource across the firmware download and SBR */ + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "%s: unable to acquire SBus resource\n", + __func__); + return ret; + } + + /* make sure thermal polling is not causing interrupts */ + therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN); + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); + msleep(100); + dd_dev_info(dd, "%s: Disabled therm polling\n", + __func__); + } + +retry: + /* the SBus download will reset the spico for thermal */ + + /* step 3: download SBus Master firmware */ + /* step 4: download PCIe Gen3 SerDes firmware */ + dd_dev_info(dd, "%s: downloading firmware\n", __func__); + ret = load_pcie_firmware(dd); + if (ret) { + /* do not proceed if the firmware cannot be downloaded */ + return_error = 1; + goto done; + } + + /* step 5: set up device parameter settings */ + dd_dev_info(dd, "%s: setting PCIe registers\n", __func__); + + /* + * PcieCfgSpcie1 - Link Control 3 + * Leave at reset value. No need to set PerfEq - link equalization + * will be performed automatically after the SBR when the target + * speed is 8GT/s. + */ + + /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */ + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff); + + /* step 5a: Set Synopsys Port Logic registers */ + + /* + * PcieCfgRegPl2 - Port Force Link + * + * Set the low power field to 0x10 to avoid unnecessary power + * management messages. All other fields are zero. + */ + reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32); + + /* + * PcieCfgRegPl100 - Gen3 Control + * + * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl + * turn on PcieCfgRegPl100.EqEieosCnt + * Everything else zero. + */ + reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32); + + /* + * PcieCfgRegPl101 - Gen3 EQ FS and LF + * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping + * PcieCfgRegPl103 - Gen3 EQ Preset Index + * PcieCfgRegPl105 - Gen3 EQ Status + * + * Give initial EQ settings. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */ + /* 1000mV, FS=24, LF = 8 */ + fs = 24; + lf = 8; + div = 3; + eq = discrete_preliminary_eq; + default_pset = DEFAULT_DISCRETE_PSET; + ctle_tunings = discrete_ctle_tunings; + /* bit 0 - discrete on/off */ + static_ctle_mode = pcie_ctle & 0x1; + } else { + /* 400mV, FS=29, LF = 9 */ + fs = 29; + lf = 9; + div = 1; + eq = integrated_preliminary_eq; + default_pset = DEFAULT_MCP_PSET; + ctle_tunings = integrated_ctle_tunings; + /* bit 1 - integrated on/off */ + static_ctle_mode = (pcie_ctle >> 1) & 0x1; + } + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, + (fs << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) | + (lf << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT)); + ret = load_eq_table(dd, eq, fs, div); + if (ret) + goto done; + + /* + * PcieCfgRegPl106 - Gen3 EQ Control + * + * Set Gen3EqPsetReqVec, leave other fields 0. + */ + if (pset == UNSET_PSET) + pset = default_pset; + if (pset > 10) { /* valid range is 0-10, inclusive */ + dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n", + __func__, pset, default_pset); + pset = default_pset; + } + dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pset); + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106, + ((1 << pset) << + PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) | + PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK | + PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK); + + /* + * step 5b: Do post firmware download steps via SBus + */ + dd_dev_info(dd, "%s: doing pcie post steps\n", __func__); + pcie_post_steps(dd); + + /* + * step 5c: Program gasket interrupts + */ + /* set the Rx Bit Rate to REFCLK ratio */ + write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050); + /* disable pCal for PCIe Gen3 RX equalization */ + /* select adaptive or static CTLE */ + write_gasket_interrupt(dd, intnum++, 0x0026, + 0x5b01 | (static_ctle_mode << 3)); + /* + * Enable iCal for PCIe Gen3 RX equalization, and set which + * evaluation of RX_EQ_EVAL will launch the iCal procedure. + */ + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202); + + if (static_ctle_mode) { + /* apply static CTLE tunings */ + u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw; + + pcie_dc = ctle_tunings[pset][0]; + pcie_lf = ctle_tunings[pset][1]; + pcie_hf = ctle_tunings[pset][2]; + pcie_bw = ctle_tunings[pset][3]; + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf); + write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw); + } + + /* terminate list */ + write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000); + + /* + * step 5d: program XMT margin + */ + write_xmt_margin(dd, __func__); + + /* + * step 5e: disable active state power management (ASPM). It + * will be enabled if required later + */ + dd_dev_info(dd, "%s: clearing ASPM\n", __func__); + aspm_hw_disable_l1(dd); + + /* + * step 5f: clear DirectSpeedChange + * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the + * change in the speed target from starting before we are ready. + * This field defaults to 0 and we are not changing it, so nothing + * needs to be done. + */ + + /* step 5g: Set target link speed */ + /* + * Set target link speed to be target on both device and parent. + * On setting the parent: Some system BIOSs "helpfully" set the + * parent target speed to Gen2 to match the ASIC's initial speed. + * We can set the target Gen3 because we have already checked + * that it is Gen3 capable earlier. + */ + dd_dev_info(dd, "%s: setting parent target link speed\n", __func__); + ret = pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + /* only write to parent if target is not as high as ours */ + if ((lnkctl2 & PCI_EXP_LNKCTL2_TLS) < target_vector) { + lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + ret = pcie_capability_write_word(parent, + PCI_EXP_LNKCTL2, lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to write to PCI config\n"); + return_error = 1; + goto done; + } + } else { + dd_dev_info(dd, "%s: ..target speed is OK\n", __func__); + } + + dd_dev_info(dd, "%s: setting target link speed\n", __func__); + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); + if (ret) { + dd_dev_err(dd, "Unable to write to PCI config\n"); + return_error = 1; + goto done; + } + + /* step 5h: arm gasket logic */ + /* hold DC in reset across the SBR */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */ + /* save firmware control across the SBR */ + fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL); + + dd_dev_info(dd, "%s: arming gasket logic\n", __func__); + arm_gasket_logic(dd); + + /* + * step 6: quiesce PCIe link + * The chip has already been reset, so there will be no traffic + * from the chip. Linux has no easy way to enforce that it will + * not try to access the device, so we just need to hope it doesn't + * do it while we are doing the reset. + */ + + /* + * step 7: initiate the secondary bus reset (SBR) + * step 8: hardware brings the links back up + * step 9: wait for link speed transition to be complete + */ + dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__); + ret = trigger_sbr(dd); + if (ret) + goto done; + + /* step 10: decide what to do next */ + + /* check if we can read PCI space */ + ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor); + if (ret) { + dd_dev_info(dd, + "%s: read of VendorID failed after SBR, err %d\n", + __func__, ret); + return_error = 1; + goto done; + } + if (vendor == 0xffff) { + dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__); + return_error = 1; + ret = -EIO; + goto done; + } + + /* restore PCI space registers we know were reset */ + dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__); + ret = restore_pci_variables(dd); + if (ret) { + dd_dev_err(dd, "%s: Could not restore PCI variables\n", + __func__); + return_error = 1; + goto done; + } + + /* restore firmware control */ + write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl); + + /* + * Check the gasket block status. + * + * This is the first CSR read after the SBR. If the read returns + * all 1s (fails), the link did not make it back. + * + * Once we're sure we can read and write, clear the DC reset after + * the SBR. Then check for any per-lane errors. Then look over + * the status. + */ + reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS); + dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg); + if (reg == ~0ull) { /* PCIe read failed/timeout */ + dd_dev_err(dd, "SBR failed - unable to read from device\n"); + return_error = 1; + ret = -ENOSYS; + goto done; + } + + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + + /* Set the LED off */ + setextled(dd, 0); + + /* check for any per-lane errors */ + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); + if (ret) { + dd_dev_err(dd, "Unable to read from PCI config\n"); + return_error = 1; + goto done; + } + + dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32); + + /* extract status, look for our HFI */ + status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK; + if ((status & (1 << dd->hfi1_id)) == 0) { + dd_dev_err(dd, + "%s: gasket status 0x%x, expecting 0x%x\n", + __func__, status, 1 << dd->hfi1_id); + ret = -EIO; + goto done; + } + + /* extract error */ + err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK; + if (err) { + dd_dev_err(dd, "%s: gasket error %d\n", __func__, err); + ret = -EIO; + goto done; + } + + /* update our link information cache */ + update_lbus_info(dd); + dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, + dd->lbus_info); + + if (dd->lbus_speed != target_speed || + dd->lbus_width < target_width) { /* not target */ + /* maybe retry */ + do_retry = retry_count < pcie_retry; + dd_dev_err(dd, "PCIe link speed or width did not match target%s\n", + do_retry ? ", retrying" : ""); + retry_count++; + if (do_retry) { + msleep(100); /* allow time to settle */ + goto retry; + } + ret = -EIO; + } + +done: + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + msleep(100); + dd_dev_info(dd, "%s: Re-enable therm polling\n", + __func__); + } + release_chip_resource(dd, CR_SBUS); +done_no_mutex: + /* return no error if it is OK to be at current speed */ + if (ret && !return_error) { + dd_dev_err(dd, "Proceeding at current speed PCIe speed\n"); + ret = 0; + } + + dd_dev_info(dd, "%s: done\n", __func__); + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/pin_system.c b/drivers/infiniband/hw/hfi1/pin_system.c new file mode 100644 index 0000000000..384f722093 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pin_system.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2023 - Cornelis Networks, Inc. + */ + +#include <linux/types.h> + +#include "hfi.h" +#include "common.h" +#include "device.h" +#include "pinning.h" +#include "mmu_rb.h" +#include "user_sdma.h" +#include "trace.h" + +struct sdma_mmu_node { + struct mmu_rb_node rb; + struct hfi1_user_sdma_pkt_q *pq; + struct page **pages; + unsigned int npages; +}; + +static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, + unsigned long len); +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, + bool *stop); +static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); + +static struct mmu_rb_ops sdma_rb_ops = { + .filter = sdma_rb_filter, + .evict = sdma_rb_evict, + .remove = sdma_rb_remove, +}; + +int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq) +{ + struct hfi1_devdata *dd = pq->dd; + int ret; + + ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, + &pq->handler); + if (ret) + dd_dev_err(dd, + "[%u:%u] Failed to register system memory DMA support with MMU: %d\n", + pq->ctxt, pq->subctxt, ret); + return ret; +} + +void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq) +{ + if (pq->handler) + hfi1_mmu_rb_unregister(pq->handler); +} + +static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) +{ + struct evict_data evict_data; + + evict_data.cleared = 0; + evict_data.target = npages; + hfi1_mmu_rb_evict(pq->handler, &evict_data); + return evict_data.cleared; +} + +static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, + unsigned int start, unsigned int npages) +{ + hfi1_release_user_pages(mm, pages + start, npages, false); + kfree(pages); +} + +static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node) +{ + return node->rb.handler->mn.mm; +} + +static void free_system_node(struct sdma_mmu_node *node) +{ + if (node->npages) { + unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, + node->npages); + atomic_sub(node->npages, &node->pq->n_locked); + } + kfree(node); +} + +/* + * kref_get()'s an additional kref on the returned rb_node to prevent rb_node + * from being released until after rb_node is assigned to an SDMA descriptor + * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the + * virtual address range for rb_node is invalidated between now and then. + */ +static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, + unsigned long start, + unsigned long end) +{ + struct mmu_rb_node *rb_node; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start)); + if (!rb_node) { + spin_unlock_irqrestore(&handler->lock, flags); + return NULL; + } + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&rb_node->refcount); + spin_unlock_irqrestore(&handler->lock, flags); + + return container_of(rb_node, struct sdma_mmu_node, rb); +} + +static int pin_system_pages(struct user_sdma_request *req, + uintptr_t start_address, size_t length, + struct sdma_mmu_node *node, int npages) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + int pinned, cleared; + struct page **pages; + + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + +retry: + if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked), + npages)) { + SDMA_DBG(req, "Evicting: nlocked %u npages %u", + atomic_read(&pq->n_locked), npages); + cleared = sdma_cache_evict(pq, npages); + if (cleared >= npages) + goto retry; + } + + SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u", + start_address, node->npages, npages); + pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0, + pages); + + if (pinned < 0) { + kfree(pages); + SDMA_DBG(req, "pinned %d", pinned); + return pinned; + } + if (pinned != npages) { + unpin_vector_pages(current->mm, pages, node->npages, pinned); + SDMA_DBG(req, "npages %u pinned %d", npages, pinned); + return -EFAULT; + } + node->rb.addr = start_address; + node->rb.len = length; + node->pages = pages; + node->npages = npages; + atomic_add(pinned, &pq->n_locked); + SDMA_DBG(req, "done. pinned %d", pinned); + return 0; +} + +/* + * kref refcount on *node_p will be 2 on successful addition: one kref from + * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being + * released until after *node_p is assigned to an SDMA descriptor (struct + * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual + * address range for *node_p is invalidated between now and then. + */ +static int add_system_pinning(struct user_sdma_request *req, + struct sdma_mmu_node **node_p, + unsigned long start, unsigned long len) + +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct sdma_mmu_node *node; + int ret; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + /* First kref "moves" to mmu_rb_handler */ + kref_init(&node->rb.refcount); + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&node->rb.refcount); + + node->pq = pq; + ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); + if (ret == 0) { + ret = hfi1_mmu_rb_insert(pq->handler, &node->rb); + if (ret) + free_system_node(node); + else + *node_p = node; + + return ret; + } + + kfree(node); + return ret; +} + +static int get_system_cache_entry(struct user_sdma_request *req, + struct sdma_mmu_node **node_p, + size_t req_start, size_t req_len) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); + u64 end = PFN_ALIGN(req_start + req_len); + int ret; + + if ((end - start) == 0) { + SDMA_DBG(req, + "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx", + req_start, req_len, start, end); + return -EINVAL; + } + + SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len); + + while (1) { + struct sdma_mmu_node *node = + find_system_node(pq->handler, start, end); + u64 prepend_len = 0; + + SDMA_DBG(req, "node %p start %llx end %llu", node, start, end); + if (!node) { + ret = add_system_pinning(req, node_p, start, + end - start); + if (ret == -EEXIST) { + /* + * Another execution context has inserted a + * conficting entry first. + */ + continue; + } + return ret; + } + + if (node->rb.addr <= start) { + /* + * This entry covers at least part of the region. If it doesn't extend + * to the end, then this will be called again for the next segment. + */ + *node_p = node; + return 0; + } + + SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", + node->rb.addr, kref_read(&node->rb.refcount)); + prepend_len = node->rb.addr - start; + + /* + * This node will not be returned, instead a new node + * will be. So release the reference. + */ + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); + + /* Prepend a node to cover the beginning of the allocation */ + ret = add_system_pinning(req, node_p, start, prepend_len); + if (ret == -EEXIST) { + /* Another execution context has inserted a conficting entry first. */ + continue; + } + return ret; + } +} + +static void sdma_mmu_rb_node_get(void *ctx) +{ + struct mmu_rb_node *node = ctx; + + kref_get(&node->refcount); +} + +static void sdma_mmu_rb_node_put(void *ctx) +{ + struct sdma_mmu_node *node = ctx; + + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); +} + +static int add_mapping_to_sdma_packet(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + struct sdma_mmu_node *cache_entry, + size_t start, + size_t from_this_cache_entry) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + unsigned int page_offset; + unsigned int from_this_page; + size_t page_index; + void *ctx; + int ret; + + /* + * Because the cache may be more fragmented than the memory that is being accessed, + * it's not strictly necessary to have a descriptor per cache entry. + */ + + while (from_this_cache_entry) { + page_index = PFN_DOWN(start - cache_entry->rb.addr); + + if (page_index >= cache_entry->npages) { + SDMA_DBG(req, + "Request for page_index %zu >= cache_entry->npages %u", + page_index, cache_entry->npages); + return -EINVAL; + } + + page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); + from_this_page = PAGE_SIZE - page_offset; + + if (from_this_page < from_this_cache_entry) { + ctx = NULL; + } else { + /* + * In the case they are equal the next line has no practical effect, + * but it's better to do a register to register copy than a conditional + * branch. + */ + from_this_page = from_this_cache_entry; + ctx = cache_entry; + } + + ret = sdma_txadd_page(pq->dd, &tx->txreq, + cache_entry->pages[page_index], + page_offset, from_this_page, + ctx, + sdma_mmu_rb_node_get, + sdma_mmu_rb_node_put); + if (ret) { + /* + * When there's a failure, the entire request is freed by + * user_sdma_send_pkts(). + */ + SDMA_DBG(req, + "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u", + ret, page_index, page_offset, from_this_page); + return ret; + } + start += from_this_page; + from_this_cache_entry -= from_this_page; + } + return 0; +} + +static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + struct user_sdma_iovec *iovec, + size_t from_this_iovec) +{ + while (from_this_iovec > 0) { + struct sdma_mmu_node *cache_entry; + size_t from_this_cache_entry; + size_t start; + int ret; + + start = (uintptr_t)iovec->iov.iov_base + iovec->offset; + ret = get_system_cache_entry(req, &cache_entry, start, + from_this_iovec); + if (ret) { + SDMA_DBG(req, "pin system segment failed %d", ret); + return ret; + } + + from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); + if (from_this_cache_entry > from_this_iovec) + from_this_cache_entry = from_this_iovec; + + ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, + from_this_cache_entry); + + /* + * Done adding cache_entry to zero or more sdma_desc. Can + * kref_put() the "safety" kref taken under + * get_system_cache_entry(). + */ + kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); + + if (ret) { + SDMA_DBG(req, "add system segment failed %d", ret); + return ret; + } + + iovec->offset += from_this_cache_entry; + from_this_iovec -= from_this_cache_entry; + } + + return 0; +} + +/* + * Add up to pkt_data_remaining bytes to the txreq, starting at the current + * offset in the given iovec entry and continuing until all data has been added + * to the iovec or the iovec entry type changes. + * + * On success, prior to returning, adjust pkt_data_remaining, req->iov_idx, and + * the offset value in req->iov[req->iov_idx] to reflect the data that has been + * consumed. + */ +int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + struct user_sdma_iovec *iovec, + u32 *pkt_data_remaining) +{ + size_t remaining_to_add = *pkt_data_remaining; + /* + * Walk through iovec entries, ensure the associated pages + * are pinned and mapped, add data to the packet until no more + * data remains to be added or the iovec entry type changes. + */ + while (remaining_to_add > 0) { + struct user_sdma_iovec *cur_iovec; + size_t from_this_iovec; + int ret; + + cur_iovec = iovec; + from_this_iovec = iovec->iov.iov_len - iovec->offset; + + if (from_this_iovec > remaining_to_add) { + from_this_iovec = remaining_to_add; + } else { + /* The current iovec entry will be consumed by this pass. */ + req->iov_idx++; + iovec++; + } + + ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec, + from_this_iovec); + if (ret) + return ret; + + remaining_to_add -= from_this_iovec; + } + *pkt_data_remaining = remaining_to_add; + + return 0; +} + +static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, + unsigned long len) +{ + return (bool)(node->addr == addr); +} + +/* + * Return 1 to remove the node from the rb tree and call the remove op. + * + * Called with the rb tree lock held. + */ +static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, + void *evict_arg, bool *stop) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + struct evict_data *evict_data = evict_arg; + + /* this node will be evicted, add its pages to our count */ + evict_data->cleared += node->npages; + + /* have enough pages been cleared? */ + if (evict_data->cleared >= evict_data->target) + *stop = true; + + return 1; /* remove this node */ +} + +static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + free_system_node(node); +} diff --git a/drivers/infiniband/hw/hfi1/pinning.h b/drivers/infiniband/hw/hfi1/pinning.h new file mode 100644 index 0000000000..a814a3aa96 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pinning.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2023 Cornelis Networks, Inc. + */ +#ifndef _HFI1_PINNING_H +#define _HFI1_PINNING_H + +struct hfi1_user_sdma_pkt_q; +struct user_sdma_request; +struct user_sdma_txreq; +struct user_sdma_iovec; + +int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq); +void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq); +int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + struct user_sdma_iovec *iovec, + u32 *pkt_data_remaining); + +#endif /* _HFI1_PINNING_H */ diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c new file mode 100644 index 0000000000..dfea53e0fd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -0,0 +1,2141 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2018 Intel Corporation. + */ + +#include <linux/delay.h> +#include "hfi.h" +#include "qp.h" +#include "trace.h" + +#define SC(name) SEND_CTXT_##name +/* + * Send Context functions + */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause); + +/* + * Set the CM reset bit and wait for it to clear. Use the provided + * sendctrl register. This routine has no locking. + */ +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) +{ + write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK); + while (1) { + udelay(1); + sendctrl = read_csr(dd, SEND_CTRL); + if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0) + break; + } +} + +/* global control of PIO send */ +void pio_send_control(struct hfi1_devdata *dd, int op) +{ + u64 reg, mask; + unsigned long flags; + int write = 1; /* write sendctrl back */ + int flush = 0; /* re-read sendctrl to make sure it is flushed */ + int i; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + reg = read_csr(dd, SEND_CTRL); + switch (op) { + case PSC_GLOBAL_ENABLE: + reg |= SEND_CTRL_SEND_ENABLE_SMASK; + fallthrough; + case PSC_DATA_VL_ENABLE: + mask = 0; + for (i = 0; i < ARRAY_SIZE(dd->vld); i++) + if (!dd->vld[i].mtu) + mask |= BIT_ULL(i); + /* Disallow sending on VLs not enabled */ + mask = (mask & SEND_CTRL_UNSUPPORTED_VL_MASK) << + SEND_CTRL_UNSUPPORTED_VL_SHIFT; + reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask; + break; + case PSC_GLOBAL_DISABLE: + reg &= ~SEND_CTRL_SEND_ENABLE_SMASK; + break; + case PSC_GLOBAL_VLARB_ENABLE: + reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK; + break; + case PSC_GLOBAL_VLARB_DISABLE: + reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK; + break; + case PSC_CM_RESET: + __cm_reset(dd, reg); + write = 0; /* CSR already written (and flushed) */ + break; + case PSC_DATA_VL_DISABLE: + reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK; + flush = 1; + break; + default: + dd_dev_err(dd, "%s: invalid control %d\n", __func__, op); + break; + } + + if (write) { + write_csr(dd, SEND_CTRL, reg); + if (flush) + (void)read_csr(dd, SEND_CTRL); /* flush write */ + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* number of send context memory pools */ +#define NUM_SC_POOLS 2 + +/* Send Context Size (SCS) wildcards */ +#define SCS_POOL_0 -1 +#define SCS_POOL_1 -2 + +/* Send Context Count (SCC) wildcards */ +#define SCC_PER_VL -1 +#define SCC_PER_CPU -2 +#define SCC_PER_KRCVQ -3 + +/* Send Context Size (SCS) constants */ +#define SCS_ACK_CREDITS 32 +#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */ + +#define PIO_THRESHOLD_CEILING 4096 + +#define PIO_WAIT_BATCH_SIZE 5 + +/* default send context sizes */ +static struct sc_config_sizes sc_config_sizes[SC_MAX] = { + [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_VL }, /* one per NUMA */ + [SC_ACK] = { .size = SCS_ACK_CREDITS, + .count = SCC_PER_KRCVQ }, + [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_CPU }, /* one per CPU */ + [SC_VL15] = { .size = SCS_VL15_CREDITS, + .count = 1 }, + +}; + +/* send context memory pool configuration */ +struct mem_pool_config { + int centipercent; /* % of memory, in 100ths of 1% */ + int absolute_blocks; /* absolute block count */ +}; + +/* default memory pool configuration: 100% in pool 0 */ +static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = { + /* centi%, abs blocks */ + { 10000, -1 }, /* pool 0 */ + { 0, -1 }, /* pool 1 */ +}; + +/* memory pool information, used when calculating final sizes */ +struct mem_pool_info { + int centipercent; /* + * 100th of 1% of memory to use, -1 if blocks + * already set + */ + int count; /* count of contexts in the pool */ + int blocks; /* block size of the pool */ + int size; /* context size, in blocks */ +}; + +/* + * Convert a pool wildcard to a valid pool index. The wildcards + * start at -1 and increase negatively. Map them as: + * -1 => 0 + * -2 => 1 + * etc. + * + * Return -1 on non-wildcard input, otherwise convert to a pool number. + */ +static int wildcard_to_pool(int wc) +{ + if (wc >= 0) + return -1; /* non-wildcard */ + return -wc - 1; +} + +static const char *sc_type_names[SC_MAX] = { + "kernel", + "ack", + "user", + "vl15" +}; + +static const char *sc_type_name(int index) +{ + if (index < 0 || index >= SC_MAX) + return "unknown"; + return sc_type_names[index]; +} + +/* + * Read the send context memory pool configuration and send context + * size configuration. Replace any wildcards and come up with final + * counts and sizes for the send context types. + */ +int init_sc_pools_and_sizes(struct hfi1_devdata *dd) +{ + struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } }; + int total_blocks = (chip_pio_mem_size(dd) / PIO_BLOCK_SIZE) - 1; + int total_contexts = 0; + int fixed_blocks; + int pool_blocks; + int used_blocks; + int cp_total; /* centipercent total */ + int ab_total; /* absolute block total */ + int extra; + int i; + + /* + * When SDMA is enabled, kernel context pio packet size is capped by + * "piothreshold". Reduce pio buffer allocation for kernel context by + * setting it to a fixed size. The allocation allows 3-deep buffering + * of the largest pio packets plus up to 128 bytes header, sufficient + * to maintain verbs performance. + * + * When SDMA is disabled, keep the default pooling allocation. + */ + if (HFI1_CAP_IS_KSET(SDMA)) { + u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ? + piothreshold : PIO_THRESHOLD_CEILING; + sc_config_sizes[SC_KERNEL].size = + 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE; + } + + /* + * Step 0: + * - copy the centipercents/absolute sizes from the pool config + * - sanity check these values + * - add up centipercents, then later check for full value + * - add up absolute blocks, then later check for over-commit + */ + cp_total = 0; + ab_total = 0; + for (i = 0; i < NUM_SC_POOLS; i++) { + int cp = sc_mem_pool_config[i].centipercent; + int ab = sc_mem_pool_config[i].absolute_blocks; + + /* + * A negative value is "unused" or "invalid". Both *can* + * be valid, but centipercent wins, so check that first + */ + if (cp >= 0) { /* centipercent valid */ + cp_total += cp; + } else if (ab >= 0) { /* absolute blocks valid */ + ab_total += ab; + } else { /* neither valid */ + dd_dev_err( + dd, + "Send context memory pool %d: both the block count and centipercent are invalid\n", + i); + return -EINVAL; + } + + mem_pool_info[i].centipercent = cp; + mem_pool_info[i].blocks = ab; + } + + /* do not use both % and absolute blocks for different pools */ + if (cp_total != 0 && ab_total != 0) { + dd_dev_err( + dd, + "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n"); + return -EINVAL; + } + + /* if any percentages are present, they must add up to 100% x 100 */ + if (cp_total != 0 && cp_total != 10000) { + dd_dev_err( + dd, + "Send context memory pool centipercent is %d, expecting 10000\n", + cp_total); + return -EINVAL; + } + + /* the absolute pool total cannot be more than the mem total */ + if (ab_total > total_blocks) { + dd_dev_err( + dd, + "Send context memory pool absolute block count %d is larger than the memory size %d\n", + ab_total, total_blocks); + return -EINVAL; + } + + /* + * Step 2: + * - copy from the context size config + * - replace context type wildcard counts with real values + * - add up non-memory pool block sizes + * - add up memory pool user counts + */ + fixed_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + int count = sc_config_sizes[i].count; + int size = sc_config_sizes[i].size; + int pool; + + /* + * Sanity check count: Either a positive value or + * one of the expected wildcards is valid. The positive + * value is checked later when we compare against total + * memory available. + */ + if (i == SC_ACK) { + count = dd->n_krcv_queues; + } else if (i == SC_KERNEL) { + count = INIT_SC_PER_VL * num_vls; + } else if (count == SCC_PER_CPU) { + count = dd->num_rcv_contexts - dd->n_krcv_queues; + } else if (count < 0) { + dd_dev_err( + dd, + "%s send context invalid count wildcard %d\n", + sc_type_name(i), count); + return -EINVAL; + } + if (total_contexts + count > chip_send_contexts(dd)) + count = chip_send_contexts(dd) - total_contexts; + + total_contexts += count; + + /* + * Sanity check pool: The conversion will return a pool + * number or -1 if a fixed (non-negative) value. The fixed + * value is checked later when we compare against + * total memory available. + */ + pool = wildcard_to_pool(size); + if (pool == -1) { /* non-wildcard */ + fixed_blocks += size * count; + } else if (pool < NUM_SC_POOLS) { /* valid wildcard */ + mem_pool_info[pool].count += count; + } else { /* invalid wildcard */ + dd_dev_err( + dd, + "%s send context invalid pool wildcard %d\n", + sc_type_name(i), size); + return -EINVAL; + } + + dd->sc_sizes[i].count = count; + dd->sc_sizes[i].size = size; + } + if (fixed_blocks > total_blocks) { + dd_dev_err( + dd, + "Send context fixed block count, %u, larger than total block count %u\n", + fixed_blocks, total_blocks); + return -EINVAL; + } + + /* step 3: calculate the blocks in the pools, and pool context sizes */ + pool_blocks = total_blocks - fixed_blocks; + if (ab_total > pool_blocks) { + dd_dev_err( + dd, + "Send context fixed pool sizes, %u, larger than pool block count %u\n", + ab_total, pool_blocks); + return -EINVAL; + } + /* subtract off the fixed pool blocks */ + pool_blocks -= ab_total; + + for (i = 0; i < NUM_SC_POOLS; i++) { + struct mem_pool_info *pi = &mem_pool_info[i]; + + /* % beats absolute blocks */ + if (pi->centipercent >= 0) + pi->blocks = (pool_blocks * pi->centipercent) / 10000; + + if (pi->blocks == 0 && pi->count != 0) { + dd_dev_err( + dd, + "Send context memory pool %d has %u contexts, but no blocks\n", + i, pi->count); + return -EINVAL; + } + if (pi->count == 0) { + /* warn about wasted blocks */ + if (pi->blocks != 0) + dd_dev_err( + dd, + "Send context memory pool %d has %u blocks, but zero contexts\n", + i, pi->blocks); + pi->size = 0; + } else { + pi->size = pi->blocks / pi->count; + } + } + + /* step 4: fill in the context type sizes from the pool sizes */ + used_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + if (dd->sc_sizes[i].size < 0) { + unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size); + + WARN_ON_ONCE(pool >= NUM_SC_POOLS); + dd->sc_sizes[i].size = mem_pool_info[pool].size; + } + /* make sure we are not larger than what is allowed by the HW */ +#define PIO_MAX_BLOCKS 1024 + if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS) + dd->sc_sizes[i].size = PIO_MAX_BLOCKS; + + /* calculate our total usage */ + used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count; + } + extra = total_blocks - used_blocks; + if (extra != 0) + dd_dev_info(dd, "unused send context blocks: %d\n", extra); + + return total_contexts; +} + +int init_send_contexts(struct hfi1_devdata *dd) +{ + u16 base; + int ret, i, j, context; + + ret = init_credit_return(dd); + if (ret) + return ret; + + dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8), + GFP_KERNEL); + dd->send_contexts = kcalloc(dd->num_send_contexts, + sizeof(struct send_context_info), + GFP_KERNEL); + if (!dd->send_contexts || !dd->hw_to_sw) { + kfree(dd->hw_to_sw); + kfree(dd->send_contexts); + free_credit_return(dd); + return -ENOMEM; + } + + /* hardware context map starts with invalid send context indices */ + for (i = 0; i < TXE_NUM_CONTEXTS; i++) + dd->hw_to_sw[i] = INVALID_SCI; + + /* + * All send contexts have their credit sizes. Allocate credits + * for each context one after another from the global space. + */ + context = 0; + base = 1; + for (i = 0; i < SC_MAX; i++) { + struct sc_config_sizes *scs = &dd->sc_sizes[i]; + + for (j = 0; j < scs->count; j++) { + struct send_context_info *sci = + &dd->send_contexts[context]; + sci->type = i; + sci->base = base; + sci->credits = scs->size; + + context++; + base += scs->size; + } + } + + return 0; +} + +/* + * Allocate a software index and hardware context of the given type. + * + * Must be called with dd->sc_lock held. + */ +static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index, + u32 *hw_context) +{ + struct send_context_info *sci; + u32 index; + u32 context; + + for (index = 0, sci = &dd->send_contexts[0]; + index < dd->num_send_contexts; index++, sci++) { + if (sci->type == type && sci->allocated == 0) { + sci->allocated = 1; + /* use a 1:1 mapping, but make them non-equal */ + context = chip_send_contexts(dd) - index - 1; + dd->hw_to_sw[context] = index; + *sw_index = index; + *hw_context = context; + return 0; /* success */ + } + } + dd_dev_err(dd, "Unable to locate a free type %d send context\n", type); + return -ENOSPC; +} + +/* + * Free the send context given by its software index. + * + * Must be called with dd->sc_lock held. + */ +static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context) +{ + struct send_context_info *sci; + + sci = &dd->send_contexts[sw_index]; + if (!sci->allocated) { + dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n", + __func__, sw_index, hw_context); + } + sci->allocated = 0; + dd->hw_to_sw[hw_context] = INVALID_SCI; +} + +/* return the base context of a context in a group */ +static inline u32 group_context(u32 context, u32 group) +{ + return (context >> group) << group; +} + +/* return the size of a group */ +static inline u32 group_size(u32 group) +{ + return 1 << group; +} + +/* + * Obtain the credit return addresses, kernel virtual and bus, for the + * given sc. + * + * To understand this routine: + * o va and dma are arrays of struct credit_return. One for each physical + * send context, per NUMA. + * o Each send context always looks in its relative location in a struct + * credit_return for its credit return. + * o Each send context in a group must have its return address CSR programmed + * with the same value. Use the address of the first send context in the + * group. + */ +static void cr_group_addresses(struct send_context *sc, dma_addr_t *dma) +{ + u32 gc = group_context(sc->hw_context, sc->group); + u32 index = sc->hw_context & 0x7; + + sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index]; + *dma = (unsigned long) + &((struct credit_return *)sc->dd->cr_base[sc->node].dma)[gc]; +} + +/* + * Work queue function triggered in error interrupt routine for + * kernel contexts. + */ +static void sc_halted(struct work_struct *work) +{ + struct send_context *sc; + + sc = container_of(work, struct send_context, halt_work); + sc_restart(sc); +} + +/* + * Calculate PIO block threshold for this send context using the given MTU. + * Trigger a return when one MTU plus optional header of credits remain. + * + * Parameter mtu is in bytes. + * Parameter hdrqentsize is in DWORDs. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) +{ + u32 release_credits; + u32 threshold; + + /* add in the header size, then divide by the PIO block size */ + mtu += hdrqentsize << 2; + release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE); + + /* check against this context's credits */ + if (sc->credits <= release_credits) + threshold = 1; + else + threshold = sc->credits - release_credits; + + return threshold; +} + +/* + * Calculate credit threshold in terms of percent of the allocated credits. + * Trigger when unreturned credits equal or exceed the percentage of the whole. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) +{ + return (sc->credits * percent) / 100; +} + +/* + * Set the credit return threshold. + */ +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold) +{ + unsigned long flags; + u32 old_threshold; + int force_return = 0; + + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + + old_threshold = (sc->credit_ctrl >> + SC(CREDIT_CTRL_THRESHOLD_SHIFT)) + & SC(CREDIT_CTRL_THRESHOLD_MASK); + + if (new_threshold != old_threshold) { + sc->credit_ctrl = + (sc->credit_ctrl + & ~SC(CREDIT_CTRL_THRESHOLD_SMASK)) + | ((new_threshold + & SC(CREDIT_CTRL_THRESHOLD_MASK)) + << SC(CREDIT_CTRL_THRESHOLD_SHIFT)); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + + /* force a credit return on change to avoid a possible stall */ + force_return = 1; + } + + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); + + if (force_return) + sc_return_credits(sc); +} + +/* + * set_pio_integrity + * + * Set the CHECK_ENABLE register for the send context 'sc'. + */ +void set_pio_integrity(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u32 hw_context = sc->hw_context; + int type = sc->type; + + write_kctxt_csr(dd, hw_context, + SC(CHECK_ENABLE), + hfi1_pkt_default_send_ctxt_mask(dd, type)); +} + +static u32 get_buffers_allocated(struct send_context *sc) +{ + int cpu; + u32 ret = 0; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(sc->buffers_allocated, cpu); + return ret; +} + +static void reset_buffers_allocated(struct send_context *sc) +{ + int cpu; + + for_each_possible_cpu(cpu) + (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0; +} + +/* + * Allocate a NUMA relative send context structure of the given type along + * with a HW context. + */ +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa) +{ + struct send_context_info *sci; + struct send_context *sc = NULL; + dma_addr_t dma; + unsigned long flags; + u64 reg; + u32 thresh; + u32 sw_index; + u32 hw_context; + int ret; + u8 opval, opmask; + + /* do not allocate while frozen */ + if (dd->flags & HFI1_FROZEN) + return NULL; + + sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa); + if (!sc) + return NULL; + + sc->buffers_allocated = alloc_percpu(u32); + if (!sc->buffers_allocated) { + kfree(sc); + dd_dev_err(dd, + "Cannot allocate buffers_allocated per cpu counters\n" + ); + return NULL; + } + + spin_lock_irqsave(&dd->sc_lock, flags); + ret = sc_hw_alloc(dd, type, &sw_index, &hw_context); + if (ret) { + spin_unlock_irqrestore(&dd->sc_lock, flags); + free_percpu(sc->buffers_allocated); + kfree(sc); + return NULL; + } + + sci = &dd->send_contexts[sw_index]; + sci->sc = sc; + + sc->dd = dd; + sc->node = numa; + sc->type = type; + spin_lock_init(&sc->alloc_lock); + spin_lock_init(&sc->release_lock); + spin_lock_init(&sc->credit_ctrl_lock); + seqlock_init(&sc->waitlock); + INIT_LIST_HEAD(&sc->piowait); + INIT_WORK(&sc->halt_work, sc_halted); + init_waitqueue_head(&sc->halt_wait); + + /* grouping is always single context for now */ + sc->group = 0; + + sc->sw_index = sw_index; + sc->hw_context = hw_context; + cr_group_addresses(sc, &dma); + sc->credits = sci->credits; + sc->size = sc->credits * PIO_BLOCK_SIZE; + +/* PIO Send Memory Address details */ +#define PIO_ADDR_CONTEXT_MASK 0xfful +#define PIO_ADDR_CONTEXT_SHIFT 16 + sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK) + << PIO_ADDR_CONTEXT_SHIFT); + + /* set base and credits */ + reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK)) + << SC(CTRL_CTXT_DEPTH_SHIFT)) + | ((sci->base & SC(CTRL_CTXT_BASE_MASK)) + << SC(CTRL_CTXT_BASE_SHIFT)); + write_kctxt_csr(dd, hw_context, SC(CTRL), reg); + + set_pio_integrity(sc); + + /* unmask all errors */ + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1); + + /* set the default partition key */ + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), + (SC(CHECK_PARTITION_KEY_VALUE_MASK) & + DEFAULT_PKEY) << + SC(CHECK_PARTITION_KEY_VALUE_SHIFT)); + + /* per context type checks */ + if (type == SC_USER) { + opval = USER_OPCODE_CHECK_VAL; + opmask = USER_OPCODE_CHECK_MASK; + } else { + opval = OPCODE_CHECK_VAL_DISABLED; + opmask = OPCODE_CHECK_MASK_DISABLED; + } + + /* set the send context check opcode mask and value */ + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), + ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) | + ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT))); + + /* set up credit return */ + reg = dma & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg); + + /* + * Calculate the initial credit return threshold. + * + * For Ack contexts, set a threshold for half the credits. + * For User contexts use the given percentage. This has been + * sanitized on driver start-up. + * For Kernel contexts, use the default MTU plus a header + * or half the credits, whichever is smaller. This should + * work for both the 3-deep buffering allocation and the + * pooling allocation. + */ + if (type == SC_ACK) { + thresh = sc_percent_to_threshold(sc, 50); + } else if (type == SC_USER) { + thresh = sc_percent_to_threshold(sc, + user_credit_return_threshold); + } else { /* kernel */ + thresh = min(sc_percent_to_threshold(sc, 50), + sc_mtu_to_threshold(sc, hfi1_max_mtu, + hdrqentsize)); + } + reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); + /* add in early return */ + if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN)) + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */ + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + + /* set up write-through credit_ctrl */ + sc->credit_ctrl = reg; + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg); + + /* User send contexts should not allow sending on VL15 */ + if (type == SC_USER) { + reg = 1ULL << 15; + write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg); + } + + spin_unlock_irqrestore(&dd->sc_lock, flags); + + /* + * Allocate shadow ring to track outstanding PIO buffers _after_ + * unlocking. We don't know the size until the lock is held and + * we can't allocate while the lock is held. No one is using + * the context yet, so allocate it now. + * + * User contexts do not get a shadow ring. + */ + if (type != SC_USER) { + /* + * Size the shadow ring 1 larger than the number of credits + * so head == tail can mean empty. + */ + sc->sr_size = sci->credits + 1; + sc->sr = kcalloc_node(sc->sr_size, + sizeof(union pio_shadow_ring), + GFP_KERNEL, numa); + if (!sc->sr) { + sc_free(sc); + return NULL; + } + } + + hfi1_cdbg(PIO, + "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u", + sw_index, + hw_context, + sc_type_name(type), + sc->group, + sc->credits, + sc->credit_ctrl, + thresh); + + return sc; +} + +/* free a per-NUMA send context structure */ +void sc_free(struct send_context *sc) +{ + struct hfi1_devdata *dd; + unsigned long flags; + u32 sw_index; + u32 hw_context; + + if (!sc) + return; + + sc->flags |= SCF_IN_FREE; /* ensure no restarts */ + dd = sc->dd; + if (!list_empty(&sc->piowait)) + dd_dev_err(dd, "piowait list not empty!\n"); + sw_index = sc->sw_index; + hw_context = sc->hw_context; + sc_disable(sc); /* make sure the HW is disabled */ + flush_work(&sc->halt_work); + + spin_lock_irqsave(&dd->sc_lock, flags); + dd->send_contexts[sw_index].sc = NULL; + + /* clear/disable all registers set in sc_alloc */ + write_kctxt_csr(dd, hw_context, SC(CTRL), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0); + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0); + + /* release the index and context for re-use */ + sc_hw_free(dd, sw_index, hw_context); + spin_unlock_irqrestore(&dd->sc_lock, flags); + + kfree(sc->sr); + free_percpu(sc->buffers_allocated); + kfree(sc); +} + +/* disable the context */ +void sc_disable(struct send_context *sc) +{ + u64 reg; + struct pio_buf *pbuf; + LIST_HEAD(wake_list); + + if (!sc) + return; + + /* do all steps, even if already disabled */ + spin_lock_irq(&sc->alloc_lock); + reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL)); + reg &= ~SC(CTRL_CTXT_ENABLE_SMASK); + sc->flags &= ~SCF_ENABLED; + sc_wait_for_packet_egress(sc, 1); + write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg); + + /* + * Flush any waiters. Once the context is disabled, + * credit return interrupts are stopped (although there + * could be one in-process when the context is disabled). + * Wait one microsecond for any lingering interrupts, then + * proceed with the flush. + */ + udelay(1); + spin_lock(&sc->release_lock); + if (sc->sr) { /* this context has a shadow ring */ + while (sc->sr_tail != sc->sr_head) { + pbuf = &sc->sr[sc->sr_tail].pbuf; + if (pbuf->cb) + (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE); + sc->sr_tail++; + if (sc->sr_tail >= sc->sr_size) + sc->sr_tail = 0; + } + } + spin_unlock(&sc->release_lock); + + write_seqlock(&sc->waitlock); + list_splice_init(&sc->piowait, &wake_list); + write_sequnlock(&sc->waitlock); + while (!list_empty(&wake_list)) { + struct iowait *wait; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + + wait = list_first_entry(&wake_list, struct iowait, list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); + } + + spin_unlock_irq(&sc->alloc_lock); +} + +/* return SendEgressCtxtStatus.PacketOccupancy */ +static u64 packet_occupancy(u64 reg) +{ + return (reg & + SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK) + >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT; +} + +/* is egress halted on the context? */ +static bool egress_halted(u64 reg) +{ + return !!(reg & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK); +} + +/* is the send context halted? */ +static bool is_sc_halted(struct hfi1_devdata *dd, u32 hw_context) +{ + return !!(read_kctxt_csr(dd, hw_context, SC(STATUS)) & + SC(STATUS_CTXT_HALTED_SMASK)); +} + +/** + * sc_wait_for_packet_egress - wait for packet + * @sc: valid send context + * @pause: wait for credit return + * + * Wait for packet egress, optionally pause for credit return + * + * Egress halt and Context halt are not necessarily the same thing, so + * check for both. + * + * NOTE: The context halt bit may not be set immediately. Because of this, + * it is necessary to check the SW SFC_HALTED bit (set in the IRQ) and the HW + * context bit to determine if the context is halted. + */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg = 0; + u64 reg_prev; + u32 loop = 0; + + while (1) { + reg_prev = reg; + reg = read_csr(dd, sc->hw_context * 8 + + SEND_EGRESS_CTXT_STATUS); + /* done if any halt bits, SW or HW are set */ + if (sc->flags & SCF_HALTED || + is_sc_halted(dd, sc->hw_context) || egress_halted(reg)) + break; + reg = packet_occupancy(reg); + if (reg == 0) + break; + /* counter is reset if occupancy count changes */ + if (reg != reg_prev) + loop = 0; + if (loop > 50000) { + /* timed out - bounce the link */ + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n", + __func__, sc->sw_index, + sc->hw_context, (u32)reg); + queue_work(dd->pport->link_wq, + &dd->pport->link_bounce_work); + break; + } + loop++; + udelay(1); + } + + if (pause) + /* Add additional delay to ensure chip returns all credits */ + pause_for_credit_return(dd); +} + +void sc_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + struct send_context *sc = dd->send_contexts[i].sc; + + if (!sc) + continue; + sc_wait_for_packet_egress(sc, 0); + } +} + +/* + * Restart a context after it has been halted due to error. + * + * If the first step fails - wait for the halt to be asserted, return early. + * Otherwise complain about timeouts but keep going. + * + * It is expected that allocations (enabled flag bit) have been shut off + * already (only applies to kernel contexts). + */ +int sc_restart(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u32 loop; + int count; + + /* bounce off if not halted, or being free'd */ + if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE)) + return -EINVAL; + + dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index, + sc->hw_context); + + /* + * Step 1: Wait for the context to actually halt. + * + * The error interrupt is asynchronous to actually setting halt + * on the context. + */ + loop = 0; + while (1) { + reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS)); + if (reg & SC(STATUS_CTXT_HALTED_SMASK)) + break; + if (loop > 100) { + dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n", + __func__, sc->sw_index, sc->hw_context); + return -ETIME; + } + loop++; + udelay(1); + } + + /* + * Step 2: Ensure no users are still trying to write to PIO. + * + * For kernel contexts, we have already turned off buffer allocation. + * Now wait for the buffer count to go to zero. + * + * For user contexts, the user handling code has cut off write access + * to the context's PIO pages before calling this routine and will + * restore write access after this routine returns. + */ + if (sc->type != SC_USER) { + /* kernel context */ + loop = 0; + while (1) { + count = get_buffers_allocated(sc); + if (count == 0) + break; + if (loop > 100) { + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n", + __func__, sc->sw_index, + sc->hw_context, count); + } + loop++; + udelay(1); + } + } + + /* + * Step 3: Wait for all packets to egress. + * This is done while disabling the send context + * + * Step 4: Disable the context + * + * This is a superset of the halt. After the disable, the + * errors can be cleared. + */ + sc_disable(sc); + + /* + * Step 5: Enable the context + * + * This enable will clear the halted flag and per-send context + * error flags. + */ + return sc_enable(sc); +} + +/* + * PIO freeze processing. To be called after the TXE block is fully frozen. + * Go through all frozen send contexts and disable them. The contexts are + * already stopped by the freeze. + */ +void pio_freeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + /* + * Don't disable unallocated, unfrozen, or user send contexts. + * User send contexts will be disabled when the process + * calls into the driver to reset its context. + */ + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + + /* only need to disable, the context is already stopped */ + sc_disable(sc); + } +} + +/* + * Unfreeze PIO for kernel send contexts. The precondition for calling this + * is that all PIO send contexts have been disabled and the SPC freeze has + * been cleared. Now perform the last step and re-enable each kernel context. + * User (PSM) processing will occur when PSM calls into the kernel to + * acknowledge the freeze. + */ +void pio_kernel_unfreeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + if (sc->flags & SCF_LINK_DOWN) + continue; + + sc_enable(sc); /* will clear the sc frozen flag */ + } +} + +/** + * pio_kernel_linkup() - Re-enable send contexts after linkup event + * @dd: valid devive data + * + * When the link goes down, the freeze path is taken. However, a link down + * event is different from a freeze because if the send context is re-enabled + * whowever is sending data will start sending data again, which will hang + * any QP that is sending data. + * + * The freeze path now looks at the type of event that occurs and takes this + * path for link down event. + */ +void pio_kernel_linkup(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER) + continue; + + sc_enable(sc); /* will clear the sc link down flag */ + } +} + +/* + * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear. + * Returns: + * -ETIMEDOUT - if we wait too long + * -EIO - if there was an error + */ +static int pio_init_wait_progress(struct hfi1_devdata *dd) +{ + u64 reg; + int max, count = 0; + + /* max is the longest possible HW init time / delay */ + max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5; + while (1) { + reg = read_csr(dd, SEND_PIO_INIT_CTXT); + if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK)) + break; + if (count >= max) + return -ETIMEDOUT; + udelay(5); + count++; + } + + return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0; +} + +/* + * Reset all of the send contexts to their power-on state. Used + * only during manual init - no lock against sc_enable needed. + */ +void pio_reset_all(struct hfi1_devdata *dd) +{ + int ret; + + /* make sure the init engine is not busy */ + ret = pio_init_wait_progress(dd); + /* ignore any timeout */ + if (ret == -EIO) { + /* clear the error */ + write_csr(dd, SEND_PIO_ERR_CLEAR, + SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK); + } + + /* reset init all */ + write_csr(dd, SEND_PIO_INIT_CTXT, + SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK); + udelay(2); + ret = pio_init_wait_progress(dd); + if (ret < 0) { + dd_dev_err(dd, + "PIO send context init %s while initializing all PIO blocks\n", + ret == -ETIMEDOUT ? "is stuck" : "had an error"); + } +} + +/* enable the context */ +int sc_enable(struct send_context *sc) +{ + u64 sc_ctrl, reg, pio; + struct hfi1_devdata *dd; + unsigned long flags; + int ret = 0; + + if (!sc) + return -EINVAL; + dd = sc->dd; + + /* + * Obtain the allocator lock to guard against any allocation + * attempts (which should not happen prior to context being + * enabled). On the release/disable side we don't need to + * worry about locking since the releaser will not do anything + * if the context accounting values have not changed. + */ + spin_lock_irqsave(&sc->alloc_lock, flags); + sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK))) + goto unlock; /* already enabled */ + + /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */ + + *sc->hw_free = 0; + sc->free = 0; + sc->alloc_free = 0; + sc->fill = 0; + sc->fill_wrap = 0; + sc->sr_head = 0; + sc->sr_tail = 0; + sc->flags = 0; + /* the alloc lock insures no fast path allocation */ + reset_buffers_allocated(sc); + + /* + * Clear all per-context errors. Some of these will be set when + * we are re-enabling after a context halt. Now that the context + * is disabled, the halt will not clear until after the PIO init + * engine runs below. + */ + reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS)); + if (reg) + write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg); + + /* + * The HW PIO initialization engine can handle only one init + * request at a time. Serialize access to each device's engine. + */ + spin_lock(&dd->sc_init_lock); + /* + * Since access to this code block is serialized and + * each access waits for the initialization to complete + * before releasing the lock, the PIO initialization engine + * should not be in use, so we don't have to wait for the + * InProgress bit to go down. + */ + pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) << + SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) | + SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK; + write_csr(dd, SEND_PIO_INIT_CTXT, pio); + /* + * Wait until the engine is done. Give the chip the required time + * so, hopefully, we read the register just once. + */ + udelay(2); + ret = pio_init_wait_progress(dd); + spin_unlock(&dd->sc_init_lock); + if (ret) { + dd_dev_err(dd, + "sctxt%u(%u): Context not enabled due to init failure %d\n", + sc->sw_index, sc->hw_context, ret); + goto unlock; + } + + /* + * All is well. Enable the context. + */ + sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK); + write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl); + /* + * Read SendCtxtCtrl to force the write out and prevent a timing + * hazard where a PIO write may reach the context before the enable. + */ + read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + sc->flags |= SCF_ENABLED; + +unlock: + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + return ret; +} + +/* force a credit return on the context */ +void sc_return_credits(struct send_context *sc) +{ + if (!sc) + return; + + /* a 0->1 transition schedules a credit return */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), + SC(CREDIT_FORCE_FORCE_RETURN_SMASK)); + /* + * Ensure that the write is flushed and the credit return is + * scheduled. We care more about the 0 -> 1 transition. + */ + read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE)); + /* set back to 0 for next time */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0); +} + +/* allow all in-flight packets to drain on the context */ +void sc_flush(struct send_context *sc) +{ + if (!sc) + return; + + sc_wait_for_packet_egress(sc, 1); +} + +/* drop all packets on the context, no waiting until they are sent */ +void sc_drop(struct send_context *sc) +{ + if (!sc) + return; + + dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", + __func__, sc->sw_index, sc->hw_context); +} + +/* + * Start the software reaction to a context halt or SPC freeze: + * - mark the context as halted or frozen + * - stop buffer allocations + * + * Called from the error interrupt. Other work is deferred until + * out of the interrupt. + */ +void sc_stop(struct send_context *sc, int flag) +{ + unsigned long flags; + + /* stop buffer allocations */ + spin_lock_irqsave(&sc->alloc_lock, flags); + /* mark the context */ + sc->flags |= flag; + sc->flags &= ~SCF_ENABLED; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + wake_up(&sc->halt_wait); +} + +#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32)) +#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS) + +/* + * The send context buffer "allocator". + * + * @sc: the PIO send context we are allocating from + * @len: length of whole packet - including PBC - in dwords + * @cb: optional callback to call when the buffer is finished sending + * @arg: argument for cb + * + * Return a pointer to a PIO buffer, NULL if not enough room, -ECOMM + * when link is down. + */ +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg) +{ + struct pio_buf *pbuf = NULL; + unsigned long flags; + unsigned long avail; + unsigned long blocks = dwords_to_blocks(dw_len); + u32 fill_wrap; + int trycount = 0; + u32 head, next; + + spin_lock_irqsave(&sc->alloc_lock, flags); + if (!(sc->flags & SCF_ENABLED)) { + spin_unlock_irqrestore(&sc->alloc_lock, flags); + return ERR_PTR(-ECOMM); + } + +retry: + avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* not enough room */ + if (unlikely(trycount)) { /* already tried to get more room */ + spin_unlock_irqrestore(&sc->alloc_lock, flags); + goto done; + } + /* copy from receiver cache line and recalculate */ + sc->alloc_free = READ_ONCE(sc->free); + avail = + (unsigned long)sc->credits - + (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* still no room, actively update */ + sc_release_update(sc); + sc->alloc_free = READ_ONCE(sc->free); + trycount++; + goto retry; + } + } + + /* there is enough room */ + + preempt_disable(); + this_cpu_inc(*sc->buffers_allocated); + + /* read this once */ + head = sc->sr_head; + + /* "allocate" the buffer */ + sc->fill += blocks; + fill_wrap = sc->fill_wrap; + sc->fill_wrap += blocks; + if (sc->fill_wrap >= sc->credits) + sc->fill_wrap = sc->fill_wrap - sc->credits; + + /* + * Fill the parts that the releaser looks at before moving the head. + * The only necessary piece is the sent_at field. The credits + * we have just allocated cannot have been returned yet, so the + * cb and arg will not be looked at for a "while". Put them + * on this side of the memory barrier anyway. + */ + pbuf = &sc->sr[head].pbuf; + pbuf->sent_at = sc->fill; + pbuf->cb = cb; + pbuf->arg = arg; + pbuf->sc = sc; /* could be filled in at sc->sr init time */ + /* make sure this is in memory before updating the head */ + + /* calculate next head index, do not store */ + next = head + 1; + if (next >= sc->sr_size) + next = 0; + /* + * update the head - must be last! - the releaser can look at fields + * in pbuf once we move the head + */ + smp_wmb(); + sc->sr_head = next; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + /* finish filling in the buffer outside the lock */ + pbuf->start = sc->base_addr + fill_wrap * PIO_BLOCK_SIZE; + pbuf->end = sc->base_addr + sc->size; + pbuf->qw_written = 0; + pbuf->carry_bytes = 0; + pbuf->carry.val64 = 0; +done: + return pbuf; +} + +/* + * There are at least two entities that can turn on credit return + * interrupts and they can overlap. Avoid problems by implementing + * a count scheme that is enforced by a lock. The lock is needed because + * the count and CSR write must be paired. + */ + +/* + * Start credit return interrupts. This is managed by a count. If already + * on, just increment the count. + */ +void sc_add_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + if (sc->credit_intr_count == 0) { + sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + sc->credit_intr_count++; + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * Stop credit return interrupts. This is managed by a count. Decrement the + * count, if the last user, then turn the credit interrupts off. + */ +void sc_del_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + WARN_ON(sc->credit_intr_count == 0); + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + sc->credit_intr_count--; + if (sc->credit_intr_count == 0) { + sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * The caller must be careful when calling this. All needint calls + * must be paired with !needint. + */ +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint) +{ + if (needint) + sc_add_credit_return_intr(sc); + else + sc_del_credit_return_intr(sc); + trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl); + if (needint) + sc_return_credits(sc); +} + +/** + * sc_piobufavail - callback when a PIO buffer is available + * @sc: the send context + * + * This is called from the interrupt handler when a PIO buffer is + * available after hfi1_verbs_send() returned an error that no buffers were + * available. Disable the interrupt if there are no more QPs waiting. + */ +static void sc_piobufavail(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + struct list_head *list; + struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE]; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + unsigned long flags; + uint i, n = 0, top_idx = 0; + + if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && + dd->send_contexts[sc->sw_index].type != SC_VL15) + return; + list = &sc->piowait; + /* + * Note: checking that the piowait list is empty and clearing + * the buffer available interrupt needs to be atomic or we + * could end up with QPs on the wait list with the interrupt + * disabled. + */ + write_seqlock_irqsave(&sc->waitlock, flags); + while (!list_empty(list)) { + struct iowait *wait; + + if (n == ARRAY_SIZE(qps)) + break; + wait = list_first_entry(list, struct iowait, list); + iowait_get_priority(wait); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + if (n) { + priv = qps[top_idx]->priv; + top_idx = iowait_priority_update_top(wait, + &priv->s_iowait, + n, top_idx); + } + + /* refcount held until actual wake up */ + qps[n++] = qp; + } + /* + * If there had been waiters and there are more + * insure that we redo the force to avoid a potential hang. + */ + if (n) { + hfi1_sc_wantpiobuf_intr(sc, 0); + if (!list_empty(list)) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock_irqrestore(&sc->waitlock, flags); + + /* Wake up the top-priority one first */ + if (n) + hfi1_qp_wakeup(qps[top_idx], + RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); + for (i = 0; i < n; i++) + if (i != top_idx) + hfi1_qp_wakeup(qps[i], + RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); +} + +/* translate a send credit update to a bit code of reasons */ +static inline int fill_code(u64 hw_free) +{ + int code = 0; + + if (hw_free & CR_STATUS_SMASK) + code |= PRC_STATUS_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK) + code |= PRC_PBC; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK) + code |= PRC_THRESHOLD; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK) + code |= PRC_FILL_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK) + code |= PRC_SC_DISABLE; + return code; +} + +/* use the jiffies compare to get the wrap right */ +#define sent_before(a, b) time_before(a, b) /* a < b */ + +/* + * The send context buffer "releaser". + */ +void sc_release_update(struct send_context *sc) +{ + struct pio_buf *pbuf; + u64 hw_free; + u32 head, tail; + unsigned long old_free; + unsigned long free; + unsigned long extra; + unsigned long flags; + int code; + + if (!sc) + return; + + spin_lock_irqsave(&sc->release_lock, flags); + /* update free */ + hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */ + old_free = sc->free; + extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT) + - (old_free & CR_COUNTER_MASK)) + & CR_COUNTER_MASK; + free = old_free + extra; + trace_hfi1_piofree(sc, extra); + + /* call sent buffer callbacks */ + code = -1; /* code not yet set */ + head = READ_ONCE(sc->sr_head); /* snapshot the head */ + tail = sc->sr_tail; + while (head != tail) { + pbuf = &sc->sr[tail].pbuf; + + if (sent_before(free, pbuf->sent_at)) { + /* not sent yet */ + break; + } + if (pbuf->cb) { + if (code < 0) /* fill in code on first user */ + code = fill_code(hw_free); + (*pbuf->cb)(pbuf->arg, code); + } + + tail++; + if (tail >= sc->sr_size) + tail = 0; + } + sc->sr_tail = tail; + /* make sure tail is updated before free */ + smp_wmb(); + sc->free = free; + spin_unlock_irqrestore(&sc->release_lock, flags); + sc_piobufavail(sc); +} + +/* + * Send context group releaser. Argument is the send context that caused + * the interrupt. Called from the send context interrupt handler. + * + * Call release on all contexts in the group. + * + * This routine takes the sc_lock without an irqsave because it is only + * called from an interrupt handler. Adjust if that changes. + */ +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context) +{ + struct send_context *sc; + u32 sw_index; + u32 gc, gc_end; + + spin_lock(&dd->sc_lock); + sw_index = dd->hw_to_sw[hw_context]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + goto done; + } + sc = dd->send_contexts[sw_index].sc; + if (unlikely(!sc)) + goto done; + + gc = group_context(hw_context, sc->group); + gc_end = gc + group_size(sc->group); + for (; gc < gc_end; gc++) { + sw_index = dd->hw_to_sw[gc]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, + "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + continue; + } + sc_release_update(dd->send_contexts[sw_index].sc); + } +done: + spin_unlock(&dd->sc_lock); +} + +/* + * pio_select_send_context_vl() - select send context + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * This function returns a send context based on the selector and a vl. + * The mapping fields are protected by RCU + */ +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl) +{ + struct pio_vl_map *m; + struct pio_map_elem *e; + struct send_context *rval; + + /* + * NOTE This should only happen if SC->VL changed after the initial + * checks on the QP/AH + * Default will return VL0's send context below + */ + if (unlikely(vl >= num_vls)) { + rval = NULL; + goto done; + } + + rcu_read_lock(); + m = rcu_dereference(dd->pio_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return dd->vld[0].sc; + } + e = m->map[vl & m->mask]; + rval = e->ksc[selector & e->mask]; + rcu_read_unlock(); + +done: + rval = !rval ? dd->vld[0].sc : rval; + return rval; +} + +/* + * pio_select_send_context_sc() - select send context + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * This function returns an send context based on the selector and an sc + */ +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return pio_select_send_context_vl(dd, selector, vl); +} + +/* + * Free the indicated map struct + */ +static void pio_map_free(struct pio_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void pio_map_rcu_callback(struct rcu_head *list) +{ + struct pio_vl_map *m = container_of(list, struct pio_vl_map, list); + + pio_map_free(m); +} + +/* + * Set credit return threshold for the kernel send context + */ +static void set_threshold(struct hfi1_devdata *dd, int scontext, int i) +{ + u32 thres; + + thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext], + 50), + sc_mtu_to_threshold(dd->kernel_send_context[scontext], + dd->vld[i].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->kernel_send_context[scontext], thres); +} + +/* + * pio_map_init - called when #vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_scontexts: per vl send context mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_scontexts is used to specify a non-uniform vl/send context + * loading. NULL implies auto computing the loading and giving each + * VL an uniform distribution of send contexts per VL. + * + * The auto algorithm computers the sc_per_vl and the number of extra + * send contexts. Any extra send contexts are added from the last VL + * on down + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_send_contexts are non-power of 2, the + * array sizes in the struct pio_vl_map and the struct pio_map_elem are + * rounded up to the next highest power of 2 and the first entry is + * reused in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is not + * chaged. + * + */ +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) +{ + int i, j; + int extra, sc_per_vl; + int scontext = 1; + int num_kernel_send_contexts = 0; + u8 lvl_scontexts[OPA_MAX_VLS]; + struct pio_vl_map *oldmap, *newmap; + + if (!vl_scontexts) { + for (i = 0; i < dd->num_send_contexts; i++) + if (dd->send_contexts[i].type == SC_KERNEL) + num_kernel_send_contexts++; + /* truncate divide */ + sc_per_vl = num_kernel_send_contexts / num_vls; + /* extras */ + extra = num_kernel_send_contexts % num_vls; + vl_scontexts = lvl_scontexts; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc(struct_size(newmap, map, roundup_pow_of_two(num_vls)), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_scontext = scontext; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_scontexts[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc(struct_size(newmap->map[i], + ksc, sz), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* + * assign send contexts and + * adjust credit return threshold + */ + for (j = 0; j < sz; j++) { + if (dd->kernel_send_context[scontext]) { + newmap->map[i]->ksc[j] = + dd->kernel_send_context[scontext]; + set_threshold(dd, scontext, i); + } + if (++scontext >= first_scontext + + vl_scontexts[i]) + /* wrap back to first send context */ + scontext = first_scontext; + } + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + scontext = first_scontext + vl_scontexts[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->pio_map_lock); + oldmap = rcu_dereference_protected(dd->pio_map, + lockdep_is_held(&dd->pio_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->pio_map, newmap); + + spin_unlock_irq(&dd->pio_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, pio_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + pio_map_free(newmap); + return -ENOMEM; +} + +void free_pio_map(struct hfi1_devdata *dd) +{ + /* Free PIO map if allocated */ + if (rcu_access_pointer(dd->pio_map)) { + spin_lock_irq(&dd->pio_map_lock); + pio_map_free(rcu_access_pointer(dd->pio_map)); + RCU_INIT_POINTER(dd->pio_map, NULL); + spin_unlock_irq(&dd->pio_map_lock); + synchronize_rcu(); + } + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; +} + +int init_pervl_scs(struct hfi1_devdata *dd) +{ + int i; + u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */ + u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */ + u32 ctxt; + struct hfi1_pportdata *ppd = dd->pport; + + dd->vld[15].sc = sc_alloc(dd, SC_VL15, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[15].sc) + return -ENOMEM; + + hfi1_init_ctxt(dd->vld[15].sc); + dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); + + dd->kernel_send_context = kcalloc_node(dd->num_send_contexts, + sizeof(struct send_context *), + GFP_KERNEL, dd->node); + if (!dd->kernel_send_context) + goto freesc15; + + dd->kernel_send_context[0] = dd->vld[15].sc; + + for (i = 0; i < num_vls; i++) { + /* + * Since this function does not deal with a specific + * receive context but we need the RcvHdrQ entry size, + * use the size from rcd[0]. It is guaranteed to be + * valid at this point and will remain the same for all + * receive contexts. + */ + dd->vld[i].sc = sc_alloc(dd, SC_KERNEL, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[i].sc) + goto nomem; + dd->kernel_send_context[i + 1] = dd->vld[i].sc; + hfi1_init_ctxt(dd->vld[i].sc); + /* non VL15 start with the max MTU */ + dd->vld[i].mtu = hfi1_max_mtu; + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + dd->kernel_send_context[i + 1] = + sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->kernel_send_context[i + 1]) + goto nomem; + hfi1_init_ctxt(dd->kernel_send_context[i + 1]); + } + + sc_enable(dd->vld[15].sc); + ctxt = dd->vld[15].sc->hw_context; + mask = all_vl_mask & ~(1LL << 15); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + dd_dev_info(dd, + "Using send context %u(%u) for VL15\n", + dd->vld[15].sc->sw_index, ctxt); + + for (i = 0; i < num_vls; i++) { + sc_enable(dd->vld[i].sc); + ctxt = dd->vld[i].sc->hw_context; + mask = all_vl_mask & ~(data_vls_mask); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + sc_enable(dd->kernel_send_context[i + 1]); + ctxt = dd->kernel_send_context[i + 1]->hw_context; + mask = all_vl_mask & ~(data_vls_mask); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + + if (pio_map_init(dd, ppd->port - 1, num_vls, NULL)) + goto nomem; + return 0; + +nomem: + for (i = 0; i < num_vls; i++) { + sc_free(dd->vld[i].sc); + dd->vld[i].sc = NULL; + } + + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) + sc_free(dd->kernel_send_context[i + 1]); + + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; + +freesc15: + sc_free(dd->vld[15].sc); + return -ENOMEM; +} + +int init_credit_return(struct hfi1_devdata *dd) +{ + int ret; + int i; + + dd->cr_base = kcalloc( + node_affinity.num_possible_nodes, + sizeof(struct credit_return_base), + GFP_KERNEL); + if (!dd->cr_base) { + ret = -ENOMEM; + goto done; + } + for_each_node_with_cpus(i) { + int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return); + + set_dev_node(&dd->pcidev->dev, i); + dd->cr_base[i].va = dma_alloc_coherent(&dd->pcidev->dev, + bytes, + &dd->cr_base[i].dma, + GFP_KERNEL); + if (!dd->cr_base[i].va) { + set_dev_node(&dd->pcidev->dev, dd->node); + dd_dev_err(dd, + "Unable to allocate credit return DMA range for NUMA %d\n", + i); + ret = -ENOMEM; + goto done; + } + } + set_dev_node(&dd->pcidev->dev, dd->node); + + ret = 0; +done: + return ret; +} + +void free_credit_return(struct hfi1_devdata *dd) +{ + int i; + + if (!dd->cr_base) + return; + for (i = 0; i < node_affinity.num_possible_nodes; i++) { + if (dd->cr_base[i].va) { + dma_free_coherent(&dd->pcidev->dev, + TXE_NUM_CONTEXTS * + sizeof(struct credit_return), + dd->cr_base[i].va, + dd->cr_base[i].dma); + } + } + kfree(dd->cr_base); + dd->cr_base = NULL; +} + +void seqfile_dump_sci(struct seq_file *s, u32 i, + struct send_context_info *sci) +{ + struct send_context *sc = sci->sc; + u64 reg; + + seq_printf(s, "SCI %u: type %u base %u credits %u\n", + i, sci->type, sci->base, sci->credits); + seq_printf(s, " flags 0x%x sw_inx %u hw_ctxt %u grp %u\n", + sc->flags, sc->sw_index, sc->hw_context, sc->group); + seq_printf(s, " sr_size %u credits %u sr_head %u sr_tail %u\n", + sc->sr_size, sc->credits, sc->sr_head, sc->sr_tail); + seq_printf(s, " fill %lu free %lu fill_wrap %u alloc_free %lu\n", + sc->fill, sc->free, sc->fill_wrap, sc->alloc_free); + seq_printf(s, " credit_intr_count %u credit_ctrl 0x%llx\n", + sc->credit_intr_count, sc->credit_ctrl); + reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_STATUS)); + seq_printf(s, " *hw_free %llu CurrentFree %llu LastReturned %llu\n", + (le64_to_cpu(*sc->hw_free) & CR_COUNTER_SMASK) >> + CR_COUNTER_SHIFT, + (reg >> SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT)) & + SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK), + reg & SC(CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK)); +} diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h new file mode 100644 index 0000000000..ea714008f2 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -0,0 +1,293 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015-2017 Intel Corporation. + */ + +#ifndef _PIO_H +#define _PIO_H +/* send context types */ +#define SC_KERNEL 0 +#define SC_VL15 1 +#define SC_ACK 2 +#define SC_USER 3 /* must be the last one: it may take all left */ +#define SC_MAX 4 /* count of send context types */ + +/* invalid send context index */ +#define INVALID_SCI 0xff + +/* PIO buffer release callback function */ +typedef void (*pio_release_cb)(void *arg, int code); + +/* PIO release codes - in bits, as there could more than one that apply */ +#define PRC_OK 0 /* no known error */ +#define PRC_STATUS_ERR 0x01 /* credit return due to status error */ +#define PRC_PBC 0x02 /* credit return due to PBC */ +#define PRC_THRESHOLD 0x04 /* credit return due to threshold */ +#define PRC_FILL_ERR 0x08 /* credit return due fill error */ +#define PRC_FORCE 0x10 /* credit return due credit force */ +#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */ + +/* byte helper */ +union mix { + u64 val64; + u32 val32[2]; + u8 val8[8]; +}; + +/* an allocated PIO buffer */ +struct pio_buf { + struct send_context *sc;/* back pointer to owning send context */ + pio_release_cb cb; /* called when the buffer is released */ + void *arg; /* argument for cb */ + void __iomem *start; /* buffer start address */ + void __iomem *end; /* context end address */ + unsigned long sent_at; /* buffer is sent when <= free */ + union mix carry; /* pending unwritten bytes */ + u16 qw_written; /* QW written so far */ + u8 carry_bytes; /* number of valid bytes in carry */ +}; + +/* cache line aligned pio buffer array */ +union pio_shadow_ring { + struct pio_buf pbuf; +} ____cacheline_aligned; + +/* per-NUMA send context */ +struct send_context { + /* read-only after init */ + struct hfi1_devdata *dd; /* device */ + union pio_shadow_ring *sr; /* shadow ring */ + void __iomem *base_addr; /* start of PIO memory */ + u32 __percpu *buffers_allocated;/* count of buffers allocated */ + u32 size; /* context size, in bytes */ + + int node; /* context home node */ + u32 sr_size; /* size of the shadow ring */ + u16 flags; /* flags */ + u8 type; /* context type */ + u8 sw_index; /* software index number */ + u8 hw_context; /* hardware context number */ + u8 group; /* credit return group */ + + /* allocator fields */ + spinlock_t alloc_lock ____cacheline_aligned_in_smp; + u32 sr_head; /* shadow ring head */ + unsigned long fill; /* official alloc count */ + unsigned long alloc_free; /* copy of free (less cache thrash) */ + u32 fill_wrap; /* tracks fill within ring */ + u32 credits; /* number of blocks in context */ + /* adding a new field here would make it part of this cacheline */ + + /* releaser fields */ + spinlock_t release_lock ____cacheline_aligned_in_smp; + u32 sr_tail; /* shadow ring tail */ + unsigned long free; /* official free count */ + volatile __le64 *hw_free; /* HW free counter */ + /* list for PIO waiters */ + struct list_head piowait ____cacheline_aligned_in_smp; + seqlock_t waitlock; + + spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp; + u32 credit_intr_count; /* count of credit intr users */ + u64 credit_ctrl; /* cache for credit control */ + wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */ + struct work_struct halt_work; /* halted context work queue entry */ +}; + +/* send context flags */ +#define SCF_ENABLED 0x01 +#define SCF_IN_FREE 0x02 +#define SCF_HALTED 0x04 +#define SCF_FROZEN 0x08 +#define SCF_LINK_DOWN 0x10 + +struct send_context_info { + struct send_context *sc; /* allocated working context */ + u16 allocated; /* has this been allocated? */ + u16 type; /* context type */ + u16 base; /* base in PIO array */ + u16 credits; /* size in PIO array */ +}; + +/* DMA credit return, index is always (context & 0x7) */ +struct credit_return { + volatile __le64 cr[8]; +}; + +/* NUMA indexed credit return array */ +struct credit_return_base { + struct credit_return *va; + dma_addr_t dma; +}; + +/* send context configuration sizes (one per type) */ +struct sc_config_sizes { + short int size; + short int count; +}; + +/* + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform send contexts per vl, the + * number of send contexts for a vl is either the vl_scontexts[vl] or + * a computation based on num_kernel_send_contexts/num_vls: + * + * For example: + * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_kernel_send_contexts/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the send contexts are assigned + * in a round robin fashion wrapping back to the first send context + * for a particular vl. + * + * dd->pio_map + * | pio_map_elem[0] + * | +--------------------+ + * v | mask | + * pio_vl_map |--------------------| + * +--------------------------+ | ksc[0] -> sc 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| ksc[1] -> sc 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | ksc[n-1] -> sc n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +--------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |--------------------| + * | * | \-- | ksc[0] -> sc 1+n | + * | * | \---- |--------------------| + * | * | \->| ksc[1] -> sc 2+n | + * |--------------------------| |--------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |--------------------| + * \- | ksc[m-1] -> sc m+n | + * \ +--------------------+ + * \- + * \ + * \- +----------------------+ + * \- | mask | + * \ |----------------------| + * \- | ksc[0] -> sc 1+m+n | + * \- |----------------------| + * >| ksc[1] -> sc 2+m+n | + * |----------------------| + * | * | + * |----------------------| + * | ksc[o-1] -> sc o+m+n | + * +----------------------+ + * + */ + +/* Initial number of send contexts per VL */ +#define INIT_SC_PER_VL 2 + +/* + * struct pio_map_elem - mapping for a vl + * @mask - selector mask + * @ksc - array of kernel send contexts for this vl + * + * The mask is used to "mod" the selector to + * produce index into the trailing array of + * kscs + */ +struct pio_map_elem { + u32 mask; + struct send_context *ksc[]; +}; + +/* + * struct pio_vl_map - mapping for a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - numbers of vls rounded to next power of 2 + * @map - array of pio_map_elem entries + * + * This is the parent mapping structure. The trailing members of the + * struct point to pio_map_elem entries, which in turn point to an + * array of kscs for that vl. + */ +struct pio_vl_map { + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct pio_map_elem *map[]; +}; + +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, + u8 *vl_scontexts); +void free_pio_map(struct hfi1_devdata *dd); +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl); +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5); + +/* send context functions */ +int init_credit_return(struct hfi1_devdata *dd); +void free_credit_return(struct hfi1_devdata *dd); +int init_sc_pools_and_sizes(struct hfi1_devdata *dd); +int init_send_contexts(struct hfi1_devdata *dd); +int init_pervl_scs(struct hfi1_devdata *dd); +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa); +void sc_free(struct send_context *sc); +int sc_enable(struct send_context *sc); +void sc_disable(struct send_context *sc); +int sc_restart(struct send_context *sc); +void sc_return_credits(struct send_context *sc); +void sc_flush(struct send_context *sc); +void sc_drop(struct send_context *sc); +void sc_stop(struct send_context *sc, int bit); +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg); +void sc_release_update(struct send_context *sc); +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); +void sc_add_credit_return_intr(struct send_context *sc); +void sc_del_credit_return_intr(struct send_context *sc); +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent); +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); +void sc_wait(struct hfi1_devdata *dd); +void set_pio_integrity(struct send_context *sc); + +/* support functions */ +void pio_reset_all(struct hfi1_devdata *dd); +void pio_freeze(struct hfi1_devdata *dd); +void pio_kernel_unfreeze(struct hfi1_devdata *dd); +void pio_kernel_linkup(struct hfi1_devdata *dd); + +/* global PIO send control operations */ +#define PSC_GLOBAL_ENABLE 0 +#define PSC_GLOBAL_DISABLE 1 +#define PSC_GLOBAL_VLARB_ENABLE 2 +#define PSC_GLOBAL_VLARB_DISABLE 3 +#define PSC_CM_RESET 4 +#define PSC_DATA_VL_ENABLE 5 +#define PSC_DATA_VL_DISABLE 6 + +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl); +void pio_send_control(struct hfi1_devdata *dd, int op); + +/* PIO copy routines */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count); +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes); +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes); +void seg_pio_copy_end(struct pio_buf *pbuf); + +void seqfile_dump_sci(struct seq_file *s, u32 i, + struct send_context_info *sci); + +#endif /* _PIO_H */ diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c new file mode 100644 index 0000000000..7690f996d5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio_copy.c @@ -0,0 +1,715 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include "hfi.h" + +/* additive distance between non-SOP and SOP space */ +#define SOP_DISTANCE (TXE_PIO_SIZE / 2) +#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1) +/* number of QUADWORDs in a block */ +#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64)) + +/** + * pio_copy - copy data block to MMIO space + * @dd: hfi1 dev data + * @pbuf: a number of blocks allocated within a PIO send context + * @pbc: PBC to send + * @from: source, must be 8 byte aligned + * @count: number of DWORD (32-bit) quantities to copy from source + * + * Copy data from source to PIO Send Buffer memory, 8 bytes at a time. + * Must always write full BLOCK_SIZE bytes blocks. The first block must + * be written to the corresponding SOP=1 address. + * + * Known: + * o pbuf->start always starts on a block boundary + * o pbuf can wrap only at a block boundary + */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + /* write the PBC */ + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((count >> 1) * sizeof(u64)); + + if (dend < send) { + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->sc->size; + dend -= pbuf->sc->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* write dangling u32, if any */ + if (count & 1) { + union mix val; + + val.val64 = 0; + val.val32[0] = *(u32 *)from; + writeq(val.val64, dest); + dest += sizeof(u64); + } + /* + * fill in rest of block, no need to check pbuf->end + * as we only wrap on a block boundary + */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + this_cpu_dec(*pbuf->sc->buffers_allocated); + preempt_enable(); +} + +/* + * Handle carry bytes using shifts and masks. + * + * NOTE: the value the unused portion of carry is expected to always be zero. + */ + +/* + * "zero" shift - bit shift used to zero out upper bytes. Input is + * the count of LSB bytes to preserve. + */ +#define zshift(x) (8 * (8 - (x))) + +/* + * "merge" shift - bit shift used to merge with carry bytes. Input is + * the LSB byte count to move beyond. + */ +#define mshift(x) (8 * (x)) + +/* + * Jump copy - no-loop copy for < 8 bytes. + */ +static inline void jcopy(u8 *dest, const u8 *src, u32 n) +{ + switch (n) { + case 7: + *dest++ = *src++; + fallthrough; + case 6: + *dest++ = *src++; + fallthrough; + case 5: + *dest++ = *src++; + fallthrough; + case 4: + *dest++ = *src++; + fallthrough; + case 3: + *dest++ = *src++; + fallthrough; + case 2: + *dest++ = *src++; + fallthrough; + case 1: + *dest++ = *src++; + } +} + +/* + * Read nbytes from "from" and place them in the low bytes + * of pbuf->carry. Other bytes are left as-is. Any previous + * value in pbuf->carry is lost. + * + * NOTES: + * o do not read from from if nbytes is zero + * o from may _not_ be u64 aligned. + */ +static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, + unsigned int nbytes) +{ + pbuf->carry.val64 = 0; + jcopy(&pbuf->carry.val8[0], from, nbytes); + pbuf->carry_bytes = nbytes; +} + +/* + * Read nbytes bytes from "from" and put them at the end of pbuf->carry. + * It is expected that the extra read does not overfill carry. + * + * NOTES: + * o from may _not_ be u64 aligned + * o nbytes may span a QW boundary + */ +static inline void read_extra_bytes(struct pio_buf *pbuf, + const void *from, unsigned int nbytes) +{ + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes); + pbuf->carry_bytes += nbytes; +} + +/* + * Write a quad word using parts of pbuf->carry and the next 8 bytes of src. + * Put the unused part of the next 8 bytes of src into the LSB bytes of + * pbuf->carry with the upper bytes zeroed.. + * + * NOTES: + * o result must keep unused bytes zeroed + * o src must be u64 aligned + */ +static inline void merge_write8( + struct pio_buf *pbuf, + void __iomem *dest, + const void *src) +{ + u64 new, temp; + + new = *(u64 *)src; + temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes)); + writeq(temp, dest); + pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes); +} + +/* + * Write a quad word using all bytes of carry. + */ +static inline void carry8_write8(union mix carry, void __iomem *dest) +{ + writeq(carry.val64, dest); +} + +/* + * Write a quad word using all the valid bytes of carry. If carry + * has zero valid bytes, nothing is written. + * Returns 0 on nothing written, non-zero on quad word written. + */ +static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest) +{ + if (pbuf->carry_bytes) { + /* unused bytes are always kept zeroed, so just write */ + writeq(pbuf->carry.val64, dest); + return 1; + } + + return 0; +} + +/* + * Segmented PIO Copy - start + * + * Start a PIO copy. + * + * @pbuf: destination buffer + * @pbc: the PBC for the PIO buffer + * @from: data source, QWORD aligned + * @nbytes: bytes to copy + */ +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((nbytes >> 3) * sizeof(u64)); + + if (dend < send) { + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->sc->size; + dend -= pbuf->sc->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* ...but it doesn't matter as we're done writing */ + + /* save dangling bytes, if any */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3); +} + +/* + * Mid copy helper, "mixed case" - source is 64-bit aligned but carry + * bytes are non-zero. + * + * Whole u64s must be written to the chip, so bytes must be manually merged. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned. + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + unsigned long qw_to_write = nbytes >> 3; + unsigned long bytes_left = nbytes & 0x7; + + /* calculate 8-byte data end */ + dend = dest + (qw_to_write * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* + * calculate the end of data or end of block, whichever + * comes first + */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = min(send, dend); + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->sc->size; + dend -= pbuf->sc->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + pbuf->qw_written += qw_to_write; + + /* handle carry and left-over bytes */ + if (pbuf->carry_bytes + bytes_left >= 8) { + unsigned long nread; + + /* there is enough to fill another qw - fill carry */ + nread = 8 - pbuf->carry_bytes; + read_extra_bytes(pbuf, from, nread); + + /* + * One more write - but need to make sure dest is correct. + * Check for wrap and the possibility the write + * should be in SOP space. + * + * The two checks immediately below cannot both be true, hence + * the else. If we have wrapped, we cannot still be within the + * first block. Conversely, if we are still in the first block, + * we cannot have wrapped. We do the wrap check first as that + * is more likely. + */ + /* adjust if we have wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->sc->size; + /* jump to the SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + /* flush out full carry */ + carry8_write8(pbuf->carry, dest); + pbuf->qw_written++; + + /* now adjust and read the rest of the bytes into carry */ + bytes_left -= nread; + from += nread; /* from is now not aligned */ + read_low_bytes(pbuf, from, bytes_left); + } else { + /* not enough to fill another qw, append the rest to carry */ + read_extra_bytes(pbuf, from, bytes_left); + } +} + +/* + * Mid copy helper, "straight case" - source pointer is 64-bit aligned + * with no carry bytes. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_straight(struct pio_buf *pbuf, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + + /* calculate 8-byte data end */ + dend = dest + ((nbytes >> 3) * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* + * calculate the end of data or end of block, whichever + * comes first + */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = min(send, dend); + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->sc->size; + dend -= pbuf->sc->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* we know carry_bytes was zero on entry to this routine */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written += nbytes >> 3; +} + +/* + * Segmented PIO Copy - middle + * + * Must handle any aligned tail and any aligned source with any byte count. + * + * @pbuf: a number of blocks allocated within a PIO send context + * @from: data source + * @nbytes: number of bytes to copy + */ +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + unsigned long from_align = (unsigned long)from & 0x7; + + if (pbuf->carry_bytes + nbytes < 8) { + /* not enough bytes to fill a QW */ + read_extra_bytes(pbuf, from, nbytes); + return; + } + + if (from_align) { + /* misaligned source pointer - align it */ + unsigned long to_align; + + /* bytes to read to align "from" */ + to_align = 8 - from_align; + + /* + * In the advance-to-alignment logic below, we do not need + * to check if we are using more than nbytes. This is because + * if we are here, we already know that carry+nbytes will + * fill at least one QW. + */ + if (pbuf->carry_bytes + to_align < 8) { + /* not enough align bytes to fill a QW */ + read_extra_bytes(pbuf, from, to_align); + from += to_align; + nbytes -= to_align; + } else { + /* bytes to fill carry */ + unsigned long to_fill = 8 - pbuf->carry_bytes; + /* bytes left over to be read */ + unsigned long extra = to_align - to_fill; + void __iomem *dest; + + /* fill carry... */ + read_extra_bytes(pbuf, from, to_fill); + from += to_fill; + nbytes -= to_fill; + /* may not be enough valid bytes left to align */ + if (extra > nbytes) + extra = nbytes; + + /* ...now write carry */ + dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be + * true, hence the else. If we have wrapped, we + * cannot still be within the first block. + * Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first + * as that is more likely. + */ + /* adjust if we've wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->sc->size; + /* jump to SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + carry8_write8(pbuf->carry, dest); + pbuf->qw_written++; + + /* read any extra bytes to do final alignment */ + /* this will overwrite anything in pbuf->carry */ + read_low_bytes(pbuf, from, extra); + from += extra; + nbytes -= extra; + /* + * If no bytes are left, return early - we are done. + * NOTE: This short-circuit is *required* because + * "extra" may have been reduced in size and "from" + * is not aligned, as required when leaving this + * if block. + */ + if (nbytes == 0) + return; + } + + /* at this point, from is QW aligned */ + } + + if (pbuf->carry_bytes) + mid_copy_mix(pbuf, from, nbytes); + else + mid_copy_straight(pbuf, from, nbytes); +} + +/* + * Segmented PIO Copy - end + * + * Write any remainder (in pbuf->carry) and finish writing the whole block. + * + * @pbuf: a number of blocks allocated within a PIO send context + */ +void seg_pio_copy_end(struct pio_buf *pbuf) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be true, hence the + * else. If we have wrapped, we cannot still be within the first + * block. Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first as that is + * more likely. + */ + /* adjust if we have wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->sc->size; + /* jump to the SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + /* write final bytes, if any */ + if (carry_write8(pbuf, dest)) { + dest += sizeof(u64); + /* + * NOTE: We do not need to recalculate whether dest needs + * SOP_DISTANCE or not. + * + * If we are in the first block and the dangle write + * keeps us in the same block, dest will need + * to retain SOP_DISTANCE in the loop below. + * + * If we are in the first block and the dangle write pushes + * us to the next block, then loop below will not run + * and dest is not used. Hence we do not need to update + * it. + * + * If we are past the first block, then SOP_DISTANCE + * was never added, so there is nothing to do. + */ + } + + /* fill in rest of block */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + this_cpu_dec(*pbuf->sc->buffers_allocated); + preempt_enable(); +} diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c new file mode 100644 index 0000000000..54cbd8f1a6 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -0,0 +1,1035 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/firmware.h> + +#include "hfi.h" +#include "efivar.h" +#include "eprom.h" + +#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat" + +static int validate_scratch_checksum(struct hfi1_devdata *dd) +{ + u64 checksum = 0, temp_scratch = 0; + int i, j, version; + + temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH); + version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT; + + /* Prevent power on default of all zeroes from passing checksum */ + if (!version) { + dd_dev_err(dd, "%s: Config bitmap uninitialized\n", __func__); + dd_dev_err(dd, + "%s: Please update your BIOS to support active channels\n", + __func__); + return 0; + } + + /* + * ASIC scratch 0 only contains the checksum and bitmap version as + * fields of interest, both of which are handled separately from the + * loop below, so skip it + */ + checksum += version; + for (i = 1; i < ASIC_NUM_SCRATCH; i++) { + temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH + (8 * i)); + for (j = sizeof(u64); j != 0; j -= 2) { + checksum += (temp_scratch & 0xFFFF); + temp_scratch >>= 16; + } + } + + while (checksum >> 16) + checksum = (checksum & CHECKSUM_MASK) + (checksum >> 16); + + temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH); + temp_scratch &= CHECKSUM_SMASK; + temp_scratch >>= CHECKSUM_SHIFT; + + if (checksum + temp_scratch == 0xFFFF) + return 1; + + dd_dev_err(dd, "%s: Configuration bitmap corrupted\n", __func__); + return 0; +} + +static void save_platform_config_fields(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 temp_scratch = 0, temp_dest = 0; + + temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH_1); + + temp_dest = temp_scratch & + (dd->hfi1_id ? PORT1_PORT_TYPE_SMASK : + PORT0_PORT_TYPE_SMASK); + ppd->port_type = temp_dest >> + (dd->hfi1_id ? PORT1_PORT_TYPE_SHIFT : + PORT0_PORT_TYPE_SHIFT); + + temp_dest = temp_scratch & + (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SMASK : + PORT0_LOCAL_ATTEN_SMASK); + ppd->local_atten = temp_dest >> + (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SHIFT : + PORT0_LOCAL_ATTEN_SHIFT); + + temp_dest = temp_scratch & + (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SMASK : + PORT0_REMOTE_ATTEN_SMASK); + ppd->remote_atten = temp_dest >> + (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SHIFT : + PORT0_REMOTE_ATTEN_SHIFT); + + temp_dest = temp_scratch & + (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SMASK : + PORT0_DEFAULT_ATTEN_SMASK); + ppd->default_atten = temp_dest >> + (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SHIFT : + PORT0_DEFAULT_ATTEN_SHIFT); + + temp_scratch = read_csr(dd, dd->hfi1_id ? ASIC_CFG_SCRATCH_3 : + ASIC_CFG_SCRATCH_2); + + ppd->tx_preset_eq = (temp_scratch & TX_EQ_SMASK) >> TX_EQ_SHIFT; + ppd->tx_preset_noeq = (temp_scratch & TX_NO_EQ_SMASK) >> TX_NO_EQ_SHIFT; + ppd->rx_preset = (temp_scratch & RX_SMASK) >> RX_SHIFT; + + ppd->max_power_class = (temp_scratch & QSFP_MAX_POWER_SMASK) >> + QSFP_MAX_POWER_SHIFT; + + ppd->config_from_scratch = true; +} + +void get_platform_config(struct hfi1_devdata *dd) +{ + int ret = 0; + u8 *temp_platform_config = NULL; + u32 esize; + const struct firmware *platform_config_file = NULL; + + if (is_integrated(dd)) { + if (validate_scratch_checksum(dd)) { + save_platform_config_fields(dd); + return; + } + } else { + ret = eprom_read_platform_config(dd, + (void **)&temp_platform_config, + &esize); + if (!ret) { + /* success */ + dd->platform_config.data = temp_platform_config; + dd->platform_config.size = esize; + return; + } + } + dd_dev_err(dd, + "%s: Failed to get platform config, falling back to sub-optimal default file\n", + __func__); + + ret = request_firmware(&platform_config_file, + DEFAULT_PLATFORM_CONFIG_NAME, + &dd->pcidev->dev); + if (ret) { + dd_dev_err(dd, + "%s: No default platform config file found\n", + __func__); + return; + } + + /* + * Allocate separate memory block to store data and free firmware + * structure. This allows free_platform_config to treat EPROM and + * fallback configs in the same manner. + */ + dd->platform_config.data = kmemdup(platform_config_file->data, + platform_config_file->size, + GFP_KERNEL); + dd->platform_config.size = platform_config_file->size; + release_firmware(platform_config_file); +} + +void free_platform_config(struct hfi1_devdata *dd) +{ + /* Release memory allocated for eprom or fallback file read. */ + kfree(dd->platform_config.data); + dd->platform_config.data = NULL; +} + +void get_port_type(struct hfi1_pportdata *ppd) +{ + int ret; + u32 temp; + + ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_PORT_TYPE, &temp, + 4); + if (ret) { + ppd->port_type = PORT_TYPE_UNKNOWN; + return; + } + ppd->port_type = temp; +} + +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on) +{ + u8 tx_ctrl_byte = on ? 0x0 : 0xF; + int ret = 0; + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS, + &tx_ctrl_byte, 1); + /* we expected 1, so consider 0 an error */ + if (ret == 0) + ret = -EIO; + else if (ret == 1) + ret = 0; + return ret; +} + +static int qual_power(struct hfi1_pportdata *ppd) +{ + u32 cable_power_class = 0, power_class_max = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret = 0; + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4); + if (ret) + return ret; + + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class > power_class_max) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { + dd_dev_err( + ppd->dd, + "%s: Port disabled due to system power restrictions\n", + __func__); + ret = -EPERM; + } + return ret; +} + +static int qual_bitrate(struct hfi1_pportdata *ppd) +{ + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) && + cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) && + cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) { + dd_dev_err( + ppd->dd, + "%s: Cable failed bitrate check, disabling port\n", + __func__); + return -EPERM; + } + return 0; +} + +static int set_qsfp_high_power(struct hfi1_pportdata *ppd) +{ + u8 cable_power_class = 0, power_ctrl_byte = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret; + + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class > QSFP_POWER_CLASS_1) { + power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS]; + + power_ctrl_byte |= 1; + power_ctrl_byte &= ~(0x2); + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + + if (cable_power_class > QSFP_POWER_CLASS_4) { + power_ctrl_byte |= (1 << 2); + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + } + + /* SFF 8679 rev 1.7 LPMode Deassert time */ + msleep(300); + } + return 0; +} + +static void apply_rx_cdr(struct hfi1_pportdata *ppd, + u32 rx_preset_index, + u8 *cdr_ctrl_byte) +{ + u32 rx_preset; + u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) && + (cache[QSFP_CDR_INFO_OFFS] & 0x40))) + return; + + /* RX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn RX CDR on */ + *cdr_ctrl_byte |= 0xF; + return; + } + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR, + &rx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + rx_preset = (rx_preset | (rx_preset << 1) | + (rx_preset << 2) | (rx_preset << 3)); + + if (rx_preset) { + *cdr_ctrl_byte |= rx_preset; + } else { + *cdr_ctrl_byte &= rx_preset; + /* Preserve current TX CDR status */ + *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0); + } +} + +static void apply_tx_cdr(struct hfi1_pportdata *ppd, + u32 tx_preset_index, + u8 *cdr_ctrl_byte) +{ + u32 tx_preset; + u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) && + (cache[QSFP_CDR_INFO_OFFS] & 0x80))) + return; + + /* TX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn TX CDR on */ + *cdr_ctrl_byte |= 0xF0; + return; + } + + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4); + + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + tx_preset = (tx_preset | (tx_preset << 1) | + (tx_preset << 2) | (tx_preset << 3)); + + if (tx_preset) + *cdr_ctrl_byte |= (tx_preset << 4); + else + /* Preserve current/determined RX CDR status */ + *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF); +} + +static void apply_cdr_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + + apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte); + + apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte); + + qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, + &cdr_ctrl_byte, 1); +} + +static void apply_tx_eq_auto(struct hfi1_pportdata *ppd) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8)) + return; + /* Disable adaptive TX EQ if present */ + tx_eq = cache[(128 * 3) + 241]; + tx_eq &= 0xF0; + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1); +} + +static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u32 tx_preset; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4)) + return; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + &tx_preset, 4); + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_EQ_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ, + &tx_preset, 4); + + if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX EQ %x unsupported\n", + __func__, tx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying EQ %x\n", + __func__, cache[608] & 0xF0); + + tx_preset = (cache[608] & 0xF0) >> 4; + } + + tx_eq = tx_preset | (tx_preset << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1); +} + +static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index) +{ + u32 rx_preset; + u8 rx_eq, *cache = ppd->qsfp_info.cache; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2)) + return; + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_EMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP, + &rx_preset, 4); + + if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) { + dd_dev_info( + ppd->dd, + "%s: Requested RX EMP %x\n", + __func__, rx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying supported EMP %x\n", + __func__, cache[608] & 0xF); + + rx_preset = cache[608] & 0xF; + } + + rx_eq = rx_preset | (rx_preset << 4); + + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1); +} + +static void apply_eq_settings(struct hfi1_pportdata *ppd, + u32 rx_preset_index, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + + apply_tx_eq_auto(ppd); + + apply_tx_eq_prog(ppd, tx_preset_index); + + apply_rx_eq_emp(ppd, rx_preset_index); +} + +static void apply_rx_amplitude_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u32 rx_preset; + u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP, + &rx_preset, 4); + + dd_dev_info(ppd->dd, + "%s: Requested RX AMP %x\n", + __func__, + rx_preset); + + for (i = 0; i < 4; i++) { + if (cache[(128 * 3) + 225] & (1 << i)) { + preferred = i; + if (preferred == rx_preset) + break; + } + } + + /* + * Verify that preferred RX amplitude is not just a + * fall through of the default + */ + if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) { + dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n"); + return; + } + + dd_dev_info(ppd->dd, + "%s: Applying RX AMP %x\n", __func__, preferred); + + rx_amp = preferred | (preferred << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1); +} + +#define OPA_INVALID_INDEX 0xFFF + +static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id, + u32 config_data, const char *message) +{ + u8 i; + int ret; + + for (i = 0; i < 4; i++) { + ret = load_8051_config(ppd->dd, field_id, i, config_data); + if (ret != HCMD_SUCCESS) { + dd_dev_err( + ppd->dd, + "%s: %s for lane %u failed\n", + message, __func__, i); + } + } +} + +/* + * Return a special SerDes setting for low power AOC cables. The power class + * threshold and setting being used were all found by empirical testing. + * + * Summary of the logic: + * + * if (QSFP and QSFP_TYPE == AOC and QSFP_POWER_CLASS < 4) + * return 0xe + * return 0; // leave at default + */ +static u8 aoc_low_power_setting(struct hfi1_pportdata *ppd) +{ + u8 *cache = ppd->qsfp_info.cache; + int power_class; + + /* QSFP only */ + if (ppd->port_type != PORT_TYPE_QSFP) + return 0; /* leave at default */ + + /* active optical cables only */ + switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) { + case 0x0 ... 0x9: fallthrough; + case 0xC: fallthrough; + case 0xE: + /* active AOC */ + power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + if (power_class < QSFP_POWER_CLASS_4) + return 0xe; + } + return 0; /* leave at default */ +} + +static void apply_tunings( + struct hfi1_pportdata *ppd, u32 tx_preset_index, + u8 tuning_method, u32 total_atten, u8 limiting_active) +{ + int ret = 0; + u32 config_data = 0, tx_preset = 0; + u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0; + u8 *cache = ppd->qsfp_info.cache; + + /* Pass tuning method to 8051 */ + read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + &config_data); + config_data &= ~(0xff << TUNING_METHOD_SHIFT); + config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT); + ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n", + __func__); + + /* Set same channel loss for both TX and RX */ + config_data = 0 | (total_atten << 16) | (total_atten << 24); + apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data, + "Setting channel loss"); + + /* Inform 8051 of cable capabilities */ + if (ppd->qsfp_info.cache_valid) { + external_device_config = + ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) | + ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) | + ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) | + (cache[QSFP_EQ_INFO_OFFS] & 0x4); + ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, &config_data); + /* Clear, then set the external device config field */ + config_data &= ~(u32)0xFF; + config_data |= external_device_config; + ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err(ppd->dd, + "%s: Failed set ext device config params\n", + __func__); + } + + if (tx_preset_index == OPA_INVALID_INDEX) { + if (ppd->port_type == PORT_TYPE_QSFP && limiting_active) + dd_dev_err(ppd->dd, "%s: Invalid Tx preset index\n", + __func__); + return; + } + + /* Following for limiting active channels only */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_PRECUR, &tx_preset, 4); + precur = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4); + attn = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4); + postcur = tx_preset; + + /* + * NOTES: + * o The aoc_low_power_setting is applied to all lanes even + * though only lane 0's value is examined by the firmware. + * o A lingering low power setting after a cable swap does + * not occur. On cable unplug the 8051 is reset and + * restarted on cable insert. This resets all settings to + * their default, erasing any previous low power setting. + */ + config_data = precur | (attn << 8) | (postcur << 16) | + (aoc_low_power_setting(ppd) << 24); + + apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data, + "Applying TX settings"); +} + +/* Must be holding the QSFP i2c resource */ +static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, + u32 *ptr_rx_preset, u32 *ptr_total_atten) +{ + int ret; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + ppd->qsfp_info.limiting_active = 1; + + ret = set_qsfp_tx(ppd, 0); + if (ret) + return ret; + + ret = qual_power(ppd); + if (ret) + return ret; + + ret = qual_bitrate(ppd); + if (ret) + return ret; + + /* + * We'll change the QSFP memory contents from here on out, thus we set a + * flag here to remind ourselves to reset the QSFP module. This prevents + * reuse of stale settings established in our previous pass through. + */ + if (ppd->qsfp_info.reset_needed) { + ret = reset_qsfp(ppd); + if (ret) + return ret; + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + } else { + ppd->qsfp_info.reset_needed = 1; + } + + ret = set_qsfp_high_power(ppd); + if (ret) + return ret; + + if (cache[QSFP_EQ_INFO_OFFS] & 0x4) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + return ret; + } + } else { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + return ret; + } + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4); + if (ret) { + *ptr_rx_preset = OPA_INVALID_INDEX; + return ret; + } + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4); + else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4); + + apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + ret = set_qsfp_tx(ppd, 1); + + return ret; +} + +static int tune_qsfp(struct hfi1_pportdata *ppd, + u32 *ptr_tx_preset, u32 *ptr_rx_preset, + u8 *ptr_tuning_method, u32 *ptr_total_atten) +{ + u32 cable_atten = 0, remote_atten = 0, platform_atten = 0; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + int ret = 0; + u8 *cache = ppd->qsfp_info.cache; + + switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) { + case 0xA ... 0xB: + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + if (ret) + return ret; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS]; + else if ((lss & OPA_LINK_SPEED_12_5G) && + (lse & OPA_LINK_SPEED_12_5G)) + cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS]; + + /* Fallback to configured attenuation if cable memory is bad */ + if (cable_atten == 0 || cable_atten > 36) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + &cable_atten, 4); + if (ret) + return ret; + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + if (ret) + return ret; + + *ptr_total_atten = platform_atten + cable_atten + remote_atten; + + *ptr_tuning_method = OPA_PASSIVE_TUNING; + break; + case 0x0 ... 0x9: fallthrough; + case 0xC: fallthrough; + case 0xE: + ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset, + ptr_total_atten); + if (ret) + return ret; + + *ptr_tuning_method = OPA_ACTIVE_TUNING; + break; + case 0xD: fallthrough; + case 0xF: + default: + dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n", + __func__); + break; + } + return ret; +} + +/* + * This function communicates its success or failure via ppd->driver_link_ready + * Thus, it depends on its association with start_link(...) which checks + * driver_link_ready before proceeding with the link negotiation and + * initialization process. + */ +void tune_serdes(struct hfi1_pportdata *ppd) +{ + int ret = 0; + u32 total_atten = 0; + u32 remote_atten = 0, platform_atten = 0; + u32 rx_preset_index, tx_preset_index; + u8 tuning_method = 0, limiting_active = 0; + struct hfi1_devdata *dd = ppd->dd; + + rx_preset_index = OPA_INVALID_INDEX; + tx_preset_index = OPA_INVALID_INDEX; + + /* the link defaults to enabled */ + ppd->link_enabled = 1; + /* the driver link ready state defaults to not ready */ + ppd->driver_link_ready = 0; + ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + + /* Skip the tuning for testing (loopback != none) and simulations */ + if (loopback != LOOPBACK_NONE || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + ppd->driver_link_ready = 1; + + if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } + + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + } + + return; + } + + switch (ppd->port_type) { + case PORT_TYPE_DISCONNECTED: + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED); + dd_dev_warn(dd, "%s: Port disconnected, disabling port\n", + __func__); + goto bail; + case PORT_TYPE_FIXED: + /* platform_atten, remote_atten pre-zeroed to catch error */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + break; + case PORT_TYPE_VARIABLE: + if (qsfp_mod_present(ppd)) { + /* + * platform_atten, remote_atten pre-zeroed to + * catch error + */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, + &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + } else { + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG); + goto bail; + } + break; + case PORT_TYPE_QSFP: + if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + + if (ppd->qsfp_info.cache_valid) { + ret = tune_qsfp(ppd, + &tx_preset_index, + &rx_preset_index, + &tuning_method, + &total_atten); + + /* + * We may have modified the QSFP memory, so + * update the cache to reflect the changes + */ + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + limiting_active = + ppd->qsfp_info.limiting_active; + } else { + dd_dev_err(dd, + "%s: Reading QSFP memory failed\n", + __func__); + ret = -EINVAL; /* a fail indication */ + } + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + if (ret) + goto bail; + } else { + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + goto bail; + } + break; + default: + dd_dev_warn(ppd->dd, "%s: Unknown port type\n", __func__); + ppd->port_type = PORT_TYPE_UNKNOWN; + tuning_method = OPA_UNKNOWN_TUNING; + total_atten = 0; + limiting_active = 0; + tx_preset_index = OPA_INVALID_INDEX; + break; + } + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + apply_tunings(ppd, tx_preset_index, tuning_method, + total_atten, limiting_active); + + if (!ret) + ppd->driver_link_ready = 1; + + return; +bail: + ppd->driver_link_ready = 0; +} diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h new file mode 100644 index 0000000000..1d51dca1bc --- /dev/null +++ b/drivers/infiniband/hw/hfi1/platform.h @@ -0,0 +1,371 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#ifndef __PLATFORM_H +#define __PLATFORM_H + +#define METADATA_TABLE_FIELD_START_SHIFT 0 +#define METADATA_TABLE_FIELD_START_LEN_BITS 15 +#define METADATA_TABLE_FIELD_LEN_SHIFT 16 +#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16 + +/* Header structure */ +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0 +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4 + +enum platform_config_table_type_encoding { + PLATFORM_CONFIG_TABLE_RESERVED, + PLATFORM_CONFIG_SYSTEM_TABLE, + PLATFORM_CONFIG_PORT_TABLE, + PLATFORM_CONFIG_RX_PRESET_TABLE, + PLATFORM_CONFIG_TX_PRESET_TABLE, + PLATFORM_CONFIG_QSFP_ATTEN_TABLE, + PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE, + PLATFORM_CONFIG_TABLE_MAX +}; + +enum platform_config_system_table_fields { + SYSTEM_TABLE_RESERVED, + SYSTEM_TABLE_NODE_STRING, + SYSTEM_TABLE_SYSTEM_IMAGE_GUID, + SYSTEM_TABLE_NODE_GUID, + SYSTEM_TABLE_REVISION, + SYSTEM_TABLE_VENDOR_OUI, + SYSTEM_TABLE_META_VERSION, + SYSTEM_TABLE_DEVICE_ID, + SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT, + SYSTEM_TABLE_MAX +}; + +enum platform_config_port_table_fields { + PORT_TABLE_RESERVED, + PORT_TABLE_PORT_TYPE, + PORT_TABLE_LOCAL_ATTEN_12G, + PORT_TABLE_LOCAL_ATTEN_25G, + PORT_TABLE_LINK_SPEED_SUPPORTED, + PORT_TABLE_LINK_WIDTH_SUPPORTED, + PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED, + PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED, + PORT_TABLE_VL_CAP, + PORT_TABLE_MTU_CAP, + PORT_TABLE_TX_LANE_ENABLE_MASK, + PORT_TABLE_LOCAL_MAX_TIMEOUT, + PORT_TABLE_REMOTE_ATTEN_12G, + PORT_TABLE_REMOTE_ATTEN_25G, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + PORT_TABLE_RX_PRESET_IDX, + PORT_TABLE_CABLE_REACH_CLASS, + PORT_TABLE_MAX +}; + +enum platform_config_rx_preset_table_fields { + RX_PRESET_TABLE_RESERVED, + RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + RX_PRESET_TABLE_QSFP_RX_CDR, + RX_PRESET_TABLE_QSFP_RX_EMP, + RX_PRESET_TABLE_QSFP_RX_AMP, + RX_PRESET_TABLE_MAX +}; + +enum platform_config_tx_preset_table_fields { + TX_PRESET_TABLE_RESERVED, + TX_PRESET_TABLE_PRECUR, + TX_PRESET_TABLE_ATTN, + TX_PRESET_TABLE_POSTCUR, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, + TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + TX_PRESET_TABLE_QSFP_TX_CDR, + TX_PRESET_TABLE_QSFP_TX_EQ, + TX_PRESET_TABLE_MAX +}; + +enum platform_config_qsfp_attn_table_fields { + QSFP_ATTEN_TABLE_RESERVED, + QSFP_ATTEN_TABLE_TX_PRESET_IDX, + QSFP_ATTEN_TABLE_RX_PRESET_IDX, + QSFP_ATTEN_TABLE_MAX +}; + +enum platform_config_variable_settings_table_fields { + VARIABLE_SETTINGS_TABLE_RESERVED, + VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +struct platform_config { + size_t size; + const u8 *data; +}; + +struct platform_config_data { + u32 *table; + u32 *table_metadata; + u32 num_table; +}; + +/* + * This struct acts as a quick reference into the platform_data binary image + * and is populated by parse_platform_config(...) depending on the specific + * META_VERSION + */ +struct platform_config_cache { + u8 cache_valid; + struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX]; +}; + +/* This section defines default values and encodings for the + * fields defined for each table above + */ + +/* + * ===================================================== + * System table encodings + * ===================================================== + */ +#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041 +#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4 + +/* + * These power classes are the same as defined in SFF 8636 spec rev 2.4 + * describing byte 129 in table 6-16, except enumerated in a different order + */ +enum platform_config_qsfp_power_class_encoding { + QSFP_POWER_CLASS_1 = 1, + QSFP_POWER_CLASS_2, + QSFP_POWER_CLASS_3, + QSFP_POWER_CLASS_4, + QSFP_POWER_CLASS_5, + QSFP_POWER_CLASS_6, + QSFP_POWER_CLASS_7 +}; + +/* + * ==================================================== + * Port table encodings + * ==================================================== + */ +enum platform_config_port_type_encoding { + PORT_TYPE_UNKNOWN, + PORT_TYPE_DISCONNECTED, + PORT_TYPE_FIXED, + PORT_TYPE_VARIABLE, + PORT_TYPE_QSFP, + PORT_TYPE_MAX +}; + +enum platform_config_link_speed_supported_encoding { + LINK_SPEED_SUPP_12G = 1, + LINK_SPEED_SUPP_25G, + LINK_SPEED_SUPP_12G_25G, + LINK_SPEED_SUPP_MAX +}; + +/* + * This is a subset (not strict) of the link downgrades + * supported. The link downgrades supported are expected + * to be supplied to the driver by another entity such as + * the fabric manager + */ +enum platform_config_link_width_supported_encoding { + LINK_WIDTH_SUPP_1X = 1, + LINK_WIDTH_SUPP_2X, + LINK_WIDTH_SUPP_2X_1X, + LINK_WIDTH_SUPP_3X, + LINK_WIDTH_SUPP_3X_1X, + LINK_WIDTH_SUPP_3X_2X, + LINK_WIDTH_SUPP_3X_2X_1X, + LINK_WIDTH_SUPP_4X, + LINK_WIDTH_SUPP_4X_1X, + LINK_WIDTH_SUPP_4X_2X, + LINK_WIDTH_SUPP_4X_2X_1X, + LINK_WIDTH_SUPP_4X_3X, + LINK_WIDTH_SUPP_4X_3X_1X, + LINK_WIDTH_SUPP_4X_3X_2X, + LINK_WIDTH_SUPP_4X_3X_2X_1X, + LINK_WIDTH_SUPP_MAX +}; + +enum platform_config_virtual_lane_capability_encoding { + VL_CAP_VL0 = 1, + VL_CAP_VL0_1, + VL_CAP_VL0_2, + VL_CAP_VL0_3, + VL_CAP_VL0_4, + VL_CAP_VL0_5, + VL_CAP_VL0_6, + VL_CAP_VL0_7, + VL_CAP_VL0_8, + VL_CAP_VL0_9, + VL_CAP_VL0_10, + VL_CAP_VL0_11, + VL_CAP_VL0_12, + VL_CAP_VL0_13, + VL_CAP_VL0_14, + VL_CAP_MAX +}; + +/* Max MTU */ +enum platform_config_mtu_capability_encoding { + MTU_CAP_256 = 1, + MTU_CAP_512 = 2, + MTU_CAP_1024 = 3, + MTU_CAP_2048 = 4, + MTU_CAP_4096 = 5, + MTU_CAP_8192 = 6, + MTU_CAP_10240 = 7 +}; + +enum platform_config_local_max_timeout_encoding { + LOCAL_MAX_TIMEOUT_10_MS = 1, + LOCAL_MAX_TIMEOUT_100_MS, + LOCAL_MAX_TIMEOUT_1_S, + LOCAL_MAX_TIMEOUT_10_S, + LOCAL_MAX_TIMEOUT_100_S, + LOCAL_MAX_TIMEOUT_1000_S +}; + +enum link_tuning_encoding { + OPA_PASSIVE_TUNING, + OPA_ACTIVE_TUNING, + OPA_UNKNOWN_TUNING +}; + +/* + * Shifts and masks for the link SI tuning values stuffed into the ASIC scratch + * registers for integrated platforms + */ +#define PORT0_PORT_TYPE_SHIFT 0 +#define PORT0_LOCAL_ATTEN_SHIFT 4 +#define PORT0_REMOTE_ATTEN_SHIFT 10 +#define PORT0_DEFAULT_ATTEN_SHIFT 32 + +#define PORT1_PORT_TYPE_SHIFT 16 +#define PORT1_LOCAL_ATTEN_SHIFT 20 +#define PORT1_REMOTE_ATTEN_SHIFT 26 +#define PORT1_DEFAULT_ATTEN_SHIFT 40 + +#define PORT0_PORT_TYPE_MASK 0xFUL +#define PORT0_LOCAL_ATTEN_MASK 0x3FUL +#define PORT0_REMOTE_ATTEN_MASK 0x3FUL +#define PORT0_DEFAULT_ATTEN_MASK 0xFFUL + +#define PORT1_PORT_TYPE_MASK 0xFUL +#define PORT1_LOCAL_ATTEN_MASK 0x3FUL +#define PORT1_REMOTE_ATTEN_MASK 0x3FUL +#define PORT1_DEFAULT_ATTEN_MASK 0xFFUL + +#define PORT0_PORT_TYPE_SMASK (PORT0_PORT_TYPE_MASK << \ + PORT0_PORT_TYPE_SHIFT) +#define PORT0_LOCAL_ATTEN_SMASK (PORT0_LOCAL_ATTEN_MASK << \ + PORT0_LOCAL_ATTEN_SHIFT) +#define PORT0_REMOTE_ATTEN_SMASK (PORT0_REMOTE_ATTEN_MASK << \ + PORT0_REMOTE_ATTEN_SHIFT) +#define PORT0_DEFAULT_ATTEN_SMASK (PORT0_DEFAULT_ATTEN_MASK << \ + PORT0_DEFAULT_ATTEN_SHIFT) + +#define PORT1_PORT_TYPE_SMASK (PORT1_PORT_TYPE_MASK << \ + PORT1_PORT_TYPE_SHIFT) +#define PORT1_LOCAL_ATTEN_SMASK (PORT1_LOCAL_ATTEN_MASK << \ + PORT1_LOCAL_ATTEN_SHIFT) +#define PORT1_REMOTE_ATTEN_SMASK (PORT1_REMOTE_ATTEN_MASK << \ + PORT1_REMOTE_ATTEN_SHIFT) +#define PORT1_DEFAULT_ATTEN_SMASK (PORT1_DEFAULT_ATTEN_MASK << \ + PORT1_DEFAULT_ATTEN_SHIFT) + +#define QSFP_MAX_POWER_SHIFT 0 +#define TX_NO_EQ_SHIFT 4 +#define TX_EQ_SHIFT 25 +#define RX_SHIFT 46 + +#define QSFP_MAX_POWER_MASK 0xFUL +#define TX_NO_EQ_MASK 0x1FFFFFUL +#define TX_EQ_MASK 0x1FFFFFUL +#define RX_MASK 0xFFFFUL + +#define QSFP_MAX_POWER_SMASK (QSFP_MAX_POWER_MASK << \ + QSFP_MAX_POWER_SHIFT) +#define TX_NO_EQ_SMASK (TX_NO_EQ_MASK << TX_NO_EQ_SHIFT) +#define TX_EQ_SMASK (TX_EQ_MASK << TX_EQ_SHIFT) +#define RX_SMASK (RX_MASK << RX_SHIFT) + +#define TX_PRECUR_SHIFT 0 +#define TX_ATTN_SHIFT 4 +#define QSFP_TX_CDR_APPLY_SHIFT 9 +#define QSFP_TX_EQ_APPLY_SHIFT 10 +#define QSFP_TX_CDR_SHIFT 11 +#define QSFP_TX_EQ_SHIFT 12 +#define TX_POSTCUR_SHIFT 16 + +#define TX_PRECUR_MASK 0xFUL +#define TX_ATTN_MASK 0x1FUL +#define QSFP_TX_CDR_APPLY_MASK 0x1UL +#define QSFP_TX_EQ_APPLY_MASK 0x1UL +#define QSFP_TX_CDR_MASK 0x1UL +#define QSFP_TX_EQ_MASK 0xFUL +#define TX_POSTCUR_MASK 0x1FUL + +#define TX_PRECUR_SMASK (TX_PRECUR_MASK << TX_PRECUR_SHIFT) +#define TX_ATTN_SMASK (TX_ATTN_MASK << TX_ATTN_SHIFT) +#define QSFP_TX_CDR_APPLY_SMASK (QSFP_TX_CDR_APPLY_MASK << \ + QSFP_TX_CDR_APPLY_SHIFT) +#define QSFP_TX_EQ_APPLY_SMASK (QSFP_TX_EQ_APPLY_MASK << \ + QSFP_TX_EQ_APPLY_SHIFT) +#define QSFP_TX_CDR_SMASK (QSFP_TX_CDR_MASK << QSFP_TX_CDR_SHIFT) +#define QSFP_TX_EQ_SMASK (QSFP_TX_EQ_MASK << QSFP_TX_EQ_SHIFT) +#define TX_POSTCUR_SMASK (TX_POSTCUR_MASK << TX_POSTCUR_SHIFT) + +#define QSFP_RX_CDR_APPLY_SHIFT 0 +#define QSFP_RX_EMP_APPLY_SHIFT 1 +#define QSFP_RX_AMP_APPLY_SHIFT 2 +#define QSFP_RX_CDR_SHIFT 3 +#define QSFP_RX_EMP_SHIFT 4 +#define QSFP_RX_AMP_SHIFT 8 + +#define QSFP_RX_CDR_APPLY_MASK 0x1UL +#define QSFP_RX_EMP_APPLY_MASK 0x1UL +#define QSFP_RX_AMP_APPLY_MASK 0x1UL +#define QSFP_RX_CDR_MASK 0x1UL +#define QSFP_RX_EMP_MASK 0xFUL +#define QSFP_RX_AMP_MASK 0x3UL + +#define QSFP_RX_CDR_APPLY_SMASK (QSFP_RX_CDR_APPLY_MASK << \ + QSFP_RX_CDR_APPLY_SHIFT) +#define QSFP_RX_EMP_APPLY_SMASK (QSFP_RX_EMP_APPLY_MASK << \ + QSFP_RX_EMP_APPLY_SHIFT) +#define QSFP_RX_AMP_APPLY_SMASK (QSFP_RX_AMP_APPLY_MASK << \ + QSFP_RX_AMP_APPLY_SHIFT) +#define QSFP_RX_CDR_SMASK (QSFP_RX_CDR_MASK << QSFP_RX_CDR_SHIFT) +#define QSFP_RX_EMP_SMASK (QSFP_RX_EMP_MASK << QSFP_RX_EMP_SHIFT) +#define QSFP_RX_AMP_SMASK (QSFP_RX_AMP_MASK << QSFP_RX_AMP_SHIFT) + +#define BITMAP_VERSION 1 +#define BITMAP_VERSION_SHIFT 44 +#define BITMAP_VERSION_MASK 0xFUL +#define BITMAP_VERSION_SMASK (BITMAP_VERSION_MASK << \ + BITMAP_VERSION_SHIFT) +#define CHECKSUM_SHIFT 48 +#define CHECKSUM_MASK 0xFFFFUL +#define CHECKSUM_SMASK (CHECKSUM_MASK << CHECKSUM_SHIFT) + +/* platform.c */ +void get_platform_config(struct hfi1_devdata *dd); +void free_platform_config(struct hfi1_devdata *dd); +void get_port_type(struct hfi1_pportdata *ppd); +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on); +void tune_serdes(struct hfi1_pportdata *ppd); + +#endif /*__PLATFORM_H*/ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c new file mode 100644 index 0000000000..6193d48b2c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -0,0 +1,925 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/hash.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> +#include <rdma/ib_verbs.h> + +#include "hfi.h" +#include "qp.h" +#include "trace.h" +#include "verbs_txreq.h" + +unsigned int hfi1_qp_table_size = 256; +module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +static void flush_tx_list(struct rvt_qp *qp); +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *stx, + unsigned int seq, + bool pkts_sent); +static void iowait_wakeup(struct iowait *wait, int reason); +static void iowait_sdma_drained(struct iowait *wait); +static void qp_pio_drain(struct rvt_qp *qp); + +const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { +[IB_WR_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_RDMA_READ] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC, +}, + +[IB_WR_ATOMIC_CMP_AND_SWP] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_ATOMIC_FETCH_AND_ADD] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE, +}, + +[IB_WR_RDMA_WRITE_WITH_IMM] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_SEND_WITH_IMM] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) | + BIT(IB_QPT_UC) | BIT(IB_QPT_RC), +}, + +[IB_WR_REG_MR] = { + .length = sizeof(struct ib_reg_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_LOCAL_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC), + .flags = RVT_OPERATION_LOCAL, +}, + +[IB_WR_SEND_WITH_INV] = { + .length = sizeof(struct ib_send_wr), + .qpt_support = BIT(IB_QPT_RC), +}, + +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + +}; + +static void flush_list_head(struct list_head *l) +{ + while (!list_empty(l)) { + struct sdma_txreq *tx; + + tx = list_first_entry( + l, + struct sdma_txreq, + list); + list_del_init(&tx->list); + hfi1_put_txreq( + container_of(tx, struct verbs_txreq, txreq)); + } +} + +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head); + flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head); +} + +static void flush_iowait(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + seqlock_t *lock = priv->s_iowait.lock; + + if (!lock) + return; + write_seqlock_irqsave(lock, flags); + if (!list_empty(&priv->s_iowait.list)) { + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + rvt_put_qp(qp); + } + write_sequnlock_irqrestore(lock, flags); +} + +/* + * This function is what we would push to the core layer if we wanted to be a + * "first class citizen". Instead we hide this here and rely on Verbs ULPs + * to blindly pass the MTU enum value from the PathRecord to us. + */ +static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) +{ + /* Constraining 10KB packets to 8KB packets */ + if (mtu == (enum ib_mtu)OPA_MTU_10240) + mtu = (enum ib_mtu)OPA_MTU_8192; + return opa_mtu_enum_to_int((enum opa_mtu)mtu); +} + +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_ibdev *dev = to_idev(ibqp->device); + struct hfi1_devdata *dd = dd_from_dev(dev); + u8 sc; + + if (attr_mask & IB_QP_AV) { + sc = ah_to_sc(ibqp->device, &attr->ah_attr); + if (sc == 0xf) + return -EINVAL; + + if (!qp_to_sdma_engine(qp, sc) && + dd->flags & HFI1_HAS_SEND_DMA) + return -EINVAL; + + if (!qp_to_send_context(qp, sc)) + return -EINVAL; + } + + if (attr_mask & IB_QP_ALT_PATH) { + sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr); + if (sc == 0xf) + return -EINVAL; + + if (!qp_to_sdma_engine(qp, sc) && + dd->flags & HFI1_HAS_SEND_DMA) + return -EINVAL; + + if (!qp_to_send_context(qp, sc)) + return -EINVAL; + } + + return 0; +} + +/* + * qp_set_16b - Set the hdr_type based on whether the slid or the + * dlid in the connection is extended. Only applicable for RC and UC + * QPs. UD QPs determine this on the fly from the ah in the wqe + */ +static inline void qp_set_16b(struct rvt_qp *qp) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct hfi1_qp_priv *priv = qp->priv; + + /* Update ah_attr to account for extended LIDs */ + hfi1_update_ah_attr(qp->ibqp.device, &qp->remote_ah_attr); + + /* Create 32 bit LIDs */ + hfi1_make_opa_lid(&qp->remote_ah_attr); + + if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) + return; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, &qp->remote_ah_attr); +} + +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + + if (attr_mask & IB_QP_AV) { + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); + } + + if (attr_mask & IB_QP_PATH_MIG_STATE && + attr->path_mig_state == IB_MIG_MIGRATED && + qp->s_mig_state == IB_MIG_ARMED) { + qp->s_flags |= HFI1_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); + } + + opfn_qp_init(qp, attr, attr_mask); +} + +/** + * hfi1_setup_wqe - set up the wqe + * @qp: The qp + * @wqe: The built wqe + * @call_send: Determine if the send should be posted or scheduled. + * + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allows the driver the opportunity to perform + * validation and additional setup of the wqe. + * + * Returns 0 on success, -EINVAL on failure + * + */ +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct rvt_ah *ah; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + hfi1_setup_tid_rdma_wqe(qp, wqe); + fallthrough; + case IB_QPT_UC: + if (wqe->length > 0x80000000U) + return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; + break; + case IB_QPT_SMI: + /* + * SM packets should exclusively use VL15 and their SL is + * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah + * is created, SL is 0 in most cases and as a result some + * fields (vl and pmtu) in ah may not be set correctly, + * depending on the SL2SC and SC2VL tables at the time. + */ + ppd = ppd_from_ibp(ibp); + dd = dd_from_ppd(ppd); + if (wqe->length > dd->vld[15].mtu) + return -EINVAL; + break; + case IB_QPT_GSI: + case IB_QPT_UD: + ah = rvt_get_swqe_ah(wqe); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; + if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) + return -EINVAL; + break; + default: + break; + } + + /* + * System latency between send and schedule is large enough that + * forcing call_send to true for piothreshold packets is necessary. + */ + if (wqe->length <= piothreshold) + *call_send = true; + return 0; +} + +/** + * _hfi1_schedule_send - schedule progress + * @qp: the QP + * + * This schedules qp progress w/o regard to the s_flags. + * + * It is only used in the post send, which doesn't hold + * the s_lock. + */ +bool _hfi1_schedule_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = ppd->dd; + + if (dd->flags & HFI1_SHUTDOWN) + return true; + + return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +static void qp_pio_drain(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->s_sendcontext) + return; + while (iowait_pio_pending(&priv->s_iowait)) { + write_seqlock_irq(&priv->s_sendcontext->waitlock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1); + write_sequnlock_irq(&priv->s_sendcontext->waitlock); + iowait_pio_drain(&priv->s_iowait); + write_seqlock_irq(&priv->s_sendcontext->waitlock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0); + write_sequnlock_irq(&priv->s_sendcontext->waitlock); + } +} + +/** + * hfi1_schedule_send - schedule progress + * @qp: the QP + * + * This schedules qp progress and caller should hold + * the s_lock. + * @return true if the first leg is scheduled; + * false if the first leg is not scheduled. + */ +bool hfi1_schedule_send(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + if (hfi1_send_ok(qp)) { + _hfi1_schedule_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_IB); + return false; +} + +static void hfi1_qp_schedule(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + bool ret; + + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) { + ret = hfi1_schedule_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + } + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) { + ret = hfi1_schedule_tid_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } +} + +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & flag) { + qp->s_flags &= ~flag; + trace_hfi1_qpwakeup(qp, flag); + hfi1_qp_schedule(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + /* Notify hfi1_destroy_qp() if it is waiting. */ + rvt_put_qp(qp); +} + +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) { + qp->s_flags &= ~RVT_S_BUSY; + /* + * If we are sending a first-leg packet from the second leg, + * we need to clear the busy flag from priv->s_flags to + * avoid a race condition when the qp wakes up before + * the call to hfi1_verbs_send() returns to the second + * leg. In that case, the second leg will terminate without + * being re-scheduled, resulting in failure to send TID RDMA + * WRITE DATA and TID RDMA ACK packets. + */ + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + priv->s_flags &= ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } + } else { + priv->s_flags &= ~RVT_S_BUSY; + } +} + +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *stx, + uint seq, + bool pkts_sent) +{ + struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq); + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + unsigned long flags; + int ret = 0; + + qp = tx->qp; + priv = qp->priv; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + /* Make a common routine? */ + list_add_tail(&stx->list, &wait->tx_head); + write_seqlock(&sde->waitlock); + if (sdma_progress(sde, seq, stx)) + goto eagain; + if (list_empty(&priv->s_iowait.list)) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + + ibp->rvp.n_dmawait++; + qp->s_flags |= RVT_S_WAIT_DMA_DESC; + iowait_get_priority(&priv->s_iowait); + iowait_queue(pkts_sent, &priv->s_iowait, + &sde->dmawait); + priv->s_iowait.lock = &sde->waitlock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC); + rvt_get_qp(qp); + } + write_sequnlock(&sde->waitlock); + hfi1_qp_unbusy(qp, wait); + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EBUSY; + } else { + spin_unlock_irqrestore(&qp->s_lock, flags); + hfi1_put_txreq(tx); + } + return ret; +eagain: + write_sequnlock(&sde->waitlock); + spin_unlock_irqrestore(&qp->s_lock, flags); + list_del_init(&stx->list); + return -EAGAIN; +} + +static void iowait_wakeup(struct iowait *wait, int reason) +{ + struct rvt_qp *qp = iowait_to_qp(wait); + + WARN_ON(reason != SDMA_AVAIL_REASON); + hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC); +} + +static void iowait_sdma_drained(struct iowait *wait) +{ + struct rvt_qp *qp = iowait_to_qp(wait); + unsigned long flags; + + /* + * This happens when the send engine notes + * a QP in the error state and cannot + * do the flush work until that QP's + * sdma work has finished. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & RVT_S_WAIT_DMA) { + qp->s_flags &= ~RVT_S_WAIT_DMA; + hfi1_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +static void hfi1_init_priority(struct iowait *w) +{ + struct rvt_qp *qp = iowait_to_qp(w); + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->s_flags & RVT_S_ACK_PENDING) + w->priority++; + if (priv->s_flags & RVT_S_ACK_PENDING) + w->priority++; +} + +/** + * qp_to_sdma_engine - map a qp to a send engine + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send engine for the qp or NULL for SMI type qp. + */ +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct sdma_engine *sde; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return NULL; + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + return NULL; + default: + break; + } + sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5); + return sde; +} + +/** + * qp_to_send_context - map a qp to a send context + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send context for the qp + */ +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + /* SMA packets to VL15 */ + return dd->vld[15].sc; + default: + break; + } + + return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, + sc5); +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +static int qp_idle(struct rvt_qp *qp) +{ + return + qp->s_last == qp->s_acked && + qp->s_acked == qp->s_cur && + qp->s_cur == qp->s_tail && + qp->s_tail == qp->s_head; +} + +/** + * qp_iter_print - print the qp information to seq_file + * @s: the seq_file to emit the qp information on + * @iter: the iterator for the qp hash list + */ +void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) +{ + struct rvt_swqe *wqe; + struct rvt_qp *qp = iter->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct sdma_engine *sde; + struct send_context *send_context; + struct rvt_ack_entry *e = NULL; + struct rvt_srq *srq = qp->ibqp.srq ? + ibsrq_to_rvtsrq(qp->ibqp.srq) : NULL; + + sde = qp_to_sdma_engine(qp, priv->s_sc); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + send_context = qp_to_send_context(qp, priv->s_sc); + if (qp->s_ack_queue) + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + seq_printf(s, + "N %d %s QP %x R %u %s %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x RNR %d %s %d\n", + iter->n, + qp_idle(qp) ? "I" : "B", + qp->ibqp.qp_num, + atomic_read(&qp->refcount), + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe ? wqe->wr.opcode : 0, + qp->s_flags, + iowait_sdma_pending(&priv->s_iowait), + iowait_pio_pending(&priv->s_iowait), + !list_empty(&priv->s_iowait.list), + qp->timeout, + wqe ? wqe->ssn : 0, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->r_psn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->s_avail, + /* ack_queue ring pointers, size */ + qp->s_tail_ack_queue, qp->r_head_ack_queue, + rvt_max_atomic(&to_idev(qp->ibqp.device)->rdi), + /* remote QP info */ + qp->remote_qpn, + rdma_ah_get_dlid(&qp->remote_ah_attr), + rdma_ah_get_sl(&qp->remote_ah_attr), + qp->pmtu, + qp->s_retry, + qp->s_retry_cnt, + qp->s_rnr_retry_cnt, + qp->s_rnr_retry, + sde, + sde ? sde->this_idx : 0, + send_context, + send_context ? send_context->sw_index : 0, + ib_cq_head(qp->ibqp.send_cq), + ib_cq_tail(qp->ibqp.send_cq), + qp->pid, + qp->s_state, + qp->s_ack_state, + /* ack queue information */ + e ? e->opcode : 0, + e ? e->psn : 0, + e ? e->lpsn : 0, + qp->r_min_rnr_timer, + srq ? "SRQ" : "RQ", + srq ? srq->rq.size : qp->r_rq.size + ); +} + +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, rdi->dparms.node); + if (!priv) + return ERR_PTR(-ENOMEM); + + priv->owner = qp; + + priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), GFP_KERNEL, + rdi->dparms.node); + if (!priv->s_ahg) { + kfree(priv); + return ERR_PTR(-ENOMEM); + } + iowait_init( + &priv->s_iowait, + 1, + _hfi1_do_send, + _hfi1_do_tid_send, + iowait_sleep, + iowait_wakeup, + iowait_sdma_drained, + hfi1_init_priority); + /* Init to a value to start the running average correctly */ + priv->s_running_pkt_size = piothreshold / 2; + return priv; +} + +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + hfi1_qp_priv_tid_free(rdi, qp); + kfree(priv->s_ahg); + kfree(priv); +} + +unsigned free_all_qps(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + int n; + unsigned qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct hfi1_ibport *ibp = &dd->pport[n].ibport_data; + + rcu_read_lock(); + if (rcu_dereference(ibp->rvp.qp[0])) + qp_inuse++; + if (rcu_dereference(ibp->rvp.qp[1])) + qp_inuse++; + rcu_read_unlock(); + } + + return qp_inuse; +} + +void flush_qp_waiters(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + flush_iowait(qp); + hfi1_tid_rdma_flush_wait(qp); +} + +void stop_send_queue(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + iowait_cancel_work(&priv->s_iowait); + if (cancel_work_sync(&priv->tid_rdma.trigger_work)) + rvt_put_qp(qp); +} + +void quiesce_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); + iowait_sdma_drain(&priv->s_iowait); + qp_pio_drain(qp); + flush_tx_list(qp); +} + +void notify_qp_reset(struct rvt_qp *qp) +{ + hfi1_qp_kern_exp_rcv_clear_all(qp); + qp->r_adefered = 0; + clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); +} + +/* + * Switch to alternate path. + * The QP s_lock should be held and interrupts disabled. + */ +void hfi1_migrate_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_event ev; + + qp->s_mig_state = IB_MIG_MIGRATED; + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr); + qp->s_pkey_index = qp->s_alt_pkey_index; + qp->s_flags |= HFI1_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + qp_set_16b(qp); + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); +} + +int mtu_to_path_mtu(u32 mtu) +{ + return mtu_to_enum(mtu, OPA_MTU_8192); +} + +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu) +{ + u32 mtu; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + struct hfi1_ibport *ibp; + u8 sc, vl; + + ibp = &dd->pport[qp->port_num - 1].ibport_data; + sc = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; + vl = sc_to_vlt(dd, sc); + + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu); + if (vl < PER_VL_SEND_CONTEXTS) + mtu = min_t(u32, mtu, dd->vld[vl].mtu); + return mtu; +} + +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr) +{ + int mtu, pidx = qp->port_num - 1; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu); + if (mtu == -1) + return -1; /* values less than 0 are error */ + + if (mtu > dd->pport[pidx].ibmtu) + return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048); + else + return attr->path_mtu; +} + +void notify_error_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + seqlock_t *lock = priv->s_iowait.lock; + + if (lock) { + write_seqlock(lock); + if (!list_empty(&priv->s_iowait.list) && + !(qp->s_flags & RVT_S_BUSY) && + !(priv->s_flags & RVT_S_BUSY)) { + qp->s_flags &= ~HFI1_S_ANY_WAIT_IO; + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + rvt_put_qp(qp); + } + write_sequnlock(lock); + } + + if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + rvt_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + flush_tx_list(qp); + } +} + +/** + * hfi1_qp_iter_cb - callback for iterator + * @qp: the qp + * @v: the sl in low bits of v + * + * This is called from the iterator callback to work + * on an individual qp. + */ +static void hfi1_qp_iter_cb(struct rvt_qp *qp, u64 v) +{ + int lastwqe; + struct ib_event ev; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u8 sl = (u8)v; + + if (qp->port_num != ppd->port || + (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) || + rdma_ah_get_sl(&qp->remote_ah_attr) != sl || + !(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK)) + return; + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +/** + * hfi1_error_port_qps - put a port's RC/UC qps into error state + * @ibp: the ibport. + * @sl: the service level. + * + * This function places all RC/UC qps with a given service level into error + * state. It is generally called to force upper lay apps to abandon stale qps + * after an sl->sc mapping change. + */ +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_ibdev *dev = &ppd->dd->verbs_dev; + + rvt_qp_iter(&dev->rdi, sl, hfi1_qp_iter_cb); +} diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h new file mode 100644 index 0000000000..cdf87bc6ad --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#ifndef _QP_H +#define _QP_H +#include <linux/hash.h> +#include <rdma/rdmavt_qp.h> +#include "verbs.h" +#include "sdma.h" +#include "verbs_txreq.h" + +extern unsigned int hfi1_qp_table_size; + +extern const struct rvt_operation_params hfi1_post_parms[]; + +/* + * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK + * + * HFI1_S_AHG_VALID - ahg header valid on chip + * HFI1_S_AHG_CLEAR - have send engine clear ahg state + * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain + * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response + * HFI1_S_WAIT_HALT - halt the first leg send engine + * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 + */ +#define HFI1_S_AHG_VALID 0x80000000 +#define HFI1_S_AHG_CLEAR 0x40000000 +#define HFI1_S_WAIT_PIO_DRAIN 0x20000000 +#define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 +#define HFI1_S_WAIT_HALT 0x04000000 +#define HFI1_S_MIN_BIT_MASK 0x01000000 + +/* + * overload wait defines + */ + +#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) +#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA) + +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) || + (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + +/* + * free_ahg - clear ahg from QP + */ +static inline void clear_ahg(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + priv->s_ahg->ahgcount = 0; + qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR); + if (priv->s_sde && qp->s_ahgidx >= 0) + sdma_ahg_free(priv->s_sde, qp->s_ahgidx); + qp->s_ahgidx = -1; +} + +/** + * hfi1_qp_wakeup - wake up on the indicated event + * @qp: the QP + * @flag: flag the qp on which the qp is stalled + */ +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag); + +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5); +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); + +void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); + +bool _hfi1_schedule_send(struct rvt_qp *qp); +bool hfi1_schedule_send(struct rvt_qp *qp); + +void hfi1_migrate_qp(struct rvt_qp *qp); + +/* + * Functions provided by hfi1 driver for rdmavt to use + */ +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp); +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +unsigned free_all_qps(struct rvt_dev_info *rdi); +void notify_qp_reset(struct rvt_qp *qp); +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); +void flush_qp_waiters(struct rvt_qp *qp); +void notify_error_qp(struct rvt_qp *qp); +void stop_send_queue(struct rvt_qp *qp); +void quiesce_qp(struct rvt_qp *qp); +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); +int mtu_to_path_mtu(u32 mtu); +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait); +#endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c new file mode 100644 index 0000000000..19d7887a4f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -0,0 +1,816 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ + +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> + +#include "hfi.h" + +/* for the given bus number, return the CSR for reading an i2c line */ +static inline u32 i2c_in_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN; +} + +/* for the given bus number, return the CSR for writing an i2c line */ +static inline u32 i2c_oe_csr(u32 bus_num) +{ + return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; +} + +static void hfi1_setsda(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CDAT; + else + reg |= QSFP_HFI0_I2CDAT; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static void hfi1_setscl(void *data, int state) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + struct hfi1_devdata *dd = bus->controlling_dd; + u64 reg; + u32 target_oe; + + target_oe = i2c_oe_csr(bus->num); + reg = read_csr(dd, target_oe); + /* + * The OE bit value is inverted and connected to the pin. When + * OE is 0 the pin is left to be pulled up, when the OE is 1 + * the pin is driven low. This matches the "open drain" or "open + * collector" convention. + */ + if (state) + reg &= ~QSFP_HFI0_I2CCLK; + else + reg |= QSFP_HFI0_I2CCLK; + write_csr(dd, target_oe, reg); + /* do a read to force the write into the chip */ + (void)read_csr(dd, target_oe); +} + +static int hfi1_getsda(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setsda(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CDAT); +} + +static int hfi1_getscl(void *data) +{ + struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data; + u64 reg; + u32 target_in; + + hfi1_setscl(data, 1); /* clear OE so we do not pull line down */ + udelay(2); /* 1us pull up + 250ns hold */ + + target_in = i2c_in_csr(bus->num); + reg = read_csr(bus->controlling_dd, target_in); + return !!(reg & QSFP_HFI0_I2CCLK); +} + +/* + * Allocate and initialize the given i2c bus number. + * Returns NULL on failure. + */ +static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd, + struct hfi1_asic_data *ad, int num) +{ + struct hfi1_i2c_bus *bus; + int ret; + + bus = kzalloc(sizeof(*bus), GFP_KERNEL); + if (!bus) + return NULL; + + bus->controlling_dd = dd; + bus->num = num; /* our bus number */ + + bus->algo.setsda = hfi1_setsda; + bus->algo.setscl = hfi1_setscl; + bus->algo.getsda = hfi1_getsda; + bus->algo.getscl = hfi1_getscl; + bus->algo.udelay = 5; + bus->algo.timeout = usecs_to_jiffies(100000); + bus->algo.data = bus; + + bus->adapter.owner = THIS_MODULE; + bus->adapter.algo_data = &bus->algo; + bus->adapter.dev.parent = &dd->pcidev->dev; + snprintf(bus->adapter.name, sizeof(bus->adapter.name), + "hfi1_i2c%d", num); + + ret = i2c_bit_add_bus(&bus->adapter); + if (ret) { + dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n", + __func__, num, ret); + kfree(bus); + return NULL; + } + + return bus; +} + +/* + * Initialize i2c buses. + * Return 0 on success, -errno on error. + */ +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) +{ + ad->i2c_bus0 = init_i2c_bus(dd, ad, 0); + ad->i2c_bus1 = init_i2c_bus(dd, ad, 1); + if (!ad->i2c_bus0 || !ad->i2c_bus1) + return -ENOMEM; + return 0; +}; + +static void clean_i2c_bus(struct hfi1_i2c_bus *bus) +{ + if (bus) { + i2c_del_adapter(&bus->adapter); + kfree(bus); + } +} + +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad) +{ + if (!ad) + return; + clean_i2c_bus(ad->i2c_bus0); + ad->i2c_bus0 = NULL; + clean_i2c_bus(ad->i2c_bus1); + ad->i2c_bus1 = NULL; +} + +static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + fallthrough; + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_NOSTART; + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; + } + + i2c->controlling_dd = dd; + ret = i2c_transfer(&i2c->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n", + __func__, i2c->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} + +static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus, + u8 slave_addr, int offset, int offset_size, + u8 *data, u16 len) +{ + int ret; + int num_msgs; + u8 offset_bytes[2]; + struct i2c_msg msgs[2]; + + switch (offset_size) { + case 0: + num_msgs = 1; + msgs[0].addr = slave_addr; + msgs[0].flags = I2C_M_RD; + msgs[0].len = len; + msgs[0].buf = data; + break; + case 2: + offset_bytes[1] = (offset >> 8) & 0xff; + fallthrough; + case 1: + num_msgs = 2; + offset_bytes[0] = offset & 0xff; + + msgs[0].addr = slave_addr; + msgs[0].flags = 0; + msgs[0].len = offset_size; + msgs[0].buf = offset_bytes; + + msgs[1].addr = slave_addr; + msgs[1].flags = I2C_M_RD; + msgs[1].len = len; + msgs[1].buf = data; + break; + default: + return -EINVAL; + } + + bus->controlling_dd = dd; + ret = i2c_transfer(&bus->adapter, msgs, num_msgs); + if (ret != num_msgs) { + dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n", + __func__, bus->num, slave_addr, offset, len, ret); + return ret < 0 ? ret : -EIO; + } + return 0; +} + +/* + * Raw i2c write. No set-up or lock checking. + * + * Return 0 on success, -errno on error. + */ +static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; + + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len); +} + +/* + * Caller must hold the i2c chain resource. + * + * Return number of bytes written, or -errno. + */ +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + int ret; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len); + if (ret) + return ret; + + return len; +} + +/* + * Raw i2c read. No set-up or lock checking. + * + * Return 0 on success, -errno on error. + */ +static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_i2c_bus *bus; + u8 slave_addr; + int offset_size; + + bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0; + slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */ + offset_size = (i2c_addr >> 8) & 0x3; + return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len); +} + +/* + * Caller must hold the i2c chain resource. + * + * Return number of bytes read, or -errno. + */ +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + int ret; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len); + if (ret) + return ret; + + return len; +} + +/* + * Write page n, offset m of QSFP memory as defined by SFF 8636 + * by writing @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + * + * Return number of bytes written or -errno. + */ +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nwrite; + int ret = 0; + u8 page; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + while (count < len) { + /* + * Set the qsfp page based on a zero-based address + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); + break; + } + + offset = addr % QSFP_PAGESIZE; + nwrite = len - count; + /* truncate write to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY) + nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); + + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nwrite); + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) /* stop on error */ + break; + + count += nwrite; + addr += nwrite; + } + + if (ret < 0) + return ret; + return count; +} + +/* + * Perform a stand-alone single QSFP write. Acquire the resource, do the + * write, then release the resource. + */ +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_write(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * by reading @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + * + * Return the number of bytes read or -errno. + */ +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nread; + int ret = 0; + u8 page; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + while (count < len) { + /* + * Set the qsfp page based on a zero-based address + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); + break; + } + + offset = addr % QSFP_PAGESIZE; + nread = len - count; + /* truncate read to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY) + nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); + + ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nread); + if (ret) /* stop on error */ + break; + + count += nread; + addr += nread; + } + + if (ret < 0) + return ret; + return count; +} + +/* + * Perform a stand-alone single QSFP read. Acquire the resource, do the + * read, then release the resource. + */ +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_read(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* + * This function caches the QSFP memory range in 128 byte chunks. + * As an example, the next byte after address 255 is byte 128 from + * upper page 01H (if existing) rather than byte 0 from lower page 00H. + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * in the cache by reading byte ((128 * n) + m) + * The calls to qsfp_{read,write} in this function correctly handle the + * address map difference between this mapping and the mapping implemented + * by those functions + * + * The caller must be holding the QSFP i2c chain resource. + */ +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) +{ + u32 target = ppd->dd->hfi1_id; + int ret; + unsigned long flags; + u8 *cache = &cp->cache[0]; + + /* ensure sane contents on invalid reads, for cable swaps */ + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto bail; + } + + ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE); + if (ret != QSFP_PAGESIZE) { + dd_dev_info(ppd->dd, + "%s: Page 0 read failed, expected %d, got %d\n", + __func__, QSFP_PAGESIZE, ret); + goto bail; + } + + /* Is paging enabled? */ + if (!(cache[2] & 4)) { + /* Paging enabled, page 03 required */ + if ((cache[195] & 0xC0) == 0xC0) { + /* all */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x80) == 0x80) { + /* only page 2 and 3 */ + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x40) == 0x40) { + /* only page 1 and 3 */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else { + /* only page 3 */ + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } + } + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 1; + ppd->qsfp_info.cache_refresh_required = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + + return 0; + +bail: + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); + return ret; +} + +const char * const hfi1_qsfp_devtech[16] = { + "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP", + "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML", + "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq", + "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq" +}; + +#define QSFP_DUMP_CHUNK 16 /* Holds longest string */ +#define QSFP_DEFAULT_HDR_CNT 224 + +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3) +/* For use with QSFP_HIGH_PWR macro */ +#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */ + +/* + * Takes power class byte [Page 00 Byte 129] in SFF 8636 + * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4) + */ +int get_qsfp_power_class(u8 power_byte) +{ + if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED) + /* power classes count from 1, their bit encodings from 0 */ + return (QSFP_PWR(power_byte) + 1); + /* + * 00 in the high power classes stands for unused, bringing + * balance to the off-by-1 offset above, we add 4 here to + * account for the difference between the low and high power + * groups + */ + return (QSFP_HIGH_PWR(power_byte) + 4); +} + +int qsfp_mod_present(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + + reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); + return !(reg & QSFP_HFI0_MODPRST_N); +} + +/* + * This function maps QSFP memory addresses in 128 byte chunks in the following + * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1 + * spec + * For addr 000-127, lower page 00h + * For addr 128-255, upper page 00h + * For addr 256-383, upper page 01h + * For addr 384-511, upper page 02h + * For addr 512-639, upper page 03h + * + * For addresses beyond this range, it returns the invalid range of data buffer + * set to 0. + * For upper pages that are optional, if they are not valid, returns the + * particular range of bytes in the data buffer set to 0. + */ +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, + u8 *data) +{ + struct hfi1_pportdata *ppd; + u32 excess_len = len; + int ret = 0, offset = 0; + + if (port_num > dd->num_pports || port_num < 1) { + dd_dev_info(dd, "%s: Invalid port number %d\n", + __func__, port_num); + ret = -EINVAL; + goto set_zeroes; + } + + ppd = dd->pport + (port_num - 1); + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto set_zeroes; + } + + if (!ppd->qsfp_info.cache_valid) { + ret = -EINVAL; + goto set_zeroes; + } + + if (addr >= (QSFP_MAX_NUM_PAGES * 128)) { + ret = -ERANGE; + goto set_zeroes; + } + + if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) { + excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128); + memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len)); + data += (len - excess_len); + goto set_zeroes; + } + + memcpy(data, &ppd->qsfp_info.cache[addr], len); + + if (addr <= QSFP_MONITOR_VAL_END && + (addr + len) >= QSFP_MONITOR_VAL_START) { + /* Overlap with the dynamic channel monitor range */ + if (addr < QSFP_MONITOR_VAL_START) { + if (addr + len <= QSFP_MONITOR_VAL_END) + len = addr + len - QSFP_MONITOR_VAL_START; + else + len = QSFP_MONITOR_RANGE; + offset = QSFP_MONITOR_VAL_START - addr; + addr = QSFP_MONITOR_VAL_START; + } else if (addr == QSFP_MONITOR_VAL_START) { + offset = 0; + if (addr + len > QSFP_MONITOR_VAL_END) + len = QSFP_MONITOR_RANGE; + } else { + offset = 0; + if (addr + len > QSFP_MONITOR_VAL_END) + len = QSFP_MONITOR_VAL_END - addr + 1; + } + /* Refresh the values of the dynamic monitors from the cable */ + ret = one_qsfp_read(ppd, dd->hfi1_id, addr, data + offset, len); + if (ret != len) { + ret = -EAGAIN; + goto set_zeroes; + } + } + + return 0; + +set_zeroes: + memset(data, 0, excess_len); + return ret; +} + +static const char *pwr_codes[8] = {"N/AW", + "1.5W", + "2.0W", + "2.5W", + "3.5W", + "4.0W", + "4.5W", + "5.0W" + }; + +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) +{ + u8 *cache = &ppd->qsfp_info.cache[0]; + u8 bin_buff[QSFP_DUMP_CHUNK]; + char lenstr[6]; + int sofar; + int bidx = 0; + u8 *atten = &cache[QSFP_ATTEN_OFFS]; + u8 *vendor_oui = &cache[QSFP_VOUI_OFFS]; + u8 power_byte = 0; + + sofar = 0; + lenstr[0] = ' '; + lenstr[1] = '\0'; + + if (ppd->qsfp_info.cache_valid) { + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + snprintf(lenstr, sizeof(lenstr), "%dM ", + cache[QSFP_MOD_LEN_OFFS]); + + power_byte = cache[QSFP_MOD_PWR_OFFS]; + sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", + pwr_codes[get_qsfp_power_class(power_byte)]); + + sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", + lenstr, + hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]); + + sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n", + QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n", + QSFP_OUI(vendor_oui)); + + sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n", + QSFP_PN_LEN, &cache[QSFP_PN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n", + QSFP_REV_LEN, &cache[QSFP_REV_OFFS]); + + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + sofar += scnprintf(buf + sofar, len - sofar, + "Atten:%d, %d\n", + QSFP_ATTEN_SDR(atten), + QSFP_ATTEN_DDR(atten)); + + sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n", + QSFP_SN_LEN, &cache[QSFP_SN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", + QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", + QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]); + + while (bidx < QSFP_DEFAULT_HDR_CNT) { + int iidx; + + memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK); + for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) { + sofar += scnprintf(buf + sofar, len - sofar, + " %02X", bin_buff[iidx]); + } + sofar += scnprintf(buf + sofar, len - sofar, "\n"); + bidx += QSFP_DUMP_CHUNK; + } + } + return sofar; +} diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h new file mode 100644 index 0000000000..8f14111eaa --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + */ +/* QSFP support common definitions, for hfi driver */ + +#define QSFP_DEV 0xA0 +#define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 +/* 128 byte pages, per SFF 8636 rev 2.4 */ +#define QSFP_MAX_NUM_PAGES 5 + +/* + * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1. + * _N means asserted low + */ +#define QSFP_HFI0_I2CCLK BIT(0) +#define QSFP_HFI0_I2CDAT BIT(1) +#define QSFP_HFI0_RESET_N BIT(2) +#define QSFP_HFI0_INT_N BIT(3) +#define QSFP_HFI0_MODPRST_N BIT(4) + +/* QSFP is paged at 256 bytes */ +#define QSFP_PAGESIZE 256 +/* Reads/writes cannot cross 128 byte boundaries */ +#define QSFP_RW_BOUNDARY 128 + +/* number of bytes in i2c offset for QSFP devices */ +#define __QSFP_OFFSET_SIZE 1 /* num address bytes */ +#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8) /* shifted value */ + +/* Defined fields that Intel requires of qualified cables */ +/* Byte 0 is Identifier, not checked */ +/* Byte 1 is reserved "status MSB" */ +#define QSFP_MONITOR_VAL_START 22 +#define QSFP_MONITOR_VAL_END 81 +#define QSFP_MONITOR_RANGE (QSFP_MONITOR_VAL_END - QSFP_MONITOR_VAL_START + 1) +#define QSFP_TX_CTRL_BYTE_OFFS 86 +#define QSFP_PWR_CTRL_BYTE_OFFS 93 +#define QSFP_CDR_CTRL_BYTE_OFFS 98 + +#define QSFP_PAGE_SELECT_BYTE_OFFS 127 +/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ +#define QSFP_MOD_ID_OFFS 128 +/* + * Byte 129 is "Extended Identifier". + * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W + */ +#define QSFP_MOD_PWR_OFFS 129 +/* Byte 130 is Connector type. Not Intel req'd */ +/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ +/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */ +#define QSFP_NOM_BIT_RATE_100_OFFS 140 +/* Byte 141 is Extended Rate Select. Not Intel req'd */ +/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */ +/* Byte 146 is length for Copper. Units of 1 meter */ +#define QSFP_MOD_LEN_OFFS 146 +/* + * Byte 147 is Device technology. D0..3 not Intel req'd + * D4..7 select from 15 choices, translated by table: + */ +#define QSFP_MOD_TECH_OFFS 147 +extern const char *const hfi1_qsfp_devtech[16]; +/* Active Equalization includes fiber, copper full EQ, and copper near Eq */ +#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1) +/* Active Equalization includes fiber, copper full EQ, and copper far Eq */ +#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1) +/* Attenuation should be valid for copper other than full/near Eq */ +#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1) +/* Length is only valid if technology is "copper" */ +#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1) +#define QSFP_TECH_1490 9 + +#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \ + oui[2]) +#define QSFP_OUI_AMPHENOL 0x415048 +#define QSFP_OUI_FINISAR 0x009065 +#define QSFP_OUI_GORE 0x002177 + +/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */ +#define QSFP_VEND_OFFS 148 +#define QSFP_VEND_LEN 16 +/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */ +#define QSFP_IBXCV_OFFS 164 +/* Bytes 165..167 are Vendor OUI number */ +#define QSFP_VOUI_OFFS 165 +#define QSFP_VOUI_LEN 3 +/* Bytes 168..183 are Vendor Part Number, string */ +#define QSFP_PN_OFFS 168 +#define QSFP_PN_LEN 16 +/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */ +#define QSFP_REV_OFFS 184 +#define QSFP_REV_LEN 2 +/* + * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd + * If copper, they are attenuation in dB: + * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR) + */ +#define QSFP_ATTEN_OFFS 186 +#define QSFP_ATTEN_LEN 2 +/* + * Bytes 188,189 are Wavelength tolerance, if optical + * If copper, they are attenuation in dB: + * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s + */ +#define QSFP_CU_ATTEN_7G_OFFS 188 +#define QSFP_CU_ATTEN_12G_OFFS 189 +/* Byte 190 is Max Case Temp. Not Intel req'd */ +/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */ +#define QSFP_CC_OFFS 191 +#define QSFP_EQ_INFO_OFFS 193 +#define QSFP_CDR_INFO_OFFS 194 +/* Bytes 196..211 are Serial Number, String */ +#define QSFP_SN_OFFS 196 +#define QSFP_SN_LEN 16 +/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */ +#define QSFP_DATE_OFFS 212 +#define QSFP_DATE_LEN 6 +/* Bytes 218,219 are optional lot-code, string */ +#define QSFP_LOT_OFFS 218 +#define QSFP_LOT_LEN 2 +/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */ +/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */ +#define QSFP_NOM_BIT_RATE_250_OFFS 222 +/* Byte 223 is LSB of sum of bytes 192..222 */ +#define QSFP_CC_EXT_OFFS 223 + +/* + * Interrupt flag masks + */ +#define QSFP_DATA_NOT_READY 0x01 + +#define QSFP_HIGH_TEMP_ALARM 0x80 +#define QSFP_LOW_TEMP_ALARM 0x40 +#define QSFP_HIGH_TEMP_WARNING 0x20 +#define QSFP_LOW_TEMP_WARNING 0x10 + +#define QSFP_HIGH_VCC_ALARM 0x80 +#define QSFP_LOW_VCC_ALARM 0x40 +#define QSFP_HIGH_VCC_WARNING 0x20 +#define QSFP_LOW_VCC_WARNING 0x10 + +#define QSFP_HIGH_POWER_ALARM 0x88 +#define QSFP_LOW_POWER_ALARM 0x44 +#define QSFP_HIGH_POWER_WARNING 0x22 +#define QSFP_LOW_POWER_WARNING 0x11 + +#define QSFP_HIGH_BIAS_ALARM 0x88 +#define QSFP_LOW_BIAS_ALARM 0x44 +#define QSFP_HIGH_BIAS_WARNING 0x22 +#define QSFP_LOW_BIAS_WARNING 0x11 + +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + +/* + * struct qsfp_data encapsulates state of QSFP device for one port. + * it will be part of port-specific data if a board supports QSFP. + * + * Since multiple board-types use QSFP, and their pport_data structs + * differ (in the chip-specific section), we need a pointer to its head. + * + * Avoiding premature optimization, we will have one work_struct per port, + * and let the qsfp_lock arbitrate access to common resources. + * + */ +struct qsfp_data { + /* Helps to find our way */ + struct hfi1_pportdata *ppd; + struct work_struct qsfp_work; + u8 cache[QSFP_MAX_NUM_PAGES * 128]; + /* protect qsfp data */ + spinlock_t qsfp_lock; + u8 check_interrupt_flags; + u8 reset_needed; + u8 limiting_active; + u8 cache_valid; + u8 cache_refresh_required; +}; + +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, + struct qsfp_data *cp); +int get_qsfp_power_class(u8 power_byte); +int qsfp_mod_present(struct hfi1_pportdata *ppd); +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, + u32 len, u8 *data); + +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +struct hfi1_asic_data; +int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); +void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c new file mode 100644 index 0000000000..acd2b273ea --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -0,0 +1,3244 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#include <linux/io.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> + +#include "hfi.h" +#include "qp.h" +#include "rc.h" +#include "verbs_txreq.h" +#include "trace.h" + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled) + __must_hold(&qp->s_lock) +{ + struct rvt_ack_entry *e = NULL; + u8 i, p; + bool s = true; + + for (i = qp->r_head_ack_queue; ; i = p) { + if (i == qp->s_tail_ack_queue) + s = false; + if (i) + p = i - 1; + else + p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + if (p == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[p]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (p == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + s = false; + break; + } + } + if (prev) + *prev = p; + if (prev_ack) + *prev_ack = i; + if (scheduled) + *scheduled = s; + return e; +} + +/** + * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @dev: the device for this QP + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @ps: the xmit packet state + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps) +{ + struct rvt_ack_entry *e; + u32 hwords, hdrlen; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); + int middle = 0; + u32 pmtu = qp->pmtu; + struct hfi1_qp_priv *qpriv = qp->priv; + bool last_pkt; + u32 delta; + u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; + + trace_hfi1_rsp_make_rc_ack(qp, 0); + lockdep_assert_held(&qp->s_lock); + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto bail; + + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + else + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + release_rdma_sge_mr(e); + fallthrough; + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; + trace_hfi1_rsp_make_rc_ack(qp, e->psn); + fallthrough; + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & RVT_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* + * If a RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = rvt_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; + } else if (e->opcode == TID_OP(READ_REQ)) { + /* + * If a TID RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_ack_state = TID_OP(READ_RESP); + goto read_resp; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + ps->s_txreq->ss = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = rvt_compute_aeth(qp); + ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = mask_psn(e->psn); + e->sent = 1; + } + trace_hfi1_tid_write_rsp_make_rc_ack(qp); + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + fallthrough; + case OP(RDMA_READ_RESPONSE_MIDDLE): + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + } else { + ohdr->u.aeth = rvt_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + break; + + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + /* Do not free e->rdma_sge until all data are received */ + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + break; + + case TID_OP(READ_RESP): +read_resp: + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, + &bth1, &bth2, &len, + &last_pkt); + if (delta == 0) + goto error_qp; + hwords += delta; + if (last_pkt) { + e->sent = 1; + /* + * Increment qp->s_tail_ack_queue through s_ack_state + * transition. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + break; + case TID_OP(READ_REQ): + goto bail; + + default: +normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); +normal_no_state: + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qp->s_nak_state << + IB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = rvt_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; + ps->s_txreq->ss = NULL; + } + qp->s_rdma_ack_cnt++; + ps->s_txreq->sde = qpriv->s_sde; + ps->s_txreq->s_cur_size = len; + ps->s_txreq->hdr_dwords = hwords; + hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); + return 1; +error_qp: + spin_unlock_irqrestore(&qp->s_lock, ps->flags); + spin_lock_irqsave(&qp->r_lock, ps->flags); + spin_lock(&qp->s_lock); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, ps->flags); + spin_lock_irqsave(&qp->s_lock, ps->flags); +bail: + qp->s_ack_state = OP(ACKNOWLEDGE); + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * RVT_S_RESP_PENDING + */ + smp_wmb(); + qp->s_flags &= ~(RVT_S_RESP_PENDING + | RVT_S_ACK_PENDING + | HFI1_S_AHG_VALID); + return 0; +} + +/** + * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * @ps: the current packet state + * + * Assumes s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_other_headers *ohdr; + struct rvt_sge_state *ss = NULL; + struct rvt_swqe *wqe; + struct hfi1_swqe_priv *wpriv; + struct tid_rdma_request *req = NULL; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); + u32 pmtu = qp->pmtu; + char newreq; + int middle = 0; + int delta; + struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; + + trace_hfi1_sender_make_rc_req(qp); + lockdep_assert_held(&qp->s_lock); + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + if (priv->hdr_type == HFI1_PKT_TYPE_9B) { + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } else { + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + } + + /* Sending responses has higher priority over sending requests. */ + if ((qp->s_flags & RVT_S_RESP_PENDING) && + make_rc_ack(dev, qp, ohdr, ps)) + return 1; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == READ_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done_free_tx; + } + + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) + goto bail; + + if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { + if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { + qp->s_flags |= RVT_S_WAIT_PSN; + goto bail; + } + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + } + + /* Send a request. */ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); +check_s_state: + switch (qp->s_state) { + default: + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == READ_ONCE(qp->s_head)) { + clear_ahg(qp); + goto bail; + } + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + * However, there is no need to guard against + * TID RDMA READ after TID RDMA READ. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic && + (wqe->wr.opcode != IB_WR_TID_RDMA_READ || + priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { + qp->s_flags |= RVT_S_WAIT_FENCE; + goto bail; + } + /* + * Local operations are processed immediately + * after all prior requests have completed + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (++qp->s_tail == qp->s_size) + qp->s_tail = 0; + if (!(wqe->wr.send_flags & + RVT_SEND_COMPLETION_ONLY)) { + err = rvt_invalidate_rkey( + qp, + wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } + rvt_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + if (local_ops) + atomic_dec(&qp->local_ops_pending); + goto done_free_tx; + } + + newreq = 1; + qp->s_psn = wqe->psn; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = mask_psn(qp->s_psn); + + /* + * Interlock between various IB requests and TID RDMA + * if necessary. + */ + if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_wqe_interlock(qp, wqe)) + goto bail; + + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + /* If no credit, return. */ + if (!rvt_rc_credit_avail(qp, wqe)) + goto bail; + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_ONLY); + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } else { + qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); + /* Invalidate rkey comes after the BTH */ + ohdr->u.ieth = cpu_to_be32( + wqe->wr.ex.invalidate_rkey); + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + goto no_flow_control; + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (!rvt_rc_credit_avail(qp, wqe)) + goto bail; +no_flow_control: + put_ib_reth_vaddr( + wqe->rdma_wr.remote_addr, + &ohdr->u.rc.reth); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_ONLY); + } else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + trace_hfi1_tid_write_sender_make_req(qp, newreq); + trace_hfi1_tid_req_make_req_write(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + put_ib_reth_vaddr( + wqe->rdma_wr.remote_addr, + &ohdr->u.rc.reth); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_TID_RDMA_READ: + trace_hfi1_tid_read_sender_make_req(qp, newreq); + wpriv = wqe->priv; + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_req_read(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); + delta = cmp_psn(qp->s_psn, wqe->psn); + + /* + * Don't allow more operations to be started + * than the QP limits allow. We could get here under + * three conditions; (1) It's a new request; (2) We are + * sending the second or later segment of a request, + * but the qp->s_state is set to OP(RDMA_READ_REQUEST) + * when the last segment of a previous request is + * received just before this; (3) We are re-sending a + * request. + */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + if (newreq) { + struct tid_rdma_flow *flow = + &req->flows[req->setup_head]; + + /* + * Set up s_sge as it is needed for TID + * allocation. However, if the pages have been + * walked and mapped, skip it. An earlier try + * has failed to allocate the TID entries. + */ + if (!flow->npagesets) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + req->isge = 0; + req->clear_tail = req->setup_head; + req->flow_idx = req->setup_head; + req->state = TID_REQUEST_ACTIVE; + } + } else if (delta == 0) { + /* Re-send a request */ + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_pending = 0; + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + } + req->s_next_psn = qp->s_psn; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + fallthrough; + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { + qp->s_state = OP(COMPARE_SWAP); + put_ib_ateth_swap(wqe->atomic_wr.swap, + &ohdr->u.atomic_eth); + put_ib_ateth_compare(wqe->atomic_wr.compare_add, + &ohdr->u.atomic_eth); + } else { + qp->s_state = OP(FETCH_ADD); + put_ib_ateth_swap(wqe->atomic_wr.compare_add, + &ohdr->u.atomic_eth); + put_ib_ateth_compare(0, &ohdr->u.atomic_eth); + } + put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, + &ohdr->u.atomic_eth); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->atomic_wr.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + } + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + qp->s_psn = wqe->lpsn + 1; + else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + qp->s_psn = req->s_next_psn; + else + qp->s_psn++; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_FIRST is used by the ACK processing + * thread to indicate a SEND needs to be restarted from an + * earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + fallthrough; + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + fallthrough; + case OP(SEND_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_LAST); + } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } else { + qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); + /* invalidate data comes after the BTH */ + ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_LAST is used by the ACK processing + * thread to indicate a RDMA write needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + fallthrough; + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + fallthrough; + case OP(RDMA_WRITE_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_LAST); + } else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing + * thread to indicate a RDMA read needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; + put_ib_reth_vaddr( + wqe->rdma_wr.remote_addr + len, + &ohdr->u.rc.reth); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; + + case TID_OP(READ_RESP): + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto bail; + /* This is used to restart a TID read request */ + req = wqe_to_tid_req(wqe); + wpriv = wqe->priv; + /* + * Back down. The field qp->s_psn has been set to the psn with + * which the request should be restart. It's OK to use division + * as this is on the retry path. + */ + req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; + + /* + * The following function need to be redefined to return the + * status to make sure that we find the flow. At the same + * time, we can use the req->state change to check if the + * call succeeds or not. + */ + req->state = TID_REQUEST_RESEND; + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + if (req->state != TID_REQUEST_ACTIVE) { + /* + * Failed to find the flow. Release all allocated tid + * resources. + */ + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + + hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); + goto bail; + } + req->state = TID_REQUEST_RESEND; + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + flow = &req->flows[req->flow_idx]; + len -= flow->sent; + req->s_next_psn = flow->flow_state.ib_lpsn + 1; + delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; + case TID_OP(READ_REQ): + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + /* + * If the current WR is not TID RDMA READ, or this is the start + * of a new request, we need to change the qp->s_state so that + * the request can be set up properly. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || + qp->s_cur == qp->s_tail) { + qp->s_state = OP(RDMA_READ_REQUEST); + if (delta == 0 || qp->s_cur == qp->s_tail) + goto check_s_state; + else + goto bail; + } + + /* Rate limiting */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + + wpriv = wqe->priv; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; + } + qp->s_sending_hpsn = bth2; + delta = delta_psn(bth2, wqe->psn); + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + bth2 |= IB_BTH_REQ_ACK; + if (qp->s_flags & RVT_S_SEND_ONE) { + qp->s_flags &= ~RVT_S_SEND_ONE; + qp->s_flags |= RVT_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->ss = ss; + ps->s_txreq->s_cur_size = len; + hfi1_make_ruc_header( + qp, + ohdr, + bth0 | (qp->s_state << 24), + bth1, + bth2, + middle, + ps); + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + return 0; +} + +static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1) +{ + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qp->r_nak_state << + IB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = rvt_compute_aeth(qp); + + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); +} + +static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto unlock; + ibp = rcd_to_iport(packet->rcd); + this_cpu_inc(*ibp->rvp.rc_qacks); + qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + if (is_fecn) + qp->s_flags |= RVT_S_ECN; + + /* Schedule the send tasklet. */ + hfi1_schedule_send(qp); +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_header *hdr = &opa_hdr->ibh; + struct ib_other_headers *ohdr; + u16 lrh0 = HFI1_LRH_BTH; + u16 pkey; + u32 bth0, bth1; + + opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; + ohdr = &hdr->u.oth; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ + *hwords = 6; + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { + *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, + rdma_ah_read_grh(&qp->remote_ah_attr), + *hwords - 2, SIZE_OF_CRC); + ohdr = &hdr->u.l.oth; + lrh0 = HFI1_LRH_GRH; + } + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); + + /* read pkey_index w/o lock (its atomic) */ + pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + + lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | + (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) << + IB_SL_SHIFT; + + hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), + ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr)); + + bth0 = pkey | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + bth1 = (!!is_fecn) << IB_BECN_SHIFT; + /* + * Inline ACKs go out without the use of the Verbs send engine, so + * we need to set the STL Verbs Extended bit here + */ + bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; + hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); +} + +static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_16b_header *hdr = &opa_hdr->opah; + struct ib_other_headers *ohdr; + u32 bth0, bth1 = 0; + u16 len, pkey; + bool becn = is_fecn; + u8 l4 = OPA_16B_L4_IB_LOCAL; + u8 extra_bytes; + + opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; + ohdr = &hdr->u.oth; + /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ + *hwords = 8; + extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0); + *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { + *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, + rdma_ah_read_grh(&qp->remote_ah_attr), + *hwords - 4, *nwords); + ohdr = &hdr->u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + } + *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + + /* read pkey_index w/o lock (its atomic) */ + pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + + /* Convert dwords to flits */ + len = (*hwords + *nwords) >> 1; + + hfi1_make_16b_hdr(hdr, ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)), + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), + 16B), len, pkey, becn, 0, l4, sc5); + + bth0 = pkey | (OP(ACKNOWLEDGE) << 24); + bth0 |= extra_bytes << 20; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth1 = OPA_BTH_MIG_REQ; + hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); +} + +typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet, + struct hfi1_opa_header *opa_hdr, + u8 sc5, bool is_fecn, + u64 *pbc_flags, u32 *hwords, + u32 *nwords); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B +}; + +/* + * hfi1_send_rc_ack - Construct an ACK packet and send it + * + * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). + * Note that RDMA reads and atomics are handled in the + * send side QP state and send engine. + */ +void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; + u64 pbc, pbc_flags = 0; + u32 hwords = 0; + u32 nwords = 0; + u32 plen; + struct pio_buf *pbuf; + struct hfi1_opa_header opa_hdr; + + /* clear the defer count */ + qp->r_adefered = 0; + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->s_flags & RVT_S_RESP_PENDING) { + hfi1_queue_rc_ack(packet, is_fecn); + return; + } + + /* Ensure s_rdma_ack_cnt changes are committed */ + if (qp->s_rdma_ack_cnt) { + hfi1_queue_rc_ack(packet, is_fecn); + return; + } + + /* Don't try to send ACKs if the link isn't ACTIVE */ + if (driver_lstate(ppd) != IB_PORT_ACTIVE) + return; + + /* Make the appropriate header */ + hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn, + &pbc_flags, &hwords, &nwords); + + plen = 2 /* PBC */ + hwords + nwords; + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, + sc_to_vlt(ppd->dd, sc5), plen); + pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); + if (IS_ERR_OR_NULL(pbuf)) { + /* + * We have no room to send at the moment. Pass + * responsibility for sending the ACK to the send engine + * so that when enough buffer space becomes available, + * the ACK is sent ahead of other outgoing packets. + */ + hfi1_queue_rc_ack(packet, is_fecn); + return; + } + trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &opa_hdr, ib_is_sc5(sc5)); + + /* write the pbc and data */ + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + (priv->hdr_type == HFI1_PKT_TYPE_9B ? + (void *)&opa_hdr.ibh : + (void *)&opa_hdr.opah), hwords); + return; +} + +/** + * update_num_rd_atomic - update the qp->s_num_rd_atomic + * @qp: the QP + * @psn: the packet sequence number to restart at + * @wqe: the wqe + * + * This is called from reset_psn() to update qp->s_num_rd_atomic + * for the current wqe. + * Called at interrupt level with the QP s_lock held. + */ +static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, + struct rvt_swqe *wqe) +{ + u32 opcode = wqe->wr.opcode; + + if (opcode == IB_WR_RDMA_READ || + opcode == IB_WR_ATOMIC_CMP_AND_SWP || + opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + qp->s_num_rd_atomic++; + } else if (opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct hfi1_qp_priv *priv = qp->priv; + + if (cmp_psn(psn, wqe->lpsn) <= 0) { + u32 cur_seg; + + cur_seg = (psn - wqe->psn) / priv->pkts_ps; + req->ack_pending = cur_seg - req->comp_seg; + priv->pending_tid_r_segs += req->ack_pending; + qp->s_num_rd_atomic += req->ack_pending; + trace_hfi1_tid_req_update_num_rd_atomic(qp, 0, + wqe->wr.opcode, + wqe->psn, + wqe->lpsn, + req); + } else { + priv->pending_tid_r_segs += req->total_segs; + qp->s_num_rd_atomic += req->total_segs; + } + } +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from hfi1_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct rvt_qp *qp, u32 psn) +{ + u32 n = qp->s_acked; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); + u32 opcode; + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + qp->s_cur = n; + priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; + qp->s_num_rd_atomic = 0; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (cmp_psn(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + update_num_rd_atomic(qp, psn, wqe); + + /* Find the work request opcode corresponding to the given PSN. */ + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = rvt_get_swqe_ptr(qp, n); + diff = cmp_psn(psn, wqe->psn); + if (diff < 0) { + /* Point wqe back to the previous one*/ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + break; + } + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + update_num_rd_atomic(qp, psn, wqe); + } + opcode = wqe->wr.opcode; + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See hfi1_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + case IB_WR_TID_RDMA_READ: + qp->s_state = TID_OP(READ_RESP); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + qp->s_psn = psn; + /* + * Set RVT_S_WAIT_PSN as rc_complete() may start the timer + * asynchronously before the send engine can get scheduled. + * Doing it in hfi1_make_rc_req() is too late. + */ + if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && + (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) + qp->s_flags |= RVT_S_WAIT_PSN; + qp->s_flags &= ~HFI1_S_AHG_VALID; + trace_hfi1_sender_reset_psn(qp); +} + +/* + * Back up requester to resend the last un-ACKed request. + * The QP r_lock and s_lock should be held and interrupts disabled. + */ +void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + struct hfi1_ibport *ibp; + + lockdep_assert_held(&qp->r_lock); + lockdep_assert_held(&qp->s_lock); + trace_hfi1_sender_restart_rc(qp); + if (qp->s_retry == 0) { + if (qp->s_mig_state == IB_MIG_ARMED) { + hfi1_migrate_qp(qp); + qp->s_retry = qp->s_retry_cnt; + } else if (qp->s_last == qp->s_acked) { + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + trace_hfi1_tid_write_sender_restart_rc(qp, 0); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req; + + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + } + + hfi1_trdma_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + return; + } else { /* need to handle delayed completion */ + return; + } + } else { + qp->s_retry--; + } + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) + ibp->rvp.n_rc_resends++; + else + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); + + qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | + RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); + if (wait) + qp->s_flags |= RVT_S_SEND_ONE; + reset_psn(qp, psn); +} + +/* + * Set qp->s_sending_psn to the next PSN after the given one. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. + */ +static void reset_sending_psn(struct rvt_qp *qp, u32 psn) +{ + struct rvt_swqe *wqe; + u32 n = qp->s_last; + + lockdep_assert_held(&qp->s_lock); + /* Find the work request corresponding to the given PSN. */ + for (;;) { + wqe = rvt_get_swqe_ptr(qp, n); + if (cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + qp->s_sending_psn = wqe->lpsn + 1; + else + qp->s_sending_psn = psn + 1; + break; + } + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + } +} + +/** + * hfi1_rc_verbs_aborted - handle abort status + * @qp: the QP + * @opah: the opa header + * + * This code modifies both ACK bit in BTH[2] + * and the s_flags to go into send one mode. + * + * This serves to throttle the send engine to only + * send a single packet in the likely case the + * a link has gone down. + */ +void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah) +{ + struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah); + u8 opcode = ib_bth_get_opcode(ohdr); + u32 psn; + + /* ignore responses */ + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) + return; + + psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK; + ohdr->bth[2] = cpu_to_be32(psn); + qp->s_flags |= RVT_S_SEND_ONE; +} + +/* + * This should be called with the QP s_lock held and interrupts disabled. + */ +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) +{ + struct ib_other_headers *ohdr; + struct hfi1_qp_priv *priv = qp->priv; + struct rvt_swqe *wqe; + u32 opcode, head, tail; + u32 psn; + struct tid_rdma_request *req; + + lockdep_assert_held(&qp->s_lock); + if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) + return; + + ohdr = hfi1_get_rc_ohdr(opah); + opcode = ib_bth_get_opcode(ohdr); + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { + WARN_ON(!qp->s_rdma_ack_cnt); + qp->s_rdma_ack_cnt--; + return; + } + + psn = ib_bth_get_psn(ohdr); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } + + /* + * Start timer after a packet requesting an ACK has been sent and + * there are still requests that haven't been acked. + */ + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && + !(qp->s_flags & + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + if (opcode == TID_OP(READ_REQ)) + rvt_add_retry_timer_ext(qp, priv->timeout_shift); + else + rvt_add_retry_timer(qp); + } + + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + + while (qp->s_last != qp->s_acked) { + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) + break; + trdma_clean_swqe(qp, wqe); + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, + wqe, + ib_hfi1_wc_opcode[wqe->wr.opcode], + IB_WC_SUCCESS); + } + /* + * If we were waiting for sends to complete before re-sending, + * and they are now complete, restart sending. + */ + trace_hfi1_sendcomplete(qp, psn); + if (qp->s_flags & RVT_S_WAIT_PSN && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + qp->s_flags &= ~RVT_S_WAIT_PSN; + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + hfi1_schedule_send(qp); + } +} + +static inline void update_last_psn(struct rvt_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/* + * Generate a SWQE completion. + * This is similar to hfi1_send_complete but has to check to be sure + * that the SGEs are not being referenced if the SWQE is being resent. + */ +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + /* + * Don't decrement refcount and don't generate a + * completion if the SWQE is being resent until the send + * is finished. + */ + trace_hfi1_rc_completion(qp, wqe->lpsn); + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + trdma_clean_swqe(qp, wqe); + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, + wqe, + ib_hfi1_wc_opcode[wqe->wr.opcode], + IB_WC_SUCCESS); + } else { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + this_cpu_inc(*ibp->rvp.rc_delayed_comp); + /* + * If send progress not running attempt to progress + * SDMA queue. + */ + if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { + struct sdma_engine *engine; + u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr); + u8 sc5; + + /* For now use sc to find engine */ + sc5 = ibp->sl_to_sc[sl]; + engine = qp_to_sdma_engine(qp, sc5); + sdma_engine_progress_schedule(engine); + } + } + + qp->s_retry = qp->s_retry_cnt; + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); + + /* + * If we are completing a request which is in the process of + * being resent, we can stop re-sending it since we know the + * responder has already seen it. + */ + if (qp->s_acked == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_acked = qp->s_cur; + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + if (qp->s_acked != qp->s_tail) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + } else { + if (++qp->s_acked >= qp->s_size) + qp->s_acked = 0; + if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) + qp->s_draining = 0; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + } + if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + hfi1_schedule_send(qp); + } + return wqe; +} + +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + +/* + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * May be called at interrupt level, with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp; + enum ib_wc_status status; + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + struct rvt_dev_info *rdi; + + lockdep_assert_held(&qp->s_lock); + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> IB_AETH_NAK_SHIFT) + ack_psn--; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + ibp = rcd_to_iport(rcd); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail_stop; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_READ && + (opcode != TID_OP(READ_RESP) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail_stop; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + u64 *vaddr = wqe->sg_list[0].vaddr; + *vaddr = val; + } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & RVT_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); + trace_hfi1_sender_do_rc_ack(qp); + switch (aeth >> IB_AETH_NAK_SHIFT) { + case 0: /* ACK */ + this_cpu_inc(*ibp->rvp.rc_acks); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + if (wqe_to_tid_req(wqe)->ack_pending) + rvt_mod_retry_timer_ext(qp, + qpriv->timeout_shift); + else + rvt_stop_rc_timers(qp); + } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + + /* + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. + */ + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } + } else { + /* No more acks - kill all timers */ + rvt_stop_rc_timers(qp); + if (cmp_psn(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + } + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + rvt_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; + update_last_psn(qp, psn); + return 1; + + case 1: /* RNR NAK */ + ibp->rvp.n_rnr_naks++; + if (qp->s_acked == qp->s_tail) + goto bail_stop; + if (qp->s_flags & RVT_S_WAIT_RNR) + goto bail_stop; + rdi = ib_to_rvt(qp->ibqp.device); + if (!(rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT)) { + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) + qp->s_rnr_retry--; + } + + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } + + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); + qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); + rvt_stop_rc_timers(qp); + rvt_add_rnr_timer(qp, aeth); + return 0; + + case 3: /* NAK */ + if (qp->s_acked == qp->s_tail) + goto bail_stop; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> IB_AETH_CREDIT_SHIFT) & + IB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + ibp->rvp.n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + hfi1_restart_rc(qp, psn, 0); + hfi1_schedule_send(qp); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + ibp->rvp.n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + ibp->rvp.n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + ibp->rvp.n_other_naks++; +class_b: + if (qp->s_last == qp->s_acked) { + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + hfi1_kern_read_tid_flow_free(qp); + + hfi1_trdma_send_complete(qp, wqe, status); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_retry = qp->s_retry_cnt; + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail_stop; + + default: /* 2: reserved */ +reserved: + /* Ignore reserved NAK codes. */ + goto bail_stop; + } + /* cannot be reached */ +bail_stop: + rvt_stop_rc_timers(qp); + return ret; +} + +/* + * We have seen an out of sequence RDMA read middle or last packet. + * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. + */ +static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, + struct hfi1_ctxtdata *rcd) +{ + struct rvt_swqe *wqe; + + lockdep_assert_held(&qp->s_lock); + /* Remove QP from retry timer */ + rvt_stop_rc_timers(qp); + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + + while (cmp_psn(psn, wqe->lpsn) > 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + break; + wqe = do_rc_completion(qp, wqe, ibp); + } + + ibp->rvp.n_rdma_seq++; + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/** + * rc_rcv_resp - process an incoming RC response packet + * @packet: data packet information + * + * This is called from hfi1_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static void rc_rcv_resp(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + void *data = packet->payload; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp; + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u64 val; + u32 aeth; + u32 psn = ib_bth_get_psn(packet->ohdr); + u32 pmtu = qp->pmtu; + u16 hdrsize = packet->hlen; + u8 opcode = packet->opcode; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); + + spin_lock_irqsave(&qp->s_lock, flags); + trace_hfi1_ack(qp, psn); + + /* Ignore invalid responses. */ + if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = cmp_psn(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + aeth = be32_to_cpu(ohdr->u.aeth); + if ((aeth >> IB_AETH_NAK_SHIFT) == 0) + rvt_get_credit(qp, aeth); + } + goto ack_done; + } + + /* + * Skip everything other than the PSN we expect, if we are waiting + * for a reply to a restarted RDMA read or atomic op. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + aeth = be32_to_cpu(ohdr->u.aeth); + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) + val = ib_u64_get(&ohdr->u.at.atomic_ack_eth); + else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; +read_middle: + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* + * We got a response so update the timeout. + * 4.096 usec. * (1 << qp->timeout) + */ + rvt_mod_retry_timer(qp); + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + aeth = be32_to_cpu(ohdr->u.aeth); + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen < (hdrsize + extra_bytes))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + extra_bytes))) + goto ack_len_err; +read_last: + tlen -= hdrsize + extra_bytes; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + aeth = be32_to_cpu(ohdr->u.aeth); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); + WARN_ON(qp->s_rdma_read_sge.num_sge); + (void)do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0, rcd); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_seq_err: + ibp = rcd_to_iport(rcd); + rdma_seq_err(qp, ibp, psn, rcd); + goto ack_done; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + if (qp->s_last == qp->s_acked) { + rvt_send_complete(qp, wqe, status); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +static inline void rc_cancel_ack(struct rvt_qp *qp) +{ + qp->r_adefered = 0; + if (list_empty(&qp->rspwait)) + return; + list_del_init(&qp->rspwait); + qp->r_flags &= ~RVT_R_RSP_NAK; + rvt_put_qp(qp); +} + +/** + * rc_rcv_error - process an incoming duplicate or error RC packet + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * @rcd: the receive context + * + * This is called from hfi1_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, + struct rvt_qp *qp, u32 opcode, u32 psn, + int diff, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct rvt_ack_entry *e; + unsigned long flags; + u8 prev; + u8 mra; /* most recent ACK */ + bool old_req; + + trace_hfi1_rcv_error(qp, psn); + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence NAK until all packets + * in the receive queue have been processed. + * Otherwise, we end up propagating congestion. + */ + rc_defered_ack(rcd, qp); + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + * old_req is true if there is an older response that is scheduled + * to be sent before sending this one. + */ + e = NULL; + old_req = true; + ibp->rvp.n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + + e = find_prev_entry(qp, psn, &prev, &mra, &old_req); + + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST)) + goto unlock_done; + /* RETH comes after BTH */ + reth = &ohdr->u.rc.reth; + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = delta_psn(psn, e->psn) * qp->pmtu; + len = be32_to_cpu(reth->length); + if (unlikely(offset + len != e->rdma_sge.sge_length)) + goto unlock_done; + release_rdma_sge_mr(e); + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = get_ib_reth_vaddr(reth); + int ok; + + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->psn = psn; + if (old_req) + goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send engine is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8)opcode || old_req) + goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; + qp->s_tail_ack_queue = prev; + break; + } + + default: + /* + * Ignore this operation if it doesn't request an ACK + * or an earlier RDMA read or atomic is going to be resent. + */ + if (!(psn & IB_BTH_REQ_ACK) || old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (mra == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; + qp->s_tail_ack_queue = mra; + break; + } + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags |= RVT_S_RESP_PENDING; + qp->r_nak_state = 0; + hfi1_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, + u32 lqpn, u32 rqpn, u8 svc_type) +{ + struct opa_hfi1_cong_log_event_internal *cc_event; + unsigned long flags; + + if (sl >= OPA_MAX_SLS) + return; + + spin_lock_irqsave(&ppd->cc_log_lock, flags); + + ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); + ppd->threshold_event_counter++; + + cc_event = &ppd->cc_events[ppd->cc_log_idx++]; + if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_log_idx = 0; + cc_event->lqpn = lqpn & RVT_QPN_MASK; + cc_event->rqpn = rqpn & RVT_QPN_MASK; + cc_event->sl = sl; + cc_event->svc_type = svc_type; + cc_event->rlid = rlid; + /* keep timestamp in units of 1.024 usec */ + cc_event->timestamp = ktime_get_ns() / 1024; + + spin_unlock_irqrestore(&ppd->cc_log_lock, flags); +} + +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, + u32 rqpn, u8 svc_type) +{ + struct cca_timer *cca_timer; + u16 ccti, ccti_incr, ccti_timer, ccti_limit; + u8 trigger_threshold; + struct cc_state *cc_state; + unsigned long flags; + + if (sl >= OPA_MAX_SLS) + return; + + cc_state = get_cc_state(ppd); + + if (!cc_state) + return; + + /* + * 1) increase CCTI (for this SL) + * 2) select IPG (i.e., call set_link_ipg()) + * 3) start timer + */ + ccti_limit = cc_state->cct.ccti_limit; + ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + trigger_threshold = + cc_state->cong_setting.entries[sl].trigger_threshold; + + spin_lock_irqsave(&ppd->cca_timer_lock, flags); + + cca_timer = &ppd->cca_timer[sl]; + if (cca_timer->ccti < ccti_limit) { + if (cca_timer->ccti + ccti_incr <= ccti_limit) + cca_timer->ccti += ccti_incr; + else + cca_timer->ccti = ccti_limit; + set_link_ipg(ppd); + } + + ccti = cca_timer->ccti; + + if (!hrtimer_active(&cca_timer->hrtimer)) { + /* ccti_timer is in units of 1.024 usec */ + unsigned long nsec = 1024 * ccti_timer; + + hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), + HRTIMER_MODE_REL_PINNED); + } + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + + if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) + log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); +} + +/** + * hfi1_rc_rcv - process an incoming RC packet + * @packet: data packet information + * + * This is called from qp_rcv() to process an incoming RC packet + * for the given QP. + * May be called at interrupt level. + */ +void hfi1_rc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + void *data = packet->payload; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct ib_other_headers *ohdr = packet->ohdr; + u32 opcode = packet->opcode; + u32 hdrsize = packet->hlen; + u32 psn = ib_bth_get_psn(packet->ohdr); + u32 pad = packet->pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + int diff; + struct ib_reth *reth; + unsigned long flags; + int ret; + bool copy_last = false, fecn; + u32 rkey; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); + + lockdep_assert_held(&qp->r_lock); + + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + fecn = process_ecn(qp, packet); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + rc_rcv_resp(packet); + return; + } + + /* Compute 24 bits worth of difference. */ + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) + return; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(SEND_LAST_WITH_INVALIDATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + fallthrough; + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): +send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + /* + * There will be no padding for 9B packet but 16B packets + * will come in with some padding since we always add + * CRC and LT bytes which will need to be flit aligned + */ + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + case OP(SEND_ONLY_WITH_INVALIDATE): + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) + goto send_last_inv; + fallthrough; /* for SEND_ONLY_WITH_IMMEDIATE */ + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST_WITH_INVALIDATE): +send_last_inv: + rkey = be32_to_cpu(ohdr->u.ieth); + if (rvt_invalidate_rkey(qp, rkey)) + goto no_immediate_data; + wc.ex.invalidate_rkey = rkey; + wc.wc_flags = IB_WC_WITH_INVALIDATE; + goto send_last; + case OP(RDMA_WRITE_LAST): + copy_last = rvt_is_user_qp(qp); + fallthrough; + case OP(SEND_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; +send_last: + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + extra_bytes))) + goto nack_inv; + /* Don't count the CRC(and padding and LT byte for 16B). */ + tlen -= (hdrsize + extra_bytes); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); + rvt_put_ss(&qp->r_sge); + qp->r_msn++; + if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); + break; + + case OP(RDMA_WRITE_ONLY): + copy_last = rvt_is_user_qp(qp); + fallthrough; + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = get_ib_reth_vaddr(reth); + int ok; + + /* Check rkey & NAK */ + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto no_immediate_data; + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto nack_op_err; + if (!ret) { + /* peer will send again */ + rvt_put_ss(&qp->r_sge); + goto rnr_nak; + } + wc.ex.imm_data = ohdr->u.rc.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + + case OP(RDMA_READ_REQUEST): { + struct rvt_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + release_rdma_sge_mr(e); + reth = &ohdr->u.rc.reth; + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = get_ib_reth_vaddr(reth); + int ok; + + /* Check rkey & NAK */ + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + qp->r_psn += rvt_div_mtu(qp, len - 1); + } else { + e->rdma_sge.mr = NULL; + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = qp->r_psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; + + /* Schedule the send engine. */ + qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; + struct rvt_ack_entry *e; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + release_rdma_sge_mr(e); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = get_ib_ateth_swap(ateth); + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + get_ib_ateth_compare(ateth), + sdata); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; +ack: + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = psn; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; + + /* Schedule the send engine. */ + qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & IB_BTH_REQ_ACK || fecn) { + if (packet->numpkt == 0 || fecn || + qp->r_adefered >= HFI1_PSN_CREDIT) { + rc_cancel_ack(qp); + goto send_ack; + } + qp->r_adefered++; + rc_defered_ack(rcd, qp); + } + return; + +rnr_nak: + qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; + qp->r_ack_psn = qp->r_psn; + /* Queue RNR NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_op_err: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, fecn); +} + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet, + struct rvt_qp *qp) +{ + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + int diff; + u32 opcode; + u32 psn; + + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + psn = ib_bth_get_psn(packet->ohdr); + opcode = ib_bth_get_opcode(packet->ohdr); + + /* Only deal with RDMA Writes for now */ + if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = delta_psn(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + rc_defered_ack(rcd, qp); + } /* Out of sequence NAK */ + } /* QP Request NAKs */ +} diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h new file mode 100644 index 0000000000..5ed5e85d58 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef HFI1_RC_H +#define HFI1_RC_H + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) +{ + unsigned int next; + + next = n + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + return rvt_restart_sge(ss, wqe, len); +} + +static inline void release_rdma_sge_mr(struct rvt_ack_entry *e) +{ + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } +} + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled); +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd); +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + +#endif /* HFI1_RC_H */ diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c new file mode 100644 index 0000000000..b0151b7293 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#include <linux/spinlock.h> + +#include "hfi.h" +#include "mad.h" +#include "qp.h" +#include "verbs_txreq.h" +#include "trace.h" + +static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) +{ + return (gid->global.interface_id == id && + (gid->global.subnet_prefix == gid_prefix || + gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX)); +} + +/* + * + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the hfi1_migrate_qp() call. + */ +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) +{ + __be64 guid; + unsigned long flags; + struct rvt_qp *qp = packet->qp; + u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; + u32 dlid = packet->dlid; + u32 slid = packet->slid; + u32 sl = packet->sl; + bool migrated = packet->migrated; + u16 pkey = packet->pkey; + + if (qp->s_mig_state == IB_MIG_ARMED && migrated) { + if (!packet->grh) { + if ((rdma_ah_get_ah_flags(&qp->alt_ah_attr) & + IB_AH_GRH) && + (packet->etype != RHF_RCV_TYPE_BYPASS)) + return 1; + } else { + const struct ib_global_route *grh; + + if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) & + IB_AH_GRH)) + return 1; + grh = rdma_ah_read_grh(&qp->alt_ah_attr); + guid = get_sguid(ibp, grh->sgid_index); + if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix, + guid)) + return 1; + if (!gid_ok( + &packet->grh->sgid, + grh->dgid.global.subnet_prefix, + grh->dgid.global.interface_id)) + return 1; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey, + sc5, slid))) { + hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num, + slid, dlid); + return 1; + } + /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ + if (slid != rdma_ah_get_dlid(&qp->alt_ah_attr) || + ppd_from_ibp(ibp)->port != + rdma_ah_get_port_num(&qp->alt_ah_attr)) + return 1; + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else { + if (!packet->grh) { + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & + IB_AH_GRH) && + (packet->etype != RHF_RCV_TYPE_BYPASS)) + return 1; + } else { + const struct ib_global_route *grh; + + if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & + IB_AH_GRH)) + return 1; + grh = rdma_ah_read_grh(&qp->remote_ah_attr); + guid = get_sguid(ibp, grh->sgid_index); + if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix, + guid)) + return 1; + if (!gid_ok( + &packet->grh->sgid, + grh->dgid.global.subnet_prefix, + grh->dgid.global.interface_id)) + return 1; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey, + sc5, slid))) { + hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num, + slid, dlid); + return 1; + } + /* Validate the SLID. See Ch. 9.6.1.5 */ + if ((slid != rdma_ah_get_dlid(&qp->remote_ah_attr)) || + ppd_from_ibp(ibp)->port != qp->port_num) + return 1; + if (qp->s_mig_state == IB_MIG_REARM && !migrated) + qp->s_mig_state = IB_MIG_ARMED; + } + + return 0; +} + +/** + * hfi1_make_grh - construct a GRH header + * @ibp: a pointer to the IB port + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: size of header after grh being sent in dwords + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + const struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | + (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | + (grh->flow_label << IB_GRH_FLOW_SHIFT)); + hdr->paylen = cpu_to_be16((hwords + nwords) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = IB_GRH_NEXT_HDR; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; + hdr->sgid.global.interface_id = + grh->sgid_index < HFI1_GUIDS_PER_PORT ? + get_sguid(ibp, grh->sgid_index) : + get_sguid(ibp, HFI1_PORT_GUID_INDEX); + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, \ + hdr.ibh.u.oth.bth[2]) / 4) + +/** + * build_ahg - create ahg in s_ahg + * @qp: a pointer to QP + * @npsn: the next PSN for the request/response + * + * This routine handles the AHG by allocating an ahg entry and causing the + * copy of the first middle. + * + * Subsequent middles use the copied entry, editing the + * PSN with 1 or 2 edits. + */ +static inline void build_ahg(struct rvt_qp *qp, u32 npsn) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR)) + clear_ahg(qp); + if (!(qp->s_flags & HFI1_S_AHG_VALID)) { + /* first middle that needs copy */ + if (qp->s_ahgidx < 0) + qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); + if (qp->s_ahgidx >= 0) { + qp->s_ahgpsn = npsn; + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + /* save to protect a change in another thread */ + priv->s_ahg->ahgidx = qp->s_ahgidx; + qp->s_flags |= HFI1_S_AHG_VALID; + } + } else { + /* subsequent middle after valid */ + if (qp->s_ahgidx >= 0) { + priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG; + priv->s_ahg->ahgidx = qp->s_ahgidx; + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[0] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16((u16)npsn), + BTH2_OFFSET, + 16, + 16); + if ((npsn & 0xffff0000) != + (qp->s_ahgpsn & 0xffff0000)) { + priv->s_ahg->ahgcount++; + priv->s_ahg->ahgdesc[1] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16( + (u16)(npsn >> 16)), + BTH2_OFFSET, + 0, + 16); + } + } + } +} + +static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2) +{ + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(bth1); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/** + * hfi1_make_ruc_header_16B - build a 16B header + * @qp: the queue pair + * @ohdr: a pointer to the destination header memory + * @bth0: bth0 passed in from the RC/UC builder + * @bth1: bth1 passed in from the RC/UC builder + * @bth2: bth2 passed in from the RC/UC builder + * @middle: non zero implies indicates ahg "could" be used + * @ps: the current packet state + * + * This routine may disarm ahg under these situations: + * - packet needs a GRH + * - BECN needed + * - migration state not IB_MIG_MIGRATED + */ +static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2, + int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = ps->ibp; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 slid; + u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + u8 l4 = OPA_16B_L4_IB_LOCAL; + u8 extra_bytes = hfi1_get_16b_padding( + (ps->s_txreq->hdr_dwords << 2), + ps->s_txreq->s_cur_size); + u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size + + extra_bytes + SIZE_OF_LT) >> 2); + bool becn = false; + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { + struct ib_grh *grh; + struct ib_global_route *grd = + rdma_ah_retrieve_grh(&qp->remote_ah_attr); + /* + * Ensure OPA GIDs are transformed to IB gids + * before creating the GRH. + */ + if (grd->sgid_index == OPA_GID_INDEX) + grd->sgid_index = 0; + grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh; + l4 = OPA_16B_L4_IB_GLOBAL; + ps->s_txreq->hdr_dwords += + hfi1_make_grh(ibp, grh, grd, + ps->s_txreq->hdr_dwords - LRH_16B_DWORDS, + nwords); + middle = 0; + } + + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth1 |= OPA_BTH_MIG_REQ; + else + middle = 0; + + if (qp->s_flags & RVT_S_ECN) { + qp->s_flags &= ~RVT_S_ECN; + /* we recently received a FECN, so return a BECN */ + becn = true; + middle = 0; + } + if (middle) + build_ahg(qp, bth2); + else + qp->s_flags &= ~HFI1_S_AHG_VALID; + + bth0 |= pkey; + bth0 |= extra_bytes << 20; + hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2); + + if (!ppd->lid) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + else + slid = ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)); + + hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah, + slid, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), + 16B), + (ps->s_txreq->hdr_dwords + nwords) >> 1, + pkey, becn, 0, l4, priv->s_sc); +} + +/** + * hfi1_make_ruc_header_9B - build a 9B header + * @qp: the queue pair + * @ohdr: a pointer to the destination header memory + * @bth0: bth0 passed in from the RC/UC builder + * @bth1: bth1 passed in from the RC/UC builder + * @bth2: bth2 passed in from the RC/UC builder + * @middle: non zero implies indicates ahg "could" be used + * @ps: the current packet state + * + * This routine may disarm ahg under these situations: + * - packet needs a GRH + * - BECN needed + * - migration state not IB_MIG_MIGRATED + */ +static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2, + int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = ps->ibp; + u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + u16 lrh0 = HFI1_LRH_BTH; + u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; + u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size + + extra_bytes) >> 2); + + if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { + struct ib_grh *grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh; + + lrh0 = HFI1_LRH_GRH; + ps->s_txreq->hdr_dwords += + hfi1_make_grh(ibp, grh, + rdma_ah_read_grh(&qp->remote_ah_attr), + ps->s_txreq->hdr_dwords - LRH_9B_DWORDS, + nwords); + middle = 0; + } + lrh0 |= (priv->s_sc & 0xf) << 12 | + (rdma_ah_get_sl(&qp->remote_ah_attr) & 0xf) << 4; + + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + else + middle = 0; + + if (qp->s_flags & RVT_S_ECN) { + qp->s_flags &= ~RVT_S_ECN; + /* we recently received a FECN, so return a BECN */ + bth1 |= (IB_BECN_MASK << IB_BECN_SHIFT); + middle = 0; + } + if (middle) + build_ahg(qp, bth2); + else + qp->s_flags &= ~HFI1_S_AHG_VALID; + + bth0 |= pkey; + bth0 |= extra_bytes << 20; + hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2); + hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh, + lrh0, + ps->s_txreq->hdr_dwords + nwords, + opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), + ppd_from_ibp(ibp)->lid | + rdma_ah_get_path_bits(&qp->remote_ah_attr)); +} + +typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2, int middle, + struct hfi1_pkt_state *ps); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_ruc_header_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_ruc_header_16B +}; + +void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2, int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + + /* + * reset s_ahg/AHG fields + * + * This insures that the ahgentry/ahgcount + * are at a non-AHG default to protect + * build_verbs_tx_desc() from using + * an include ahgidx. + * + * build_ahg() will modify as appropriate + * to use the AHG feature. + */ + priv->s_ahg->tx_flags = 0; + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; + + /* Make the appropriate header */ + hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle, + ps); +} + +/* when sending, force a reschedule every one of these periods */ +#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ + +/** + * hfi1_schedule_send_yield - test for a yield required for QP + * send engine + * @qp: a pointer to QP + * @ps: a pointer to a structure with commonly lookup values for + * the send engine progress + * @tid: true if it is the tid leg + * + * This routine checks if the time slice for the QP has expired + * for RC QPs, if so an additional work entry is queued. At this + * point, other QPs have an opportunity to be scheduled. It + * returns true if a yield is required, otherwise, false + * is returned. + */ +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid) +{ + ps->pkts_sent = true; + + if (unlikely(time_after(jiffies, ps->timeout))) { + if (!ps->in_thread || + workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { + spin_lock_irqsave(&qp->s_lock, ps->flags); + if (!tid) { + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + } else { + struct hfi1_qp_priv *priv = qp->priv; + + if (priv->s_flags & + HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= + ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + } else { + priv->s_flags &= ~RVT_S_BUSY; + } + hfi1_schedule_tid_send(qp); + } + + spin_unlock_irqrestore(&qp->s_lock, ps->flags); + this_cpu_inc(*ps->ppd->dd->send_schedule); + trace_hfi1_rc_expired_time_slice(qp, true); + return true; + } + + cond_resched(); + this_cpu_inc(*ps->ppd->dd->send_schedule); + ps->timeout = jiffies + ps->timeout_int; + } + + trace_hfi1_rc_expired_time_slice(qp, false); + return false; +} + +void hfi1_do_send_from_rvt(struct rvt_qp *qp) +{ + hfi1_do_send(qp, false); +} + +void _hfi1_do_send(struct work_struct *work) +{ + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); + + hfi1_do_send(qp, true); +} + +/** + * hfi1_do_send - perform a send on a QP + * @qp: a pointer to the QP + * @in_thread: true if in a workqueue thread + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP. + * Otherwise, two threads could send packets out of order. + */ +void hfi1_do_send(struct rvt_qp *qp, bool in_thread) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + ps.in_thread = in_thread; + ps.wait = iowait_get_ib_work(&priv->s_iowait); + + trace_hfi1_rc_do_send(qp, in_thread); + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & + ~((1 << ps.ppd->lmc) - 1)) == + ps.ppd->lid)) { + rvt_ruc_loopback(qp); + return; + } + make_req = hfi1_make_rc_req; + ps.timeout_int = qp->timeout_jiffies; + break; + case IB_QPT_UC: + if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & + ~((1 << ps.ppd->lmc) - 1)) == + ps.ppd->lid)) { + rvt_ruc_loopback(qp); + return; + } + make_req = hfi1_make_uc_req; + ps.timeout_int = SEND_RESCHED_TIMEOUT; + break; + default: + make_req = hfi1_make_ud_req; + ps.timeout_int = SEND_RESCHED_TIMEOUT; + } + + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + qp->s_flags |= RVT_S_BUSY; + + ps.timeout_int = ps.timeout_int / 8; + ps.timeout = jiffies + ps.timeout_int; + ps.cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; + + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); + do { + /* Check for a constructed packet to be sent. */ + if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) + qp->s_flags |= RVT_S_BUSY; + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + /* + * If the packet cannot be sent now, return and + * the send engine will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + + /* allow other tasks to run */ + if (hfi1_schedule_send_yield(qp, &ps, false)) + return; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + } + } while (make_req(qp, &ps)); + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c new file mode 100644 index 0000000000..26c6216275 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -0,0 +1,3381 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#include <linux/spinlock.h> +#include <linux/seqlock.h> +#include <linux/netdevice.h> +#include <linux/moduleparam.h> +#include <linux/bitops.h> +#include <linux/timer.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> + +#include "hfi.h" +#include "common.h" +#include "qp.h" +#include "sdma.h" +#include "iowait.h" +#include "trace.h" + +/* must be a power of 2 >= 64 <= 32768 */ +#define SDMA_DESCQ_CNT 2048 +#define SDMA_DESC_INTR 64 +#define INVALID_TAIL 0xffff +#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32)) + +static uint sdma_descq_cnt = SDMA_DESCQ_CNT; +module_param(sdma_descq_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); + +static uint sdma_idle_cnt = 250; +module_param(sdma_idle_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)"); + +uint mod_num_sdma; +module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO); +MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use"); + +static uint sdma_desct_intr = SDMA_DESC_INTR; +module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt"); + +#define SDMA_WAIT_BATCH_SIZE 20 +/* max wait time for a SDMA engine to indicate it has halted */ +#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */ +/* all SDMA engine errors that cause a halt */ + +#define SD(name) SEND_DMA_##name +#define ALL_SDMA_ENG_HALT_ERRS \ + (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK)) + +/* sdma_sendctrl operations */ +#define SDMA_SENDCTRL_OP_ENABLE BIT(0) +#define SDMA_SENDCTRL_OP_INTENABLE BIT(1) +#define SDMA_SENDCTRL_OP_HALT BIT(2) +#define SDMA_SENDCTRL_OP_CLEANUP BIT(3) + +/* handle long defines */ +#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK +#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT + +static const char * const sdma_state_names[] = { + [sdma_state_s00_hw_down] = "s00_HwDown", + [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait", + [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait", + [sdma_state_s20_idle] = "s20_Idle", + [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait", + [sdma_state_s80_hw_freeze] = "s80_HwFreeze", + [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean", + [sdma_state_s99_running] = "s99_Running", +}; + +#ifdef CONFIG_SDMA_VERBOSITY +static const char * const sdma_event_names[] = { + [sdma_event_e00_go_hw_down] = "e00_GoHwDown", + [sdma_event_e10_go_hw_start] = "e10_GoHwStart", + [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone", + [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone", + [sdma_event_e30_go_running] = "e30_GoRunning", + [sdma_event_e40_sw_cleaned] = "e40_SwCleaned", + [sdma_event_e50_hw_cleaned] = "e50_HwCleaned", + [sdma_event_e60_hw_halted] = "e60_HwHalted", + [sdma_event_e70_go_idle] = "e70_GoIdle", + [sdma_event_e80_hw_freeze] = "e80_HwFreeze", + [sdma_event_e81_hw_frozen] = "e81_HwFrozen", + [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze", + [sdma_event_e85_link_down] = "e85_LinkDown", + [sdma_event_e90_sw_halted] = "e90_SwHalted", +}; +#endif + +static const struct sdma_set_state_action sdma_action_table[] = { + [sdma_state_s00_hw_down] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s10_hw_start_up_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s15_hw_start_up_clean_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s20_idle] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s50_hw_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s60_idle_halt_wait] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s80_hw_freeze] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s82_freeze_sw_clean] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + .go_s99_running_totrue = 1, + }, +}; + +#define SDMA_TAIL_UPDATE_THRESH 0x1F + +/* declare all statics here rather than keep sorting */ +static void sdma_complete(struct kref *); +static void sdma_finalput(struct sdma_state *); +static void sdma_get(struct sdma_state *); +static void sdma_hw_clean_up_task(struct tasklet_struct *); +static void sdma_put(struct sdma_state *); +static void sdma_set_state(struct sdma_engine *, enum sdma_states); +static void sdma_start_hw_clean_up(struct sdma_engine *); +static void sdma_sw_clean_up_task(struct tasklet_struct *); +static void sdma_sendctrl(struct sdma_engine *, unsigned); +static void init_sdma_regs(struct sdma_engine *, u32, uint); +static void sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void __sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void dump_sdma_state(struct sdma_engine *sde); +static void sdma_make_progress(struct sdma_engine *sde, u64 status); +static void sdma_desc_avail(struct sdma_engine *sde, uint avail); +static void sdma_flush_descq(struct sdma_engine *sde); + +/** + * sdma_state_name() - return state string from enum + * @state: state + */ +static const char *sdma_state_name(enum sdma_states state) +{ + return sdma_state_names[state]; +} + +static void sdma_get(struct sdma_state *ss) +{ + kref_get(&ss->kref); +} + +static void sdma_complete(struct kref *kref) +{ + struct sdma_state *ss = + container_of(kref, struct sdma_state, kref); + + complete(&ss->comp); +} + +static void sdma_put(struct sdma_state *ss) +{ + kref_put(&ss->kref, sdma_complete); +} + +static void sdma_finalput(struct sdma_state *ss) +{ + sdma_put(ss); + wait_for_completion(&ss->comp); +} + +static inline void write_sde_csr( + struct sdma_engine *sde, + u32 offset0, + u64 value) +{ + write_kctxt_csr(sde->dd, sde->this_idx, offset0, value); +} + +static inline u64 read_sde_csr( + struct sdma_engine *sde, + u32 offset0) +{ + return read_kctxt_csr(sde->dd, sde->this_idx, offset0); +} + +/* + * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for + * sdma engine 'sde' to drop to 0. + */ +static void sdma_wait_for_packet_egress(struct sdma_engine *sde, + int pause) +{ + u64 off = 8 * sde->this_idx; + struct hfi1_devdata *dd = sde->dd; + int lcnt = 0; + u64 reg_prev; + u64 reg = 0; + + while (1) { + reg_prev = reg; + reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS); + + reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK; + reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT; + if (reg == 0) + break; + /* counter is reest if accupancy count changes */ + if (reg != reg_prev) + lcnt = 0; + if (lcnt++ > 500) { + /* timed out - bounce the link */ + dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n", + __func__, sde->this_idx, (u32)reg); + queue_work(dd->pport->link_wq, + &dd->pport->link_bounce_work); + break; + } + udelay(1); + } +} + +/* + * sdma_wait() - wait for packet egress to complete for all SDMA engines, + * and pause for credit return. + */ +void sdma_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + sdma_wait_for_packet_egress(sde, 0); + } +} + +static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt) +{ + u64 reg; + + if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT)) + return; + reg = cnt; + reg &= SD(DESC_CNT_CNT_MASK); + reg <<= SD(DESC_CNT_CNT_SHIFT); + write_sde_csr(sde, SD(DESC_CNT), reg); +} + +static inline void complete_tx(struct sdma_engine *sde, + struct sdma_txreq *tx, + int res) +{ + /* protect against complete modifying */ + struct iowait *wait = tx->wait; + callback_t complete = tx->complete; + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, tx->sn); + if (WARN_ON_ONCE(sde->head_sn != tx->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, tx->sn); + sde->head_sn++; +#endif + __sdma_txclean(sde->dd, tx); + if (complete) + (*complete)(tx, res); + if (iowait_sdma_dec(wait)) + iowait_drain_wakeup(wait); +} + +/* + * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status + * + * Depending on timing there can be txreqs in two places: + * - in the descq ring + * - in the flush list + * + * To avoid ordering issues the descq ring needs to be flushed + * first followed by the flush list. + * + * This routine is called from two places + * - From a work queue item + * - Directly from the state machine just before setting the + * state to running + * + * Must be called with head_lock held + * + */ +static void sdma_flush(struct sdma_engine *sde) +{ + struct sdma_txreq *txp, *txp_next; + LIST_HEAD(flushlist); + unsigned long flags; + uint seq; + + /* flush from head to tail */ + sdma_flush_descq(sde); + spin_lock_irqsave(&sde->flushlist_lock, flags); + /* copy flush list */ + list_splice_init(&sde->flushlist, &flushlist); + spin_unlock_irqrestore(&sde->flushlist_lock, flags); + /* flush from flush list */ + list_for_each_entry_safe(txp, txp_next, &flushlist, list) + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); + /* wakeup QPs orphaned on the dmawait list */ + do { + struct iowait *w, *nw; + + seq = read_seqbegin(&sde->waitlock); + if (!list_empty(&sde->dmawait)) { + write_seqlock(&sde->waitlock); + list_for_each_entry_safe(w, nw, &sde->dmawait, list) { + if (w->wakeup) { + w->wakeup(w, SDMA_AVAIL_REASON); + list_del_init(&w->list); + } + } + write_sequnlock(&sde->waitlock); + } + } while (read_seqretry(&sde->waitlock, seq)); +} + +/* + * Fields a work request for flushing the descq ring + * and the flush list + * + * If the engine has been brought to running during + * the scheduling delay, the flush is ignored, assuming + * that the process of bringing the engine to running + * would have done this flush prior to going to running. + * + */ +static void sdma_field_flush(struct work_struct *work) +{ + unsigned long flags; + struct sdma_engine *sde = + container_of(work, struct sdma_engine, flush_worker); + + write_seqlock_irqsave(&sde->head_lock, flags); + if (!__sdma_running(sde)) + sdma_flush(sde); + write_sequnlock_irqrestore(&sde->head_lock, flags); +} + +static void sdma_err_halt_wait(struct work_struct *work) +{ + struct sdma_engine *sde = container_of(work, struct sdma_engine, + err_halt_worker); + u64 statuscsr; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT); + while (1) { + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_HALTED_SMASK); + if (statuscsr) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(sde->dd, + "SDMA engine %d - timeout waiting for engine to halt\n", + sde->this_idx); + /* + * Continue anyway. This could happen if there was + * an uncorrectable error in the wrong spot. + */ + break; + } + usleep_range(80, 120); + } + + sdma_process_event(sde, sdma_event_e15_hw_halt_done); +} + +static void sdma_err_progress_check_schedule(struct sdma_engine *sde) +{ + if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) { + unsigned index; + struct hfi1_devdata *dd = sde->dd; + + for (index = 0; index < dd->num_sdma; index++) { + struct sdma_engine *curr_sdma = &dd->per_sdma[index]; + + if (curr_sdma != sde) + curr_sdma->progress_check_head = + curr_sdma->descq_head; + } + dd_dev_err(sde->dd, + "SDMA engine %d - check scheduled\n", + sde->this_idx); + mod_timer(&sde->err_progress_check_timer, jiffies + 10); + } +} + +static void sdma_err_progress_check(struct timer_list *t) +{ + unsigned index; + struct sdma_engine *sde = from_timer(sde, t, err_progress_check_timer); + + dd_dev_err(sde->dd, "SDE progress check event\n"); + for (index = 0; index < sde->dd->num_sdma; index++) { + struct sdma_engine *curr_sde = &sde->dd->per_sdma[index]; + unsigned long flags; + + /* check progress on each engine except the current one */ + if (curr_sde == sde) + continue; + /* + * We must lock interrupts when acquiring sde->lock, + * to avoid a deadlock if interrupt triggers and spins on + * the same lock on same CPU + */ + spin_lock_irqsave(&curr_sde->tail_lock, flags); + write_seqlock(&curr_sde->head_lock); + + /* skip non-running queues */ + if (curr_sde->state.current_state != sdma_state_s99_running) { + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + continue; + } + + if ((curr_sde->descq_head != curr_sde->descq_tail) && + (curr_sde->descq_head == + curr_sde->progress_check_head)) + __sdma_process_event(curr_sde, + sdma_event_e90_sw_halted); + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + } + schedule_work(&sde->err_halt_worker); +} + +static void sdma_hw_clean_up_task(struct tasklet_struct *t) +{ + struct sdma_engine *sde = from_tasklet(sde, t, + sdma_hw_clean_up_task); + u64 statuscsr; + + while (1) { +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, + __func__); +#endif + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK); + if (statuscsr) + break; + udelay(10); + } + + sdma_process_event(sde, sdma_event_e25_hw_clean_up_done); +} + +static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde) +{ + return sde->tx_ring[sde->tx_head & sde->sdma_mask]; +} + +/* + * flush ring for recovery + */ +static void sdma_flush_descq(struct sdma_engine *sde) +{ + u16 head, tail; + int progress = 0; + struct sdma_txreq *txp = get_txhead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + while (head != tail) { + /* advance head, wrap if needed */ + head = ++sde->descq_head & sde->sdma_mask; + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == head) { + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); + trace_hfi1_sdma_progress(sde, head, tail, txp); + txp = get_txhead(sde); + } + progress++; + } + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +static void sdma_sw_clean_up_task(struct tasklet_struct *t) +{ + struct sdma_engine *sde = from_tasklet(sde, t, sdma_sw_clean_up_task); + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + /* + * At this point, the following should always be true: + * - We are halted, so no more descriptors are getting retired. + * - We are not running, so no one is submitting new work. + * - Only we can send the e40_sw_cleaned, so we can't start + * running again until we say so. So, the active list and + * descq are ours to play with. + */ + + /* + * In the error clean up sequence, software clean must be called + * before the hardware clean so we can use the hardware head in + * the progress routine. A hardware clean or SPC unfreeze will + * reset the hardware head. + * + * Process all retired requests. The progress routine will use the + * latest physical hardware head - we are not running so speed does + * not matter. + */ + sdma_make_progress(sde, 0); + + sdma_flush(sde); + + /* + * Reset our notion of head and tail. + * Note that the HW registers have been reset via an earlier + * clean up. + */ + sde->descq_tail = 0; + sde->descq_head = 0; + sde->desc_avail = sdma_descq_freecnt(sde); + *sde->head_dma = 0; + + __sdma_process_event(sde, sdma_event_e40_sw_cleaned); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sw_tear_down(struct sdma_engine *sde) +{ + struct sdma_state *ss = &sde->state; + + /* Releasing this reference means the state machine has stopped. */ + sdma_put(ss); + + /* stop waiting for all unfreeze events to complete */ + atomic_set(&sde->dd->sdma_unfreeze_count, -1); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); +} + +static void sdma_start_hw_clean_up(struct sdma_engine *sde) +{ + tasklet_hi_schedule(&sde->sdma_hw_clean_up_task); +} + +static void sdma_set_state(struct sdma_engine *sde, + enum sdma_states next_state) +{ + struct sdma_state *ss = &sde->state; + const struct sdma_set_state_action *action = sdma_action_table; + unsigned op = 0; + + trace_hfi1_sdma_state( + sde, + sdma_state_names[ss->current_state], + sdma_state_names[next_state]); + + /* debugging bookkeeping */ + ss->previous_state = ss->current_state; + ss->previous_op = ss->current_op; + ss->current_state = next_state; + + if (ss->previous_state != sdma_state_s99_running && + next_state == sdma_state_s99_running) + sdma_flush(sde); + + if (action[next_state].op_enable) + op |= SDMA_SENDCTRL_OP_ENABLE; + + if (action[next_state].op_intenable) + op |= SDMA_SENDCTRL_OP_INTENABLE; + + if (action[next_state].op_halt) + op |= SDMA_SENDCTRL_OP_HALT; + + if (action[next_state].op_cleanup) + op |= SDMA_SENDCTRL_OP_CLEANUP; + + if (action[next_state].go_s99_running_tofalse) + ss->go_s99_running = 0; + + if (action[next_state].go_s99_running_totrue) + ss->go_s99_running = 1; + + ss->current_op = op; + sdma_sendctrl(sde, ss->current_op); +} + +/** + * sdma_get_descq_cnt() - called when device probed + * + * Return a validated descq count. + * + * This is currently only used in the verbs initialization to build the tx + * list. + * + * This will probably be deleted in favor of a more scalable approach to + * alloc tx's. + * + */ +u16 sdma_get_descq_cnt(void) +{ + u16 count = sdma_descq_cnt; + + if (!count) + return SDMA_DESCQ_CNT; + /* count must be a power of 2 greater than 64 and less than + * 32768. Otherwise return default. + */ + if (!is_power_of_2(count)) + return SDMA_DESCQ_CNT; + if (count < 64 || count > 32768) + return SDMA_DESCQ_CNT; + return count; +} + +/** + * sdma_engine_get_vl() - return vl for a given sdma engine + * @sde: sdma engine + * + * This function returns the vl mapped to a given engine, or an error if + * the mapping can't be found. The mapping fields are protected by RCU. + */ +int sdma_engine_get_vl(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + struct sdma_vl_map *m; + u8 vl; + + if (sde->this_idx >= TXE_NUM_SDMA_ENGINES) + return -EINVAL; + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return -EINVAL; + } + vl = m->engine_to_vl[sde->this_idx]; + rcu_read_unlock(); + + return vl; +} + +/** + * sdma_select_engine_vl() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * + * This function returns an engine based on the selector and a vl. The + * mapping fields are protected by RCU. + */ +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl) +{ + struct sdma_vl_map *m; + struct sdma_map_elem *e; + struct sdma_engine *rval; + + /* NOTE This should only happen if SC->VL changed after the initial + * checks on the QP/AH + * Default will return engine 0 below + */ + if (vl >= num_vls) { + rval = NULL; + goto done; + } + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return &dd->per_sdma[0]; + } + e = m->map[vl & m->mask]; + rval = e->sde[selector & e->mask]; + rcu_read_unlock(); + +done: + rval = !rval ? &dd->per_sdma[0] : rval; + trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx); + return rval; +} + +/** + * sdma_select_engine_sc() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * + * This function returns an engine based on the selector and an sc. + */ +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return sdma_select_engine_vl(dd, selector, vl); +} + +struct sdma_rht_map_elem { + u32 mask; + u8 ctr; + struct sdma_engine *sde[]; +}; + +struct sdma_rht_node { + unsigned long cpu_id; + struct sdma_rht_map_elem *map[HFI1_MAX_VLS_SUPPORTED]; + struct rhash_head node; +}; + +#define NR_CPUS_HINT 192 + +static const struct rhashtable_params sdma_rht_params = { + .nelem_hint = NR_CPUS_HINT, + .head_offset = offsetof(struct sdma_rht_node, node), + .key_offset = offsetof(struct sdma_rht_node, cpu_id), + .key_len = sizeof_field(struct sdma_rht_node, cpu_id), + .max_size = NR_CPUS, + .min_size = 8, + .automatic_shrinking = true, +}; + +/* + * sdma_select_user_engine() - select sdma engine based on user setup + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * This function returns an sdma engine for a user sdma request. + * User defined sdma engine affinity setting is honored when applicable, + * otherwise system default sdma engine mapping is used. To ensure correct + * ordering, the mapping from <selector, vl> to sde must remain unchanged. + */ +struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, + u32 selector, u8 vl) +{ + struct sdma_rht_node *rht_node; + struct sdma_engine *sde = NULL; + unsigned long cpu_id; + + /* + * To ensure that always the same sdma engine(s) will be + * selected make sure the process is pinned to this CPU only. + */ + if (current->nr_cpus_allowed != 1) + goto out; + + rcu_read_lock(); + cpu_id = smp_processor_id(); + rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id, + sdma_rht_params); + + if (rht_node && rht_node->map[vl]) { + struct sdma_rht_map_elem *map = rht_node->map[vl]; + + sde = map->sde[selector & map->mask]; + } + rcu_read_unlock(); + + if (sde) + return sde; + +out: + return sdma_select_engine_vl(dd, selector, vl); +} + +static void sdma_populate_sde_map(struct sdma_rht_map_elem *map) +{ + int i; + + for (i = 0; i < roundup_pow_of_two(map->ctr ? : 1) - map->ctr; i++) + map->sde[map->ctr + i] = map->sde[i]; +} + +static void sdma_cleanup_sde_map(struct sdma_rht_map_elem *map, + struct sdma_engine *sde) +{ + unsigned int i, pow; + + /* only need to check the first ctr entries for a match */ + for (i = 0; i < map->ctr; i++) { + if (map->sde[i] == sde) { + memmove(&map->sde[i], &map->sde[i + 1], + (map->ctr - i - 1) * sizeof(map->sde[0])); + map->ctr--; + pow = roundup_pow_of_two(map->ctr ? : 1); + map->mask = pow - 1; + sdma_populate_sde_map(map); + break; + } + } +} + +/* + * Prevents concurrent reads and writes of the sdma engine cpu_mask + */ +static DEFINE_MUTEX(process_to_sde_mutex); + +ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, + size_t count) +{ + struct hfi1_devdata *dd = sde->dd; + cpumask_var_t mask, new_mask; + unsigned long cpu; + int ret, vl, sz; + struct sdma_rht_node *rht_node; + + vl = sdma_engine_get_vl(sde); + if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map))) + return -EINVAL; + + ret = zalloc_cpumask_var(&mask, GFP_KERNEL); + if (!ret) + return -ENOMEM; + + ret = zalloc_cpumask_var(&new_mask, GFP_KERNEL); + if (!ret) { + free_cpumask_var(mask); + return -ENOMEM; + } + ret = cpulist_parse(buf, mask); + if (ret) + goto out_free; + + if (!cpumask_subset(mask, cpu_online_mask)) { + dd_dev_warn(sde->dd, "Invalid CPU mask\n"); + ret = -EINVAL; + goto out_free; + } + + sz = sizeof(struct sdma_rht_map_elem) + + (TXE_NUM_SDMA_ENGINES * sizeof(struct sdma_engine *)); + + mutex_lock(&process_to_sde_mutex); + + for_each_cpu(cpu, mask) { + /* Check if we have this already mapped */ + if (cpumask_test_cpu(cpu, &sde->cpu_mask)) { + cpumask_set_cpu(cpu, new_mask); + continue; + } + + rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu, + sdma_rht_params); + if (!rht_node) { + rht_node = kzalloc(sizeof(*rht_node), GFP_KERNEL); + if (!rht_node) { + ret = -ENOMEM; + goto out; + } + + rht_node->map[vl] = kzalloc(sz, GFP_KERNEL); + if (!rht_node->map[vl]) { + kfree(rht_node); + ret = -ENOMEM; + goto out; + } + rht_node->cpu_id = cpu; + rht_node->map[vl]->mask = 0; + rht_node->map[vl]->ctr = 1; + rht_node->map[vl]->sde[0] = sde; + + ret = rhashtable_insert_fast(dd->sdma_rht, + &rht_node->node, + sdma_rht_params); + if (ret) { + kfree(rht_node->map[vl]); + kfree(rht_node); + dd_dev_err(sde->dd, "Failed to set process to sde affinity for cpu %lu\n", + cpu); + goto out; + } + + } else { + int ctr, pow; + + /* Add new user mappings */ + if (!rht_node->map[vl]) + rht_node->map[vl] = kzalloc(sz, GFP_KERNEL); + + if (!rht_node->map[vl]) { + ret = -ENOMEM; + goto out; + } + + rht_node->map[vl]->ctr++; + ctr = rht_node->map[vl]->ctr; + rht_node->map[vl]->sde[ctr - 1] = sde; + pow = roundup_pow_of_two(ctr); + rht_node->map[vl]->mask = pow - 1; + + /* Populate the sde map table */ + sdma_populate_sde_map(rht_node->map[vl]); + } + cpumask_set_cpu(cpu, new_mask); + } + + /* Clean up old mappings */ + for_each_cpu(cpu, cpu_online_mask) { + struct sdma_rht_node *rht_node; + + /* Don't cleanup sdes that are set in the new mask */ + if (cpumask_test_cpu(cpu, mask)) + continue; + + rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu, + sdma_rht_params); + if (rht_node) { + bool empty = true; + int i; + + /* Remove mappings for old sde */ + for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) + if (rht_node->map[i]) + sdma_cleanup_sde_map(rht_node->map[i], + sde); + + /* Free empty hash table entries */ + for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) { + if (!rht_node->map[i]) + continue; + + if (rht_node->map[i]->ctr) { + empty = false; + break; + } + } + + if (empty) { + ret = rhashtable_remove_fast(dd->sdma_rht, + &rht_node->node, + sdma_rht_params); + WARN_ON(ret); + + for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) + kfree(rht_node->map[i]); + + kfree(rht_node); + } + } + } + + cpumask_copy(&sde->cpu_mask, new_mask); +out: + mutex_unlock(&process_to_sde_mutex); +out_free: + free_cpumask_var(mask); + free_cpumask_var(new_mask); + return ret ? : strnlen(buf, PAGE_SIZE); +} + +ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf) +{ + mutex_lock(&process_to_sde_mutex); + if (cpumask_empty(&sde->cpu_mask)) + snprintf(buf, PAGE_SIZE, "%s\n", "empty"); + else + cpumap_print_to_pagebuf(true, buf, &sde->cpu_mask); + mutex_unlock(&process_to_sde_mutex); + return strnlen(buf, PAGE_SIZE); +} + +static void sdma_rht_free(void *ptr, void *arg) +{ + struct sdma_rht_node *rht_node = ptr; + int i; + + for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) + kfree(rht_node->map[i]); + + kfree(rht_node); +} + +/** + * sdma_seqfile_dump_cpu_list() - debugfs dump the cpu to sdma mappings + * @s: seq file + * @dd: hfi1_devdata + * @cpuid: cpu id + * + * This routine dumps the process to sde mappings per cpu + */ +void sdma_seqfile_dump_cpu_list(struct seq_file *s, + struct hfi1_devdata *dd, + unsigned long cpuid) +{ + struct sdma_rht_node *rht_node; + int i, j; + + rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpuid, + sdma_rht_params); + if (!rht_node) + return; + + seq_printf(s, "cpu%3lu: ", cpuid); + for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) { + if (!rht_node->map[i] || !rht_node->map[i]->ctr) + continue; + + seq_printf(s, " vl%d: [", i); + + for (j = 0; j < rht_node->map[i]->ctr; j++) { + if (!rht_node->map[i]->sde[j]) + continue; + + if (j > 0) + seq_puts(s, ","); + + seq_printf(s, " sdma%2d", + rht_node->map[i]->sde[j]->this_idx); + } + seq_puts(s, " ]"); + } + + seq_puts(s, "\n"); +} + +/* + * Free the indicated map struct + */ +static void sdma_map_free(struct sdma_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void sdma_map_rcu_callback(struct rcu_head *list) +{ + struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list); + + sdma_map_free(m); +} + +/** + * sdma_map_init - called when # vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_engines: per vl engine mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_engines is used to specify a non-uniform vl/engine loading. NULL + * implies auto computing the loading and giving each VLs a uniform + * distribution of engines per VL. + * + * The auto algorithm computes the sde_per_vl and the number of extra + * engines. Any extra engines are added from the last VL on down. + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_sdma are non-power of 2, the array sizes + * in the struct sdma_vl_map and the struct sdma_map_elem are rounded + * up to the next highest power of 2 and the first entry is reused + * in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is + * not changed. + * + */ +int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) +{ + int i, j; + int extra, sde_per_vl; + int engine = 0; + u8 lvl_engines[OPA_MAX_VLS]; + struct sdma_vl_map *oldmap, *newmap; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return 0; + + if (!vl_engines) { + /* truncate divide */ + sde_per_vl = dd->num_sdma / num_vls; + /* extras */ + extra = dd->num_sdma % num_vls; + vl_engines = lvl_engines; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc( + sizeof(struct sdma_vl_map) + + roundup_pow_of_two(num_vls) * + sizeof(struct sdma_map_elem *), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + /* initialize back-map */ + for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++) + newmap->engine_to_vl[i] = -1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_engine = engine; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_engines[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc( + sizeof(struct sdma_map_elem) + + sz * sizeof(struct sdma_engine *), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* assign engines */ + for (j = 0; j < sz; j++) { + newmap->map[i]->sde[j] = + &dd->per_sdma[engine]; + if (++engine >= first_engine + vl_engines[i]) + /* wrap back to first engine */ + engine = first_engine; + } + /* assign back-map */ + for (j = 0; j < vl_engines[i]; j++) + newmap->engine_to_vl[first_engine + j] = i; + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + engine = first_engine + vl_engines[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->sde_map_lock); + oldmap = rcu_dereference_protected(dd->sdma_map, + lockdep_is_held(&dd->sde_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->sdma_map, newmap); + + spin_unlock_irq(&dd->sde_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, sdma_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + sdma_map_free(newmap); + return -ENOMEM; +} + +/** + * sdma_clean - Clean up allocated memory + * @dd: struct hfi1_devdata + * @num_engines: num sdma engines + * + * This routine can be called regardless of the success of + * sdma_init() + */ +void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) +{ + size_t i; + struct sdma_engine *sde; + + if (dd->sdma_pad_dma) { + dma_free_coherent(&dd->pcidev->dev, SDMA_PAD, + (void *)dd->sdma_pad_dma, + dd->sdma_pad_phys); + dd->sdma_pad_dma = NULL; + dd->sdma_pad_phys = 0; + } + if (dd->sdma_heads_dma) { + dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size, + (void *)dd->sdma_heads_dma, + dd->sdma_heads_phys); + dd->sdma_heads_dma = NULL; + dd->sdma_heads_phys = 0; + } + for (i = 0; dd->per_sdma && i < num_engines; ++i) { + sde = &dd->per_sdma[i]; + + sde->head_dma = NULL; + sde->head_phys = 0; + + if (sde->descq) { + dma_free_coherent( + &dd->pcidev->dev, + sde->descq_cnt * sizeof(u64[2]), + sde->descq, + sde->descq_phys + ); + sde->descq = NULL; + sde->descq_phys = 0; + } + kvfree(sde->tx_ring); + sde->tx_ring = NULL; + } + if (rcu_access_pointer(dd->sdma_map)) { + spin_lock_irq(&dd->sde_map_lock); + sdma_map_free(rcu_access_pointer(dd->sdma_map)); + RCU_INIT_POINTER(dd->sdma_map, NULL); + spin_unlock_irq(&dd->sde_map_lock); + synchronize_rcu(); + } + kfree(dd->per_sdma); + dd->per_sdma = NULL; + + if (dd->sdma_rht) { + rhashtable_free_and_destroy(dd->sdma_rht, sdma_rht_free, NULL); + kfree(dd->sdma_rht); + dd->sdma_rht = NULL; + } +} + +/** + * sdma_init() - called when device probed + * @dd: hfi1_devdata + * @port: port number (currently only zero) + * + * Initializes each sde and its csrs. + * Interrupts are not required to be enabled. + * + * Returns: + * 0 - success, -errno on failure + */ +int sdma_init(struct hfi1_devdata *dd, u8 port) +{ + unsigned this_idx; + struct sdma_engine *sde; + struct rhashtable *tmp_sdma_rht; + u16 descq_cnt; + void *curr_head; + struct hfi1_pportdata *ppd = dd->pport + port; + u32 per_sdma_credits; + uint idle_cnt = sdma_idle_cnt; + size_t num_engines = chip_sdma_engines(dd); + int ret = -ENOMEM; + + if (!HFI1_CAP_IS_KSET(SDMA)) { + HFI1_CAP_CLEAR(SDMA_AHG); + return 0; + } + if (mod_num_sdma && + /* can't exceed chip support */ + mod_num_sdma <= chip_sdma_engines(dd) && + /* count must be >= vls */ + mod_num_sdma >= num_vls) + num_engines = mod_num_sdma; + + dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma); + dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", chip_sdma_engines(dd)); + dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n", + chip_sdma_mem_size(dd)); + + per_sdma_credits = + chip_sdma_mem_size(dd) / (num_engines * SDMA_BLOCK_SIZE); + + /* set up freeze waitqueue */ + init_waitqueue_head(&dd->sdma_unfreeze_wq); + atomic_set(&dd->sdma_unfreeze_count, 0); + + descq_cnt = sdma_get_descq_cnt(); + dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n", + num_engines, descq_cnt); + + /* alloc memory for array of send engines */ + dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma), + GFP_KERNEL, dd->node); + if (!dd->per_sdma) + return ret; + + idle_cnt = ns_to_cclock(dd, idle_cnt); + if (idle_cnt) + dd->default_desc1 = + SDMA_DESC1_HEAD_TO_HOST_FLAG; + else + dd->default_desc1 = + SDMA_DESC1_INT_REQ_FLAG; + + if (!sdma_desct_intr) + sdma_desct_intr = SDMA_DESC_INTR; + + /* Allocate memory for SendDMA descriptor FIFOs */ + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + sde = &dd->per_sdma[this_idx]; + sde->dd = dd; + sde->ppd = ppd; + sde->this_idx = this_idx; + sde->descq_cnt = descq_cnt; + sde->desc_avail = sdma_descq_freecnt(sde); + sde->sdma_shift = ilog2(descq_cnt); + sde->sdma_mask = (1 << sde->sdma_shift) - 1; + + /* Create a mask specifically for each interrupt source */ + sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES + + this_idx); + /* Create a combined mask to cover all 3 interrupt sources */ + sde->imask = sde->int_mask | sde->progress_mask | + sde->idle_mask; + + spin_lock_init(&sde->tail_lock); + seqlock_init(&sde->head_lock); + spin_lock_init(&sde->senddmactrl_lock); + spin_lock_init(&sde->flushlist_lock); + seqlock_init(&sde->waitlock); + /* insure there is always a zero bit */ + sde->ahg_bits = 0xfffffffe00000000ULL; + + sdma_set_state(sde, sdma_state_s00_hw_down); + + /* set up reference counting */ + kref_init(&sde->state.kref); + init_completion(&sde->state.comp); + + INIT_LIST_HEAD(&sde->flushlist); + INIT_LIST_HEAD(&sde->dmawait); + + sde->tail_csr = + get_kctxt_csr_addr(dd, this_idx, SD(TAIL)); + + tasklet_setup(&sde->sdma_hw_clean_up_task, + sdma_hw_clean_up_task); + tasklet_setup(&sde->sdma_sw_clean_up_task, + sdma_sw_clean_up_task); + INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait); + INIT_WORK(&sde->flush_worker, sdma_field_flush); + + sde->progress_check_head = 0; + + timer_setup(&sde->err_progress_check_timer, + sdma_err_progress_check, 0); + + sde->descq = dma_alloc_coherent(&dd->pcidev->dev, + descq_cnt * sizeof(u64[2]), + &sde->descq_phys, GFP_KERNEL); + if (!sde->descq) + goto bail; + sde->tx_ring = + kvzalloc_node(array_size(descq_cnt, + sizeof(struct sdma_txreq *)), + GFP_KERNEL, dd->node); + if (!sde->tx_ring) + goto bail; + } + + dd->sdma_heads_size = L1_CACHE_BYTES * num_engines; + /* Allocate memory for DMA of head registers to memory */ + dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev, + dd->sdma_heads_size, + &dd->sdma_heads_phys, + GFP_KERNEL); + if (!dd->sdma_heads_dma) { + dd_dev_err(dd, "failed to allocate SendDMA head memory\n"); + goto bail; + } + + /* Allocate memory for pad */ + dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD, + &dd->sdma_pad_phys, GFP_KERNEL); + if (!dd->sdma_pad_dma) { + dd_dev_err(dd, "failed to allocate SendDMA pad memory\n"); + goto bail; + } + + /* assign each engine to different cacheline and init registers */ + curr_head = (void *)dd->sdma_heads_dma; + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + unsigned long phys_offset; + + sde = &dd->per_sdma[this_idx]; + + sde->head_dma = curr_head; + curr_head += L1_CACHE_BYTES; + phys_offset = (unsigned long)sde->head_dma - + (unsigned long)dd->sdma_heads_dma; + sde->head_phys = dd->sdma_heads_phys + phys_offset; + init_sdma_regs(sde, per_sdma_credits, idle_cnt); + } + dd->flags |= HFI1_HAS_SEND_DMA; + dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0; + dd->num_sdma = num_engines; + ret = sdma_map_init(dd, port, ppd->vls_operational, NULL); + if (ret < 0) + goto bail; + + tmp_sdma_rht = kzalloc(sizeof(*tmp_sdma_rht), GFP_KERNEL); + if (!tmp_sdma_rht) { + ret = -ENOMEM; + goto bail; + } + + ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params); + if (ret < 0) { + kfree(tmp_sdma_rht); + goto bail; + } + + dd->sdma_rht = tmp_sdma_rht; + + dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma); + return 0; + +bail: + sdma_clean(dd, num_engines); + return ret; +} + +/** + * sdma_all_running() - called when the link goes up + * @dd: hfi1_devdata + * + * This routine moves all engines to the running state. + */ +void sdma_all_running(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* move all engines to running */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e30_go_running); + } +} + +/** + * sdma_all_idle() - called when the link goes down + * @dd: hfi1_devdata + * + * This routine moves all engines to the idle state. + */ +void sdma_all_idle(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* idle all engines */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e70_go_idle); + } +} + +/** + * sdma_start() - called to kick off state processing for all engines + * @dd: hfi1_devdata + * + * This routine is for kicking off the state processing for all required + * sdma engines. Interrupts need to be working at this point. + * + */ +void sdma_start(struct hfi1_devdata *dd) +{ + unsigned i; + struct sdma_engine *sde; + + /* kick off the engines state processing */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e10_go_hw_start); + } +} + +/** + * sdma_exit() - used when module is removed + * @dd: hfi1_devdata + */ +void sdma_exit(struct hfi1_devdata *dd) +{ + unsigned this_idx; + struct sdma_engine *sde; + + for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma; + ++this_idx) { + sde = &dd->per_sdma[this_idx]; + if (!list_empty(&sde->dmawait)) + dd_dev_err(dd, "sde %u: dmawait list not empty!\n", + sde->this_idx); + sdma_process_event(sde, sdma_event_e00_go_hw_down); + + del_timer_sync(&sde->err_progress_check_timer); + + /* + * This waits for the state machine to exit so it is not + * necessary to kill the sdma_sw_clean_up_task to make sure + * it is not running. + */ + sdma_finalput(&sde->state); + } +} + +/* + * unmap the indicated descriptor + */ +static inline void sdma_unmap_desc( + struct hfi1_devdata *dd, + struct sdma_desc *descp) +{ + switch (sdma_mapping_type(descp)) { + case SDMA_MAP_SINGLE: + dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + case SDMA_MAP_PAGE: + dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + } + + if (descp->pinning_ctx && descp->ctx_put) + descp->ctx_put(descp->pinning_ctx); + descp->pinning_ctx = NULL; +} + +/* + * return the mode as indicated by the first + * descriptor in the tx. + */ +static inline u8 ahg_mode(struct sdma_txreq *tx) +{ + return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK) + >> SDMA_DESC1_HEADER_MODE_SHIFT; +} + +/** + * __sdma_txclean() - clean tx of mappings, descp *kmalloc's + * @dd: hfi1_devdata for unmapping + * @tx: tx request to clean + * + * This is used in the progress routine to clean the tx or + * by the ULP to toss an in-process tx build. + * + * The code can be called multiple times without issue. + * + */ +void __sdma_txclean( + struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + u16 i; + + if (tx->num_desc) { + u8 skip = 0, mode = ahg_mode(tx); + + /* unmap first */ + sdma_unmap_desc(dd, &tx->descp[0]); + /* determine number of AHG descriptors to skip */ + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1 + skip; i < tx->num_desc; i++) + sdma_unmap_desc(dd, &tx->descp[i]); + tx->num_desc = 0; + } + kfree(tx->coalesce_buf); + tx->coalesce_buf = NULL; + /* kmalloc'ed descp */ + if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) { + tx->desc_limit = ARRAY_SIZE(tx->descs); + kfree(tx->descp); + } +} + +static inline u16 sdma_gethead(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + int use_dmahead; + u16 hwhead; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + +retry: + use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) && + (dd->flags & HFI1_HAS_SDMA_TIMEOUT); + hwhead = use_dmahead ? + (u16)le64_to_cpu(*sde->head_dma) : + (u16)read_sde_csr(sde, SD(HEAD)); + + if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) { + u16 cnt; + u16 swtail; + u16 swhead; + int sane; + + swhead = sde->descq_head & sde->sdma_mask; + /* this code is really bad for cache line trading */ + swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; + cnt = sde->descq_cnt; + + if (swhead < swtail) + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + else if (swhead > swtail) + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + else + /* empty */ + sane = (hwhead == swhead); + + if (unlikely(!sane)) { + dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%u swhd=%u swtl=%u cnt=%u\n", + sde->this_idx, + use_dmahead ? "dma" : "kreg", + hwhead, swhead, swtail, cnt); + if (use_dmahead) { + /* try one more time, using csr */ + use_dmahead = 0; + goto retry; + } + /* proceed as if no progress */ + hwhead = swhead; + } + } + return hwhead; +} + +/* + * This is called when there are send DMA descriptors that might be + * available. + * + * This is called with head_lock held. + */ +static void sdma_desc_avail(struct sdma_engine *sde, uint avail) +{ + struct iowait *wait, *nw, *twait; + struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; + uint i, n = 0, seq, tidx = 0; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "avail: %u\n", avail); +#endif + + do { + seq = read_seqbegin(&sde->waitlock); + if (!list_empty(&sde->dmawait)) { + /* at least one item */ + write_seqlock(&sde->waitlock); + /* Harvest waiters wanting DMA descriptors */ + list_for_each_entry_safe( + wait, + nw, + &sde->dmawait, + list) { + u32 num_desc; + + if (!wait->wakeup) + continue; + if (n == ARRAY_SIZE(waits)) + break; + iowait_init_priority(wait); + num_desc = iowait_get_all_desc(wait); + if (num_desc > avail) + break; + avail -= num_desc; + /* Find the top-priority wait memeber */ + if (n) { + twait = waits[tidx]; + tidx = + iowait_priority_update_top(wait, + twait, + n, + tidx); + } + list_del_init(&wait->list); + waits[n++] = wait; + } + write_sequnlock(&sde->waitlock); + break; + } + } while (read_seqretry(&sde->waitlock, seq)); + + /* Schedule the top-priority entry first */ + if (n) + waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON); + + for (i = 0; i < n; i++) + if (i != tidx) + waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); +} + +/* head_lock must be held */ +static void sdma_make_progress(struct sdma_engine *sde, u64 status) +{ + struct sdma_txreq *txp = NULL; + int progress = 0; + u16 hwhead, swhead; + int idle_check_done = 0; + + hwhead = sdma_gethead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + +retry: + txp = get_txhead(sde); + swhead = sde->descq_head & sde->sdma_mask; + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + while (swhead != hwhead) { + /* advance head, wrap if needed */ + swhead = ++sde->descq_head & sde->sdma_mask; + + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == swhead) { + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + complete_tx(sde, txp, SDMA_TXREQ_S_OK); + /* see if there is another txp */ + txp = get_txhead(sde); + } + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + progress++; + } + + /* + * The SDMA idle interrupt is not guaranteed to be ordered with respect + * to updates to the dma_head location in host memory. The head + * value read might not be fully up to date. If there are pending + * descriptors and the SDMA idle interrupt fired then read from the + * CSR SDMA head instead to get the latest value from the hardware. + * The hardware SDMA head should be read at most once in this invocation + * of sdma_make_progress(..) which is ensured by idle_check_done flag + */ + if ((status & sde->idle_mask) && !idle_check_done) { + u16 swtail; + + swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; + if (swtail != hwhead) { + hwhead = (u16)read_sde_csr(sde, SD(HEAD)); + idle_check_done = 1; + goto retry; + } + } + + sde->last_status = status; + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +/* + * sdma_engine_interrupt() - interrupt handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + * + * Status is a mask of the 3 possible interrupts for this engine. It will + * contain bits _only_ for this SDMA engine. It will contain at least one + * bit, it may contain more. + */ +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) +{ + trace_hfi1_sdma_engine_interrupt(sde, status); + write_seqlock(&sde->head_lock); + sdma_set_desc_cnt(sde, sdma_desct_intr); + if (status & sde->idle_mask) + sde->idle_int_cnt++; + else if (status & sde->progress_mask) + sde->progress_int_cnt++; + else if (status & sde->int_mask) + sde->sdma_int_cnt++; + sdma_make_progress(sde, status); + write_sequnlock(&sde->head_lock); +} + +/** + * sdma_engine_error() - error handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + */ +void sdma_engine_error(struct sdma_engine *sde, u64 status) +{ + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); +#endif + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + if (status & ALL_SDMA_ENG_HALT_ERRS) + __sdma_process_event(sde, sdma_event_e60_hw_halted); + if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) { + dd_dev_err(sde->dd, + "SDMA (%u) engine error: 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); + dump_sdma_state(sde); + } + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sendctrl(struct sdma_engine *sde, unsigned op) +{ + u64 set_senddmactrl = 0; + u64 clr_senddmactrl = 0; + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n", + sde->this_idx, + (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0); +#endif + + if (op & SDMA_SENDCTRL_OP_ENABLE) + set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_INTENABLE) + set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_HALT) + set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + + spin_lock_irqsave(&sde->senddmactrl_lock, flags); + + sde->p_senddmactrl |= set_senddmactrl; + sde->p_senddmactrl &= ~clr_senddmactrl; + + if (op & SDMA_SENDCTRL_OP_CLEANUP) + write_sde_csr(sde, SD(CTRL), + sde->p_senddmactrl | + SD(CTRL_SDMA_CLEANUP_SMASK)); + else + write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl); + + spin_unlock_irqrestore(&sde->senddmactrl_lock, flags); + +#ifdef CONFIG_SDMA_VERBOSITY + sdma_dumpstate(sde); +#endif +} + +static void sdma_setlengen(struct sdma_engine *sde) +{ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + /* + * Set SendDmaLenGen and clear-then-set the MSB of the generation + * count to enable generation checking and load the internal + * generation counter. + */ + write_sde_csr(sde, SD(LEN_GEN), + (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)); + write_sde_csr(sde, SD(LEN_GEN), + ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) | + (4ULL << SD(LEN_GEN_GENERATION_SHIFT))); +} + +static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + smp_wmb(); /* see get_txhead() */ + writeq(tail, sde->tail_csr); +} + +/* + * This is called when changing to state s10_hw_start_up_halt_wait as + * a result of send buffer errors or send DMA descriptor errors. + */ +static void sdma_hw_start_up(struct sdma_engine *sde) +{ + u64 reg; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + *sde->head_dma = 0; + + reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) << + SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT); + write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg); +} + +/* + * set_sdma_integrity + * + * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'. + */ +static void set_sdma_integrity(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + + write_sde_csr(sde, SD(CHECK_ENABLE), + hfi1_pkt_base_sdma_integrity(dd)); +} + +static void init_sdma_regs( + struct sdma_engine *sde, + u32 credits, + uint idle_cnt) +{ + u8 opval, opmask; +#ifdef CONFIG_SDMA_VERBOSITY + struct hfi1_devdata *dd = sde->dd; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys); + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt); + write_sde_csr(sde, SD(DESC_CNT), 0); + write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys); + write_sde_csr(sde, SD(MEMORY), + ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) | + ((u64)(credits * sde->this_idx) << + SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT))); + write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull); + set_sdma_integrity(sde); + opmask = OPCODE_CHECK_MASK_DISABLED; + opval = OPCODE_CHECK_VAL_DISABLED; + write_sde_csr(sde, SD(CHECK_OPCODE), + (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) | + (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT)); +} + +#ifdef CONFIG_SDMA_VERBOSITY + +#define sdma_dumpstate_helper0(reg) do { \ + csr = read_csr(sde->dd, reg); \ + dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \ + } while (0) + +#define sdma_dumpstate_helper(reg) do { \ + csr = read_sde_csr(sde, reg); \ + dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \ + #reg, sde->this_idx, csr); \ + } while (0) + +#define sdma_dumpstate_helper2(reg) do { \ + csr = read_csr(sde->dd, reg + (8 * i)); \ + dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \ + #reg, i, csr); \ + } while (0) + +void sdma_dumpstate(struct sdma_engine *sde) +{ + u64 csr; + unsigned i; + + sdma_dumpstate_helper(SD(CTRL)); + sdma_dumpstate_helper(SD(STATUS)); + sdma_dumpstate_helper0(SD(ERR_STATUS)); + sdma_dumpstate_helper0(SD(ERR_MASK)); + sdma_dumpstate_helper(SD(ENG_ERR_STATUS)); + sdma_dumpstate_helper(SD(ENG_ERR_MASK)); + + for (i = 0; i < CCE_NUM_INT_CSRS; ++i) { + sdma_dumpstate_helper2(CCE_INT_STATUS); + sdma_dumpstate_helper2(CCE_INT_MASK); + sdma_dumpstate_helper2(CCE_INT_BLOCKED); + } + + sdma_dumpstate_helper(SD(TAIL)); + sdma_dumpstate_helper(SD(HEAD)); + sdma_dumpstate_helper(SD(PRIORITY_THLD)); + sdma_dumpstate_helper(SD(IDLE_CNT)); + sdma_dumpstate_helper(SD(RELOAD_CNT)); + sdma_dumpstate_helper(SD(DESC_CNT)); + sdma_dumpstate_helper(SD(DESC_FETCHED_CNT)); + sdma_dumpstate_helper(SD(MEMORY)); + sdma_dumpstate_helper0(SD(ENGINES)); + sdma_dumpstate_helper0(SD(MEM_SIZE)); + /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */ + sdma_dumpstate_helper(SD(BASE_ADDR)); + sdma_dumpstate_helper(SD(LEN_GEN)); + sdma_dumpstate_helper(SD(HEAD_ADDR)); + sdma_dumpstate_helper(SD(CHECK_ENABLE)); + sdma_dumpstate_helper(SD(CHECK_VL)); + sdma_dumpstate_helper(SD(CHECK_JOB_KEY)); + sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY)); + sdma_dumpstate_helper(SD(CHECK_SLID)); + sdma_dumpstate_helper(SD(CHECK_OPCODE)); +} +#endif + +static void dump_sdma_state(struct sdma_engine *sde) +{ + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + u16 head, tail, cnt; + + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + cnt = sdma_descq_freecnt(sde); + + dd_dev_err(sde->dd, + "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n", + sde->this_idx, head, tail, cnt, + !list_empty(&sde->flushlist)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + dd_dev_err(sde->dd, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + dd_dev_err(sde->dd, + "\tdesc0:0x%016llx desc1 0x%016llx\n", + desc[0], desc[1]); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + dd_dev_err(sde->dd, + "\taidx: %u amode: %u alen: %u\n", + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_DWS_SMASK) >> + SDMA_DESC1_HEADER_DWS_SHIFT)); + head++; + head &= sde->sdma_mask; + } +} + +#define SDE_FMT \ + "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n" +/** + * sdma_seqfile_dump_sde() - debugfs dump of sde + * @s: seq file + * @sde: send dma engine to dump + * + * This routine dumps the sde to the indicated seq file. + */ +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) +{ + u16 head, tail; + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + + head = sde->descq_head & sde->sdma_mask; + tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask; + seq_printf(s, SDE_FMT, sde->this_idx, + sde->cpu, + sdma_state_name(sde->state.current_state), + (unsigned long long)read_sde_csr(sde, SD(CTRL)), + (unsigned long long)read_sde_csr(sde, SD(STATUS)), + (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)), + (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail, + (unsigned long long)read_sde_csr(sde, SD(HEAD)), head, + (unsigned long long)le64_to_cpu(*sde->head_dma), + (unsigned long long)read_sde_csr(sde, SD(MEMORY)), + (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)), + (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)), + (unsigned long long)sde->last_status, + (unsigned long long)sde->ahg_bits, + sde->tx_tail, + sde->tx_head, + sde->descq_tail, + sde->descq_head, + !list_empty(&sde->flushlist), + sde->descq_full_count, + (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + seq_printf(s, + "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n", + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT)); + head = (head + 1) & sde->sdma_mask; + } +} + +/* + * add the generation number into + * the qw1 and return + */ +static inline u64 add_gen(struct sdma_engine *sde, u64 qw1) +{ + u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3; + + qw1 &= ~SDMA_DESC1_GENERATION_SMASK; + qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + return qw1; +} + +/* + * This routine submits the indicated tx + * + * Space has already been guaranteed and + * tail side of ring is locked. + * + * The hardware tail update is done + * in the caller and that is facilitated + * by returning the new tail. + * + * There is special case logic for ahg + * to not add the generation number for + * up to 2 descriptors that follow the + * first descriptor. + * + */ +static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) +{ + int i; + u16 tail; + struct sdma_desc *descp = tx->descp; + u8 skip = 0, mode = ahg_mode(tx); + + tail = sde->descq_tail & sde->sdma_mask; + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1])); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1], + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + descp++; + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1; i < tx->num_desc; i++, descp++) { + u64 qw1; + + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + if (skip) { + /* edits don't have generation */ + qw1 = descp->qw[1]; + skip--; + } else { + /* replace generation with real one for non-edits */ + qw1 = add_gen(sde, descp->qw[1]); + } + sde->descq[tail].qw[1] = cpu_to_le64(qw1); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1, + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + } + tx->next_descq_idx = tail; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); + WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]); +#endif + sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx; + sde->desc_avail -= tx->num_desc; + return tail; +} + +/* + * Check for progress + */ +static int sdma_check_progress( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + bool pkts_sent) +{ + int ret; + + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc <= sde->desc_avail) + return -EAGAIN; + /* pulse the head_lock */ + if (wait && iowait_ioww_to_iow(wait)->sleep) { + unsigned seq; + + seq = raw_seqcount_begin( + (const seqcount_t *)&sde->head_lock.seqcount); + ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent); + if (ret == -EAGAIN) + sde->desc_avail = sdma_descq_freecnt(sde); + } else { + ret = -EBUSY; + } + return ret; +} + +/** + * sdma_send_txreq() - submit a tx req to ring + * @sde: sdma engine to use + * @wait: SE wait structure to use when full (may be NULL) + * @tx: sdma_txreq to submit + * @pkts_sent: has any packet been sent yet? + * + * The call submits the tx into the ring. If a iowait structure is non-NULL + * the packet will be queued to the list in wait. + * + * Return: + * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in + * ring (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + bool pkts_sent) +{ + int ret = 0; + u16 tail; + unsigned long flags; + + /* user should have supplied entire packet */ + if (unlikely(tx->tlen)) + return -EINVAL; + tx->wait = iowait_ioww_to_iow(wait); + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + tail = submit_tx(sde, tx); + if (wait) + iowait_sdma_inc(iowait_ioww_to_iow(wait)); + sdma_update_tail(sde, tail); +unlock: + spin_unlock_irqrestore(&sde->tail_lock, flags); + return ret; +unlock_noconn: + if (wait) + iowait_sdma_inc(iowait_ioww_to_iow(wait)); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + spin_lock(&sde->flushlist_lock); + list_add_tail(&tx->list, &sde->flushlist); + spin_unlock(&sde->flushlist_lock); + iowait_inc_wait_count(wait, tx->num_desc); + queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker); + ret = -ECOMM; + goto unlock; +nodesc: + ret = sdma_check_progress(sde, wait, tx, pkts_sent); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto unlock; +} + +/** + * sdma_send_txlist() - submit a list of tx req to ring + * @sde: sdma engine to use + * @wait: SE wait structure to use when full (may be NULL) + * @tx_list: list of sdma_txreqs to submit + * @count_out: pointer to a u16 which, after return will contain the total number of + * sdma_txreqs removed from the tx_list. This will include sdma_txreqs + * whose SDMA descriptors are submitted to the ring and the sdma_txreqs + * which are added to SDMA engine flush list if the SDMA engine state is + * not running. + * + * The call submits the list into the ring. + * + * If the iowait structure is non-NULL and not equal to the iowait list + * the unprocessed part of the list will be appended to the list in wait. + * + * In all cases, the tx_list will be updated so the head of the tx_list is + * the list of descriptors that have yet to be transmitted. + * + * The intent of this call is to provide a more efficient + * way of submitting multiple packets to SDMA while holding the tail + * side locking. + * + * Return: + * 0 - Success, + * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, + struct list_head *tx_list, u16 *count_out) +{ + struct sdma_txreq *tx, *tx_next; + int ret = 0; + unsigned long flags; + u16 tail = INVALID_TAIL; + u32 submit_count = 0, flush_count = 0, total_count; + + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = iowait_ioww_to_iow(wait); + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + if (unlikely(tx->tlen)) { + ret = -EINVAL; + goto update_tail; + } + list_del_init(&tx->list); + tail = submit_tx(sde, tx); + submit_count++; + if (tail != INVALID_TAIL && + (submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) { + sdma_update_tail(sde, tail); + tail = INVALID_TAIL; + } + } +update_tail: + total_count = submit_count + flush_count; + if (wait) { + iowait_sdma_add(iowait_ioww_to_iow(wait), total_count); + iowait_starve_clear(submit_count > 0, + iowait_ioww_to_iow(wait)); + } + if (tail != INVALID_TAIL) + sdma_update_tail(sde, tail); + spin_unlock_irqrestore(&sde->tail_lock, flags); + *count_out = total_count; + return ret; +unlock_noconn: + spin_lock(&sde->flushlist_lock); + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = iowait_ioww_to_iow(wait); + list_del_init(&tx->list); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + list_add_tail(&tx->list, &sde->flushlist); + flush_count++; + iowait_inc_wait_count(wait, tx->num_desc); + } + spin_unlock(&sde->flushlist_lock); + queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker); + ret = -ECOMM; + goto update_tail; +nodesc: + ret = sdma_check_progress(sde, wait, tx, submit_count > 0); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto update_tail; +} + +static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event) +{ + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + __sdma_process_event(sde, event); + + if (sde->state.current_state == sdma_state_s99_running) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void __sdma_process_event(struct sdma_engine *sde, + enum sdma_events event) +{ + struct sdma_state *ss = &sde->state; + int need_progress = 0; + + /* CONFIG SDMA temporary */ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx, + sdma_state_names[ss->current_state], + sdma_event_names[event]); +#endif + + switch (ss->current_state) { + case sdma_state_s00_hw_down: + switch (event) { + case sdma_event_e00_go_hw_down: + break; + case sdma_event_e30_go_running: + /* + * If down, but running requested (usually result + * of link up, then we need to start up. + * This can happen when hw down is requested while + * bringing the link up with traffic active on + * 7220, e.g. + */ + ss->go_s99_running = 1; + fallthrough; /* and start dma engine */ + case sdma_event_e10_go_hw_start: + /* This reference means the state machine is started */ + sdma_get(&sde->state); + sdma_set_state(sde, + sdma_state_s10_hw_start_up_halt_wait); + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e40_sw_cleaned: + sdma_sw_tear_down(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s10_hw_start_up_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, + sdma_state_s15_hw_start_up_clean_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s15_hw_start_up_clean_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s20_idle: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + sdma_set_state(sde, sdma_state_s99_running); + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e85_link_down: + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s30_sw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s40_hw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s50_hw_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s60_idle_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s80_hw_freeze: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + sdma_set_state(sde, sdma_state_s82_freeze_sw_clean); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s82_freeze_sw_clean: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + /* notify caller this engine is done cleaning */ + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s99_running: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + need_progress = 1; + sdma_err_progress_check_schedule(sde); + fallthrough; + case sdma_event_e90_sw_halted: + /* + * SW initiated halt does not perform engines + * progress check + */ + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + sdma_set_state(sde, sdma_state_s60_idle_halt_wait); + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + fallthrough; + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + } + break; + } + + ss->last_event = event; + if (need_progress) + sdma_make_progress(sde, 0); +} + +/* + * _extend_sdma_tx_descs() - helper to extend txreq + * + * This is called once the initial nominal allocation + * of descriptors in the sdma_txreq is exhausted. + * + * The code will bump the allocation up to the max + * of MAX_DESC (64) descriptors. There doesn't seem + * much point in an interim step. The last descriptor + * is reserved for coalesce buffer in order to support + * cases where input packet has >MAX_DESC iovecs. + * + */ +static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int i; + struct sdma_desc *descp; + + /* Handle last descriptor */ + if (unlikely((tx->num_desc == (MAX_DESC - 1)))) { + /* if tlen is 0, it is for padding, release last descriptor */ + if (!tx->tlen) { + tx->desc_limit = MAX_DESC; + } else if (!tx->coalesce_buf) { + /* allocate coalesce buffer with space for padding */ + tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32), + GFP_ATOMIC); + if (!tx->coalesce_buf) + goto enomem; + tx->coalesce_idx = 0; + } + return 0; + } + + if (unlikely(tx->num_desc == MAX_DESC)) + goto enomem; + + descp = kmalloc_array(MAX_DESC, sizeof(struct sdma_desc), GFP_ATOMIC); + if (!descp) + goto enomem; + tx->descp = descp; + + /* reserve last descriptor for coalescing */ + tx->desc_limit = MAX_DESC - 1; + /* copy ones already built */ + for (i = 0; i < tx->num_desc; i++) + tx->descp[i] = tx->descs[i]; + return 0; +enomem: + __sdma_txclean(dd, tx); + return -ENOMEM; +} + +/* + * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors + * + * This is called once the initial nominal allocation of descriptors + * in the sdma_txreq is exhausted. + * + * This function calls _extend_sdma_tx_descs to extend or allocate + * coalesce buffer. If there is a allocated coalesce buffer, it will + * copy the input packet data into the coalesce buffer. It also adds + * coalesce buffer descriptor once when whole packet is received. + * + * Return: + * <0 - error + * 0 - coalescing, don't populate descriptor + * 1 - continue with populating descriptor + */ +int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, + int type, void *kvaddr, struct page *page, + unsigned long offset, u16 len) +{ + int pad_len, rval; + dma_addr_t addr; + + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) { + __sdma_txclean(dd, tx); + return rval; + } + + /* If coalesce buffer is allocated, copy data into it */ + if (tx->coalesce_buf) { + if (type == SDMA_MAP_NONE) { + __sdma_txclean(dd, tx); + return -EINVAL; + } + + if (type == SDMA_MAP_PAGE) { + kvaddr = kmap_local_page(page); + kvaddr += offset; + } else if (WARN_ON(!kvaddr)) { + __sdma_txclean(dd, tx); + return -EINVAL; + } + + memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len); + tx->coalesce_idx += len; + if (type == SDMA_MAP_PAGE) + kunmap_local(kvaddr); + + /* If there is more data, return */ + if (tx->tlen - tx->coalesce_idx) + return 0; + + /* Whole packet is received; add any padding */ + pad_len = tx->packet_len & (sizeof(u32) - 1); + if (pad_len) { + pad_len = sizeof(u32) - pad_len; + memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len); + /* padding is taken care of for coalescing case */ + tx->packet_len += pad_len; + tx->tlen += pad_len; + } + + /* dma map the coalesce buffer */ + addr = dma_map_single(&dd->pcidev->dev, + tx->coalesce_buf, + tx->tlen, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + __sdma_txclean(dd, tx); + return -ENOSPC; + } + + /* Add descriptor for coalesce buffer */ + tx->desc_limit = MAX_DESC; + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, + addr, tx->tlen, NULL, NULL, NULL); + } + + return 1; +} + +/* Update sdes when the lmc changes */ +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid) +{ + struct sdma_engine *sde; + int i; + u64 sreg; + + sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) << + SD(CHECK_SLID_MASK_SHIFT)) | + (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) << + SD(CHECK_SLID_VALUE_SHIFT)); + + for (i = 0; i < dd->num_sdma; i++) { + hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + sde = &dd->per_sdma[i]; + write_sde_csr(sde, SD(CHECK_SLID), sreg); + } +} + +/* tx not dword sized - pad */ +int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int rval = 0; + + if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) { + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) { + __sdma_txclean(dd, tx); + return rval; + } + } + + /* finish the one just added */ + make_tx_sdma_desc( + tx, + SDMA_MAP_NONE, + dd->sdma_pad_phys, + sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)), + NULL, NULL, NULL); + tx->num_desc++; + _sdma_close_tx(dd, tx); + return rval; +} + +/* + * Add ahg to the sdma_txreq + * + * The logic will consume up to 3 + * descriptors at the beginning of + * sdma_txreq. + */ +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen) +{ + u32 i, shift = 0, desc = 0; + u8 mode; + + WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4); + /* compute mode */ + if (num_ahg == 1) + mode = SDMA_AHG_APPLY_UPDATE1; + else if (num_ahg <= 5) + mode = SDMA_AHG_APPLY_UPDATE2; + else + mode = SDMA_AHG_APPLY_UPDATE3; + tx->num_desc++; + /* initialize to consumed descriptors to zero */ + switch (mode) { + case SDMA_AHG_APPLY_UPDATE3: + tx->num_desc++; + tx->descs[2].qw[0] = 0; + tx->descs[2].qw[1] = 0; + fallthrough; + case SDMA_AHG_APPLY_UPDATE2: + tx->num_desc++; + tx->descs[1].qw[0] = 0; + tx->descs[1].qw[1] = 0; + break; + } + ahg_hlen >>= 2; + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK) + << SDMA_DESC1_HEADER_DWS_SHIFT) | + (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT) | + (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK) + << SDMA_DESC1_HEADER_UPDATE1_SHIFT); + for (i = 0; i < (num_ahg - 1); i++) { + if (!shift && !(i & 2)) + desc++; + tx->descs[desc].qw[!!(i & 2)] |= + (((u64)ahg[i + 1]) + << shift); + shift = (shift + 32) & 63; + } +} + +/** + * sdma_ahg_alloc - allocate an AHG entry + * @sde: engine to allocate from + * + * Return: + * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled, + * -ENOSPC if an entry is not available + */ +int sdma_ahg_alloc(struct sdma_engine *sde) +{ + int nr; + int oldbit; + + if (!sde) { + trace_hfi1_ahg_allocate(sde, -EINVAL); + return -EINVAL; + } + while (1) { + nr = ffz(READ_ONCE(sde->ahg_bits)); + if (nr > 31) { + trace_hfi1_ahg_allocate(sde, -ENOSPC); + return -ENOSPC; + } + oldbit = test_and_set_bit(nr, &sde->ahg_bits); + if (!oldbit) + break; + cpu_relax(); + } + trace_hfi1_ahg_allocate(sde, nr); + return nr; +} + +/** + * sdma_ahg_free - free an AHG entry + * @sde: engine to return AHG entry + * @ahg_index: index to free + * + * This routine frees the indicate AHG entry. + */ +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index) +{ + if (!sde) + return; + trace_hfi1_ahg_deallocate(sde, ahg_index); + if (ahg_index < 0 || ahg_index > 31) + return; + clear_bit(ahg_index, &sde->ahg_bits); +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is going into a freeze but before the freeze is fully + * settled. Generally an error interrupt. + * + * This event will pull the engine out of running so no more entries can be + * added to the engine's queue. + */ +void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down) +{ + int i; + enum sdma_events event = link_down ? sdma_event_e85_link_down : + sdma_event_e80_hw_freeze; + + /* set up the wait but do not wait here */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines to stop running and wait */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], event); + + /* sdma_freeze() will wait for all engines to have stopped */ +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is fully frozen. + */ +void sdma_freeze(struct hfi1_devdata *dd) +{ + int i; + int ret; + + /* + * Make sure all engines have moved out of the running state before + * continuing. + */ + ret = wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= + 0); + /* interrupted or count is negative, then unloading - just exit */ + if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0) + return; + + /* set up the count for the next wait */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines that the SPC is frozen, they can start cleaning */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen); + + /* + * Wait for everyone to finish software clean before exiting. The + * software clean will read engine CSRs, so must be completed before + * the next step, which will clear the engine CSRs. + */ + (void)wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= 0); + /* no need to check results - done no matter what */ +} + +/* + * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen. + * + * The SPC freeze acts like a SDMA halt and a hardware clean combined. All + * that is left is a software clean. We could do it after the SPC is fully + * frozen, but then we'd have to add another state to wait for the unfreeze. + * Instead, just defer the software clean until the unfreeze step. + */ +void sdma_unfreeze(struct hfi1_devdata *dd) +{ + int i; + + /* tell all engines start freeze clean up */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], + sdma_event_e82_hw_unfreeze); +} + +/** + * _sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + */ +void _sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + trace_hfi1_sdma_engine_progress(sde, sde->progress_mask); + /* assume we have selected a good cpu */ + write_csr(sde->dd, + CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)), + sde->progress_mask); +} diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h new file mode 100644 index 0000000000..7fdebab202 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -0,0 +1,1056 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#ifndef _HFI1_SDMA_H +#define _HFI1_SDMA_H + +#include <linux/types.h> +#include <linux/list.h> +#include <asm/byteorder.h> +#include <linux/workqueue.h> +#include <linux/rculist.h> + +#include "hfi.h" +#include "verbs.h" +#include "sdma_txreq.h" + +/* Hardware limit */ +#define MAX_DESC 64 +/* Hardware limit for SDMA packet size */ +#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) + +#define SDMA_MAP_NONE 0 +#define SDMA_MAP_SINGLE 1 +#define SDMA_MAP_PAGE 2 + +#define SDMA_AHG_VALUE_MASK 0xffff +#define SDMA_AHG_VALUE_SHIFT 0 +#define SDMA_AHG_INDEX_MASK 0xf +#define SDMA_AHG_INDEX_SHIFT 16 +#define SDMA_AHG_FIELD_LEN_MASK 0xf +#define SDMA_AHG_FIELD_LEN_SHIFT 20 +#define SDMA_AHG_FIELD_START_MASK 0x1f +#define SDMA_AHG_FIELD_START_SHIFT 24 +#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1 +#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31 + +/* AHG modes */ + +/* + * Be aware the ordering and values + * for SDMA_AHG_APPLY_UPDATE[123] + * are assumed in generating a skip + * count in submit_tx() in sdma.c + */ +#define SDMA_AHG_NO_AHG 0 +#define SDMA_AHG_COPY 1 +#define SDMA_AHG_APPLY_UPDATE1 2 +#define SDMA_AHG_APPLY_UPDATE2 3 +#define SDMA_AHG_APPLY_UPDATE3 4 + +/* + * Bits defined in the send DMA descriptor. + */ +#define SDMA_DESC0_FIRST_DESC_FLAG BIT_ULL(63) +#define SDMA_DESC0_LAST_DESC_FLAG BIT_ULL(62) +#define SDMA_DESC0_BYTE_COUNT_SHIFT 48 +#define SDMA_DESC0_BYTE_COUNT_WIDTH 14 +#define SDMA_DESC0_BYTE_COUNT_MASK \ + ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1) +#define SDMA_DESC0_BYTE_COUNT_SMASK \ + (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT) +#define SDMA_DESC0_PHY_ADDR_SHIFT 0 +#define SDMA_DESC0_PHY_ADDR_WIDTH 48 +#define SDMA_DESC0_PHY_ADDR_MASK \ + ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1) +#define SDMA_DESC0_PHY_ADDR_SMASK \ + (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT) + +#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32 +#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32 +#define SDMA_DESC1_HEADER_UPDATE1_MASK \ + ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1) +#define SDMA_DESC1_HEADER_UPDATE1_SMASK \ + (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT) +#define SDMA_DESC1_HEADER_MODE_SHIFT 13 +#define SDMA_DESC1_HEADER_MODE_WIDTH 3 +#define SDMA_DESC1_HEADER_MODE_MASK \ + ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1) +#define SDMA_DESC1_HEADER_MODE_SMASK \ + (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT) +#define SDMA_DESC1_HEADER_INDEX_SHIFT 8 +#define SDMA_DESC1_HEADER_INDEX_WIDTH 5 +#define SDMA_DESC1_HEADER_INDEX_MASK \ + ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1) +#define SDMA_DESC1_HEADER_INDEX_SMASK \ + (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT) +#define SDMA_DESC1_HEADER_DWS_SHIFT 4 +#define SDMA_DESC1_HEADER_DWS_WIDTH 4 +#define SDMA_DESC1_HEADER_DWS_MASK \ + ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1) +#define SDMA_DESC1_HEADER_DWS_SMASK \ + (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT) +#define SDMA_DESC1_GENERATION_SHIFT 2 +#define SDMA_DESC1_GENERATION_WIDTH 2 +#define SDMA_DESC1_GENERATION_MASK \ + ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1) +#define SDMA_DESC1_GENERATION_SMASK \ + (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT) +#define SDMA_DESC1_INT_REQ_FLAG BIT_ULL(1) +#define SDMA_DESC1_HEAD_TO_HOST_FLAG BIT_ULL(0) + +enum sdma_states { + sdma_state_s00_hw_down, + sdma_state_s10_hw_start_up_halt_wait, + sdma_state_s15_hw_start_up_clean_wait, + sdma_state_s20_idle, + sdma_state_s30_sw_clean_up_wait, + sdma_state_s40_hw_clean_up_wait, + sdma_state_s50_hw_halt_wait, + sdma_state_s60_idle_halt_wait, + sdma_state_s80_hw_freeze, + sdma_state_s82_freeze_sw_clean, + sdma_state_s99_running, +}; + +enum sdma_events { + sdma_event_e00_go_hw_down, + sdma_event_e10_go_hw_start, + sdma_event_e15_hw_halt_done, + sdma_event_e25_hw_clean_up_done, + sdma_event_e30_go_running, + sdma_event_e40_sw_cleaned, + sdma_event_e50_hw_cleaned, + sdma_event_e60_hw_halted, + sdma_event_e70_go_idle, + sdma_event_e80_hw_freeze, + sdma_event_e81_hw_frozen, + sdma_event_e82_hw_unfreeze, + sdma_event_e85_link_down, + sdma_event_e90_sw_halted, +}; + +struct sdma_set_state_action { + unsigned op_enable:1; + unsigned op_intenable:1; + unsigned op_halt:1; + unsigned op_cleanup:1; + unsigned go_s99_running_tofalse:1; + unsigned go_s99_running_totrue:1; +}; + +struct sdma_state { + struct kref kref; + struct completion comp; + enum sdma_states current_state; + unsigned current_op; + unsigned go_s99_running; + /* debugging/development */ + enum sdma_states previous_state; + unsigned previous_op; + enum sdma_events last_event; +}; + +/** + * DOC: sdma exported routines + * + * These sdma routines fit into three categories: + * - The SDMA API for building and submitting packets + * to the ring + * + * - Initialization and tear down routines to buildup + * and tear down SDMA + * + * - ISR entrances to handle interrupts, state changes + * and errors + */ + +/** + * DOC: sdma PSM/verbs API + * + * The sdma API is designed to be used by both PSM + * and verbs to supply packets to the SDMA ring. + * + * The usage of the API is as follows: + * + * Embed a struct iowait in the QP or + * PQ. The iowait should be initialized with a + * call to iowait_init(). + * + * The user of the API should create an allocation method + * for their version of the txreq. slabs, pre-allocated lists, + * and dma pools can be used. Once the user's overload of + * the sdma_txreq has been allocated, the sdma_txreq member + * must be initialized with sdma_txinit() or sdma_txinit_ahg(). + * + * The txreq must be declared with the sdma_txreq first. + * + * The tx request, once initialized, is manipulated with calls to + * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr() + * for each disjoint memory location. It is the user's responsibility + * to understand the packet boundaries and page boundaries to do the + * appropriate number of sdma_txadd_* calls.. The user + * must be prepared to deal with failures from these routines due to + * either memory allocation or dma_mapping failures. + * + * The mapping specifics for each memory location are recorded + * in the tx. Memory locations added with sdma_txadd_page() + * and sdma_txadd_kvaddr() are automatically mapped when added + * to the tx and nmapped as part of the progress processing in the + * SDMA interrupt handling. + * + * sdma_txadd_daddr() is used to add an dma_addr_t memory to the + * tx. An example of a use case would be a pre-allocated + * set of headers allocated via dma_pool_alloc() or + * dma_alloc_coherent(). For these memory locations, it + * is the responsibility of the user to handle that unmapping. + * (This would usually be at an unload or job termination.) + * + * The routine sdma_send_txreq() is used to submit + * a tx to the ring after the appropriate number of + * sdma_txadd_* have been done. + * + * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist() + * can be used to submit a list of packets. + * + * The user is free to use the link overhead in the struct sdma_txreq as + * long as the tx isn't in flight. + * + * The extreme degenerate case of the number of descriptors + * exceeding the ring size is automatically handled as + * memory locations are added. An overflow of the descriptor + * array that is part of the sdma_txreq is also automatically + * handled. + * + */ + +/** + * DOC: Infrastructure calls + * + * sdma_init() is used to initialize data structures and + * CSRs for the desired number of SDMA engines. + * + * sdma_start() is used to kick the SDMA engines initialized + * with sdma_init(). Interrupts must be enabled at this + * point since aspects of the state machine are interrupt + * driven. + * + * sdma_engine_error() and sdma_engine_interrupt() are + * entrances for interrupts. + * + * sdma_map_init() is for the management of the mapping + * table when the number of vls is changed. + * + */ + +/* + * struct hw_sdma_desc - raw 128 bit SDMA descriptor + * + * This is the raw descriptor in the SDMA ring + */ +struct hw_sdma_desc { + /* private: don't use directly */ + __le64 qw[2]; +}; + +/** + * struct sdma_engine - Data pertaining to each SDMA engine. + * @dd: a back-pointer to the device data + * @ppd: per port back-pointer + * @imask: mask for irq manipulation + * @idle_mask: mask for determining if an interrupt is due to sdma_idle + * + * This structure has the state for each sdma_engine. + * + * Accessing to non public fields are not supported + * since the private members are subject to change. + */ +struct sdma_engine { + /* read mostly */ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + /* private: */ + void __iomem *tail_csr; + u64 imask; /* clear interrupt mask */ + u64 idle_mask; + u64 progress_mask; + u64 int_mask; + /* private: */ + volatile __le64 *head_dma; /* DMA'ed by chip */ + /* private: */ + dma_addr_t head_phys; + /* private: */ + struct hw_sdma_desc *descq; + /* private: */ + unsigned descq_full_count; + struct sdma_txreq **tx_ring; + /* private: */ + dma_addr_t descq_phys; + /* private */ + u32 sdma_mask; + /* private */ + struct sdma_state state; + /* private */ + int cpu; + /* private: */ + u8 sdma_shift; + /* private: */ + u8 this_idx; /* zero relative engine */ + /* protect changes to senddmactrl shadow */ + spinlock_t senddmactrl_lock; + /* private: */ + u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */ + + /* read/write using tail_lock */ + spinlock_t tail_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 tail_sn; +#endif + /* private: */ + u32 descq_tail; + /* private: */ + unsigned long ahg_bits; + /* private: */ + u16 desc_avail; + /* private: */ + u16 tx_tail; + /* private: */ + u16 descq_cnt; + + /* read/write using head_lock */ + /* private: */ + seqlock_t head_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 head_sn; +#endif + /* private: */ + u32 descq_head; + /* private: */ + u16 tx_head; + /* private: */ + u64 last_status; + /* private */ + u64 err_cnt; + /* private */ + u64 sdma_int_cnt; + u64 idle_int_cnt; + u64 progress_int_cnt; + + /* private: */ + seqlock_t waitlock; + struct list_head dmawait; + + /* CONFIG SDMA for now, just blindly duplicate */ + /* private: */ + struct tasklet_struct sdma_hw_clean_up_task + ____cacheline_aligned_in_smp; + + /* private: */ + struct tasklet_struct sdma_sw_clean_up_task + ____cacheline_aligned_in_smp; + /* private: */ + struct work_struct err_halt_worker; + /* private */ + struct timer_list err_progress_check_timer; + u32 progress_check_head; + /* private: */ + struct work_struct flush_worker; + /* protect flush list */ + spinlock_t flushlist_lock; + /* private: */ + struct list_head flushlist; + struct cpumask cpu_mask; + struct kobject kobj; + u32 msix_intr; +}; + +int sdma_init(struct hfi1_devdata *dd, u8 port); +void sdma_start(struct hfi1_devdata *dd); +void sdma_exit(struct hfi1_devdata *dd); +void sdma_clean(struct hfi1_devdata *dd, size_t num_engines); +void sdma_all_running(struct hfi1_devdata *dd); +void sdma_all_idle(struct hfi1_devdata *dd); +void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); +void sdma_freeze(struct hfi1_devdata *dd); +void sdma_unfreeze(struct hfi1_devdata *dd); +void sdma_wait(struct hfi1_devdata *dd); + +/** + * sdma_empty() - idle engine test + * @engine: sdma engine + * + * Currently used by verbs as a latency optimization. + * + * Return: + * 1 - empty, 0 - non-empty + */ +static inline int sdma_empty(struct sdma_engine *sde) +{ + return sde->descq_tail == sde->descq_head; +} + +static inline u16 sdma_descq_freecnt(struct sdma_engine *sde) +{ + return sde->descq_cnt - + (sde->descq_tail - + READ_ONCE(sde->descq_head)) - 1; +} + +static inline u16 sdma_descq_inprocess(struct sdma_engine *sde) +{ + return sde->descq_cnt - sdma_descq_freecnt(sde); +} + +/* + * Either head_lock or tail lock required to see + * a steady state. + */ +static inline int __sdma_running(struct sdma_engine *engine) +{ + return engine->state.current_state == sdma_state_s99_running; +} + +/** + * sdma_running() - state suitability test + * @engine: sdma engine + * + * sdma_running probes the internal state to determine if it is suitable + * for submitting packets. + * + * Return: + * 1 - ok to submit, 0 - not ok to submit + * + */ +static inline int sdma_running(struct sdma_engine *engine) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&engine->tail_lock, flags); + ret = __sdma_running(engine); + spin_unlock_irqrestore(&engine->tail_lock, flags); + return ret; +} + +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen); + +/** + * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @ahg_entry: ahg entry to use (0 - 31) + * @num_ahg: ahg descriptor for first descriptor (0 - 9) + * @ahg: array of AHG descriptors (up to 9 entries) + * @ahg_hlen: number of bytes from ASIC entry to use + * @cb: callback + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user independent + * fields. + * + * The currently supported flags are SDMA_TXREQ_F_URGENT, + * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be + * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in + * the AHG descriptors into the first 1 to 3 descriptors. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. Aspects of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. The callback will be provided this tx, a status, and a flag. + * + * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + * The flag, if the is the iowait had been used, indicates the iowait + * sdma_busy count has reached zero. + * + * user data portion of tlen should be precise. The sdma_txadd_* entrances + * will pad with a descriptor references 1 - 3 bytes when the number of bytes + * specified in tlen have been supplied to the sdma_txreq. + * + * ahg_hlen is used to determine the number of on-chip entry bytes to + * use as the header. This is for cases where the stored header is + * larger than the header to be used in a packet. This is typical + * for verbs where an RDMA_WRITE_FIRST is larger than the packet in + * and RDMA_WRITE_MIDDLE. + * + */ +static inline int sdma_txinit_ahg( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + u8 ahg_entry, + u8 num_ahg, + u32 *ahg, + u8 ahg_hlen, + void (*cb)(struct sdma_txreq *, int)) +{ + if (tlen == 0) + return -ENODATA; + if (tlen > MAX_SDMA_PKT_SIZE) + return -EMSGSIZE; + tx->desc_limit = ARRAY_SIZE(tx->descs); + tx->descp = &tx->descs[0]; + INIT_LIST_HEAD(&tx->list); + tx->num_desc = 0; + tx->flags = flags; + tx->complete = cb; + tx->coalesce_buf = NULL; + tx->wait = NULL; + tx->packet_len = tlen; + tx->tlen = tx->packet_len; + tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG; + tx->descs[0].qw[1] = 0; + if (flags & SDMA_TXREQ_F_AHG_COPY) + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT); + else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg) + _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen); + return 0; +} + +/** + * sdma_txinit() - initialize an sdma_txreq struct (no AHG) + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @cb: callback pointer + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user + * independent fields. + * + * The currently supported flags is SDMA_TXREQ_F_URGENT. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. The head size of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. + * + * The callback, if non-NULL, will be provided this tx and a status. The + * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + */ +static inline int sdma_txinit( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + void (*cb)(struct sdma_txreq *, int)) +{ + return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb); +} + +/* helpers - don't use */ +static inline int sdma_mapping_type(struct sdma_desc *d) +{ + return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK) + >> SDMA_DESC1_GENERATION_SHIFT; +} + +static inline size_t sdma_mapping_len(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK) + >> SDMA_DESC0_BYTE_COUNT_SHIFT; +} + +static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK) + >> SDMA_DESC0_PHY_ADDR_SHIFT; +} + +static inline void make_tx_sdma_desc( + struct sdma_txreq *tx, + int type, + dma_addr_t addr, + size_t len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) +{ + struct sdma_desc *desc = &tx->descp[tx->num_desc]; + + if (!tx->num_desc) { + /* qw[0] zero; qw[1] first, ahg mode already in from init */ + desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } else { + desc->qw[0] = 0; + desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } + desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK) + << SDMA_DESC0_PHY_ADDR_SHIFT) | + (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) + << SDMA_DESC0_BYTE_COUNT_SHIFT); + + desc->pinning_ctx = pinning_ctx; + desc->ctx_put = ctx_put; + if (pinning_ctx && ctx_get) + ctx_get(pinning_ctx); +} + +/* helper to extend txreq */ +int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, + int type, void *kvaddr, struct page *page, + unsigned long offset, u16 len); +int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); +void __sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *); + +static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + if (tx->num_desc) + __sdma_txclean(dd, tx); +} + +/* helpers used by public routines */ +static inline void _sdma_close_tx(struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + u16 last_desc = tx->num_desc - 1; + + tx->descp[last_desc].qw[0] |= SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[last_desc].qw[1] |= dd->default_desc1; + if (tx->flags & SDMA_TXREQ_F_URGENT) + tx->descp[last_desc].qw[1] |= (SDMA_DESC1_HEAD_TO_HOST_FLAG | + SDMA_DESC1_INT_REQ_FLAG); +} + +static inline int _sdma_txadd_daddr( + struct hfi1_devdata *dd, + int type, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) +{ + int rval = 0; + + make_tx_sdma_desc( + tx, + type, + addr, len, + pinning_ctx, ctx_get, ctx_put); + WARN_ON(len > tx->tlen); + tx->num_desc++; + tx->tlen -= len; + /* special cases for last */ + if (!tx->tlen) { + if (tx->packet_len & (sizeof(u32) - 1)) { + rval = _pad_sdma_tx_descs(dd, tx); + if (rval) + return rval; + } else { + _sdma_close_tx(dd, tx); + } + } + return rval; +} + +/** + * sdma_txadd_page() - add a page to the sdma_txreq + * @dd: the device to use for mapping + * @tx: tx request to which the page is added + * @page: page to map + * @offset: offset within the page + * @len: length in bytes + * @pinning_ctx: context to be stored on struct sdma_desc .pinning_ctx. Not + * added if coalesce buffer is used. E.g. pointer to pinned-page + * cache entry for the sdma_desc. + * @ctx_get: optional function to take reference to @pinning_ctx. Not called if + * @pinning_ctx is NULL. + * @ctx_put: optional function to release reference to @pinning_ctx after + * sdma_desc completes. May be called in interrupt context so must + * not sleep. Not called if @pinning_ctx is NULL. + * + * This is used to add a page/offset/length descriptor. + * + * The mapping/unmapping of the page/offset/len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't + * extend/coalesce descriptor array + */ +static inline int sdma_txadd_page( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + struct page *page, + unsigned long offset, + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) +{ + dma_addr_t addr; + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE, + NULL, page, offset, len); + if (rval <= 0) + return rval; + } + + addr = dma_map_page( + &dd->pcidev->dev, + page, + offset, + len, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + __sdma_txclean(dd, tx); + return -ENOSPC; + } + + return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, tx, addr, len, + pinning_ctx, ctx_get, ctx_put); +} + +/** + * sdma_txadd_daddr() - add a dma address to the sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @addr: dma address mapped by caller + * @len: length in bytes + * + * This is used to add a descriptor for memory that is already dma mapped. + * + * In this case, there is no unmapping as part of the progress processing for + * this memory location. + * + * Return: + * 0 - success, -ENOMEM - couldn't extend descriptor array + */ + +static inline int sdma_txadd_daddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len) +{ + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE, + NULL, NULL, 0, 0); + if (rval <= 0) + return rval; + } + + return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len, + NULL, NULL, NULL); +} + +/** + * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @kvaddr: the kernel virtual address + * @len: length in bytes + * + * This is used to add a descriptor referenced by the indicated kvaddr and + * len. + * + * The mapping/unmapping of the kvaddr and len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce + * descriptor array + */ +static inline int sdma_txadd_kvaddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + void *kvaddr, + u16 len) +{ + dma_addr_t addr; + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE, + kvaddr, NULL, 0, len); + if (rval <= 0) + return rval; + } + + addr = dma_map_single( + &dd->pcidev->dev, + kvaddr, + len, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + __sdma_txclean(dd, tx); + return -ENOSPC; + } + + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, addr, len, + NULL, NULL, NULL); +} + +struct iowait_work; + +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + bool pkts_sent); +int sdma_send_txlist(struct sdma_engine *sde, + struct iowait_work *wait, + struct list_head *tx_list, + u16 *count_out); + +int sdma_ahg_alloc(struct sdma_engine *sde); +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); + +/** + * sdma_build_ahg - build ahg descriptor + * @data + * @dwindex + * @startbit + * @bits + * + * Build and return a 32 bit descriptor. + */ +static inline u32 sdma_build_ahg_descriptor( + u16 data, + u8 dwindex, + u8 startbit, + u8 bits) +{ + return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT | + ((startbit & SDMA_AHG_FIELD_START_MASK) << + SDMA_AHG_FIELD_START_SHIFT) | + ((bits & SDMA_AHG_FIELD_LEN_MASK) << + SDMA_AHG_FIELD_LEN_SHIFT) | + ((dwindex & SDMA_AHG_INDEX_MASK) << + SDMA_AHG_INDEX_SHIFT) | + ((data & SDMA_AHG_VALUE_MASK) << + SDMA_AHG_VALUE_SHIFT)); +} + +/** + * sdma_progress - use seq number of detect head progress + * @sde: sdma_engine to check + * @seq: base seq count + * @tx: txreq for which we need to check descriptor availability + * + * This is used in the appropriate spot in the sleep routine + * to check for potential ring progress. This routine gets the + * seqcount before queuing the iowait structure for progress. + * + * If the seqcount indicates that progress needs to be checked, + * re-submission is detected by checking whether the descriptor + * queue has enough descriptor for the txreq. + */ +static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq, + struct sdma_txreq *tx) +{ + if (read_seqretry(&sde->head_lock, seq)) { + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc > sde->desc_avail) + return 0; + return 1; + } + return 0; +} + +/* for use by interrupt handling */ +void sdma_engine_error(struct sdma_engine *sde, u64 status); +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status); + +/* + * + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform engines per vl, the + * number of engines for a vl is either the vl_engines[vl] or + * a computation based on num_sdma/num_vls: + * + * For example: + * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_sdma/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the engines are assigned + * in a round robin fashion wrapping back to the first engine + * for a particular vl. + * + * dd->sdma_map + * | sdma_map_elem[0] + * | +--------------------+ + * v | mask | + * sdma_vl_map |--------------------| + * +--------------------------+ | sde[0] -> eng 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| sde[1] -> eng 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | sde[n-1] -> eng n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +---------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |---------------------| + * | * | \-- | sde[0] -> eng 1+n | + * | * | \---- |---------------------| + * | * | \->| sde[1] -> eng 2+n | + * |--------------------------| |---------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |---------------------| + * \- | sde[m-1] -> eng m+n | + * \ +---------------------+ + * \- + * \ + * \- +----------------------+ + * \- | mask | + * \ |----------------------| + * \- | sde[0] -> eng 1+m+n | + * \- |----------------------| + * >| sde[1] -> eng 2+m+n | + * |----------------------| + * | * | + * |----------------------| + * | sde[o-1] -> eng o+m+n| + * +----------------------+ + * + */ + +/** + * struct sdma_map_elem - mapping for a vl + * @mask - selector mask + * @sde - array of engines for this vl + * + * The mask is used to "mod" the selector + * to produce index into the trailing + * array of sdes. + */ +struct sdma_map_elem { + u32 mask; + struct sdma_engine *sde[]; +}; + +/** + * struct sdma_map_el - mapping for a vl + * @engine_to_vl - map of an engine to a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - number of vls rounded to next power of 2 + * @map - array of sdma_map_elem entries + * + * This is the parent mapping structure. The trailing + * members of the struct point to sdma_map_elem entries, which + * in turn point to an array of sde's for that vl. + */ +struct sdma_vl_map { + s8 engine_to_vl[TXE_NUM_SDMA_ENGINES]; + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct sdma_map_elem *map[]; +}; + +int sdma_map_init( + struct hfi1_devdata *dd, + u8 port, + u8 num_vls, + u8 *vl_engines); + +/* slow path */ +void _sdma_engine_progress_schedule(struct sdma_engine *sde); + +/** + * sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + * This is the fast path. + * + */ +static inline void sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8)) + return; + _sdma_engine_progress_schedule(sde); +} + +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5); + +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl); + +struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, + u32 selector, u8 vl); +ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf); +ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, + size_t count); +int sdma_engine_get_vl(struct sdma_engine *sde); +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *); +void sdma_seqfile_dump_cpu_list(struct seq_file *s, struct hfi1_devdata *dd, + unsigned long cpuid); + +#ifdef CONFIG_SDMA_VERBOSITY +void sdma_dumpstate(struct sdma_engine *); +#endif +static inline char *slashstrip(char *s) +{ + char *r = s; + + while (*s) + if (*s++ == '/') + r = s; + return r; +} + +u16 sdma_get_descq_cnt(void); + +extern uint mod_num_sdma; + +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); +#endif diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h new file mode 100644 index 0000000000..85ae7293c2 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2016 Intel Corporation. + */ + +#ifndef HFI1_SDMA_TXREQ_H +#define HFI1_SDMA_TXREQ_H + +/* increased for AHG */ +#define NUM_DESC 6 + +/* + * struct sdma_desc - canonical fragment descriptor + * + * This is the descriptor carried in the tx request + * corresponding to each fragment. + * + */ +struct sdma_desc { + /* private: don't use directly */ + u64 qw[2]; + void *pinning_ctx; + /* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */ + void (*ctx_put)(void *ctx); +}; + +/** + * struct sdma_txreq - the sdma_txreq structure (one per packet) + * @list: for use by user and by queuing for wait + * + * This is the representation of a packet which consists of some + * number of fragments. Storage is provided to within the structure. + * for all fragments. + * + * The storage for the descriptors are automatically extended as needed + * when the currently allocation is exceeded. + * + * The user (Verbs or PSM) may overload this structure with fields + * specific to their use by putting this struct first in their struct. + * The method of allocation of the overloaded structure is user dependent + * + * The list is the only public field in the structure. + * + */ + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_VIP 0x0010 + +struct sdma_txreq; +typedef void (*callback_t)(struct sdma_txreq *, int); + +struct iowait; +struct sdma_txreq { + struct list_head list; + /* private: */ + struct sdma_desc *descp; + /* private: */ + void *coalesce_buf; + /* private: */ + struct iowait *wait; + /* private: */ + callback_t complete; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + u64 sn; +#endif + /* private: - used in coalesce/pad processing */ + u16 packet_len; + /* private: - down-counted to trigger last */ + u16 tlen; + /* private: */ + u16 num_desc; + /* private: */ + u16 desc_limit; + /* private: */ + u16 next_descq_idx; + /* private: */ + u16 coalesce_idx; + /* private: flags */ + u16 flags; + /* private: */ + struct sdma_desc descs[NUM_DESC]; +}; + +static inline int sdma_txreq_built(struct sdma_txreq *tx) +{ + return tx->num_desc; +} + +#endif /* HFI1_SDMA_TXREQ_H */ diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c new file mode 100644 index 0000000000..3b3407dc7c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -0,0 +1,697 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2017 Intel Corporation. + */ + +#include <linux/ctype.h> +#include <rdma/ib_sysfs.h> + +#include "hfi.h" +#include "mad.h" +#include "trace.h" + +static struct hfi1_pportdata *hfi1_get_pportdata_kobj(struct kobject *kobj) +{ + u32 port_num; + struct ib_device *ibdev = ib_port_sysfs_get_ibdev_kobj(kobj, &port_num); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + return &dd->pport[port_num - 1]; +} + +/* + * Start of per-port congestion control structures and support code + */ + +/* + * Congestion control table size followed by table entries + */ +static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t pos, size_t count) +{ + int ret; + struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); + struct cc_state *cc_state; + + ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow) + + sizeof(__be16); + + if (pos > ret) + return -EINVAL; + + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (!cc_state) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, (void *)&cc_state->cct + pos, count); + rcu_read_unlock(); + + return count; +} +static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); + +/* + * Congestion settings: port control, control map and an array of 16 + * entries for the congestion entries - increase, timer, event log + * trigger threshold and the minimum injection rate delay. + */ +static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); + int ret; + struct cc_state *cc_state; + + ret = sizeof(struct opa_congestion_setting_attr_shadow); + + if (pos > ret) + return -EINVAL; + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (!cc_state) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, (void *)&cc_state->cong_setting + pos, count); + rcu_read_unlock(); + + return count; +} +static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); + +static struct bin_attribute *port_cc_bin_attributes[] = { + &bin_attr_cc_setting_bin, + &bin_attr_cc_table_bin, + NULL +}; + +static ssize_t cc_prescan_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + + return sysfs_emit(buf, "%s\n", ppd->cc_prescan ? "on" : "off"); +} + +static ssize_t cc_prescan_store(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, const char *buf, + size_t count) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + + if (!memcmp(buf, "on", 2)) + ppd->cc_prescan = true; + else if (!memcmp(buf, "off", 3)) + ppd->cc_prescan = false; + + return count; +} +static IB_PORT_ATTR_ADMIN_RW(cc_prescan); + +static struct attribute *port_cc_attributes[] = { + &ib_port_attr_cc_prescan.attr, + NULL +}; + +static const struct attribute_group port_cc_group = { + .name = "CCMgtA", + .attrs = port_cc_attributes, + .bin_attrs = port_cc_bin_attributes, +}; + +/* Start sc2vl */ +struct hfi1_sc2vl_attr { + struct ib_port_attribute attr; + int sc; +}; + +static ssize_t sc2vl_attr_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hfi1_sc2vl_attr *sattr = + container_of(attr, struct hfi1_sc2vl_attr, attr); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + return sysfs_emit(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc)); +} + +#define HFI1_SC2VL_ATTR(N) \ + static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \ + .attr = __ATTR(N, 0444, sc2vl_attr_show, NULL), \ + .sc = N, \ + } + +HFI1_SC2VL_ATTR(0); +HFI1_SC2VL_ATTR(1); +HFI1_SC2VL_ATTR(2); +HFI1_SC2VL_ATTR(3); +HFI1_SC2VL_ATTR(4); +HFI1_SC2VL_ATTR(5); +HFI1_SC2VL_ATTR(6); +HFI1_SC2VL_ATTR(7); +HFI1_SC2VL_ATTR(8); +HFI1_SC2VL_ATTR(9); +HFI1_SC2VL_ATTR(10); +HFI1_SC2VL_ATTR(11); +HFI1_SC2VL_ATTR(12); +HFI1_SC2VL_ATTR(13); +HFI1_SC2VL_ATTR(14); +HFI1_SC2VL_ATTR(15); +HFI1_SC2VL_ATTR(16); +HFI1_SC2VL_ATTR(17); +HFI1_SC2VL_ATTR(18); +HFI1_SC2VL_ATTR(19); +HFI1_SC2VL_ATTR(20); +HFI1_SC2VL_ATTR(21); +HFI1_SC2VL_ATTR(22); +HFI1_SC2VL_ATTR(23); +HFI1_SC2VL_ATTR(24); +HFI1_SC2VL_ATTR(25); +HFI1_SC2VL_ATTR(26); +HFI1_SC2VL_ATTR(27); +HFI1_SC2VL_ATTR(28); +HFI1_SC2VL_ATTR(29); +HFI1_SC2VL_ATTR(30); +HFI1_SC2VL_ATTR(31); + +static struct attribute *port_sc2vl_attributes[] = { + &hfi1_sc2vl_attr_0.attr.attr, + &hfi1_sc2vl_attr_1.attr.attr, + &hfi1_sc2vl_attr_2.attr.attr, + &hfi1_sc2vl_attr_3.attr.attr, + &hfi1_sc2vl_attr_4.attr.attr, + &hfi1_sc2vl_attr_5.attr.attr, + &hfi1_sc2vl_attr_6.attr.attr, + &hfi1_sc2vl_attr_7.attr.attr, + &hfi1_sc2vl_attr_8.attr.attr, + &hfi1_sc2vl_attr_9.attr.attr, + &hfi1_sc2vl_attr_10.attr.attr, + &hfi1_sc2vl_attr_11.attr.attr, + &hfi1_sc2vl_attr_12.attr.attr, + &hfi1_sc2vl_attr_13.attr.attr, + &hfi1_sc2vl_attr_14.attr.attr, + &hfi1_sc2vl_attr_15.attr.attr, + &hfi1_sc2vl_attr_16.attr.attr, + &hfi1_sc2vl_attr_17.attr.attr, + &hfi1_sc2vl_attr_18.attr.attr, + &hfi1_sc2vl_attr_19.attr.attr, + &hfi1_sc2vl_attr_20.attr.attr, + &hfi1_sc2vl_attr_21.attr.attr, + &hfi1_sc2vl_attr_22.attr.attr, + &hfi1_sc2vl_attr_23.attr.attr, + &hfi1_sc2vl_attr_24.attr.attr, + &hfi1_sc2vl_attr_25.attr.attr, + &hfi1_sc2vl_attr_26.attr.attr, + &hfi1_sc2vl_attr_27.attr.attr, + &hfi1_sc2vl_attr_28.attr.attr, + &hfi1_sc2vl_attr_29.attr.attr, + &hfi1_sc2vl_attr_30.attr.attr, + &hfi1_sc2vl_attr_31.attr.attr, + NULL +}; + +static const struct attribute_group port_sc2vl_group = { + .name = "sc2vl", + .attrs = port_sc2vl_attributes, +}; +/* End sc2vl */ + +/* Start sl2sc */ +struct hfi1_sl2sc_attr { + struct ib_port_attribute attr; + int sl; +}; + +static ssize_t sl2sc_attr_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hfi1_sl2sc_attr *sattr = + container_of(attr, struct hfi1_sl2sc_attr, attr); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; + + return sysfs_emit(buf, "%u\n", ibp->sl_to_sc[sattr->sl]); +} + +#define HFI1_SL2SC_ATTR(N) \ + static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \ + .attr = __ATTR(N, 0444, sl2sc_attr_show, NULL), .sl = N \ + } + +HFI1_SL2SC_ATTR(0); +HFI1_SL2SC_ATTR(1); +HFI1_SL2SC_ATTR(2); +HFI1_SL2SC_ATTR(3); +HFI1_SL2SC_ATTR(4); +HFI1_SL2SC_ATTR(5); +HFI1_SL2SC_ATTR(6); +HFI1_SL2SC_ATTR(7); +HFI1_SL2SC_ATTR(8); +HFI1_SL2SC_ATTR(9); +HFI1_SL2SC_ATTR(10); +HFI1_SL2SC_ATTR(11); +HFI1_SL2SC_ATTR(12); +HFI1_SL2SC_ATTR(13); +HFI1_SL2SC_ATTR(14); +HFI1_SL2SC_ATTR(15); +HFI1_SL2SC_ATTR(16); +HFI1_SL2SC_ATTR(17); +HFI1_SL2SC_ATTR(18); +HFI1_SL2SC_ATTR(19); +HFI1_SL2SC_ATTR(20); +HFI1_SL2SC_ATTR(21); +HFI1_SL2SC_ATTR(22); +HFI1_SL2SC_ATTR(23); +HFI1_SL2SC_ATTR(24); +HFI1_SL2SC_ATTR(25); +HFI1_SL2SC_ATTR(26); +HFI1_SL2SC_ATTR(27); +HFI1_SL2SC_ATTR(28); +HFI1_SL2SC_ATTR(29); +HFI1_SL2SC_ATTR(30); +HFI1_SL2SC_ATTR(31); + +static struct attribute *port_sl2sc_attributes[] = { + &hfi1_sl2sc_attr_0.attr.attr, + &hfi1_sl2sc_attr_1.attr.attr, + &hfi1_sl2sc_attr_2.attr.attr, + &hfi1_sl2sc_attr_3.attr.attr, + &hfi1_sl2sc_attr_4.attr.attr, + &hfi1_sl2sc_attr_5.attr.attr, + &hfi1_sl2sc_attr_6.attr.attr, + &hfi1_sl2sc_attr_7.attr.attr, + &hfi1_sl2sc_attr_8.attr.attr, + &hfi1_sl2sc_attr_9.attr.attr, + &hfi1_sl2sc_attr_10.attr.attr, + &hfi1_sl2sc_attr_11.attr.attr, + &hfi1_sl2sc_attr_12.attr.attr, + &hfi1_sl2sc_attr_13.attr.attr, + &hfi1_sl2sc_attr_14.attr.attr, + &hfi1_sl2sc_attr_15.attr.attr, + &hfi1_sl2sc_attr_16.attr.attr, + &hfi1_sl2sc_attr_17.attr.attr, + &hfi1_sl2sc_attr_18.attr.attr, + &hfi1_sl2sc_attr_19.attr.attr, + &hfi1_sl2sc_attr_20.attr.attr, + &hfi1_sl2sc_attr_21.attr.attr, + &hfi1_sl2sc_attr_22.attr.attr, + &hfi1_sl2sc_attr_23.attr.attr, + &hfi1_sl2sc_attr_24.attr.attr, + &hfi1_sl2sc_attr_25.attr.attr, + &hfi1_sl2sc_attr_26.attr.attr, + &hfi1_sl2sc_attr_27.attr.attr, + &hfi1_sl2sc_attr_28.attr.attr, + &hfi1_sl2sc_attr_29.attr.attr, + &hfi1_sl2sc_attr_30.attr.attr, + &hfi1_sl2sc_attr_31.attr.attr, + NULL +}; + +static const struct attribute_group port_sl2sc_group = { + .name = "sl2sc", + .attrs = port_sl2sc_attributes, +}; + +/* End sl2sc */ + +/* Start vl2mtu */ + +struct hfi1_vl2mtu_attr { + struct ib_port_attribute attr; + int vl; +}; + +static ssize_t vl2mtu_attr_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hfi1_vl2mtu_attr *vlattr = + container_of(attr, struct hfi1_vl2mtu_attr, attr); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + return sysfs_emit(buf, "%u\n", dd->vld[vlattr->vl].mtu); +} + +#define HFI1_VL2MTU_ATTR(N) \ + static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \ + .attr = __ATTR(N, 0444, vl2mtu_attr_show, NULL), \ + .vl = N, \ + } + +HFI1_VL2MTU_ATTR(0); +HFI1_VL2MTU_ATTR(1); +HFI1_VL2MTU_ATTR(2); +HFI1_VL2MTU_ATTR(3); +HFI1_VL2MTU_ATTR(4); +HFI1_VL2MTU_ATTR(5); +HFI1_VL2MTU_ATTR(6); +HFI1_VL2MTU_ATTR(7); +HFI1_VL2MTU_ATTR(8); +HFI1_VL2MTU_ATTR(9); +HFI1_VL2MTU_ATTR(10); +HFI1_VL2MTU_ATTR(11); +HFI1_VL2MTU_ATTR(12); +HFI1_VL2MTU_ATTR(13); +HFI1_VL2MTU_ATTR(14); +HFI1_VL2MTU_ATTR(15); + +static struct attribute *port_vl2mtu_attributes[] = { + &hfi1_vl2mtu_attr_0.attr.attr, + &hfi1_vl2mtu_attr_1.attr.attr, + &hfi1_vl2mtu_attr_2.attr.attr, + &hfi1_vl2mtu_attr_3.attr.attr, + &hfi1_vl2mtu_attr_4.attr.attr, + &hfi1_vl2mtu_attr_5.attr.attr, + &hfi1_vl2mtu_attr_6.attr.attr, + &hfi1_vl2mtu_attr_7.attr.attr, + &hfi1_vl2mtu_attr_8.attr.attr, + &hfi1_vl2mtu_attr_9.attr.attr, + &hfi1_vl2mtu_attr_10.attr.attr, + &hfi1_vl2mtu_attr_11.attr.attr, + &hfi1_vl2mtu_attr_12.attr.attr, + &hfi1_vl2mtu_attr_13.attr.attr, + &hfi1_vl2mtu_attr_14.attr.attr, + &hfi1_vl2mtu_attr_15.attr.attr, + NULL +}; + +static const struct attribute_group port_vl2mtu_group = { + .name = "vl2mtu", + .attrs = port_vl2mtu_attributes, +}; + +/* end of per-port file structures and support code */ + +/* + * Start of per-unit (or driver, in some cases, but replicated + * per unit) functions (these get a device *) + */ +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + + return sysfs_emit(buf, "%x\n", dd_from_dev(dev)->minrev); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + if (!dd->boardname) + return -EINVAL; + + return sysfs_emit(buf, "%s\n", dd->boardname); +} +static DEVICE_ATTR_RO(board_id); + +static ssize_t boardversion_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return sysfs_emit(buf, "%s", dd->boardversion); +} +static DEVICE_ATTR_RO(boardversion); + +static ssize_t nctxts_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* + * Return the smaller of send and receive contexts. + * Normally, user level applications would require both a send + * and a receive context, so returning the smaller of the two counts + * give a more accurate picture of total contexts available. + */ + return sysfs_emit(buf, "%u\n", + min(dd->num_user_contexts, + (u32)dd->sc_sizes[SC_USER].count)); +} +static DEVICE_ATTR_RO(nctxts); + +static ssize_t nfreectxts_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* Return the number of free user ports (contexts) available. */ + return sysfs_emit(buf, "%u\n", dd->freectxts); +} +static DEVICE_ATTR_RO(nfreectxts); + +static ssize_t serial_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* dd->serial is already newline terminated in chip.c */ + return sysfs_emit(buf, "%s", dd->serial); +} +static DEVICE_ATTR_RO(serial); + +static ssize_t chip_reset_store(struct device *device, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) { + ret = -EINVAL; + goto bail; + } + + ret = hfi1_reset_device(dd->unit); +bail: + return ret < 0 ? ret : count; +} +static DEVICE_ATTR_WO(chip_reset); + +/* + * Convert the reported temperature from an integer (reported in + * units of 0.25C) to a floating point number. + */ +#define temp_d(t) ((t) >> 2) +#define temp_f(t) (((t)&0x3) * 25u) + +/* + * Dump tempsense values, in decimal, to ease shell-scripts. + */ +static ssize_t tempsense_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); + struct hfi1_devdata *dd = dd_from_dev(dev); + struct hfi1_temp temp; + int ret; + + ret = hfi1_tempsense_rd(dd, &temp); + if (ret) + return ret; + + return sysfs_emit(buf, "%u.%02u %u.%02u %u.%02u %u.%02u %u %u %u\n", + temp_d(temp.curr), temp_f(temp.curr), + temp_d(temp.lo_lim), temp_f(temp.lo_lim), + temp_d(temp.hi_lim), temp_f(temp.hi_lim), + temp_d(temp.crit_lim), temp_f(temp.crit_lim), + temp.triggers & 0x1, + temp.triggers & 0x2, + temp.triggers & 0x4); +} +static DEVICE_ATTR_RO(tempsense); + +/* + * end of per-unit (or driver, in some cases, but replicated + * per unit) functions + */ + +/* start of per-unit file structures and support code */ +static struct attribute *hfi1_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_board_id.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_chip_reset.attr, + NULL, +}; + +const struct attribute_group ib_hfi1_attr_group = { + .attrs = hfi1_attributes, +}; + +const struct attribute_group *hfi1_attr_port_groups[] = { + &port_cc_group, + &port_sc2vl_group, + &port_sl2sc_group, + &port_vl2mtu_group, + NULL, +}; + +struct sde_attribute { + struct attribute attr; + ssize_t (*show)(struct sdma_engine *sde, char *buf); + ssize_t (*store)(struct sdma_engine *sde, const char *buf, size_t cnt); +}; + +static ssize_t sde_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct sde_attribute *sde_attr = + container_of(attr, struct sde_attribute, attr); + struct sdma_engine *sde = + container_of(kobj, struct sdma_engine, kobj); + + if (!sde_attr->show) + return -EINVAL; + + return sde_attr->show(sde, buf); +} + +static ssize_t sde_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct sde_attribute *sde_attr = + container_of(attr, struct sde_attribute, attr); + struct sdma_engine *sde = + container_of(kobj, struct sdma_engine, kobj); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!sde_attr->store) + return -EINVAL; + + return sde_attr->store(sde, buf, count); +} + +static const struct sysfs_ops sde_sysfs_ops = { + .show = sde_show, + .store = sde_store, +}; + +static struct kobj_type sde_ktype = { + .sysfs_ops = &sde_sysfs_ops, +}; + +#define SDE_ATTR(_name, _mode, _show, _store) \ + struct sde_attribute sde_attr_##_name = \ + __ATTR(_name, _mode, _show, _store) + +static ssize_t sde_show_cpu_to_sde_map(struct sdma_engine *sde, char *buf) +{ + return sdma_get_cpu_to_sde_map(sde, buf); +} + +static ssize_t sde_store_cpu_to_sde_map(struct sdma_engine *sde, + const char *buf, size_t count) +{ + return sdma_set_cpu_to_sde_map(sde, buf, count); +} + +static ssize_t sde_show_vl(struct sdma_engine *sde, char *buf) +{ + int vl; + + vl = sdma_engine_get_vl(sde); + if (vl < 0) + return vl; + + return sysfs_emit(buf, "%d\n", vl); +} + +static SDE_ATTR(cpu_list, S_IWUSR | S_IRUGO, + sde_show_cpu_to_sde_map, + sde_store_cpu_to_sde_map); +static SDE_ATTR(vl, S_IRUGO, sde_show_vl, NULL); + +static struct sde_attribute *sde_attribs[] = { + &sde_attr_cpu_list, + &sde_attr_vl +}; + +/* + * Register and create our files in /sys/class/infiniband. + */ +int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) +{ + struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; + struct device *class_dev = &dev->dev; + int i, j, ret; + + for (i = 0; i < dd->num_sdma; i++) { + ret = kobject_init_and_add(&dd->per_sdma[i].kobj, + &sde_ktype, &class_dev->kobj, + "sdma%d", i); + if (ret) + goto bail; + + for (j = 0; j < ARRAY_SIZE(sde_attribs); j++) { + ret = sysfs_create_file(&dd->per_sdma[i].kobj, + &sde_attribs[j]->attr); + if (ret) + goto bail; + } + } + + return 0; +bail: + /* + * The function kobject_put() will call kobject_del() if the kobject + * has been added successfully. The sysfs files created under the + * kobject directory will also be removed during the process. + */ + for (; i >= 0; i--) + kobject_put(&dd->per_sdma[i].kobj); + + return ret; +} + +/* + * Unregister and remove our files in /sys/class/infiniband. + */ +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd) +{ + int i; + + /* Unwind operations in hfi1_verbs_register_sysfs() */ + for (i = 0; i < dd->num_sdma; i++) + kobject_put(&dd->per_sdma[i].kobj); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c new file mode 100644 index 0000000000..18b05ffb41 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -0,0 +1,5532 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 - 2020 Intel Corporation. + * + */ + +#include "hfi.h" +#include "qp.h" +#include "rc.h" +#include "verbs.h" +#include "tid_rdma.h" +#include "exp_rcv.h" +#include "trace.h" + +/** + * DOC: TID RDMA READ protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the requester side. It + * converts a qualified RDMA READ request into a TID RDMA READ request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA READ request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- Each local scatter-gather entry should be 4K page aligned; + * -- Each local scatter-gather entry should be a multiple of 4K page size; + */ + +#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) +#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) +#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) +#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) + +/* Maximum number of packets within a flow generation. */ +#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) + +#define GENERATION_MASK 0xFFFFF + +static u32 mask_generation(u32 a) +{ + return a & GENERATION_MASK; +} + +/* Reserved generation value to set to unused flows for kernel contexts */ +#define KERN_GENERATION_RESERVED mask_generation(U32_MAX) + +/* + * J_KEY for kernel contexts when TID RDMA is used. + * See generate_jkey() in hfi.h for more information. + */ +#define TID_RDMA_JKEY 32 +#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE +#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) + +/* Maximum number of segments in flight per QP request. */ +#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 +#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 +#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ + TID_RDMA_MAX_WRITE_SEGS_PER_REQ) +#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) + +#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) + +#define TID_RDMA_DESTQP_FLOW_SHIFT 11 +#define TID_RDMA_DESTQP_FLOW_MASK 0x1f + +#define TID_OPFN_QP_CTXT_MASK 0xff +#define TID_OPFN_QP_CTXT_SHIFT 56 +#define TID_OPFN_QP_KDETH_MASK 0xff +#define TID_OPFN_QP_KDETH_SHIFT 48 +#define TID_OPFN_MAX_LEN_MASK 0x7ff +#define TID_OPFN_MAX_LEN_SHIFT 37 +#define TID_OPFN_TIMEOUT_MASK 0x1f +#define TID_OPFN_TIMEOUT_SHIFT 32 +#define TID_OPFN_RESERVED_MASK 0x3f +#define TID_OPFN_RESERVED_SHIFT 26 +#define TID_OPFN_URG_MASK 0x1 +#define TID_OPFN_URG_SHIFT 25 +#define TID_OPFN_VER_MASK 0x7 +#define TID_OPFN_VER_SHIFT 22 +#define TID_OPFN_JKEY_MASK 0x3f +#define TID_OPFN_JKEY_SHIFT 16 +#define TID_OPFN_MAX_READ_MASK 0x3f +#define TID_OPFN_MAX_READ_SHIFT 10 +#define TID_OPFN_MAX_WRITE_MASK 0x3f +#define TID_OPFN_MAX_WRITE_SHIFT 4 + +/* + * OPFN TID layout + * + * 63 47 31 15 + * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC + * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 + * N - the context Number + * K - the Kdeth_qp + * M - Max_len + * T - Timeout + * D - reserveD + * V - version + * U - Urg capable + * J - Jkey + * R - max_Read + * W - max_Write + * C - Capcode + */ + +static void tid_rdma_trigger_resume(struct work_struct *work); +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp); +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req); +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); +static void hfi1_tid_timeout(struct timer_list *t); +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); +static void hfi1_tid_retry_timeout(struct timer_list *t); +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps); +static void hfi1_do_tid_send(struct rvt_qp *qp); +static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx); +static void tid_rdma_rcv_err(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff, bool fecn); +static void update_r_next_psn_fecn(struct hfi1_packet *packet, + struct hfi1_qp_priv *priv, + struct hfi1_ctxtdata *rcd, + struct tid_rdma_flow *flow, + bool fecn); + +static void validate_r_tid_ack(struct hfi1_qp_priv *priv) +{ + if (priv->r_tid_ack == HFI1_QP_WQE_INVALID) + priv->r_tid_ack = priv->r_tid_tail; +} + +static void tid_rdma_schedule_ack(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + priv->s_flags |= RVT_S_ACK_PENDING; + hfi1_schedule_tid_send(qp); +} + +static void tid_rdma_trigger_ack(struct rvt_qp *qp) +{ + validate_r_tid_ack(qp->priv); + tid_rdma_schedule_ack(qp); +} + +static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) +{ + return + (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << + TID_OPFN_QP_CTXT_SHIFT) | + ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << + TID_OPFN_QP_KDETH_SHIFT) | + (((u64)((p->max_len >> PAGE_SHIFT) - 1) & + TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | + (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << + TID_OPFN_TIMEOUT_SHIFT) | + (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | + (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | + (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << + TID_OPFN_MAX_READ_SHIFT) | + (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << + TID_OPFN_MAX_WRITE_SHIFT); +} + +static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) +{ + p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & + TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; + p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; + p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & + TID_OPFN_MAX_WRITE_MASK; + p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & + TID_OPFN_MAX_READ_MASK; + p->qp = + ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) + << 16) | + ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); + p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; + p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; +} + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) +{ + struct hfi1_qp_priv *priv = qp->priv; + + p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; + p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; + p->jkey = priv->rcd->jkey; + p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; + p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; + p->timeout = qp->timeout; + p->urg = is_urg_masked(priv->rcd); +} + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) +{ + struct hfi1_qp_priv *priv = qp->priv; + + *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); + return true; +} + +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *remote, *old; + bool ret = true; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + data &= ~0xfULL; + /* + * If data passed in is zero, return true so as not to continue the + * negotiation process + */ + if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) + goto null; + /* + * If kzalloc fails, return false. This will result in: + * * at the requester a new OPFN request being generated to retry + * the negotiation + * * at the responder, 0 being returned to the requester so as to + * disable TID RDMA at both the requester and the responder + */ + remote = kzalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) { + ret = false; + goto null; + } + + tid_rdma_opfn_decode(remote, data); + priv->tid_timer_timeout_jiffies = + usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / + 1000UL) << 3) * 7); + trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); + trace_hfi1_opfn_param(qp, 1, remote); + rcu_assign_pointer(priv->tid_rdma.remote, remote); + /* + * A TID RDMA READ request's segment size is not equal to + * remote->max_len only when the request's data length is smaller + * than remote->max_len. In that case, there will be only one segment. + * Therefore, when priv->pkts_ps is used to calculate req->cur_seg + * during retry, it will lead to req->cur_seg = 0, which is exactly + * what is expected. + */ + priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); + priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; + goto free; +null: + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + priv->timeout_shift = 0; +free: + if (old) + kfree_rcu(old, rcu_head); + return ret; +} + +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) +{ + bool ret; + + ret = tid_rdma_conn_reply(qp, *data); + *data = 0; + /* + * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate + * TID RDMA could not be enabled. This will result in TID RDMA being + * disabled at the requester too. + */ + if (ret) + (void)tid_rdma_conn_req(qp, data); + return ret; +} + +void tid_rdma_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *old; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + if (old) + kfree_rcu(old, rcu_head); +} + +/* This is called at context initialization time */ +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) +{ + if (reinit) + return 0; + + BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); + BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); + rcd->jkey = TID_RDMA_JKEY; + hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); + return hfi1_alloc_ctxt_rcv_groups(rcd); +} + +/** + * qp_to_rcd - determine the receive context used by a qp + * @rdi: rvt dev struct + * @qp: the qp + * + * This routine returns the receive context associated + * with a a qp's qpn. + * + * Returns the context. + */ +static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, + struct rvt_qp *qp) +{ + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + unsigned int ctxt; + + if (qp->ibqp.qp_num == 0) + ctxt = 0; + else + ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); + return dd->rcd[ctxt]; +} + +int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_init_attr *init_attr) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + int i, ret; + + qpriv->rcd = qp_to_rcd(rdi, qp); + + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); + qpriv->flow_state.psn = 0; + qpriv->flow_state.index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_state = TID_OP(WRITE_RESP); + qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; + qpriv->s_tid_head = HFI1_QP_WQE_INVALID; + qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qpriv->r_tid_head = HFI1_QP_WQE_INVALID; + qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; + qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_requests, 0); + atomic_set(&qpriv->n_tid_requests, 0); + timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); + timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); + INIT_LIST_HEAD(&qpriv->tid_wait); + + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + for (i = 0; i < qp->s_size; i++) { + struct hfi1_swqe_priv *priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.swqe = wqe; + wqe->priv = priv; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.ack = &qp->s_ack_queue[i]; + + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, + GFP_KERNEL); + if (ret) { + kfree(priv); + return ret; + } + qp->s_ack_queue[i].priv = priv; + } + } + + return 0; +} + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + u32 i; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + for (i = 0; i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->priv); + wqe->priv = NULL; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); + kfree(priv); + qp->s_ack_queue[i].priv = NULL; + } + cancel_work_sync(&qpriv->opfn.opfn_work); + kfree(qpriv->pages); + qpriv->pages = NULL; + } +} + +/* Flow and tid waiter functions */ +/** + * DOC: lock ordering + * + * There are two locks involved with the queuing + * routines: the qp s_lock and the exp_lock. + * + * Since the tid space allocation is called from + * the send engine, the qp s_lock is already held. + * + * The allocation routines will get the exp_lock. + * + * The first_qp() call is provided to allow the head of + * the rcd wait queue to be fetched under the exp_lock and + * followed by a drop of the exp_lock. + * + * Any qp in the wait list will have the qp reference count held + * to hold the qp in memory. + */ + +/* + * return head of rcd wait list + * + * Must hold the exp_lock. + * + * Get a reference to the QP to hold the QP in memory. + * + * The caller must release the reference when the local + * is no longer being used. + */ +static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue) + __must_hold(&rcd->exp_lock) +{ + struct hfi1_qp_priv *priv; + + lockdep_assert_held(&rcd->exp_lock); + priv = list_first_entry_or_null(&queue->queue_head, + struct hfi1_qp_priv, + tid_wait); + if (!priv) + return NULL; + rvt_get_qp(priv->owner); + return priv->owner; +} + +/** + * kernel_tid_waiters - determine rcd wait + * @rcd: the receive context + * @queue: the queue to operate on + * @qp: the head of the qp being processed + * + * This routine will return false IFF + * the list is NULL or the head of the + * list is the indicated qp. + * + * Must hold the qp s_lock and the exp_lock. + * + * Return: + * false if either of the conditions below are satisfied: + * 1. The list is empty or + * 2. The indicated qp is at the head of the list and the + * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. + * true is returned otherwise. + */ +static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct rvt_qp *fqp; + bool ret = true; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + fqp = first_qp(rcd, queue); + if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) + ret = false; + rvt_put_qp(fqp); + return ret; +} + +/** + * dequeue_tid_waiter - dequeue the qp from the list + * @rcd: the receive context + * @queue: the queue to operate on + * @qp: the qp to remove the wait list + * + * This routine removes the indicated qp from the + * wait list if it is there. + * + * This should be done after the hardware flow and + * tid array resources have been allocated. + * + * Must hold the qp s_lock and the rcd exp_lock. + * + * It assumes the s_lock to protect the s_flags + * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. + */ +static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) + return; + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); +} + +/** + * queue_qp_for_tid_wait - suspend QP on tid space + * @rcd: the receive context + * @queue: the queue to operate on + * @qp: the qp + * + * The qp is inserted at the tail of the rcd + * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. + * + * Must hold the qp s_lock and the exp_lock. + */ +static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) { + qp->s_flags |= HFI1_S_WAIT_TID_SPACE; + list_add_tail(&priv->tid_wait, &queue->queue_head); + priv->tid_enqueue = ++queue->enqueue; + rcd->dd->verbs_dev.n_tidwait++; + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); + rvt_get_qp(qp); + } +} + +/** + * __trigger_tid_waiter - trigger tid waiter + * @qp: the qp + * + * This is a private entrance to schedule the qp + * assuming the caller is holding the qp->s_lock. + */ +static void __trigger_tid_waiter(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + lockdep_assert_held(&qp->s_lock); + if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) + return; + trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); + hfi1_schedule_send(qp); +} + +/** + * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp + * @qp: the qp + * + * trigger a schedule or a waiting qp in a deadlock + * safe manner. The qp reference is held prior + * to this call via first_qp(). + * + * If the qp trigger was already scheduled (!rval) + * the reference is dropped, otherwise the resume + * or the destroy cancel will dispatch the reference. + */ +static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + bool rval; + + if (!qp) + return; + + priv = qp->priv; + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + dd = dd_from_ibdev(qp->ibqp.device); + + rval = queue_work_on(priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node)), + ppd->hfi1_wq, + &priv->tid_rdma.trigger_work); + if (!rval) + rvt_put_qp(qp); +} + +/** + * tid_rdma_trigger_resume - field a trigger work request + * @work: the work item + * + * Complete the off qp trigger processing by directly + * calling the progress routine. + */ +static void tid_rdma_trigger_resume(struct work_struct *work) +{ + struct tid_rdma_qp_params *tr; + struct hfi1_qp_priv *priv; + struct rvt_qp *qp; + + tr = container_of(work, struct tid_rdma_qp_params, trigger_work); + priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); + qp = priv->owner; + spin_lock_irq(&qp->s_lock); + if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { + spin_unlock_irq(&qp->s_lock); + hfi1_do_send(priv->owner, true); + } else { + spin_unlock_irq(&qp->s_lock); + } + rvt_put_qp(qp); +} + +/* + * tid_rdma_flush_wait - unwind any tid space wait + * + * This is called when resetting a qp to + * allow a destroy or reset to get rid + * of any tid space linkage and reference counts. + */ +static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv; + + if (!qp) + return; + lockdep_assert_held(&qp->s_lock); + priv = qp->priv; + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + spin_lock(&priv->rcd->exp_lock); + if (!list_empty(&priv->tid_wait)) { + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); + } + spin_unlock(&priv->rcd->exp_lock); +} + +void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); + _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); +} + +/* Flow functions */ +/** + * kern_reserve_flow - allocate a hardware flow + * @rcd: the context to use for allocation + * @last: the index of the preferred flow. Use RXE_NUM_TID_FLOWS to + * signify "don't care". + * + * Use a bit mask based allocation to reserve a hardware + * flow for use in receiving KDETH data packets. If a preferred flow is + * specified the function will attempt to reserve that flow again, if + * available. + * + * The exp_lock must be held. + * + * Return: + * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 + * On failure: -EAGAIN + */ +static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) + __must_hold(&rcd->exp_lock) +{ + int nr; + + /* Attempt to reserve the preferred flow index */ + if (last >= 0 && last < RXE_NUM_TID_FLOWS && + !test_and_set_bit(last, &rcd->flow_mask)) + return last; + + nr = ffz(rcd->flow_mask); + BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= + (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); + if (nr > (RXE_NUM_TID_FLOWS - 1)) + return -EAGAIN; + set_bit(nr, &rcd->flow_mask); + return nr; +} + +static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, + u32 flow_idx) +{ + u64 reg; + + reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | + RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | + RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; + + if (generation != KERN_GENERATION_RESERVED) + reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; + + write_uctxt_csr(rcd->dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); +} + +static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + u32 generation = rcd->flows[flow_idx].generation; + + kern_set_hw_flow(rcd, generation, flow_idx); + return generation; +} + +static u32 kern_flow_generation_next(u32 gen) +{ + u32 generation = mask_generation(gen + 1); + + if (generation == KERN_GENERATION_RESERVED) + generation = mask_generation(generation + 1); + return generation; +} + +static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + rcd->flows[flow_idx].generation = + kern_flow_generation_next(rcd->flows[flow_idx].generation); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); +} + +int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + int ret = 0; + + /* The QP already has an allocated flow */ + if (fs->index != RXE_NUM_TID_FLOWS) + return ret; + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) + goto queue; + + ret = kern_reserve_flow(rcd, fs->last_index); + if (ret < 0) + goto queue; + fs->index = ret; + fs->last_index = fs->index; + + /* Generation received in a RESYNC overrides default flow generation */ + if (fs->generation != KERN_GENERATION_RESERVED) + rcd->flows[fs->index].generation = fs->generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + fs->psn = 0; + dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + tid_rdma_schedule_tid_wakeup(fqp); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + + if (fs->index >= RXE_NUM_TID_FLOWS) + return; + spin_lock_irqsave(&rcd->exp_lock, flags); + kern_clear_hw_flow(rcd, fs->index); + clear_bit(fs->index, &rcd->flow_mask); + fs->index = RXE_NUM_TID_FLOWS; + fs->psn = 0; + fs->generation = KERN_GENERATION_RESERVED; + + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + if (fqp == qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } +} + +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) +{ + int i; + + for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { + rcd->flows[i].generation = mask_generation(get_random_u32()); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); + } +} + +/* TID allocation functions */ +static u8 trdma_pset_order(struct tid_rdma_pageset *s) +{ + u8 count = s->count; + + return ilog2(count) + 1; +} + +/** + * tid_rdma_find_phys_blocks_4k - get groups base on mr info + * @flow: overall info for a TID RDMA segment + * @pages: pointer to an array of page structs + * @npages: number of pages + * @list: page set array to return + * + * This routine returns the number of groups associated with + * the current sge information. This implementation is based + * on the expected receive find_phys_blocks() adjusted to + * use the MR information vs. the pfn. + * + * Return: + * the number of RcvArray entries + */ +static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 pagecount, pageidx, setcount = 0, i; + void *vaddr, *this_vaddr; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + vaddr = page_address(pages[0]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_vaddr = i < npages ? page_address(pages[i]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, + this_vaddr); + /* + * If the vaddr's are not sequential, pages are not physically + * contiguous. + */ + if (this_vaddr != (vaddr + PAGE_SIZE)) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + trace_hfi1_tid_pageset(flow->req->qp, setcount, + list[setcount].idx, + list[setcount].count); + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + vaddr = this_vaddr; + } else { + vaddr += PAGE_SIZE; + pagecount++; + } + } + /* insure we always return an even number of sets */ + if (setcount & 1) + list[setcount++].count = 0; + return setcount; +} + +/** + * tid_flush_pages - dump out pages into pagesets + * @list: list of pagesets + * @idx: pointer to current page index + * @pages: number of pages to dump + * @sets: current number of pagesset + * + * This routine flushes out accumuated pages. + * + * To insure an even number of sets the + * code may add a filler. + * + * This can happen with when pages is not + * a power of 2 or pages is a power of 2 + * less than the maximum pages. + * + * Return: + * The new number of sets + */ + +static u32 tid_flush_pages(struct tid_rdma_pageset *list, + u32 *idx, u32 pages, u32 sets) +{ + while (pages) { + u32 maxpages = pages; + + if (maxpages > MAX_EXPECTED_PAGES) + maxpages = MAX_EXPECTED_PAGES; + else if (!is_power_of_2(maxpages)) + maxpages = rounddown_pow_of_two(maxpages); + list[sets].idx = *idx; + list[sets++].count = maxpages; + *idx += maxpages; + pages -= maxpages; + } + /* might need a filler */ + if (sets & 1) + list[sets++].count = 0; + return sets; +} + +/** + * tid_rdma_find_phys_blocks_8k - get groups base on mr info + * @flow: overall info for a TID RDMA segment + * @pages: pointer to an array of page structs + * @npages: number of pages + * @list: page set array to return + * + * This routine parses an array of pages to compute pagesets + * in an 8k compatible way. + * + * pages are tested two at a time, i, i + 1 for contiguous + * pages and i - 1 and i contiguous pages. + * + * If any condition is false, any accumlated pages are flushed and + * v0,v1 are emitted as separate PAGE_SIZE pagesets + * + * Otherwise, the current 8k is totaled for a future flush. + * + * Return: + * The number of pagesets + * list set with the returned number of pagesets + * + */ +static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 idx, sets = 0, i; + u32 pagecnt = 0; + void *v0, *v1, *vm1; + + if (!npages) + return 0; + for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { + /* get a new v0 */ + v0 = page_address(pages[i]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); + v1 = i + 1 < npages ? + page_address(pages[i + 1]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); + /* compare i, i + 1 vaddr */ + if (v1 != (v0 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + /* output v0,v1 as two pagesets */ + list[sets].idx = idx++; + list[sets++].count = 1; + if (v1) { + list[sets].count = 1; + list[sets++].idx = idx++; + } else { + list[sets++].count = 0; + } + vm1 = NULL; + pagecnt = 0; + continue; + } + /* i,i+1 consecutive, look at i-1,i */ + if (vm1 && v0 != (vm1 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + pagecnt = 0; + } + /* pages will always be a multiple of 8k */ + pagecnt += 2; + /* save i-1 */ + vm1 = v1; + /* move to next pair */ + } + /* dump residual pages at end */ + sets = tid_flush_pages(list, &idx, npages - idx, sets); + /* by design cannot be odd sets */ + WARN_ON(sets & 1); + return sets; +} + +/* + * Find pages for one segment of a sge array represented by @ss. The function + * does not check the sge, the sge must have been checked for alignment with a + * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of + * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge + * copy maintained in @ss->sge, the original sge is not modified. + * + * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not + * releasing the MR reference count at the same time. Otherwise, we'll "leak" + * references to the MR. This difference requires that we keep track of progress + * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request + * structure. + */ +static u32 kern_find_pages(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + struct tid_rdma_request *req = flow->req; + struct rvt_sge *sge = &ss->sge; + u32 length = flow->req->seg_len; + u32 len = PAGE_SIZE; + u32 i = 0; + + while (length && req->isge < ss->num_sge) { + pages[i++] = virt_to_page(sge->vaddr); + + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (!sge->sge_length) { + if (++req->isge < ss->num_sge) + *sge = ss->sg_list[req->isge - 1]; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + ++sge->m; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + + flow->length = flow->req->seg_len - length; + *last = req->isge != ss->num_sge; + return i; +} + +static void dma_unmap_flow(struct tid_rdma_flow *flow) +{ + struct hfi1_devdata *dd; + int i; + struct tid_rdma_pageset *pset; + + dd = flow->req->rcd->dd; + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count && pset->addr) { + dma_unmap_page(&dd->pcidev->dev, + pset->addr, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + pset->mapped = 0; + } + } +} + +static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) +{ + int i; + struct hfi1_devdata *dd = flow->req->rcd->dd; + struct tid_rdma_pageset *pset; + + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count) { + pset->addr = dma_map_page(&dd->pcidev->dev, + pages[pset->idx], + 0, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + + if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { + dma_unmap_flow(flow); + return -ENOMEM; + } + pset->mapped = 1; + } + } + return 0; +} + +static inline bool dma_mapped(struct tid_rdma_flow *flow) +{ + return !!flow->pagesets[0].mapped; +} + +/* + * Get pages pointers and identify contiguous physical memory chunks for a + * segment. All segments are of length flow->req->seg_len. + */ +static int kern_get_phys_blocks(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + u8 npages; + + /* Reuse previously computed pagesets, if any */ + if (flow->npagesets) { + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, + flow); + if (!dma_mapped(flow)) + return dma_map_flow(flow, pages); + return 0; + } + + npages = kern_find_pages(flow, pages, ss, last); + + if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) + flow->npagesets = + tid_rdma_find_phys_blocks_4k(flow, pages, npages, + flow->pagesets); + else + flow->npagesets = + tid_rdma_find_phys_blocks_8k(flow, pages, npages, + flow->pagesets); + + return dma_map_flow(flow, pages); +} + +static inline void kern_add_tid_node(struct tid_rdma_flow *flow, + struct hfi1_ctxtdata *rcd, char *s, + struct tid_group *grp, u8 cnt) +{ + struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; + + WARN_ON_ONCE(flow->tnode_cnt >= + (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); + if (WARN_ON_ONCE(cnt & 1)) + dd_dev_err(rcd->dd, + "unexpected odd allocation cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + + node->grp = grp; + node->map = grp->map; + node->cnt = cnt; + trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, + grp->base, grp->map, grp->used, cnt); +} + +/* + * Try to allocate pageset_count TID's from TID groups for a context + * + * This function allocates TID's without moving groups between lists or + * modifying grp->map. This is done as follows, being cogizant of the lists + * between which the TID groups will move: + * 1. First allocate complete groups of 8 TID's since this is more efficient, + * these groups will move from group->full without affecting used + * 2. If more TID's are needed allocate from used (will move from used->full or + * stay in used) + * 3. If we still don't have the required number of TID's go back and look again + * at a complete group (will move from group->used) + */ +static int kern_alloc_tids(struct tid_rdma_flow *flow) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + u32 ngroups, pageidx = 0; + struct tid_group *group = NULL, *used; + u8 use; + + flow->tnode_cnt = 0; + ngroups = flow->npagesets / dd->rcv_entries.group_size; + if (!ngroups) + goto used_list; + + /* First look at complete groups */ + list_for_each_entry(group, &rcd->tid_group_list.list, list) { + kern_add_tid_node(flow, rcd, "complete groups", group, + group->size); + + pageidx += group->size; + if (!--ngroups) + break; + } + + if (pageidx >= flow->npagesets) + goto ok; + +used_list: + /* Now look at partially used groups */ + list_for_each_entry(used, &rcd->tid_used_list.list, list) { + use = min_t(u32, flow->npagesets - pageidx, + used->size - used->used); + kern_add_tid_node(flow, rcd, "used groups", used, use); + + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; + } + + /* + * Look again at a complete group, continuing from where we left. + * However, if we are at the head, we have reached the end of the + * complete groups list from the first loop above + */ + if (group && &group->list == &rcd->tid_group_list.list) + goto bail_eagain; + group = list_prepare_entry(group, &rcd->tid_group_list.list, + list); + if (list_is_last(&group->list, &rcd->tid_group_list.list)) + goto bail_eagain; + group = list_next_entry(group, list); + use = min_t(u32, flow->npagesets - pageidx, group->size); + kern_add_tid_node(flow, rcd, "complete continue", group, use); + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; +bail_eagain: + trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", + (u64)flow->npagesets); + return -EAGAIN; +ok: + return 0; +} + +static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, + u32 *pset_idx) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + struct tid_rdma_pageset *pset; + u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; + u32 rcventry, npages = 0, pair = 0, tidctrl; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + pset = &flow->pagesets[(*pset_idx)++]; + if (pset->count) { + hfi1_put_tid(dd, rcventry, PT_EXPECTED, + pset->addr, trdma_pset_order(pset)); + } else { + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + } + npages += pset->count; + + rcventry -= rcd->expected_base; + tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; + /* + * A single TID entry will be used to use a rcvarr pair (with + * tidctrl 0x3), if ALL these are true (a) the bit pos is even + * (b) the group map shows current and the next bits as free + * indicating two consecutive rcvarry entries are available (c) + * we actually need 2 more entries + */ + pair = !(i & 0x1) && !((node->map >> i) & 0x3) && + node->cnt >= cnt + 2; + if (!pair) { + if (!pset->count) + tidctrl = 0x1; + flow->tid_entry[flow->tidcnt++] = + EXP_TID_SET(IDX, rcventry >> 1) | + EXP_TID_SET(CTRL, tidctrl) | + EXP_TID_SET(LEN, npages); + trace_hfi1_tid_entry_alloc(/* entry */ + flow->req->qp, flow->tidcnt - 1, + flow->tid_entry[flow->tidcnt - 1]); + + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ + flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); + npages = 0; + } + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_full_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_group_list, + &rcd->tid_used_list); + + grp->used++; + grp->map |= BIT(i); + cnt++; + } +} + +static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + u32 rcventry; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + + grp->used--; + grp->map &= ~BIT(i); + cnt++; + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_full_list, + &rcd->tid_used_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_group_list); + } + if (WARN_ON_ONCE(cnt & 1)) { + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + + dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + } +} + +static void kern_program_rcvarray(struct tid_rdma_flow *flow) +{ + u32 pset_idx = 0; + int i; + + flow->npkts = 0; + flow->tidcnt = 0; + for (i = 0; i < flow->tnode_cnt; i++) + kern_program_rcv_group(flow, i, &pset_idx); + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); +} + +/** + * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a + * TID RDMA request + * + * @req: TID RDMA request for which the segment/flow is being set up + * @ss: sge state, maintains state across successive segments of a sge + * @last: set to true after the last sge segment has been processed + * + * This function + * (1) finds a free flow entry in the flow circular buffer + * (2) finds pages and continuous physical chunks constituing one segment + * of an sge + * (3) allocates TID group entries for those chunks + * (4) programs rcvarray entries in the hardware corresponding to those + * TID's + * (5) computes a tidarray with formatted TID entries which can be sent + * to the sender + * (6) Reserves and programs HW flows. + * (7) It also manages queing the QP when TID/flow resources are not + * available. + * + * @req points to struct tid_rdma_request of which the segments are a part. The + * function uses qp, rcd and seg_len members of @req. In the absence of errors, + * req->flow_idx is the index of the flow which has been prepared in this + * invocation of function call. With flow = &req->flows[req->flow_idx], + * flow->tid_entry contains the TID array which the sender can use for TID RDMA + * sends and flow->npkts contains number of packets required to send the + * segment. + * + * hfi1_check_sge_align should be called prior to calling this function and if + * it signals error TID RDMA cannot be used for this sge and this function + * should not be called. + * + * For the queuing, caller must hold the flow->req->qp s_lock from the send + * engine and the function will procure the exp_lock. + * + * Return: + * The function returns -EAGAIN if sufficient number of TID/flow resources to + * map the segment could not be allocated. In this case the function should be + * called again with previous arguments to retry the TID allocation. There are + * no other error returns. The function returns 0 on success. + */ +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->setup_head]; + struct hfi1_ctxtdata *rcd = req->rcd; + struct hfi1_qp_priv *qpriv = req->qp->priv; + unsigned long flags; + struct rvt_qp *fqp; + u16 clear_tail = req->clear_tail; + + lockdep_assert_held(&req->qp->s_lock); + /* + * We return error if either (a) we don't have space in the flow + * circular buffer, or (b) we already have max entries in the buffer. + * Max entries depend on the type of request we are processing and the + * negotiated TID RDMA parameters. + */ + if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || + CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= + req->n_flows) + return -EINVAL; + + /* + * Get pages, identify contiguous physical memory chunks for the segment + * If we can not determine a DMA address mapping we will treat it just + * like if we ran out of space above. + */ + if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { + hfi1_wait_kmem(flow->req->qp); + return -ENOMEM; + } + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) + goto queue; + + /* + * At this point we know the number of pagesets and hence the number of + * TID's to map the segment. Allocate the TID's from the TID groups. If + * we cannot allocate the required number we exit and try again later + */ + if (kern_alloc_tids(flow)) + goto queue; + /* + * Finally program the TID entries with the pagesets, compute the + * tidarray and enable the HW flow + */ + kern_program_rcvarray(flow); + + /* + * Setup the flow state with relevant information. + * This information is used for tracking the sequence of data packets + * for the segment. + * The flow is setup here as this is the most accurate time and place + * to do so. Doing at a later time runs the risk of the flow data in + * qpriv getting out of sync. + */ + memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); + flow->idx = qpriv->flow_state.index; + flow->flow_state.generation = qpriv->flow_state.generation; + flow->flow_state.spsn = qpriv->flow_state.psn; + flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, flow->flow_state.spsn); + qpriv->flow_state.psn += flow->npkts; + + dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + tid_rdma_schedule_tid_wakeup(fqp); + + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) +{ + flow->npagesets = 0; +} + +/* + * This function is called after one segment has been successfully sent to + * release the flow and TID HW/SW resources for that segment. The segments for a + * TID RDMA request are setup and cleared in FIFO order which is managed using a + * circular buffer. + */ +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct hfi1_ctxtdata *rcd = req->rcd; + unsigned long flags; + int i; + struct rvt_qp *fqp; + + lockdep_assert_held(&req->qp->s_lock); + /* Exit if we have nothing in the flow circular buffer */ + if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) + return -EINVAL; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + for (i = 0; i < flow->tnode_cnt; i++) + kern_unprogram_rcv_group(flow, i); + /* To prevent double unprogramming */ + flow->tnode_cnt = 0; + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + dma_unmap_flow(flow); + + hfi1_tid_rdma_reset_flow(flow); + req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); + + if (fqp == req->qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } + + return 0; +} + +/* + * This function is called to release all the tid entries for + * a request. + */ +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + /* Use memory barrier for proper ordering */ + while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { + if (hfi1_kern_exp_rcv_clear(req)) + break; + } +} + +/** + * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * @req: the tid rdma request to be cleaned + */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + kfree(req->flows); + req->flows = NULL; +} + +/** + * __trdma_clean_swqe - clean up for large sized QPs + * @qp: the queue patch + * @wqe: the send wqe + */ +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_swqe_priv *p = wqe->priv; + + hfi1_kern_exp_rcv_free_flows(&p->tid_req); +} + +/* + * This can be called at QP create time or in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp) +{ + struct tid_rdma_flow *flows; + int i; + + if (likely(req->flows)) + return 0; + flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, + req->rcd->numa_id); + if (!flows) + return -ENOMEM; + /* mini init */ + for (i = 0; i < MAX_FLOWS; i++) { + flows[i].req = req; + flows[i].npagesets = 0; + flows[i].pagesets[0].mapped = 0; + flows[i].resync_npkts = 0; + } + req->flows = flows; + return 0; +} + +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + /* + * Initialize various TID RDMA request variables. + * These variables are "static", which is why they + * can be pre-initialized here before the WRs has + * even been submitted. + * However, non-NULL values for these variables do not + * imply that this WQE has been enabled for TID RDMA. + * Drivers should check the WQE's opcode to determine + * if a request is a TID RDMA one or not. + */ + req->qp = qp; + req->rcd = qpriv->rcd; +} + +u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_tidwait; +} + +static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + u16 head, tail; + struct tid_rdma_flow *flow; + + head = req->setup_head; + tail = req->clear_tail; + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + flow = &req->flows[tail]; + if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && + cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + +/* TID RDMA READ functions */ +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_swqe_priv *wpriv = wqe->priv; + struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; + struct tid_rdma_params *remote; + u32 req_len = 0; + void *req_addr = NULL; + + /* This is the IB psn used to send the request */ + *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); + + /* TID Entries for TID RDMA READ payload */ + req_addr = &flow->tid_entry[flow->tid_idx]; + req_len = sizeof(*flow->tid_entry) * + (flow->tidcnt - flow->tid_idx); + + memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); + wpriv->ss.sge.vaddr = req_addr; + wpriv->ss.sge.sge_length = req_len; + wpriv->ss.sge.length = wpriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + wpriv->ss.sge.mr = NULL; + wpriv->ss.sge.m = 0; + wpriv->ss.sge.n = 0; + + wpriv->ss.sg_list = NULL; + wpriv->ss.total_len = wpriv->ss.sge.sge_length; + wpriv->ss.num_sge = 1; + + /* Construct the TID RDMA READ REQ packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(rreq->kdeth0, KVER, 0x1); + KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); + rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + + req->cur_seg * req->seg_len + flow->sent); + rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); + rreq->reth.length = cpu_to_be32(*len); + rreq->tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + ((flow->flow_state.spsn + flow->pkt) & + HFI1_KDETH_BTH_SEQ_MASK)); + rreq->tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + *bth2 |= IB_BTH_REQ_ACK; + rcu_read_unlock(); + + /* We are done with this segment */ + flow->sent += *len; + req->cur_seg++; + qp->s_state = TID_OP(READ_REQ); + req->ack_pending++; + req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); + qpriv->pending_tid_r_segs++; + qp->s_num_rd_atomic++; + + /* Set the TID RDMA READ request payload size */ + *len = req_len; + + return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); +} + +/* + * @len: contains the data length to read upon entry and the read request + * payload length upon exit. + */ +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = NULL; + u32 hdwords = 0; + bool last; + bool retry = true; + u32 npkts = rvt_div_round_up_mtu(qp, *len); + + trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + /* + * Check sync conditions. Make sure that there are no pending + * segments before freeing the flow. + */ +sync_check: + if (req->state == TID_REQUEST_SYNC) { + if (qpriv->pending_tid_r_segs) + goto done; + + hfi1_kern_clear_hw_flow(req->rcd, qp); + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * If the request for this segment is resent, the tid resources should + * have been allocated before. In this case, req->flow_idx should + * fall behind req->setup_head. + */ + if (req->flow_idx == req->setup_head) { + retry = false; + if (req->state == TID_REQUEST_RESEND) { + /* + * This is the first new segment for a request whose + * earlier segments have been re-sent. We need to + * set up the sge pointer correctly. + */ + restart_sge(&qp->s_sge, wqe, req->s_next_psn, + qp->pmtu); + req->isge = 0; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * Check sync. The last PSN of each generation is reserved for + * RESYNC. + */ + if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { + req->state = TID_REQUEST_SYNC; + goto sync_check; + } + + /* Allocate the flow if not yet */ + if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) + goto done; + + /* + * The following call will advance req->setup_head after + * allocating the tid entries. + */ + if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { + req->state = TID_REQUEST_QUEUED; + + /* + * We don't have resources for this segment. The QP has + * already been queued. + */ + goto done; + } + } + + /* req->flow_idx should only be one slot behind req->setup_head */ + flow = &req->flows[req->flow_idx]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->sent = 0; + if (!retry) { + /* Set the first and last IB PSN for the flow in use.*/ + flow->flow_state.ib_spsn = req->s_next_psn; + flow->flow_state.ib_lpsn = + flow->flow_state.ib_spsn + flow->npkts - 1; + } + + /* Calculate the next segment start psn.*/ + req->s_next_psn += flow->npkts; + + /* Build the packet header */ + hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); +done: + return hdwords; +} + +/* + * Validate and accept the TID RDMA READ request parameters. + * Return 0 if the request is accepted successfully; + * Return 1 otherwise. + */ +static int tid_rdma_rcv_read_request(struct rvt_qp *qp, + struct rvt_ack_entry *e, + struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + u32 bth0, u32 psn, u64 vaddr, u32 len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 flow_psn, i, tidlen = 0, pktlen, tlen; + + req = ack_to_tid_req(e); + + /* Validate the payload first */ + flow = &req->flows[req->setup_head]; + + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) + return 1; + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. Also calculate the number of required packets. + */ + flow->npkts = rvt_div_round_up_mtu(qp, len); + for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_read_req(qp, i, + flow->tid_entry[i]); + tlen = EXP_TID_GET(flow->tid_entry[i], LEN); + if (!tlen) + return 1; + + /* + * For tid pair (tidctr == 3), the buffer size of the pair + * should be the sum of the buffer size described by each + * tid entry. However, only the first entry needs to be + * specified in the request (see WFR HAS Section 8.5.7.1). + */ + tidlen += tlen; + } + if (tidlen * PAGE_SIZE < len) + return 1; + + /* Empty the flow array */ + req->clear_tail = req->setup_head; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->length = len; + + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + flow->flow_state.ib_spsn = psn; + flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; + + trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); + /* Set the initial flow index to the current flow. */ + req->flow_idx = req->setup_head; + + /* advance circular buffer head */ + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + + /* + * Compute last PSN for request. + */ + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = psn + flow->npkts - 1; + e->sent = 0; + + req->n_flows = qpriv->tid_rdma.local.max_read; + req->state = TID_REQUEST_ACTIVE; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = 1; + req->r_flow_psn = e->psn; + + trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + return 0; +} + +static int tid_rdma_rcv_error(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + unsigned long flags; + u8 prev; + bool old_req; + + trace_hfi1_rsp_tid_rcv_error(qp, psn); + trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); + if (diff > 0) { + /* sequence error */ + if (!qp->r_nak_state) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + qp->r_ack_psn = qp->r_psn; + rc_defered_ack(rcd, qp); + } + goto done; + } + + ibp->rvp.n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + e = find_prev_entry(qp, psn, &prev, NULL, &old_req); + if (!e || (e->opcode != TID_OP(READ_REQ) && + e->opcode != TID_OP(WRITE_REQ))) + goto unlock; + + req = ack_to_tid_req(e); + req->r_flow_psn = psn; + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); + if (e->opcode == TID_OP(READ_REQ)) { + struct ib_reth *reth; + u32 len; + u32 rkey; + u64 vaddr; + int ok; + u32 bth0; + + reth = &ohdr->u.tid_rdma.r_req.reth; + /* + * The requester always restarts from the start of the original + * request. + */ + len = be32_to_cpu(reth->length); + if (psn != e->psn || len != req->total_len) + goto unlock; + + release_rdma_sge_mr(e); + + rkey = be32_to_cpu(reth->rkey); + vaddr = get_ib_reth_vaddr(reth); + + qp->r_len = len; + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock; + + /* + * If all the response packets for the current request have + * been sent out and this request is complete (old_request + * == false) and the TID flow may be unusable (the + * req->clear_tail is advanced). However, when an earlier + * request is received, this request will not be complete any + * more (qp->s_tail_ack_queue is moved back, see below). + * Consequently, we need to update the TID flow info everytime + * a duplicate request is received. + */ + bth0 = be32_to_cpu(ohdr->bth[0]); + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, + vaddr, len)) + goto unlock; + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue); + */ + if (old_req) + goto unlock; + } else { + struct flow_state *fstate; + bool schedule = false; + u8 i; + + if (req->state == TID_REQUEST_RESEND) { + req->state = TID_REQUEST_RESEND_ACTIVE; + } else if (req->state == TID_REQUEST_INIT_RESEND) { + req->state = TID_REQUEST_INIT; + schedule = true; + } + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue). + * Also, don't change requests, which are at the SYNC + * point and haven't generated any responses yet. + * There is nothing to retransmit for them yet. + */ + if (old_req || req->state == TID_REQUEST_INIT || + (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { + for (i = prev + 1; ; i++) { + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + if (e->opcode == TID_OP(WRITE_REQ) && + req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + } + /* + * If the state of the request has been changed, + * the first leg needs to get scheduled in order to + * pick up the change. Otherwise, normal response + * processing should take care of it. + */ + if (!schedule) + goto unlock; + } + + /* + * If there is no more allocated segment, just schedule the qp + * without changing any state. + */ + if (req->clear_tail == req->setup_head) + goto schedule; + /* + * If this request has sent responses for segments, which have + * not received data yet (flow_idx != clear_tail), the flow_idx + * pointer needs to be adjusted so the same responses can be + * re-sent. + */ + if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { + fstate = &req->flows[req->clear_tail].flow_state; + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, req->clear_tail, + MAX_FLOWS); + req->flow_idx = + CIRC_ADD(req->clear_tail, + delta_psn(psn, fstate->resp_ib_psn), + MAX_FLOWS); + qpriv->pending_tid_w_segs += + delta_psn(psn, fstate->resp_ib_psn); + /* + * When flow_idx == setup_head, we've gotten a duplicate + * request for a segment, which has not been allocated + * yet. In that case, don't adjust this request. + * However, we still want to go through the loop below + * to adjust all subsequent requests. + */ + if (CIRC_CNT(req->setup_head, req->flow_idx, + MAX_FLOWS)) { + req->cur_seg = delta_psn(psn, e->psn); + req->state = TID_REQUEST_RESEND_ACTIVE; + } + } + + for (i = prev + 1; ; i++) { + /* + * Look at everything up to and including + * s_tail_ack_queue + */ + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (e->opcode != TID_OP(WRITE_REQ) || + req->cur_seg == req->comp_seg || + req->state == TID_REQUEST_INIT || + req->state == TID_REQUEST_INIT_RESEND) { + if (req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + continue; + } + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, + req->clear_tail, + MAX_FLOWS); + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + req->cur_seg = req->comp_seg; + } + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + } + /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; + qp->s_tail_ack_queue = prev; + /* + * Since the qp->s_tail_ack_queue is modified, the + * qp->s_ack_state must be changed to re-initialize + * qp->s_ack_rdma_sge; Otherwise, we will end up in + * wrong memory region. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); +schedule: + /* + * It's possible to receive a retry psn that is earlier than an RNRNAK + * psn. In this case, the rnrnak state should be cleared. + */ + if (qpriv->rnr_nak_state) { + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + hfi1_tid_write_alloc_resources(qp, true); + } + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; +} + +void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ + * (see hfi1_rc_rcv()) + * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Initialize struct tid_rdma_flow info; + * - Copy TID entries; + * 3. Set the qp->s_ack_state. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 bth0, psn, len, rkey; + bool fecn; + u8 next; + u64 vaddr; + int diff; + u8 nack_state = IB_NAK_INVALID_REQUEST; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_read_req(qp, psn); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.r_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) + goto nack_inv; + + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); + return; + } + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) { + nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + goto nack_inv_unlock; + } + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + release_rdma_sge_mr(e); + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_READ))) + goto nack_acc; + + /* Accept the request parameters */ + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, + len)) + goto nack_inv_unlock; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn += e->lpsn - e->psn + 1; + + qp->r_head_ack_queue = next; + + /* + * For all requests other than TID WRITE which are added to the ack + * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to + * do this because of interlocks between these and TID WRITE + * requests. The same change has also been made in hfi1_rc_rcv(). + */ + qpriv->r_tid_alloc = qp->r_head_ack_queue; + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = nack_state; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +} + +u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth0, + u32 *bth1, u32 *bth2, u32 *len, bool *last) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + u32 hdwords = 0; + struct tid_rdma_params *remote; + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->sent >= flow->length); + + trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + if (!remote) { + rcu_read_unlock(); + goto done; + } + KDETH_RESET(resp->kdeth0, KVER, 0x1); + KDETH_SET(resp->kdeth0, SH, !last_pkt); + KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); + resp->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + resp->aeth = rvt_compute_aeth(qp); + resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + + flow->pkt)); + + *bth0 = TID_OP(READ_RESP) << 24; + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + *last = last_pkt; + if (last_pkt) + /* Advance to next flow */ + req->clear_tail = (req->clear_tail + 1) & + (MAX_FLOWS - 1); + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + + hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); + +done: + return hdwords; +} + +static inline struct tid_rdma_request * +find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) + __must_hold(&qp->s_lock) +{ + struct rvt_swqe *wqe; + struct tid_rdma_request *req = NULL; + u32 i, end; + + end = qp->s_cur + 1; + if (end == qp->s_size) + end = 0; + for (i = qp->s_acked; i != end;) { + wqe = rvt_get_swqe_ptr(qp, i); + if (cmp_psn(psn, wqe->psn) >= 0 && + cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == opcode) + req = wqe_to_tid_req(wqe); + break; + } + if (++i == qp->s_size) + i = 0; + } + + return req; +} + +void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that the entire segment has been read. + * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 4. Free the TID flow resources. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 opcode, aeth; + bool fecn; + unsigned long flags; + u32 kpsn, ipsn; + + trace_hfi1_sender_rcv_tid_read_resp(qp); + fecn = process_ecn(qp, packet); + kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); + req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); + if (unlikely(!req)) + goto ack_op_err; + + flow = &req->flows[req->clear_tail]; + /* When header suppression is disabled */ + if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) { + update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); + + if (cmp_psn(kpsn, flow->flow_state.r_next_psn)) + goto ack_done; + flow->flow_state.r_next_psn = mask_psn(kpsn + 1); + /* + * Copy the payload to destination buffer if this packet is + * delivered as an eager packet due to RSM rule and FECN. + * The RSM rule selects FECN bit in BTH and SH bit in + * KDETH header and therefore will not match the last + * packet of each segment that has SH bit cleared. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { + struct rvt_sge_state ss; + u32 len; + u32 tlen = packet->tlen; + u16 hdrsize = packet->hlen; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + + (SIZE_OF_CRC << 2); + u32 pmtu = qp->pmtu; + + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto ack_op_err; + len = restart_sge(&ss, req->e.swqe, ipsn, pmtu); + if (unlikely(len < pmtu)) + goto ack_op_err; + rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, + false); + /* Raise the sw sequence check flag for next packet */ + priv->s_flags |= HFI1_R_TID_SW_PSN; + } + + goto ack_done; + } + flow->flow_state.r_next_psn = mask_psn(kpsn + 1); + req->ack_pending--; + priv->pending_tid_r_segs--; + qp->s_num_rd_atomic--; + if ((qp->s_flags & RVT_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + + trace_hfi1_ack(qp, ipsn); + trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, + req->e.swqe->psn, req->e.swqe->lpsn, + req); + trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); + + /* Release the tid resources */ + hfi1_kern_exp_rcv_clear(req); + + if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) + goto ack_done; + + /* If not done yet, build next read request */ + if (++req->comp_seg >= req->total_segs) { + priv->tid_r_comp++; + req->state = TID_REQUEST_COMPLETE; + } + + /* + * Clear the hw flow under two conditions: + * 1. This request is a sync point and it is complete; + * 2. Current request is completed and there are no more requests. + */ + if ((req->state == TID_REQUEST_SYNC && + req->comp_seg == req->cur_seg) || + priv->tid_r_comp == priv->tid_r_reqs) { + hfi1_kern_clear_hw_flow(priv->rcd, qp); + priv->s_flags &= ~HFI1_R_TID_SW_PSN; + if (req->state == TID_REQUEST_SYNC) + req->state = TID_REQUEST_ACTIVE; + } + + hfi1_schedule_send(qp); + goto ack_done; + +ack_op_err: + /* + * The test indicates that the send engine has finished its cleanup + * after sending the request and it's now safe to put the QP into error + * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail + * == qp->s_head), it would be unsafe to complete the wqe pointed by + * qp->s_acked here. Putting the qp into error state will safely flush + * all remaining requests. + */ + if (qp->s_last == qp->s_acked) + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + u32 n = qp->s_acked; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + /* Free any TID entries */ + while (n != qp->s_tail) { + wqe = rvt_get_swqe_ptr(qp, n); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + } + + if (++n == qp->s_size) + n = 0; + } + /* Free flow */ + hfi1_kern_clear_hw_flow(priv->rcd, qp); +} + +static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type) +{ + struct rvt_qp *qp = packet->qp; + + if (rcv_type >= RHF_RCV_TYPE_IB) + goto done; + + spin_lock(&qp->s_lock); + + /* + * We've ran out of space in the eager buffer. + * Eagerly received KDETH packets which require space in the + * Eager buffer (packet that have payload) are TID RDMA WRITE + * response packets. In this case, we have to re-transmit the + * TID RDMA WRITE request. + */ + if (rcv_type == RHF_RCV_TYPE_EAGER) { + hfi1_restart_rc(qp, qp->s_last_psn + 1, 1); + hfi1_schedule_send(qp); + } + + /* Since no payload is delivered, just drop the packet */ + spin_unlock(&qp->s_lock); +done: + return true; +} + +static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + + /* Start from the right segment */ + qp->r_flags |= RVT_R_RDMAR_SEQ; + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->clear_tail]; + hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/* + * Handle the KDETH eflags for TID RDMA READ response. + * + * Return true if the last packet for a segment has been received and it is + * time to process the response normally; otherwise, return true. + * + * The caller must hold the packet->qp->r_lock and the rcu_read_lock. + */ +static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet, u8 rcv_type, + u8 rte, u32 psn, u32 ibpsn) + __must_hold(&packet->qp->r_lock) __must_hold(RCU) +{ + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ibport *ibp; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 ack_psn; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + bool ret = true; + int diff = 0; + u32 fpsn; + + lockdep_assert_held(&qp->r_lock); + trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn); + trace_hfi1_sender_read_kdeth_eflags(qp); + trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0); + spin_lock(&qp->s_lock); + /* If the psn is out of valid range, drop the packet */ + if (cmp_psn(ibpsn, qp->s_last_psn) < 0 || + cmp_psn(ibpsn, qp->s_psn) > 0) + goto s_unlock; + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. + */ + ack_psn = ibpsn - 1; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* Complete WQEs that the PSN finishes. */ + while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) { + /* + * If this request is a RDMA read or atomic, and the NACK is + * for a later operation, this NACK NAKs the RDMA read or + * atomic. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + restart_tid_rdma_read_req(rcd, qp, + wqe); + } else { + hfi1_restart_rc(qp, qp->s_last_psn + 1, + 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(/* wait */ + &qp->rspwait, + &rcd->qp_wait_list); + } + } + } + /* + * No need to process the NAK since we are + * restarting an earlier request. + */ + break; + } + + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + goto s_unlock; + } + + if (qp->s_acked == qp->s_tail) + goto s_unlock; + + /* Handle the eflags for the request */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto s_unlock; + + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + switch (rcv_type) { + case RHF_RCV_TYPE_EXPECTED: + switch (rte) { + case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: + /* + * On the first occurrence of a Flow Sequence error, + * the flag TID_FLOW_SW_PSN is set. + * + * After that, the flow is *not* reprogrammed and the + * protocol falls back to SW PSN checking. This is done + * to prevent continuous Flow Sequence errors for any + * packets that could be still in the fabric. + */ + flow = &req->flows[req->clear_tail]; + trace_hfi1_tid_flow_read_kdeth_eflags(qp, + req->clear_tail, + flow); + if (priv->s_flags & HFI1_R_TID_SW_PSN) { + diff = cmp_psn(psn, + flow->flow_state.r_next_psn); + if (diff > 0) { + /* Drop the packet.*/ + goto s_unlock; + } else if (diff < 0) { + /* + * If a response packet for a restarted + * request has come back, reset the + * restart flag. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) + qp->r_flags &= + ~RVT_R_RDMAR_SEQ; + + /* Drop the packet.*/ + goto s_unlock; + } + + /* + * If SW PSN verification is successful and + * this is the last packet in the segment, tell + * the caller to process it as a normal packet. + */ + fpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + if (cmp_psn(fpsn, psn) == 0) { + ret = false; + if (qp->r_flags & RVT_R_RDMAR_SEQ) + qp->r_flags &= + ~RVT_R_RDMAR_SEQ; + } + flow->flow_state.r_next_psn = + mask_psn(psn + 1); + } else { + u32 last_psn; + + last_psn = read_r_next_psn(dd, rcd->ctxt, + flow->idx); + flow->flow_state.r_next_psn = last_psn; + priv->s_flags |= HFI1_R_TID_SW_PSN; + /* + * If no request has been restarted yet, + * restart the current one. + */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) + restart_tid_rdma_read_req(rcd, qp, + wqe); + } + + break; + + case RHF_RTE_EXPECTED_FLOW_GEN_ERR: + /* + * Since the TID flow is able to ride through + * generation mismatch, drop this stale packet. + */ + break; + + default: + break; + } + break; + + case RHF_RCV_TYPE_ERROR: + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: + case RHF_RTE_ERROR_KHDR_HCRC_ERR: + case RHF_RTE_ERROR_KHDR_KVER_ERR: + case RHF_RTE_ERROR_CONTEXT_ERR: + case RHF_RTE_ERROR_KHDR_TID_ERR: + default: + break; + } + break; + default: + break; + } +s_unlock: + spin_unlock(&qp->s_lock); + return ret; +} + +bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_pportdata *ppd, + struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct hfi1_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + u8 rcv_type = rhf_rcv_type(packet->rhf); + u8 rte = rhf_rcv_type_err(packet->rhf); + struct ib_header *hdr = packet->hdr; + struct ib_other_headers *ohdr = NULL; + int lnh = be16_to_cpu(hdr->lrh[0]) & 3; + u16 lid = be16_to_cpu(hdr->lrh[1]); + u8 opcode; + u32 qp_num, psn, ibpsn; + struct rvt_qp *qp; + struct hfi1_qp_priv *qpriv; + unsigned long flags; + bool ret = true; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + int diff = 0; + + trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ", + packet->rhf); + if (packet->rhf & RHF_ICRC_ERR) + return ret; + + packet->ohdr = &hdr->u.oth; + ohdr = packet->ohdr; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) & + RVT_QPN_MASK; + if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto drop; + + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) + goto rcu_unlock; + + packet->qp = qp; + + /* Check for valid receive state. */ + spin_lock_irqsave(&qp->r_lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + goto r_unlock; + } + + if (packet->rhf & RHF_TID_ERR) { + /* For TIDERR and RC QPs preemptively schedule a NAK */ + u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + + /* Sanity check packet */ + if (tlen < 24) + goto r_unlock; + + /* + * Check for GRH. We should never get packets with GRH in this + * path. + */ + if (lnh == HFI1_LRH_GRH) + goto r_unlock; + + if (tid_rdma_tid_err(packet, rcv_type)) + goto r_unlock; + } + + /* handle TID RDMA READ */ + if (opcode == TID_OP(READ_RESP)) { + ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn); + ibpsn = mask_psn(ibpsn); + ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn, + ibpsn); + goto r_unlock; + } + + /* + * qp->s_tail_ack_queue points to the rvt_ack_entry currently being + * processed. These a completed sequentially so we can be sure that + * the pointer will not change until the entire request has completed. + */ + spin_lock(&qp->s_lock); + qpriv = qp->priv; + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID || + qpriv->r_tid_tail == qpriv->r_tid_head) + goto unlock; + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto unlock; + req = ack_to_tid_req(e); + if (req->comp_seg == req->cur_seg) + goto unlock; + flow = &req->flows[req->clear_tail]; + trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn); + trace_hfi1_rsp_handle_kdeth_eflags(qp, psn); + trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp); + trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn, + e->lpsn, req); + trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow); + + switch (rcv_type) { + case RHF_RCV_TYPE_EXPECTED: + switch (rte) { + case RHF_RTE_EXPECTED_FLOW_SEQ_ERR: + if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) { + qpriv->s_flags |= HFI1_R_TID_SW_PSN; + flow->flow_state.r_next_psn = + read_r_next_psn(dd, rcd->ctxt, + flow->idx); + qpriv->r_next_psn_kdeth = + flow->flow_state.r_next_psn; + goto nak_psn; + } else { + /* + * If the received PSN does not match the next + * expected PSN, NAK the packet. + * However, only do that if we know that the a + * NAK has already been sent. Otherwise, this + * mismatch could be due to packets that were + * already in flight. + */ + diff = cmp_psn(psn, + flow->flow_state.r_next_psn); + if (diff > 0) + goto nak_psn; + else if (diff < 0) + break; + + qpriv->s_nak_state = 0; + /* + * If SW PSN verification is successful and this + * is the last packet in the segment, tell the + * caller to process it as a normal packet. + */ + if (psn == full_flow_psn(flow, + flow->flow_state.lpsn)) + ret = false; + flow->flow_state.r_next_psn = + mask_psn(psn + 1); + qpriv->r_next_psn_kdeth = + flow->flow_state.r_next_psn; + } + break; + + case RHF_RTE_EXPECTED_FLOW_GEN_ERR: + goto nak_psn; + + default: + break; + } + break; + + case RHF_RCV_TYPE_ERROR: + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR: + case RHF_RTE_ERROR_KHDR_HCRC_ERR: + case RHF_RTE_ERROR_KHDR_KVER_ERR: + case RHF_RTE_ERROR_CONTEXT_ERR: + case RHF_RTE_ERROR_KHDR_TID_ERR: + default: + break; + } + break; + default: + break; + } + +unlock: + spin_unlock(&qp->s_lock); +r_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +rcu_unlock: + rcu_read_unlock(); +drop: + return ret; +nak_psn: + ibp->rvp.n_rc_seqnak++; + if (!qpriv->s_nak_state) { + qpriv->s_nak_state = IB_NAK_PSN_ERROR; + /* We are NAK'ing the next expected PSN */ + qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn); + tid_rdma_trigger_ack(qp); + } + goto unlock; +} + +/* + * "Rewind" the TID request information. + * This means that we reset the state back to ACTIVE, + * find the proper flow, set the flow index to that flow, + * and reset the flow information. + */ +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow; + struct hfi1_qp_priv *qpriv = qp->priv; + int diff, delta_pkts; + u32 tididx = 0, i; + u16 fidx; + + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + *bth2 = mask_psn(qp->s_psn); + flow = find_flow_ib(req, *bth2, &fidx); + if (!flow) { + trace_hfi1_msg_tid_restart_req(/* msg */ + qp, "!!!!!! Could not find flow to restart: bth2 ", + (u64)*bth2); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); + return; + } + } else { + fidx = req->acked_tail; + flow = &req->flows[fidx]; + *bth2 = mask_psn(req->r_ack_psn); + } + + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn); + else + delta_pkts = delta_psn(*bth2, + full_flow_psn(flow, + flow->flow_state.spsn)); + + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); + diff = delta_pkts + flow->resync_npkts; + + flow->sent = 0; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + if (diff) { + for (tididx = 0; tididx < flow->tidcnt; tididx++) { + u32 tidentry = flow->tid_entry[tididx], tidlen, + tidnpkts, npkts; + + flow->tid_offset = 0; + tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE; + tidnpkts = rvt_div_round_up_mtu(qp, tidlen); + npkts = min_t(u32, diff, tidnpkts); + flow->pkt += npkts; + flow->sent += (npkts == tidnpkts ? tidlen : + npkts * qp->pmtu); + flow->tid_offset += npkts * qp->pmtu; + diff -= npkts; + if (!diff) + break; + } + } + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) + + flow->sent, 0); + /* + * Packet PSN is based on flow_state.spsn + flow->pkt. However, + * during a RESYNC, the generation is incremented and the + * sequence is reset to 0. Since we've adjusted the npkts in the + * flow and the SGE has been sufficiently advanced, we have to + * adjust flow->pkt in order to calculate the correct PSN. + */ + flow->pkt -= flow->resync_npkts; + } + + if (flow->tid_offset == + EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) { + tididx++; + flow->tid_offset = 0; + } + flow->tid_idx = tididx; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + /* Move flow_idx to correct index */ + req->flow_idx = fidx; + else + req->clear_tail = fidx; + + trace_hfi1_tid_flow_restart_req(qp, fidx, flow); + trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + req->state = TID_REQUEST_ACTIVE; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + /* Reset all the flows that we are going to resend */ + fidx = CIRC_NEXT(fidx, MAX_FLOWS); + i = qpriv->s_tid_tail; + do { + for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + req->flows[fidx].sent = 0; + req->flows[fidx].pkt = 0; + req->flows[fidx].tid_idx = 0; + req->flows[fidx].tid_offset = 0; + req->flows[fidx].resync_npkts = 0; + } + if (i == qpriv->s_tid_cur) + break; + do { + i = (++i == qp->s_size ? 0 : i); + wqe = rvt_get_swqe_ptr(qp, i); + } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE); + req = wqe_to_tid_req(wqe); + req->cur_seg = req->ack_seg; + fidx = req->acked_tail; + /* Pull req->clear_tail back */ + req->clear_tail = fidx; + } while (1); + } +} + +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) +{ + int i, ret; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs; + + if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA)) + return; + + /* + * First, clear the flow to help prevent any delayed packets from + * being delivered. + */ + fs = &qpriv->flow_state; + if (fs->index != RXE_NUM_TID_FLOWS) + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + + for (i = qp->s_acked; i != qp->s_head;) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + if (++i == qp->s_size) + i = 0; + /* Free only locally allocated TID entries */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + continue; + do { + struct hfi1_swqe_priv *priv = wqe->priv; + + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } + for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) + i = 0; + /* Free only locally allocated TID entries */ + if (e->opcode != TID_OP(WRITE_REQ)) + continue; + do { + struct hfi1_ack_priv *priv = e->priv; + + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } +} + +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct rvt_swqe *prev; + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + struct tid_rdma_request *req; + + s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1; + prev = rvt_get_swqe_ptr(qp, s_prev); + + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + switch (prev->wr.opcode) { + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + break; + default: + break; + } + break; + case IB_WR_RDMA_READ: + if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + fallthrough; + case IB_WR_TID_RDMA_READ: + switch (prev->wr.opcode) { + case IB_WR_RDMA_READ: + if (qp->s_acked != qp->s_cur) + goto interlock; + break; + case IB_WR_TID_RDMA_WRITE: + req = wqe_to_tid_req(prev); + if (req->ack_seg != req->total_segs) + goto interlock; + break; + default: + break; + } + break; + default: + break; + } + return false; + +interlock: + priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK; + return true; +} + +/* Does @sge meet the alignment requirements for tid rdma? */ +static inline bool hfi1_check_sge_align(struct rvt_qp *qp, + struct rvt_sge *sge, int num_sge) +{ + int i; + + for (i = 0; i < num_sge; i++, sge++) { + trace_hfi1_sge_check_align(qp, i, sge); + if ((u64)sge->vaddr & ~PAGE_MASK || + sge->sge_length & ~PAGE_MASK) + return false; + } + return true; +} + +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct hfi1_swqe_priv *priv = wqe->priv; + struct tid_rdma_params *remote; + enum ib_wr_opcode new_opcode; + bool do_tid_rdma = false; + struct hfi1_pportdata *ppd = qpriv->rcd->ppd; + + if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == + ppd->lid) + return; + if (qpriv->hdr_type != HFI1_PKT_TYPE_9B) + return; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * If TID RDMA is disabled by the negotiation, don't + * use it. + */ + if (!remote) + goto exit; + + if (wqe->wr.opcode == IB_WR_RDMA_READ) { + if (hfi1_check_sge_align(qp, &wqe->sg_list[0], + wqe->wr.num_sge)) { + new_opcode = IB_WR_TID_RDMA_READ; + do_tid_rdma = true; + } + } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + /* + * TID RDMA is enabled for this RDMA WRITE request iff: + * 1. The remote address is page-aligned, + * 2. The length is larger than the minimum segment size, + * 3. The length is page-multiple. + */ + if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) && + !(wqe->length & ~PAGE_MASK)) { + new_opcode = IB_WR_TID_RDMA_WRITE; + do_tid_rdma = true; + } + } + + if (do_tid_rdma) { + if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC)) + goto exit; + wqe->wr.opcode = new_opcode; + priv->tid_req.seg_len = + min_t(u32, remote->max_len, wqe->length); + priv->tid_req.total_segs = + DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len); + /* Compute the last PSN of the request */ + wqe->lpsn = wqe->psn; + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + priv->tid_req.n_flows = remote->max_read; + qpriv->tid_r_reqs++; + wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1; + } else { + wqe->lpsn += priv->tid_req.total_segs - 1; + atomic_inc(&qpriv->n_requests); + } + + priv->tid_req.cur_seg = 0; + priv->tid_req.comp_seg = 0; + priv->tid_req.ack_seg = 0; + priv->tid_req.state = TID_REQUEST_INACTIVE; + /* + * Reset acked_tail. + * TID RDMA READ does not have ACKs so it does not + * update the pointer. We have to reset it so TID RDMA + * WRITE does not get confused. + */ + priv->tid_req.acked_tail = priv->tid_req.setup_head; + trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode, + wqe->psn, wqe->lpsn, + &priv->tid_req); + } +exit: + rcu_read_unlock(); +} + +/* TID RDMA WRITE functions */ + +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * Set the number of flow to be used based on negotiated + * parameters. + */ + req->n_flows = remote->max_write; + req->state = TID_REQUEST_ACTIVE; + + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_req.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); + ohdr->u.tid_rdma.w_req.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); + ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + qp->s_state = TID_OP(WRITE_REQ); + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + *bth2 |= IB_BTH_REQ_ACK; + *len = 0; + + rcu_read_unlock(); + return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); +} + +static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp) +{ + /* + * Heuristic for computing the RNR timeout when waiting on the flow + * queue. Rather than a computationaly expensive exact estimate of when + * a flow will be available, we assume that if a QP is at position N in + * the flow queue it has to wait approximately (N + 1) * (number of + * segments between two sync points). The rationale for this is that + * flows are released and recycled at each sync point. + */ + return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT; +} + +static u32 position_in_queue(struct hfi1_qp_priv *qpriv, + struct tid_queue *queue) +{ + return qpriv->tid_enqueue - queue->dequeue; +} + +/* + * @qp: points to rvt_qp context. + * @to_seg: desired RNR timeout in segments. + * Return: index of the next highest timeout in the ib_hfi1_rnr_table[] + */ +static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + u64 timeout; + u32 bytes_per_us; + u8 i; + + bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8; + timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us; + /* + * Find the next highest value in the RNR table to the required + * timeout. This gives the responder some padding. + */ + for (i = 1; i <= IB_AETH_CREDIT_MASK; i++) + if (rvt_rnr_tbl_to_usec(i) >= timeout) + return i; + return 0; +} + +/* + * Central place for resource allocation at TID write responder, + * is called from write_req and write_data interrupt handlers as + * well as the send thread when a queued QP is scheduled for + * resource allocation. + * + * Iterates over (a) segments of a request and then (b) queued requests + * themselves to allocate resources for up to local->max_write + * segments across multiple requests. Stop allocating when we + * hit a sync point, resume allocating after data packets at + * sync point have been received. + * + * Resource allocation and sending of responses is decoupled. The + * request/segment which are being allocated and sent are as follows. + * Resources are allocated for: + * [request: qpriv->r_tid_alloc, segment: req->alloc_seg] + * The send thread sends: + * [request: qp->s_tail_ack_queue, segment:req->cur_seg] + */ +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx) +{ + struct tid_rdma_request *req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct tid_rdma_params *local = &qpriv->tid_rdma.local; + struct rvt_ack_entry *e; + u32 npkts, to_seg; + bool last; + int ret = 0; + + lockdep_assert_held(&qp->s_lock); + + while (1) { + trace_hfi1_rsp_tid_write_alloc_res(qp, 0); + trace_hfi1_tid_write_rsp_alloc_res(qp); + /* + * Don't allocate more segments if a RNR NAK has already been + * scheduled to avoid messing up qp->r_psn: the RNR NAK will + * be sent only when all allocated segments have been sent. + * However, if more segments are allocated before that, TID RDMA + * WRITE RESP packets will be sent out for these new segments + * before the RNR NAK packet. When the requester receives the + * RNR NAK packet, it will restart with qp->s_last_psn + 1, + * which does not match qp->r_psn and will be dropped. + * Consequently, the requester will exhaust its retries and + * put the qp into error state. + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND) + break; + + /* No requests left to process */ + if (qpriv->r_tid_alloc == qpriv->r_tid_head) { + /* If all data has been received, clear the flow */ + if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS && + !qpriv->alloc_w_segs) { + hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } + break; + } + + e = &qp->s_ack_queue[qpriv->r_tid_alloc]; + if (e->opcode != TID_OP(WRITE_REQ)) + goto next_req; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn, + e->lpsn, req); + /* Finished allocating for all segments of this request */ + if (req->alloc_seg >= req->total_segs) + goto next_req; + + /* Can allocate only a maximum of local->max_write for a QP */ + if (qpriv->alloc_w_segs >= local->max_write) + break; + + /* Don't allocate at a sync point with data packets pending */ + if (qpriv->sync_pt && qpriv->alloc_w_segs) + break; + + /* All data received at the sync point, continue */ + if (qpriv->sync_pt && !qpriv->alloc_w_segs) { + hfi1_kern_clear_hw_flow(rcd, qp); + qpriv->sync_pt = false; + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + } + + /* Allocate flow if we don't have one */ + if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) { + ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp); + if (ret) { + to_seg = hfi1_compute_tid_rdma_flow_wt(qp) * + position_in_queue(qpriv, + &rcd->flow_queue); + break; + } + } + + npkts = rvt_div_round_up_mtu(qp, req->seg_len); + + /* + * We are at a sync point if we run out of KDETH PSN space. + * Last PSN of every generation is reserved for RESYNC. + */ + if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) { + qpriv->sync_pt = true; + break; + } + + /* + * If overtaking req->acked_tail, send an RNR NAK. Because the + * QP is not queued in this case, and the issue can only be + * caused by a delay in scheduling the second leg which we + * cannot estimate, we use a rather arbitrary RNR timeout of + * (MAX_FLOWS / 2) segments + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, + MAX_FLOWS)) { + ret = -EAGAIN; + to_seg = MAX_FLOWS >> 1; + tid_rdma_trigger_ack(qp); + break; + } + + /* Try to allocate rcv array / TID entries */ + ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last); + if (ret == -EAGAIN) + to_seg = position_in_queue(qpriv, &rcd->rarr_queue); + if (ret) + break; + + qpriv->alloc_w_segs++; + req->alloc_seg++; + continue; +next_req: + /* Begin processing the next request */ + if (++qpriv->r_tid_alloc > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qpriv->r_tid_alloc = 0; + } + + /* + * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation + * has failed (b) we are called from the rcv handler interrupt context + * (c) an RNR NAK has not already been scheduled + */ + if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state) + goto send_rnr_nak; + + return; + +send_rnr_nak: + lockdep_assert_held(&qp->r_lock); + + /* Set r_nak_state to prevent unrelated events from generating NAK's */ + qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK; + + /* Pull back r_psn to the segment being RNR NAK'd */ + qp->r_psn = e->psn + req->alloc_seg; + qp->r_ack_psn = qp->r_psn; + /* + * Pull back r_head_ack_queue to the ack entry following the request + * being RNR NAK'd. This allows resources to be allocated to the request + * if the queued QP is scheduled. + */ + qp->r_head_ack_queue = qpriv->r_tid_alloc + 1; + if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + qp->r_head_ack_queue = 0; + qpriv->r_tid_head = qp->r_head_ack_queue; + /* + * These send side fields are used in make_rc_ack(). They are set in + * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock + * for consistency + */ + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + /* + * Clear the ACK PENDING flag to prevent unwanted ACK because we + * have modified qp->s_ack_psn here. + */ + qp->s_flags &= ~(RVT_S_ACK_PENDING); + + trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn); + /* + * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK + * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be + * used for this because qp->s_lock is dropped before calling + * hfi1_send_rc_ack() leading to inconsistency between the receive + * interrupt handlers and the send thread in make_rc_ack() + */ + qpriv->rnr_nak_state = TID_RNR_NAK_SEND; + + /* + * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive + * interrupt handlers but will be sent from the send engine behind any + * previous responses that may have been scheduled + */ + rc_defered_ack(rcd, qp); +} + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST + * (see hfi1_rc_rcv()) + * - Don't allow 0-length requests. + * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Prepare struct tid_rdma_flow array? + * 3. Set the qp->s_ack_state as state diagram in design doc. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + u32 bth0, psn, len, rkey, num_segs; + bool fecn; + u8 next; + u64 vaddr; + int diff; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_write_req(qp, psn); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.w_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + + num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len); + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn); + return; + } + + /* + * The resent request which was previously RNR NAK'd is inserted at the + * location of the original request, which is one entry behind + * r_head_ack_queue + */ + if (qpriv->rnr_nak_state) + qp->r_head_ack_queue = qp->r_head_ack_queue ? + qp->r_head_ack_queue - 1 : + rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_acked_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlock; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + req = ack_to_tid_req(e); + + /* Bring previously RNR NAK'd request back to life */ + if (qpriv->rnr_nak_state) { + qp->r_nak_state = 0; + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + req->state = TID_REQUEST_INIT; + goto update_head; + } + + release_rdma_sge_mr(e); + + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK) + goto nack_inv_unlock; + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (e->opcode == TID_OP(WRITE_REQ) && + (req->setup_head != req->clear_tail || + req->clear_tail != req->acked_tail)) + goto nack_inv_unlock; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE))) + goto nack_acc; + + qp->r_psn += num_segs - 1; + + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = qp->r_psn; + e->sent = 0; + + req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write); + req->state = TID_REQUEST_INIT; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->alloc_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = num_segs; + req->r_flow_psn = e->psn; + req->ss.sge = e->rdma_sge; + req->ss.num_sge = 1; + + req->flow_idx = req->setup_head; + req->clear_tail = req->setup_head; + req->acked_tail = req->setup_head; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + + trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + + if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) { + qpriv->r_tid_tail = qp->r_head_ack_queue; + } else if (qpriv->r_tid_tail == qpriv->r_tid_head) { + struct tid_rdma_request *ptr; + + e = &qp->s_ack_queue[qpriv->r_tid_tail]; + ptr = ack_to_tid_req(e); + + if (e->opcode != TID_OP(WRITE_REQ) || + ptr->comp_seg == ptr->total_segs) { + if (qpriv->r_tid_tail == qpriv->r_tid_ack) + qpriv->r_tid_ack = qp->r_head_ack_queue; + qpriv->r_tid_tail = qp->r_head_ack_queue; + } + } +update_head: + qp->r_head_ack_queue = next; + qpriv->r_tid_head = qp->r_head_ack_queue; + + hfi1_tid_write_alloc_resources(qp, true); + trace_hfi1_tid_write_rsp_rcv_req(qp); + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + if (fecn) + qp->s_flags |= RVT_S_ECN; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +} + +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = NULL; + u32 resp_len = 0, hdwords = 0; + void *resp_addr = NULL; + struct tid_rdma_params *remote; + + trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_build_resp(qp); + trace_hfi1_rsp_build_tid_write_resp(qp, bth2); + flow = &req->flows[req->flow_idx]; + switch (req->state) { + default: + /* + * Try to allocate resources here in case QP was queued and was + * later scheduled when resources became available + */ + hfi1_tid_write_alloc_resources(qp, false); + + /* We've already sent everything which is ready */ + if (req->cur_seg >= req->alloc_seg) + goto done; + + /* + * Resources can be assigned but responses cannot be sent in + * rnr_nak state, till the resent request is received + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT) + goto done; + + req->state = TID_REQUEST_ACTIVE; + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + hfi1_add_tid_reap_timer(qp); + break; + + case TID_REQUEST_RESEND_ACTIVE: + case TID_REQUEST_RESEND: + trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow); + req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS); + if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS)) + req->state = TID_REQUEST_ACTIVE; + + hfi1_mod_tid_reap_timer(qp); + break; + } + flow->flow_state.resp_ib_psn = bth2; + resp_addr = (void *)flow->tid_entry; + resp_len = sizeof(*flow->tid_entry) * flow->tidcnt; + req->cur_seg++; + + memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp)); + epriv->ss.sge.vaddr = resp_addr; + epriv->ss.sge.sge_length = resp_len; + epriv->ss.sge.length = epriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + epriv->ss.sge.mr = NULL; + epriv->ss.sge.m = 0; + epriv->ss.sge.n = 0; + + epriv->ss.sg_list = NULL; + epriv->ss.total_len = epriv->ss.sge.sge_length; + epriv->ss.num_sge = 1; + + *ss = &epriv->ss; + *len = epriv->ss.total_len; + + /* Construct the TID RDMA WRITE RESP packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp); + ohdr->u.tid_rdma.w_rsp.tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + (flow->flow_state.spsn & + HFI1_KDETH_BTH_SEQ_MASK)); + ohdr->u.tid_rdma.w_rsp.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32); + qpriv->pending_tid_w_segs++; +done: + return hdwords; +} + +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) { + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + qpriv->s_tid_timer.expires = jiffies + + qpriv->tid_timer_timeout_jiffies; + add_timer(&qpriv->s_tid_timer); + } +} + +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + qpriv->s_flags |= HFI1_R_TID_RSC_TIMER; + mod_timer(&qpriv->s_tid_timer, jiffies + + qpriv->tid_timer_timeout_jiffies); +} + +static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + rval = del_timer(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; + } + return rval; +} + +void hfi1_del_tid_reap_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + del_timer_sync(&qpriv->s_tid_timer); + qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER; +} + +static void hfi1_tid_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer); + struct rvt_qp *qp = qpriv->owner; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + unsigned long flags; + u32 i; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) { + dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n", + qp->ibqp.qp_num, __func__, __LINE__); + trace_hfi1_msg_tid_timeout(/* msg */ + qp, "resource timeout = ", + (u64)qpriv->tid_timer_timeout_jiffies); + hfi1_stop_tid_reap_timer(qp); + /* + * Go though the entire ack queue and clear any outstanding + * HW flow and RcvArray resources. + */ + hfi1_kern_clear_hw_flow(qpriv->rcd, qp); + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct tid_rdma_request *req = + ack_to_tid_req(&qp->s_ack_queue[i]); + + hfi1_kern_exp_rcv_clear_all(req); + } + spin_unlock(&qp->s_lock); + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR); + goto unlock_r_lock; + } + spin_unlock(&qp->s_lock); +unlock_r_lock: + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that TIDENTRY array has enough space for a complete + * segment. If not, put QP in error state. + * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow + * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 5. Set qp->s_state + * 6. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + enum ib_wc_status status; + u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen; + bool fecn; + unsigned long flags; + + fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Ignore invalid responses */ + if (cmp_psn(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0)) + goto ack_done; + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + + /* + * If we are waiting for a particular packet sequence number + * due to a request being resent, check for it. Otherwise, + * ensure that we haven't missed anything. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; + } + + wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + /* + * If we've lost ACKs and our acked_tail pointer is too far + * behind, don't overwrite segments. Just drop the packet and + * let the reliability protocol take care of it. + */ + if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS)) + goto ack_done; + + /* + * The call to do_rc_ack() should be last in the chain of + * packet checks because it will end up updating the QP state. + * Therefore, anything that would prevent the packet from + * being accepted as a successful response should be prior + * to it. + */ + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + + trace_hfi1_ack(qp, psn); + + flow = &req->flows[req->setup_head]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->resync_npkts = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->flow_state.resp_ib_psn = psn; + flow->length = min_t(u32, req->seg_len, + (wqe->length - (req->comp_seg * req->seg_len))); + + flow->npkts = rvt_div_round_up_mtu(qp, flow->length); + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow); + + req->comp_seg++; + trace_hfi1_tid_write_sender_rcv_resp(qp, 0); + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. + */ + for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_write_resp(/* entry */ + qp, i, flow->tid_entry[i]); + if (!EXP_TID_GET(flow->tid_entry[i], LEN)) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + tidlen += EXP_TID_GET(flow->tid_entry[i], LEN); + } + if (tidlen * PAGE_SIZE < flow->length) { + status = IB_WC_LOC_LEN_ERR; + goto ack_err; + } + + trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + /* + * If this is the first response for this request, set the initial + * flow index to the current flow. + */ + if (!cmp_psn(psn, wqe->psn)) { + req->r_last_acked = mask_psn(wqe->psn - 1); + /* Set acked flow index to head index */ + req->acked_tail = req->setup_head; + } + + /* advance circular buffer head */ + req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS); + req->state = TID_REQUEST_ACTIVE; + + /* + * If all responses for this TID RDMA WRITE request have been received + * advance the pointer to the next one. + * Since TID RDMA requests could be mixed in with regular IB requests, + * they might not appear sequentially in the queue. Therefore, the + * next request needs to be "found". + */ + if (qpriv->s_tid_cur != qpriv->s_tid_head && + req->comp_seg == req->total_segs) { + for (i = qpriv->s_tid_cur + 1; ; i++) { + if (i == qp->s_size) + i = 0; + wqe = rvt_get_swqe_ptr(qp, i); + if (i == qpriv->s_tid_head) + break; + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + qpriv->s_tid_cur = i; + } + qp->s_flags &= ~HFI1_S_WAIT_TID_RESP; + hfi1_schedule_tid_send(qp); + goto ack_done; + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; +ack_err: + rvt_error_qp(qp, status); +ack_done: + if (fecn) + qp->s_flags |= RVT_S_ECN; + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct tid_rdma_params *remote; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + + if (!tidlen) { + hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR); + rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR); + } + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->tid_idx == (flow->tidcnt - 1) && + next_offset >= tidlen) || (flow->sent >= flow->length); + trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(wd->kdeth0, KVER, 0x1); + KDETH_SET(wd->kdeth0, SH, !last_pkt); + KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(wd->kdeth1, JKEY, remote->jkey); + wd->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + if (last_pkt) { + /* PSNs are zero-based, so +1 to count number of packets */ + if (flow->flow_state.lpsn + 1 + + rvt_div_round_up_mtu(qp, req->seg_len) > + MAX_TID_FLOW_PSN) + req->state = TID_REQUEST_SYNC; + *bth2 |= IB_BTH_REQ_ACK; + } + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + return last_pkt; +} + +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet) +{ + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = priv->rcd; + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u32 psn, next; + u8 opcode; + bool fecn; + + fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + /* + * All error handling should be done by now. If we are here, the packet + * is either good or been accepted by the error handler. + */ + spin_lock_irqsave(&qp->s_lock, flags); + e = &qp->s_ack_queue[priv->r_tid_tail]; + req = ack_to_tid_req(e); + flow = &req->flows[req->clear_tail]; + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) { + update_r_next_psn_fecn(packet, priv, rcd, flow, fecn); + + if (cmp_psn(psn, flow->flow_state.r_next_psn)) + goto send_nak; + + flow->flow_state.r_next_psn = mask_psn(psn + 1); + /* + * Copy the payload to destination buffer if this packet is + * delivered as an eager packet due to RSM rule and FECN. + * The RSM rule selects FECN bit in BTH and SH bit in + * KDETH header and therefore will not match the last + * packet of each segment that has SH bit cleared. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) { + struct rvt_sge_state ss; + u32 len; + u32 tlen = packet->tlen; + u16 hdrsize = packet->hlen; + u8 pad = packet->pad; + u8 extra_bytes = pad + packet->extra_byte + + (SIZE_OF_CRC << 2); + u32 pmtu = qp->pmtu; + + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto send_nak; + len = req->comp_seg * req->seg_len; + len += delta_psn(psn, + full_flow_psn(flow, flow->flow_state.spsn)) * + pmtu; + if (unlikely(req->total_len - len < pmtu)) + goto send_nak; + + /* + * The e->rdma_sge field is set when TID RDMA WRITE REQ + * is first received and is never modified thereafter. + */ + ss.sge = e->rdma_sge; + ss.sg_list = NULL; + ss.num_sge = 1; + ss.total_len = req->total_len; + rvt_skip_sge(&ss, len, false); + rvt_copy_sge(qp, &ss, packet->payload, pmtu, false, + false); + /* Raise the sw sequence check flag for next packet */ + priv->r_next_psn_kdeth = mask_psn(psn + 1); + priv->s_flags |= HFI1_R_TID_SW_PSN; + } + goto exit; + } + flow->flow_state.r_next_psn = mask_psn(psn + 1); + hfi1_kern_exp_rcv_clear(req); + priv->alloc_w_segs--; + rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK; + req->comp_seg++; + priv->s_nak_state = 0; + + /* + * Release the flow if one of the following conditions has been met: + * - The request has reached a sync point AND all outstanding + * segments have been completed, or + * - The entire request is complete and there are no more requests + * (of any kind) in the queue. + */ + trace_hfi1_rsp_rcv_tid_write_data(qp, psn); + trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn, + req); + trace_hfi1_tid_write_rsp_rcv_data(qp); + validate_r_tid_ack(priv); + + if (opcode == TID_OP(WRITE_DATA_LAST)) { + release_rdma_sge_mr(e); + for (next = priv->r_tid_tail + 1; ; next++) { + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + if (next == priv->r_tid_head) + break; + e = &qp->s_ack_queue[next]; + if (e->opcode == TID_OP(WRITE_REQ)) + break; + } + priv->r_tid_tail = next; + if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi)) + qp->s_acked_ack_queue = 0; + } + + hfi1_tid_write_alloc_resources(qp, true); + + /* + * If we need to generate more responses, schedule the + * send engine. + */ + if (req->cur_seg < req->total_segs || + qp->s_tail_ack_queue != qp->r_head_ack_queue) { + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + } + + priv->pending_tid_w_segs--; + if (priv->s_flags & HFI1_R_TID_RSC_TIMER) { + if (priv->pending_tid_w_segs) + hfi1_mod_tid_reap_timer(req->qp); + else + hfi1_stop_tid_reap_timer(req->qp); + } + +done: + tid_rdma_schedule_ack(qp); +exit: + priv->r_next_psn_kdeth = flow->flow_state.r_next_psn; + if (fecn) + qp->s_flags |= RVT_S_ECN; + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + +send_nak: + if (!priv->s_nak_state) { + priv->s_nak_state = IB_NAK_PSN_ERROR; + priv->s_nak_psn = flow->flow_state.r_next_psn; + tid_rdma_trigger_ack(qp); + } + goto done; +} + +static bool hfi1_tid_rdma_is_resync_psn(u32 psn) +{ + return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) == + HFI1_KDETH_BTH_SEQ_MASK); +} + +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct tid_rdma_request *req = ack_to_tid_req(e); + struct tid_rdma_flow *flow = &req->flows[iflow]; + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + if (qpriv->resync) { + *bth2 = mask_psn((fs->generation << + HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } else if (qpriv->s_nak_state) { + *bth2 = mask_psn(qpriv->s_nak_psn); + ohdr->u.tid_rdma.ack.aeth = + cpu_to_be32((qp->r_msn & IB_MSN_MASK) | + (qpriv->s_nak_state << + IB_AETH_CREDIT_SHIFT)); + } else { + *bth2 = full_flow_psn(flow, flow->flow_state.lpsn); + ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp); + } + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + ohdr->u.tid_rdma.ack.tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + + ohdr->u.tid_rdma.ack.tid_flow_psn = 0; + ohdr->u.tid_rdma.ack.verbs_psn = + cpu_to_be32(flow->flow_state.resp_ib_psn); + + if (qpriv->resync) { + /* + * If the PSN before the current expect KDETH PSN is the + * RESYNC PSN, then we never received a good TID RDMA WRITE + * DATA packet after a previous RESYNC. + * In this case, the next expected KDETH PSN stays the same. + */ + if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) { + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + } else { + /* + * Because the KDETH PSNs jump during a RESYNC, it's + * not possible to infer (or compute) the previous value + * of r_next_psn_kdeth in the case of back-to-back + * RESYNC packets. Therefore, we save it. + */ + qpriv->r_next_psn_kdeth_save = + qpriv->r_next_psn_kdeth - 1; + ohdr->u.tid_rdma.ack.tid_flow_psn = + cpu_to_be32(qpriv->r_next_psn_kdeth_save); + qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1); + } + qpriv->resync = false; + } + + return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32); +} + +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn; + unsigned long flags; + u16 fidx; + + trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0); + process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth); + req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn)); + resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn)); + + spin_lock_irqsave(&qp->s_lock, flags); + trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn); + + /* If we are waiting for an ACK to RESYNC, drop any other packets */ + if ((qp->s_flags & HFI1_S_WAIT_HALT) && + cmp_psn(psn, qpriv->s_resync_psn)) + goto ack_op_err; + + ack_psn = req_psn; + if (hfi1_tid_rdma_is_resync_psn(psn)) + ack_kpsn = resync_psn; + else + ack_kpsn = psn; + if (aeth >> 29) { + ack_psn--; + ack_kpsn--; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_op_err; + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + goto ack_op_err; + + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); + + /* Drop stale ACK/NAK */ + if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 || + cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0) + goto ack_op_err; + + while (cmp_psn(ack_kpsn, + full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 && + req->ack_seg < req->cur_seg) { + req->ack_seg++; + /* advance acked segment pointer */ + req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS); + req->r_last_acked = flow->flow_state.resp_ib_psn; + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + if (req->ack_seg == req->total_segs) { + req->state = TID_REQUEST_COMPLETE; + wqe = do_rc_completion(qp, wqe, + to_iport(qp->ibqp.device, + qp->port_num)); + trace_hfi1_sender_rcv_tid_ack(qp); + atomic_dec(&qpriv->n_tid_requests); + if (qp->s_acked == qp->s_tail) + break; + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + break; + req = wqe_to_tid_req(wqe); + } + flow = &req->flows[req->acked_tail]; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow); + } + + trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + switch (aeth >> 29) { + case 0: /* ACK */ + if (qpriv->s_flags & RVT_S_WAIT_ACK) + qpriv->s_flags &= ~RVT_S_WAIT_ACK; + if (!hfi1_tid_rdma_is_resync_psn(psn)) { + /* Check if there is any pending TID ACK */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_mod_tid_retry_timer(qp); + else + hfi1_stop_tid_retry_timer(qp); + hfi1_schedule_send(qp); + } else { + u32 spsn, fpsn, last_acked, generation; + struct tid_rdma_request *rptr; + + /* ACK(RESYNC) */ + hfi1_stop_tid_retry_timer(qp); + /* Allow new requests (see hfi1_make_tid_rdma_pkt) */ + qp->s_flags &= ~HFI1_S_WAIT_HALT; + /* + * Clear RVT_S_SEND_ONE flag in case that the TID RDMA + * ACK is received after the TID retry timer is fired + * again. In this case, do not send any more TID + * RESYNC request or wait for any more TID ACK packet. + */ + qpriv->s_flags &= ~RVT_S_SEND_ONE; + hfi1_schedule_send(qp); + + if ((qp->s_acked == qpriv->s_tid_tail && + req->ack_seg == req->total_segs) || + qp->s_acked == qp->s_tail) { + qpriv->s_state = TID_OP(WRITE_DATA_LAST); + goto done; + } + + if (req->ack_seg == req->comp_seg) { + qpriv->s_state = TID_OP(WRITE_DATA); + goto done; + } + + /* + * The PSN to start with is the next PSN after the + * RESYNC PSN. + */ + psn = mask_psn(psn + 1); + generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + spsn = 0; + + /* + * Update to the correct WQE when we get an ACK(RESYNC) + * in the middle of a request. + */ + if (delta_psn(ack_psn, wqe->lpsn)) + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + flow = &req->flows[req->acked_tail]; + /* + * RESYNC re-numbers the PSN ranges of all remaining + * segments. Also, PSN's start from 0 in the middle of a + * segment and the first segment size is less than the + * default number of packets. flow->resync_npkts is used + * to track the number of packets from the start of the + * real segment to the point of 0 PSN after the RESYNC + * in order to later correctly rewind the SGE. + */ + fpsn = full_flow_psn(flow, flow->flow_state.spsn); + req->r_ack_psn = psn; + /* + * If resync_psn points to the last flow PSN for a + * segment and the new segment (likely from a new + * request) starts with a new generation number, we + * need to adjust resync_psn accordingly. + */ + if (flow->flow_state.generation != + (resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT)) + resync_psn = mask_psn(fpsn - 1); + flow->resync_npkts += + delta_psn(mask_psn(resync_psn + 1), fpsn); + /* + * Renumber all packet sequence number ranges + * based on the new generation. + */ + last_acked = qp->s_acked; + rptr = req; + while (1) { + /* start from last acked segment */ + for (fidx = rptr->acked_tail; + CIRC_CNT(rptr->setup_head, fidx, + MAX_FLOWS); + fidx = CIRC_NEXT(fidx, MAX_FLOWS)) { + u32 lpsn; + u32 gen; + + flow = &rptr->flows[fidx]; + gen = flow->flow_state.generation; + if (WARN_ON(gen == generation && + flow->flow_state.spsn != + spsn)) + continue; + lpsn = flow->flow_state.lpsn; + lpsn = full_flow_psn(flow, lpsn); + flow->npkts = + delta_psn(lpsn, + mask_psn(resync_psn) + ); + flow->flow_state.generation = + generation; + flow->flow_state.spsn = spsn; + flow->flow_state.lpsn = + flow->flow_state.spsn + + flow->npkts - 1; + flow->pkt = 0; + spsn += flow->npkts; + resync_psn += flow->npkts; + trace_hfi1_tid_flow_rcv_tid_ack(qp, + fidx, + flow); + } + if (++last_acked == qpriv->s_tid_cur + 1) + break; + if (last_acked == qp->s_size) + last_acked = 0; + wqe = rvt_get_swqe_ptr(qp, last_acked); + rptr = wqe_to_tid_req(wqe); + } + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + hfi1_schedule_tid_send(qp); + } +done: + qpriv->s_retry = qp->s_retry_cnt; + break; + + case 3: /* NAK */ + hfi1_stop_tid_retry_timer(qp); + switch ((aeth >> IB_AETH_CREDIT_SHIFT) & + IB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + if (!req->flows) + break; + flow = &req->flows[req->acked_tail]; + flpsn = full_flow_psn(flow, flow->flow_state.lpsn); + if (cmp_psn(psn, flpsn) > 0) + break; + trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, + flow); + req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + req->cur_seg = req->ack_seg; + qpriv->s_tid_tail = qp->s_acked; + qpriv->s_state = TID_OP(WRITE_REQ); + qpriv->s_retry = qp->s_retry_cnt; + hfi1_schedule_tid_send(qp); + break; + + default: + break; + } + break; + + default: + break; + } + +ack_op_err: + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) { + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + priv->s_tid_retry_timer.expires = jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies; + add_timer(&priv->s_tid_retry_timer); + } +} + +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + lockdep_assert_held(&qp->s_lock); + priv->s_flags |= HFI1_S_TID_RETRY_TIMER; + mod_timer(&priv->s_tid_retry_timer, jiffies + + priv->tid_retry_timeout_jiffies + rdi->busy_jiffies); +} + +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + int rval = 0; + + lockdep_assert_held(&qp->s_lock); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + rval = del_timer(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; + } + return rval; +} + +void hfi1_del_tid_retry_timer(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + del_timer_sync(&priv->s_tid_retry_timer); + priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER; +} + +static void hfi1_tid_retry_timeout(struct timer_list *t) +{ + struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer); + struct rvt_qp *qp = priv->owner; + struct rvt_swqe *wqe; + unsigned long flags; + struct tid_rdma_request *req; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + trace_hfi1_tid_write_sender_retry_timeout(qp, 0); + if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) { + hfi1_stop_tid_retry_timer(qp); + if (!priv->s_retry) { + trace_hfi1_msg_tid_retry_timeout(/* msg */ + qp, + "Exhausted retries. Tid retry timeout = ", + (u64)priv->tid_retry_timeout_jiffies); + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } else { + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_tid_retry_timeout(/* req */ + qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req); + + priv->s_flags &= ~RVT_S_WAIT_ACK; + /* Only send one packet (the RESYNC) */ + priv->s_flags |= RVT_S_SEND_ONE; + /* + * No additional request shall be made by this QP until + * the RESYNC has been complete. + */ + qp->s_flags |= HFI1_S_WAIT_HALT; + priv->s_state = TID_OP(RESYNC); + priv->s_retry--; + hfi1_schedule_tid_send(qp); + } + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_params *remote; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[fidx]; + u32 generation; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 = remote->qp; + rcu_read_unlock(); + + generation = kern_flow_generation_next(flow->flow_state.generation); + *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1); + qpriv->s_resync_psn = *bth2; + *bth2 |= IB_BTH_REQ_ACK; + KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1); + + return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32); +} + +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet) +{ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ctxtdata *rcd = qpriv->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + struct tid_flow_state *fs = &qpriv->flow_state; + u32 psn, generation, idx, gen_next; + bool fecn; + unsigned long flags; + + fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + + generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT; + spin_lock_irqsave(&qp->s_lock, flags); + + gen_next = (fs->generation == KERN_GENERATION_RESERVED) ? + generation : kern_flow_generation_next(fs->generation); + /* + * RESYNC packet contains the "next" generation and can only be + * from the current or previous generations + */ + if (generation != mask_generation(gen_next - 1) && + generation != gen_next) + goto bail; + /* Already processing a resync */ + if (qpriv->resync) + goto bail; + + spin_lock(&rcd->exp_lock); + if (fs->index >= RXE_NUM_TID_FLOWS) { + /* + * If we don't have a flow, save the generation so it can be + * applied when a new flow is allocated + */ + fs->generation = generation; + } else { + /* Reprogram the QP flow with new generation */ + rcd->flows[fs->index].generation = generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + } + fs->psn = 0; + /* + * Disable SW PSN checking since a RESYNC is equivalent to a + * sync point and the flow has/will be reprogrammed + */ + qpriv->s_flags &= ~HFI1_R_TID_SW_PSN; + trace_hfi1_tid_write_rsp_rcv_resync(qp); + + /* + * Reset all TID flow information with the new generation. + * This is done for all requests and segments after the + * last received segment + */ + for (idx = qpriv->r_tid_tail; ; idx++) { + u16 flow_idx; + + if (idx > rvt_size_atomic(&dev->rdi)) + idx = 0; + e = &qp->s_ack_queue[idx]; + if (e->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn, + e->lpsn, req); + + /* start from last unacked segment */ + for (flow_idx = req->clear_tail; + CIRC_CNT(req->setup_head, flow_idx, + MAX_FLOWS); + flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) { + u32 lpsn; + u32 next; + + flow = &req->flows[flow_idx]; + lpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + next = flow->flow_state.r_next_psn; + flow->npkts = delta_psn(lpsn, next - 1); + flow->flow_state.generation = fs->generation; + flow->flow_state.spsn = fs->psn; + flow->flow_state.lpsn = + flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, + flow->flow_state.spsn); + fs->psn += flow->npkts; + trace_hfi1_tid_flow_rcv_resync(qp, flow_idx, + flow); + } + } + if (idx == qp->s_tail_ack_queue) + break; + } + + spin_unlock(&rcd->exp_lock); + qpriv->resync = true; + /* RESYNC request always gets a TID RDMA ACK. */ + qpriv->s_nak_state = 0; + tid_rdma_trigger_ack(qp); +bail: + if (fecn) + qp->s_flags |= RVT_S_ECN; + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/* + * Call this function when the last TID RDMA WRITE DATA packet for a request + * is built. + */ +static void update_tid_tail(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + u32 i; + struct rvt_swqe *wqe; + + lockdep_assert_held(&qp->s_lock); + /* Can't move beyond s_tid_cur */ + if (priv->s_tid_tail == priv->s_tid_cur) + return; + for (i = priv->s_tid_tail + 1; ; i++) { + if (i == qp->s_size) + i = 0; + + if (i == priv->s_tid_cur) + break; + wqe = rvt_get_swqe_ptr(qp, i); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + } + priv->s_tid_tail = i; + priv->s_state = TID_OP(WRITE_RESP); +} + +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct rvt_swqe *wqe; + u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0; + struct ib_other_headers *ohdr; + struct rvt_sge_state *ss = &qp->s_sge; + struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + struct tid_rdma_request *req = ack_to_tid_req(e); + bool last = false; + u8 opcode = TID_OP(WRITE_DATA); + + lockdep_assert_held(&qp->s_lock); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); + /* + * Prioritize the sending of the requests and responses over the + * sending of the TID RDMA data packets. + */ + if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) && + atomic_read(&priv->n_requests) && + !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK | + HFI1_S_ANY_WAIT_IO))) || + (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg && + !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) { + struct iowait_work *iowork; + + iowork = iowait_get_ib_work(&priv->s_iowait); + ps->s_txreq = get_waiting_verbs_txreq(iowork); + if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) { + priv->s_flags |= HFI1_S_TID_BUSY_SET; + return 1; + } + } + + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + + if ((priv->s_flags & RVT_S_ACK_PENDING) && + make_tid_rdma_ack(qp, ohdr, ps)) + return 1; + + /* + * Bail out if we can't send data. + * Be reminded that this check must been done after the call to + * make_tid_rdma_ack() because the responding QP could be in + * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. + */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) + goto bail; + + if (priv->s_flags & RVT_S_WAIT_ACK) + goto bail; + + /* Check whether there is anything to do. */ + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) + goto bail; + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + switch (priv->s_state) { + case TID_OP(WRITE_REQ): + case TID_OP(WRITE_RESP): + priv->tid_ss.sge = wqe->sg_list[0]; + priv->tid_ss.sg_list = wqe->sg_list + 1; + priv->tid_ss.num_sge = wqe->wr.num_sge; + priv->tid_ss.total_len = wqe->length; + + if (priv->s_state == TID_OP(WRITE_REQ)) + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + priv->s_state = TID_OP(WRITE_DATA); + fallthrough; + + case TID_OP(WRITE_DATA): + /* + * 1. Check whether TID RDMA WRITE RESP available. + * 2. If no: + * 2.1 If have more segments and no TID RDMA WRITE RESP, + * set HFI1_S_WAIT_TID_RESP + * 2.2 Return indicating no progress made. + * 3. If yes: + * 3.1 Build TID RDMA WRITE DATA packet. + * 3.2 If last packet in segment: + * 3.2.1 Change KDETH header bits + * 3.2.2 Advance RESP pointers. + * 3.3 Return indicating progress made. + */ + trace_hfi1_sender_make_tid_pkt(qp); + trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0); + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail); + req = wqe_to_tid_req(wqe); + len = wqe->length; + + if (!req->comp_seg || req->cur_seg == req->comp_seg) + goto bail; + + trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2, + &len); + + if (last) { + /* move pointer to next flow */ + req->clear_tail = CIRC_NEXT(req->clear_tail, + MAX_FLOWS); + if (++req->cur_seg < req->total_segs) { + if (!CIRC_CNT(req->setup_head, req->clear_tail, + MAX_FLOWS)) + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + } else { + priv->s_state = TID_OP(WRITE_DATA_LAST); + opcode = TID_OP(WRITE_DATA_LAST); + + /* Advance the s_tid_tail now */ + update_tid_tail(qp); + } + } + hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32); + ss = &priv->tid_ss; + break; + + case TID_OP(RESYNC): + trace_hfi1_sender_make_tid_pkt(qp); + /* Use generation from the most recently received response */ + wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + req = wqe_to_tid_req(wqe); + /* If no responses for this WQE look at the previous one */ + if (!req->comp_seg) { + wqe = rvt_get_swqe_ptr(qp, + (!priv->s_tid_cur ? qp->s_size : + priv->s_tid_cur) - 1); + req = wqe_to_tid_req(wqe); + } + hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1, + &bth2, + CIRC_PREV(req->setup_head, + MAX_FLOWS)); + ss = NULL; + len = 0; + opcode = TID_OP(RESYNC); + break; + + default: + goto bail; + } + if (priv->s_flags & RVT_S_SEND_ONE) { + priv->s_flags &= ~RVT_S_SEND_ONE; + priv->s_flags |= RVT_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->ss = ss; + ps->s_txreq->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, + middle, ps); + return 1; +bail: + hfi1_put_txreq(ps->s_txreq); +bail_no_tx: + ps->s_txreq = NULL; + priv->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again, set the flags to the wake up which work item to wake + * up. + * (A better algorithm should be found to do this and generalize the + * sleep/wakeup flags.) + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + return 0; +} + +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps) +{ + struct rvt_ack_entry *e; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + u32 hwords, next; + u32 len = 0; + u32 bth1 = 0, bth2 = 0; + int middle = 0; + u16 flow; + struct tid_rdma_request *req, *nreq; + + trace_hfi1_tid_write_rsp_make_tid_ack(qp); + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + /* + * In the RESYNC case, we are exactly one segment past the + * previously sent ack or at the previously sent NAK. So to send + * the resync ack, we go back one segment (which might be part of + * the previous request) and let the do-while loop execute again. + * The advantage of executing the do-while loop is that any data + * received after the previous ack is automatically acked in the + * RESYNC ack. It turns out that for the do-while loop we only need + * to pull back qpriv->r_tid_ack, not the segment + * indices/counters. The scheme works even if the previous request + * was not a TID WRITE request. + */ + if (qpriv->resync) { + if (!req->ack_seg || req->ack_seg == req->total_segs) + qpriv->r_tid_ack = !qpriv->r_tid_ack ? + rvt_size_atomic(&dev->rdi) : + qpriv->r_tid_ack - 1; + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } + + trace_hfi1_rsp_make_tid_ack(qp, e->psn); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); + /* + * If we've sent all the ACKs that we can, we are done + * until we get more segments... + */ + if (!qpriv->s_nak_state && !qpriv->resync && + req->ack_seg == req->comp_seg) + goto bail; + + do { + /* + * To deal with coalesced ACKs, the acked_tail pointer + * into the flow array is used. The distance between it + * and the clear_tail is the number of flows that are + * being ACK'ed. + */ + req->ack_seg += + /* Get up-to-date value */ + CIRC_CNT(req->clear_tail, req->acked_tail, + MAX_FLOWS); + /* Advance acked index */ + req->acked_tail = req->clear_tail; + + /* + * req->clear_tail points to the segment currently being + * received. So, when sending an ACK, the previous + * segment is being ACK'ed. + */ + flow = CIRC_PREV(req->acked_tail, MAX_FLOWS); + if (req->ack_seg != req->total_segs) + break; + req->state = TID_REQUEST_COMPLETE; + + next = qpriv->r_tid_ack + 1; + if (next > rvt_size_atomic(&dev->rdi)) + next = 0; + qpriv->r_tid_ack = next; + if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ)) + break; + nreq = ack_to_tid_req(&qp->s_ack_queue[next]); + if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg) + break; + + /* Move to the next ack entry now */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + } while (1); + + /* + * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and + * req could be pointing at the previous ack queue entry + */ + if (qpriv->s_nak_state || + (qpriv->resync && + !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) && + (cmp_psn(qpriv->r_next_psn_kdeth - 1, + full_flow_psn(&req->flows[flow], + req->flows[flow].flow_state.lpsn)) > 0))) { + /* + * A NAK will implicitly acknowledge all previous TID RDMA + * requests. Therefore, we NAK with the req->acked_tail + * segment for the request at qpriv->r_tid_ack (same at + * this point as the req->clear_tail segment for the + * qpriv->r_tid_tail request) + */ + e = &qp->s_ack_queue[qpriv->r_tid_ack]; + req = ack_to_tid_req(e); + flow = req->acked_tail; + } else if (req->ack_seg == req->total_segs && + qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + + trace_hfi1_tid_write_rsp_make_tid_ack(qp); + trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn, + req); + hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1, + &bth2); + len = 0; + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = qpriv->s_sde; + ps->s_txreq->s_cur_size = len; + ps->s_txreq->ss = NULL; + hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle, + ps); + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; + return 1; +bail: + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * RVT_S_RESP_PENDING + */ + smp_wmb(); + qpriv->s_flags &= ~RVT_S_ACK_PENDING; + return 0; +} + +static int hfi1_send_tid_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(priv->s_flags & RVT_S_BUSY || + qp->s_flags & HFI1_S_ANY_WAIT_IO) && + (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) || + (priv->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND)); +} + +void _hfi1_do_tid_send(struct work_struct *work) +{ + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); + + hfi1_do_tid_send(qp); +} + +static void hfi1_do_tid_send(struct rvt_qp *qp) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + ps.wait = iowait_get_tid_work(&priv->s_iowait); + ps.in_thread = false; + ps.timeout_int = qp->timeout_jiffies / 8; + + trace_hfi1_rc_do_tid_send(qp, false); + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_tid_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + priv->s_flags |= RVT_S_BUSY; + + ps.timeout = jiffies + ps.timeout_int; + ps.cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + ps.pkts_sent = false; + + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); + do { + /* Check for a constructed packet to be sent. */ + if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags |= RVT_S_BUSY; + ps.wait = iowait_get_ib_work(&priv->s_iowait); + } + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + + /* allow other tasks to run */ + if (hfi1_schedule_send_yield(qp, &ps, true)) + return; + + spin_lock_irqsave(&qp->s_lock, ps.flags); + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= ~HFI1_S_TID_BUSY_SET; + ps.wait = iowait_get_tid_work(&priv->s_iowait); + if (iowait_flag_set(&priv->s_iowait, + IOWAIT_PENDING_IB)) + hfi1_schedule_send(qp); + } + } + } while (hfi1_make_tid_rdma_pkt(qp, &ps)); + iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} + +static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = ppd->dd; + + if ((dd->flags & HFI1_SHUTDOWN)) + return true; + + return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +/** + * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine + * @qp: the QP + * + * This schedules qp progress on the TID RDMA state machine. Caller + * should hold the s_lock. + * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because + * the two state machines can step on each other with respect to the + * RVT_S_BUSY flag. + * Therefore, a modified test is used. + * @return true if the second leg is scheduled; + * false if the second leg is not scheduled. + */ +bool hfi1_schedule_tid_send(struct rvt_qp *qp) +{ + lockdep_assert_held(&qp->s_lock); + if (hfi1_send_tid_ok(qp)) { + /* + * The following call returns true if the qp is not on the + * queue and false if the qp is already on the queue before + * this call. Either way, the qp will be on the queue when the + * call returns. + */ + _hfi1_schedule_tid_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_TID); + return false; +} + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e) +{ + struct rvt_ack_entry *prev; + struct tid_rdma_request *req; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + u32 s_prev; + + s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) : + (qp->s_tail_ack_queue - 1); + prev = &qp->s_ack_queue[s_prev]; + + if ((e->opcode == TID_OP(READ_REQ) || + e->opcode == OP(RDMA_READ_REQUEST)) && + prev->opcode == TID_OP(WRITE_REQ)) { + req = ack_to_tid_req(prev); + if (req->ack_seg != req->total_segs) { + priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK; + return true; + } + } + return false; +} + +static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx) +{ + u64 reg; + + /* + * The only sane way to get the amount of + * progress is to read the HW flow state. + */ + reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx)); + return mask_psn(reg); +} + +static void tid_rdma_rcv_err(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff, bool fecn) +{ + unsigned long flags; + + tid_rdma_rcv_error(packet, ohdr, qp, psn, diff); + if (fecn) { + spin_lock_irqsave(&qp->s_lock, flags); + qp->s_flags |= RVT_S_ECN; + spin_unlock_irqrestore(&qp->s_lock, flags); + } +} + +static void update_r_next_psn_fecn(struct hfi1_packet *packet, + struct hfi1_qp_priv *priv, + struct hfi1_ctxtdata *rcd, + struct tid_rdma_flow *flow, + bool fecn) +{ + /* + * If a start/middle packet is delivered here due to + * RSM rule and FECN, we need to update the r_next_psn. + */ + if (fecn && packet->etype == RHF_RCV_TYPE_EAGER && + !(priv->s_flags & HFI1_R_TID_SW_PSN)) { + struct hfi1_devdata *dd = rcd->dd; + + flow->flow_state.r_next_psn = + read_r_next_psn(dd, rcd->ctxt, flow->idx); + } +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h new file mode 100644 index 0000000000..6e82df2190 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#ifndef HFI1_TID_RDMA_H +#define HFI1_TID_RDMA_H + +#include <linux/circ_buf.h> +#include "common.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + +#define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ +#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ +#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) +#define TID_RDMA_SEGMENT_SHIFT 18 + +/* + * Bit definitions for priv->s_flags. + * These bit flags overload the bit flags defined for the QP's s_flags. + * Due to the fact that these bit fields are used only for the QP priv + * s_flags, there are no collisions. + * + * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock + * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock + */ +#define HFI1_S_TID_BUSY_SET BIT(0) +/* BIT(1) reserved for RVT_S_BUSY. */ +#define HFI1_R_TID_RSC_TIMER BIT(2) +/* BIT(3) reserved for RVT_S_RESP_PENDING. */ +/* BIT(4) reserved for RVT_S_ACK_PENDING. */ +#define HFI1_S_TID_WAIT_INTERLCK BIT(5) +#define HFI1_R_TID_WAIT_INTERLCK BIT(6) +/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */ +/* BIT(16) reserved for RVT_S_SEND_ONE */ +#define HFI1_S_TID_RETRY_TIMER BIT(17) +/* BIT(18) reserved for RVT_S_ECN. */ +#define HFI1_R_TID_SW_PSN BIT(19) +/* BIT(26) reserved for HFI1_S_WAIT_HALT */ +/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */ +/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */ + +/* + * Unlike regular IB RDMA VERBS, which do not require an entry + * in the s_ack_queue, TID RDMA WRITE requests do because they + * generate responses. + * Therefore, the s_ack_queue needs to be extended by a certain + * amount. The key point is that the queue needs to be extended + * without letting the "user" know so they user doesn't end up + * using these extra entries. + */ +#define HFI1_TID_RDMA_WRITE_CNT 8 + +struct tid_rdma_params { + struct rcu_head rcu_head; + u32 qp; + u32 max_len; + u16 jkey; + u8 max_read; + u8 max_write; + u8 timeout; + u8 urg; + u8 version; +}; + +struct tid_rdma_qp_params { + struct work_struct trigger_work; + struct tid_rdma_params local; + struct tid_rdma_params __rcu *remote; +}; + +/* Track state for each hardware flow */ +struct tid_flow_state { + u32 generation; + u32 psn; + u8 index; + u8 last_index; +}; + +enum tid_rdma_req_state { + TID_REQUEST_INACTIVE = 0, + TID_REQUEST_INIT, + TID_REQUEST_INIT_RESEND, + TID_REQUEST_ACTIVE, + TID_REQUEST_RESEND, + TID_REQUEST_RESEND_ACTIVE, + TID_REQUEST_QUEUED, + TID_REQUEST_SYNC, + TID_REQUEST_RNR_NAK, + TID_REQUEST_COMPLETE, +}; + +struct tid_rdma_request { + struct rvt_qp *qp; + struct hfi1_ctxtdata *rcd; + union { + struct rvt_swqe *swqe; + struct rvt_ack_entry *ack; + } e; + + struct tid_rdma_flow *flows; /* array of tid flows */ + struct rvt_sge_state ss; /* SGE state for TID RDMA requests */ + u16 n_flows; /* size of the flow buffer window */ + u16 setup_head; /* flow index we are setting up */ + u16 clear_tail; /* flow index we are clearing */ + u16 flow_idx; /* flow index most recently set up */ + u16 acked_tail; + + u32 seg_len; + u32 total_len; + u32 r_ack_psn; /* next expected ack PSN */ + u32 r_flow_psn; /* IB PSN of next segment start */ + u32 r_last_acked; /* IB PSN of last ACK'ed packet */ + u32 s_next_psn; /* IB PSN of next segment start for read */ + + u32 total_segs; /* segments required to complete a request */ + u32 cur_seg; /* index of current segment */ + u32 comp_seg; /* index of last completed segment */ + u32 ack_seg; /* index of last ack'ed segment */ + u32 alloc_seg; /* index of next segment to be allocated */ + u32 isge; /* index of "current" sge */ + u32 ack_pending; /* num acks pending for this request */ + + enum tid_rdma_req_state state; +}; + +/* + * When header suppression is used, PSNs associated with a "flow" are + * relevant (and not the PSNs maintained by verbs). Track per-flow + * PSNs here for a TID RDMA segment. + * + */ +struct flow_state { + u32 flags; + u32 resp_ib_psn; /* The IB PSN of the response for this flow */ + u32 generation; /* generation of flow */ + u32 spsn; /* starting PSN in TID space */ + u32 lpsn; /* last PSN in TID space */ + u32 r_next_psn; /* next PSN to be received (in TID space) */ + + /* For tid rdma read */ + u32 ib_spsn; /* starting PSN in Verbs space */ + u32 ib_lpsn; /* last PSn in Verbs space */ +}; + +struct tid_rdma_pageset { + dma_addr_t addr : 48; /* Only needed for the first page */ + u8 idx: 8; + u8 count : 7; + u8 mapped: 1; +}; + +/** + * kern_tid_node - used for managing TID's in TID groups + * + * @grp_idx: rcd relative index to tid_group + * @map: grp->map captured prior to programming this TID group in HW + * @cnt: Only @cnt of available group entries are actually programmed + */ +struct kern_tid_node { + struct tid_group *grp; + u8 map; + u8 cnt; +}; + +/* Overall info for a TID RDMA segment */ +struct tid_rdma_flow { + /* + * While a TID RDMA segment is being transferred, it uses a QP number + * from the "KDETH section of QP numbers" (which is different from the + * QP number that originated the request). Bits 11-15 of these QP + * numbers identify the "TID flow" for the segment. + */ + struct flow_state flow_state; + struct tid_rdma_request *req; + u32 tid_qpn; + u32 tid_offset; + u32 length; + u32 sent; + u8 tnode_cnt; + u8 tidcnt; + u8 tid_idx; + u8 idx; + u8 npagesets; + u8 npkts; + u8 pkt; + u8 resync_npkts; + struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; + struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; + u32 tid_entry[TID_RDMA_MAX_PAGES]; +}; + +enum tid_rnr_nak_state { + TID_RNR_NAK_INIT = 0, + TID_RNR_NAK_SEND, + TID_RNR_NAK_SENT, +}; + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); +void tid_rdma_conn_error(struct rvt_qp *qp); +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); + +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last); +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req); +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req); +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + +/** + * trdma_clean_swqe - clean flows for swqe if large send queue + * @qp: the qp + * @wqe: the send wqe + */ +static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + if (!wqe->priv) + return; + __trdma_clean_swqe(qp, wqe); +} + +void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp); + +int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_init_attr *init_attr); +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); + +void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp); + +int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); +void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp); +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd); + +struct cntr_entry; +u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data); + +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len); +void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet); +u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth0, + u32 *bth1, u32 *bth2, u32 *len, bool *last); +void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet); +bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd, + struct hfi1_pportdata *ppd, + struct hfi1_packet *packet); +void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + u32 *bth2); +void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp); +bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe); + +void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (wqe->priv && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_RDMA_WRITE) && + wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE) + setup_tid_rdma_wqe(qp, wqe); +} + +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); + +void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet); + +u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth1, + u32 bth2, u32 *len, + struct rvt_sge_state **ss); + +void hfi1_del_tid_reap_timer(struct rvt_qp *qp); + +void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet); + +bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); + +void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet); + +u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u16 iflow, + u32 *bth1, u32 *bth2); + +void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet); + +void hfi1_add_tid_retry_timer(struct rvt_qp *qp); +void hfi1_del_tid_retry_timer(struct rvt_qp *qp); + +u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u16 fidx); + +void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet); + +struct hfi1_pkt_state; +int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +void _hfi1_do_tid_send(struct work_struct *work); + +bool hfi1_schedule_tid_send(struct rvt_qp *qp); + +bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e); + +#endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c new file mode 100644 index 0000000000..8302469582 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -0,0 +1,532 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ +#define CREATE_TRACE_POINTS +#include "trace.h" +#include "exp_rcv.h" +#include "ipoib.h" + +static u8 __get_ib_hdr_len(struct ib_header *hdr) +{ + struct ib_other_headers *ohdr; + u8 opcode; + + if (ib_get_lnh(hdr) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + opcode = ib_bth_get_opcode(ohdr); + return hdr_len_by_opcode[opcode] == 0 ? + 0 : hdr_len_by_opcode[opcode] - (12 + 8); +} + +static u8 __get_16b_hdr_len(struct hfi1_16b_header *hdr) +{ + struct ib_other_headers *ohdr = NULL; + u8 opcode; + u8 l4 = hfi1_16B_get_l4(hdr); + + if (l4 == OPA_16B_L4_FM) { + opcode = IB_OPCODE_UD_SEND_ONLY; + return (8 + 8); /* No BTH */ + } + + if (l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + + opcode = ib_bth_get_opcode(ohdr); + return hdr_len_by_opcode[opcode] == 0 ? + 0 : hdr_len_by_opcode[opcode] - (12 + 8 + 8); +} + +u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet) +{ + if (packet->etype != RHF_RCV_TYPE_BYPASS) + return __get_ib_hdr_len(packet->hdr); + else + return __get_16b_hdr_len(packet->hdr); +} + +u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opa_hdr) +{ + if (!opa_hdr->hdr_type) + return __get_ib_hdr_len(&opa_hdr->ibh); + else + return __get_16b_hdr_len(&opa_hdr->opah); +} + +const char *hfi1_trace_get_packet_l4_str(u8 l4) +{ + if (l4) + return "16B"; + else + return "9B"; +} + +const char *hfi1_trace_get_packet_l2_str(u8 l2) +{ + switch (l2) { + case 0: + return "0"; + case 1: + return "1"; + case 2: + return "16B"; + case 3: + return "9B"; + } + return ""; +} + +#define IMM_PRN "imm:%d" +#define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x" +#define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x" +#define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x" +#define DETH_ENTROPY_PRN "deth qkey:0x%.8x sqpn:0x%.6x entropy:0x%.2x" +#define IETH_PRN "ieth rkey:0x%.8x" +#define ATOMICACKETH_PRN "origdata:%llx" +#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx" +#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x" +#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x" +#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_READ_RSP_PRN "verbs_qp 0x%x" +#define TID_WRITE_REQ_PRN "original_qp 0x%x" +#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_WRITE_DATA_PRN "verbs_qp 0x%x" +#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x" +#define TID_RESYNC_PRN "verbs_qp 0x%x" + +#define OP(transport, op) IB_OPCODE_## transport ## _ ## op + +static const char *parse_syndrome(u8 syndrome) +{ + switch (syndrome >> 5) { + case 0: + return "ACK"; + case 1: + return "RNRNAK"; + case 3: + return "NAK"; + } + return ""; +} + +void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr, + u8 *ack, bool *becn, bool *fecn, u8 *mig, + u8 *se, u8 *pad, u8 *opcode, u8 *tver, + u16 *pkey, u32 *psn, u32 *qpn) +{ + *ack = ib_bth_get_ackreq(ohdr); + *becn = ib_bth_get_becn(ohdr); + *fecn = ib_bth_get_fecn(ohdr); + *mig = ib_bth_get_migreq(ohdr); + *se = ib_bth_get_se(ohdr); + *pad = ib_bth_get_pad(ohdr); + *opcode = ib_bth_get_opcode(ohdr); + *tver = ib_bth_get_tver(ohdr); + *pkey = ib_bth_get_pkey(ohdr); + *psn = mask_psn(ib_bth_get_psn(ohdr)); + *qpn = ib_bth_get_qpn(ohdr); +} + +void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *mig, u8 *opcode, + u8 *pad, u8 *se, u8 *tver, + u32 *psn, u32 *qpn) +{ + *ack = ib_bth_get_ackreq(ohdr); + *mig = ib_bth_get_migreq(ohdr); + *opcode = ib_bth_get_opcode(ohdr); + *pad = ib_bth_get_pad(ohdr); + *se = ib_bth_get_se(ohdr); + *tver = ib_bth_get_tver(ohdr); + *psn = mask_psn(ib_bth_get_psn(ohdr)); + *qpn = ib_bth_get_qpn(ohdr); +} + +static u16 ib_get_len(const struct ib_header *hdr) +{ + return be16_to_cpu(hdr->lrh[2]); +} + +void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5, + u8 *lnh, u8 *lver, u8 *sl, u8 *sc, + u16 *len, u32 *dlid, u32 *slid) +{ + *lnh = ib_get_lnh(hdr); + *lver = ib_get_lver(hdr); + *sl = ib_get_sl(hdr); + *sc = ib_get_sc(hdr) | (sc5 << 4); + *len = ib_get_len(hdr); + *dlid = ib_get_dlid(hdr); + *slid = ib_get_slid(hdr); +} + +void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr, + u8 *age, bool *becn, bool *fecn, + u8 *l4, u8 *rc, u8 *sc, + u16 *entropy, u16 *len, u16 *pkey, + u32 *dlid, u32 *slid) +{ + *age = hfi1_16B_get_age(hdr); + *becn = hfi1_16B_get_becn(hdr); + *fecn = hfi1_16B_get_fecn(hdr); + *l4 = hfi1_16B_get_l4(hdr); + *rc = hfi1_16B_get_rc(hdr); + *sc = hfi1_16B_get_sc(hdr); + *entropy = hfi1_16B_get_entropy(hdr); + *len = hfi1_16B_get_len(hdr); + *pkey = hfi1_16B_get_pkey(hdr); + *dlid = hfi1_16B_get_dlid(hdr); + *slid = hfi1_16B_get_slid(hdr); +} + +#define LRH_PRN "len:%d sc:%d dlid:0x%.4x slid:0x%.4x " +#define LRH_9B_PRN "lnh:%d,%s lver:%d sl:%d" +#define LRH_16B_PRN "age:%d becn:%d fecn:%d l4:%d " \ + "rc:%d sc:%d pkey:0x%.4x entropy:0x%.4x" +const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, + u8 age, bool becn, bool fecn, u8 l4, + u8 lnh, const char *lnh_name, u8 lver, + u8 rc, u8 sc, u8 sl, u16 entropy, + u16 len, u16 pkey, u32 dlid, u32 slid) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, LRH_PRN, len, sc, dlid, slid); + + if (bypass) + trace_seq_printf(p, LRH_16B_PRN, + age, becn, fecn, l4, rc, sc, pkey, entropy); + + else + trace_seq_printf(p, LRH_9B_PRN, + lnh, lnh_name, lver, sl); + trace_seq_putc(p, 0); + + return ret; +} + +#define BTH_9B_PRN \ + "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d pkey:0x%.4x " \ + "f:%d b:%d qpn:0x%.6x a:%d psn:0x%.8x" +#define BTH_16B_PRN \ + "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d " \ + "qpn:0x%.6x a:%d psn:0x%.8x" +#define L4_FM_16B_PRN \ + "op:0x%.2x,%s dest_qpn:0x%.6x src_qpn:0x%.6x" +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (bypass) + if (l4 == OPA_16B_L4_FM) + trace_seq_printf(p, L4_FM_16B_PRN, + opcode, opname, dest_qpn, src_qpn); + else + trace_seq_printf(p, BTH_16B_PRN, + opcode, opname, + se, mig, pad, tver, qpn, ack, psn); + + else + trace_seq_printf(p, BTH_9B_PRN, + opcode, opname, + se, mig, pad, tver, pkey, fecn, becn, + qpn, ack, psn); + trace_seq_putc(p, 0); + + return ret; +} + +const char *parse_everbs_hdrs( + struct trace_seq *p, + u8 opcode, u8 l4, u32 dest_qpn, u32 src_qpn, + void *ehdrs) +{ + union ib_ehdrs *eh = ehdrs; + const char *ret = trace_seq_buffer_ptr(p); + + if (l4 == OPA_16B_L4_FM) { + trace_seq_printf(p, "mgmt pkt"); + goto out; + } + + switch (opcode) { + /* imm */ + case OP(RC, SEND_LAST_WITH_IMMEDIATE): + case OP(UC, SEND_LAST_WITH_IMMEDIATE): + case OP(RC, SEND_ONLY_WITH_IMMEDIATE): + case OP(UC, SEND_ONLY_WITH_IMMEDIATE): + case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + trace_seq_printf(p, IMM_PRN, + be32_to_cpu(eh->imm_data)); + break; + /* reth + imm */ + case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, RETH_PRN " " IMM_PRN, + get_ib_reth_vaddr(&eh->rc.reth), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length), + be32_to_cpu(eh->rc.imm_data)); + break; + /* reth */ + case OP(RC, RDMA_READ_REQUEST): + case OP(RC, RDMA_WRITE_FIRST): + case OP(UC, RDMA_WRITE_FIRST): + case OP(RC, RDMA_WRITE_ONLY): + case OP(UC, RDMA_WRITE_ONLY): + trace_seq_printf(p, RETH_PRN, + get_ib_reth_vaddr(&eh->rc.reth), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length)); + break; + case OP(RC, RDMA_READ_RESPONSE_FIRST): + case OP(RC, RDMA_READ_RESPONSE_LAST): + case OP(RC, RDMA_READ_RESPONSE_ONLY): + case OP(RC, ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24, + parse_syndrome(be32_to_cpu(eh->aeth) >> 24), + be32_to_cpu(eh->aeth) & IB_MSN_MASK); + break; + case OP(TID_RDMA, WRITE_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_WRITE_REQ_PRN, + le32_to_cpu(eh->tid_rdma.w_req.kdeth0), + le32_to_cpu(eh->tid_rdma.w_req.kdeth1), + ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.w_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.w_req.reth.length), + be32_to_cpu(eh->tid_rdma.w_req.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_RESP): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_WRITE_RSP_PRN, + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0), + le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1), + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.w_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp)); + break; + case OP(TID_RDMA, WRITE_DATA_LAST): + case OP(TID_RDMA, WRITE_DATA): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN, + le32_to_cpu(eh->tid_rdma.w_data.kdeth0), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID), + KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.w_data.kdeth1), + KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.w_data.verbs_qp)); + break; + case OP(TID_RDMA, READ_REQ): + trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " " + TID_READ_REQ_PRN, + le32_to_cpu(eh->tid_rdma.r_req.kdeth0), + le32_to_cpu(eh->tid_rdma.r_req.kdeth1), + ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr), + be32_to_cpu(eh->tid_rdma.r_req.reth.rkey), + be32_to_cpu(eh->tid_rdma.r_req.reth.length), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.r_req.verbs_qp)); + break; + case OP(TID_RDMA, READ_RESP): + trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " " + TID_READ_RSP_PRN, + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET), + le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1), + KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY), + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.r_rsp.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp)); + break; + case OP(TID_RDMA, ACK): + trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " " + TID_ACK_PRN, + le32_to_cpu(eh->tid_rdma.ack.kdeth0), + le32_to_cpu(eh->tid_rdma.ack.kdeth1), + be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24, + parse_syndrome(/* aeth */ + be32_to_cpu(eh->tid_rdma.ack.aeth) + >> 24), + (be32_to_cpu(eh->tid_rdma.ack.aeth) & + IB_MSN_MASK), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn), + be32_to_cpu(eh->tid_rdma.ack.verbs_psn), + be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp), + be32_to_cpu(eh->tid_rdma.ack.verbs_qp)); + break; + case OP(TID_RDMA, RESYNC): + trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN, + le32_to_cpu(eh->tid_rdma.resync.kdeth0), + le32_to_cpu(eh->tid_rdma.resync.kdeth1), + be32_to_cpu(eh->tid_rdma.resync.verbs_qp)); + break; + /* aeth + atomicacketh */ + case OP(RC, ATOMIC_ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, + be32_to_cpu(eh->at.aeth) >> 24, + parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24), + be32_to_cpu(eh->at.aeth) & IB_MSN_MASK, + ib_u64_get(&eh->at.atomic_ack_eth)); + break; + /* atomiceth */ + case OP(RC, COMPARE_SWAP): + case OP(RC, FETCH_ADD): + trace_seq_printf(p, ATOMICETH_PRN, + get_ib_ateth_vaddr(&eh->atomic_eth), + eh->atomic_eth.rkey, + get_ib_ateth_swap(&eh->atomic_eth), + get_ib_ateth_compare(&eh->atomic_eth)); + break; + /* deth */ + case OP(UD, SEND_ONLY): + trace_seq_printf(p, DETH_ENTROPY_PRN, + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK, + be32_to_cpu(eh->ud.deth[1]) >> + HFI1_IPOIB_ENTROPY_SHIFT); + break; + case OP(UD, SEND_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, DETH_PRN, + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK); + break; + /* ieth */ + case OP(RC, SEND_LAST_WITH_INVALIDATE): + case OP(RC, SEND_ONLY_WITH_INVALIDATE): + trace_seq_printf(p, IETH_PRN, + be32_to_cpu(eh->ieth)); + break; + } +out: + trace_seq_putc(p, 0); + return ret; +} + +const char *parse_sdma_flags( + struct trace_seq *p, + u64 desc0, u64 desc1) +{ + const char *ret = trace_seq_buffer_ptr(p); + char flags[5] = { 'x', 'x', 'x', 'x', 0 }; + + flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-'; + flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + trace_seq_printf(p, "%s", flags); + if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) + trace_seq_printf(p, " amode:%u aidx:%u alen:%u", + (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) & + SDMA_DESC1_HEADER_MODE_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) & + SDMA_DESC1_HEADER_INDEX_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) & + SDMA_DESC1_HEADER_DWS_MASK)); + return ret; +} + +const char *print_u32_array( + struct trace_seq *p, + u32 *arr, int len) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; i < len ; i++) + trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]); + trace_seq_putc(p, 0); + return ret; +} + +u8 hfi1_trace_get_tid_ctrl(u32 ent) +{ + return EXP_TID_GET(ent, CTRL); +} + +u16 hfi1_trace_get_tid_len(u32 ent) +{ + return EXP_TID_GET(ent, LEN); +} + +u16 hfi1_trace_get_tid_idx(u32 ent) +{ + return EXP_TID_GET(ent, IDX); +} + +struct hfi1_ctxt_hist { + atomic_t count; + atomic_t data[255]; +}; + +static struct hfi1_ctxt_hist hist = { + .count = ATOMIC_INIT(0) +}; + +const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt) +{ + int i, len = ARRAY_SIZE(hist.data); + const char *ret = trace_seq_buffer_ptr(p); + unsigned long packet_count = atomic_fetch_inc(&hist.count); + + trace_seq_printf(p, "packet[%lu]", packet_count); + for (i = 0; i < len; ++i) { + unsigned long val; + atomic_t *count = &hist.data[i]; + + if (ctxt == i) + val = atomic_fetch_inc(count); + else + val = atomic_read(count); + + if (val) + trace_seq_printf(p, "(%d:%lu)", i, val); + } + trace_seq_putc(p, 0); + return ret; +} + +__hfi1_trace_fn(AFFINITY); +__hfi1_trace_fn(PKT); +__hfi1_trace_fn(PROC); +__hfi1_trace_fn(SDMA); +__hfi1_trace_fn(LINKVERB); +__hfi1_trace_fn(DEBUG); +__hfi1_trace_fn(SNOOP); +__hfi1_trace_fn(CNTR); +__hfi1_trace_fn(PIO); +__hfi1_trace_fn(DC8051); +__hfi1_trace_fn(FIRMWARE); +__hfi1_trace_fn(RCVCTRL); +__hfi1_trace_fn(TID); +__hfi1_trace_fn(MMU); +__hfi1_trace_fn(IOCTL); diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h new file mode 100644 index 0000000000..31e027c5a0 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#include "trace_dbg.h" +#include "trace_misc.h" +#include "trace_ctxts.h" +#include "trace_ibhdrs.h" +#include "trace_rc.h" +#include "trace_rx.h" +#include "trace_tx.h" +#include "trace_mmu.h" +#include "trace_iowait.h" +#include "trace_tid.h" diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h new file mode 100644 index 0000000000..1858eaf33b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* +* Copyright(c) 2015 - 2020 Intel Corporation. +*/ + +#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_CTXTS_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ctxts + +#define UCTXT_FMT \ + "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, " \ + "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx, subctxt_cnt:%u" +TRACE_EVENT(hfi1_uctxtdata, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt, + unsigned int subctxt), + TP_ARGS(dd, uctxt, subctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(unsigned int, subctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(void __iomem *, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_dma) + __field(u32, eager_cnt) + __field(u64, rcvegr_dma) + __field(unsigned int, subctxt_cnt) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->subctxt = subctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free); + __entry->piobase = uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = get_hdrq_cnt(uctxt); + __entry->rcvhdrq_dma = uctxt->rcvhdrq_dma; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_dma = uctxt->egrbufs.rcvtids[0].dma; + __entry->subctxt_cnt = uctxt->subctxt_cnt; + ), + TP_printk("[%s] ctxt %u:%u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_dma, + __entry->eager_cnt, + __entry->rcvegr_dma, + __entry->subctxt_cnt + ) +); + +#define CINFO_FMT \ + "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" +TRACE_EVENT(hfi1_ctxt_info, + TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt, + unsigned int subctxt, + struct hfi1_ctxt_info *cinfo), + TP_ARGS(dd, ctxt, subctxt, cinfo), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned int, ctxt) + __field(unsigned int, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo->egrtids; + __entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo->rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo->sdma_ring_size; + __entry->rcvegr_size = cinfo->rcvegr_size; + ), + TP_printk("[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) +); + +const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt); +TRACE_EVENT(ctxt_rsm_hist, + TP_PROTO(unsigned int ctxt), + TP_ARGS(ctxt), + TP_STRUCT__entry(__field(unsigned int, ctxt)), + TP_fast_assign(__entry->ctxt = ctxt;), + TP_printk("%s", hfi1_trace_print_rsm_hist(p, __entry->ctxt)) +); + +#endif /* __HFI1_TRACE_CTXTS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ctxts +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h new file mode 100644 index 0000000000..489395bfb5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* +* Copyright(c) 2015 - 2018 Intel Corporation. +*/ + +#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_EXTRA_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +/* + * Note: + * This produces a REALLY ugly trace in the console output when the string is + * too long. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_dbg + +#define MAX_MSG_LEN 512 + +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif + +DECLARE_EVENT_CLASS(hfi1_trace_template, + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry(__string(function, function) + __vstring(msg, vaf->fmt, vaf->va) + ), + TP_fast_assign(__assign_str(function, function); + __assign_vstr(msg, vaf->fmt, vaf->va); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) +); + +#pragma GCC diagnostic pop + +/* + * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an + * actual function to work and can not be in a macro. + */ +#define __hfi1_trace_def(lvl) \ +void __printf(2, 3) __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ + \ +DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ + TP_PROTO(const char *function, struct va_format *vaf), \ + TP_ARGS(function, vaf)) + +#define __hfi1_trace_fn(lvl) \ +void __printf(2, 3) __hfi1_trace_##lvl(const char *func, char *fmt, ...)\ +{ \ + struct va_format vaf = { \ + .fmt = fmt, \ + }; \ + va_list args; \ + \ + va_start(args, fmt); \ + vaf.va = &args; \ + trace_hfi1_ ##lvl(func, &vaf); \ + va_end(args); \ + return; \ +} + +/* + * To create a new trace level simply define it below and as a __hfi1_trace_fn + * in trace.c. This will create all the hooks for calling + * hfi1_cdbg(LVL, fmt, ...); as well as take care of all + * the debugfs stuff. + */ +__hfi1_trace_def(AFFINITY); +__hfi1_trace_def(PKT); +__hfi1_trace_def(PROC); +__hfi1_trace_def(SDMA); +__hfi1_trace_def(LINKVERB); +__hfi1_trace_def(DEBUG); +__hfi1_trace_def(SNOOP); +__hfi1_trace_def(CNTR); +__hfi1_trace_def(PIO); +__hfi1_trace_def(DC8051); +__hfi1_trace_def(FIRMWARE); +__hfi1_trace_def(RCVCTRL); +__hfi1_trace_def(TID); +__hfi1_trace_def(MMU); +__hfi1_trace_def(IOCTL); + +#define hfi1_cdbg(which, fmt, ...) \ + __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) + +#define hfi1_dbg(fmt, ...) \ + hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) + +/* + * Define HFI1_EARLY_DBG at compile time or here to enable early trace + * messages. Do not check in an enablement for this. + */ + +#ifdef HFI1_EARLY_DBG +#define hfi1_dbg_early(fmt, ...) \ + trace_printk(fmt, ##__VA_ARGS__) +#else +#define hfi1_dbg_early(fmt, ...) +#endif + +#endif /* __HFI1_TRACE_EXTRA_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_dbg +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h new file mode 100644 index 0000000000..b33f8f575f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2017 Intel Corporation. + */ + +#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IBHDRS_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ibhdrs + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ + ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ + ib_opcode_name(TID_RDMA_WRITE_REQ), \ + ib_opcode_name(TID_RDMA_WRITE_RESP), \ + ib_opcode_name(TID_RDMA_WRITE_DATA), \ + ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \ + ib_opcode_name(TID_RDMA_READ_REQ), \ + ib_opcode_name(TID_RDMA_READ_RESP), \ + ib_opcode_name(TID_RDMA_RESYNC), \ + ib_opcode_name(TID_RDMA_ACK), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) + +u8 ibhdr_exhdr_len(struct ib_header *hdr); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, + u8 l4, u32 dest_qpn, u32 src_qpn, + void *ehdrs); +u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opah); +u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet); +const char *hfi1_trace_get_packet_l4_str(u8 l4); +void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr, + u8 *ack, bool *becn, bool *fecn, u8 *mig, + u8 *se, u8 *pad, u8 *opcode, u8 *tver, + u16 *pkey, u32 *psn, u32 *qpn); +void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5, + u8 *lnh, u8 *lver, u8 *sl, u8 *sc, + u16 *len, u32 *dlid, u32 *slid); +void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr, + u8 *ack, u8 *mig, u8 *opcode, + u8 *pad, u8 *se, u8 *tver, + u32 *psn, u32 *qpn); +void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr, + u8 *age, bool *becn, bool *fecn, + u8 *l4, u8 *rc, u8 *sc, + u16 *entropy, u16 *len, u16 *pkey, + u32 *dlid, u32 *slid); + +const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, + u8 age, bool becn, bool fecn, u8 l4, + u8 lnh, const char *lnh_name, u8 lver, + u8 rc, u8 sc, u8 sl, u16 entropy, + u16 len, u16 pkey, u32 dlid, u32 slid); + +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn); + +const char *hfi1_trace_get_packet_l2_str(u8 l2); + +#define __parse_ib_ehdrs(op, l4, dest_qpn, src_qpn, ehdrs) \ + parse_everbs_hdrs(p, op, l4, dest_qpn, src_qpn, ehdrs) + +#define lrh_name(lrh) { HFI1_##lrh, #lrh } +#define show_lnh(lrh) \ +__print_symbolic(lrh, \ + lrh_name(LRH_BTH), \ + lrh_name(LRH_GRH)) + +DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_packet *packet, + bool sc5), + TP_ARGS(dd, packet, sc5), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u8, etype) + __field(u8, ack) + __field(u8, age) + __field(bool, becn) + __field(bool, fecn) + __field(u8, l2) + __field(u8, l4) + __field(u8, lnh) + __field(u8, lver) + __field(u8, mig) + __field(u8, opcode) + __field(u8, pad) + __field(u8, rc) + __field(u8, sc) + __field(u8, se) + __field(u8, sl) + __field(u8, tver) + __field(u16, entropy) + __field(u16, len) + __field(u16, pkey) + __field(u32, dlid) + __field(u32, psn) + __field(u32, qpn) + __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) + /* extended headers */ + __dynamic_array(u8, ehdrs, + hfi1_trace_packet_hdr_len(packet)) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + + __entry->etype = packet->etype; + __entry->l2 = hfi1_16B_get_l2(packet->hdr); + __entry->dest_qpn = 0; + __entry->src_qpn = 0; + if (__entry->etype == RHF_RCV_TYPE_BYPASS) { + hfi1_trace_parse_16b_hdr(packet->hdr, + &__entry->age, + &__entry->becn, + &__entry->fecn, + &__entry->l4, + &__entry->rc, + &__entry->sc, + &__entry->entropy, + &__entry->len, + &__entry->pkey, + &__entry->dlid, + &__entry->slid); + + if (__entry->l4 == OPA_16B_L4_FM) { + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(packet->mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(packet->mgmt); + } else { + hfi1_trace_parse_16b_bth(packet->ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } + } else { + __entry->l4 = OPA_16B_L4_9B; + hfi1_trace_parse_9b_hdr(packet->hdr, sc5, + &__entry->lnh, + &__entry->lver, + &__entry->sl, + &__entry->sc, + &__entry->len, + &__entry->dlid, + &__entry->slid); + + hfi1_trace_parse_9b_bth(packet->ohdr, + &__entry->ack, + &__entry->becn, + &__entry->fecn, + &__entry->mig, + &__entry->se, + &__entry->pad, + &__entry->opcode, + &__entry->tver, + &__entry->pkey, + &__entry->psn, + &__entry->qpn); + } + /* extended headers */ + if (__entry->l4 != OPA_16B_L4_FM) + memcpy(__get_dynamic_array(ehdrs), + &packet->ohdr->u, + __get_dynamic_array_len(ehdrs)); + ), + TP_printk("[%s] (%s) %s %s hlen:%d %s", + __get_str(dev), + __entry->etype != RHF_RCV_TYPE_BYPASS ? + show_packettype(__entry->etype) : + hfi1_trace_get_packet_l2_str( + __entry->l2), + hfi1_trace_fmt_lrh(p, + __entry->etype == + RHF_RCV_TYPE_BYPASS, + __entry->age, + __entry->becn, + __entry->fecn, + __entry->l4, + __entry->lnh, + show_lnh(__entry->lnh), + __entry->lver, + __entry->rc, + __entry->sc, + __entry->sl, + __entry->entropy, + __entry->len, + __entry->pkey, + __entry->dlid, + __entry->slid), + hfi1_trace_fmt_rest(p, + __entry->etype == + RHF_RCV_TYPE_BYPASS, + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), + /* extended headers */ + __get_dynamic_array_len(ehdrs), + __parse_ib_ehdrs( + __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_input_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_packet *packet, bool sc5), + TP_ARGS(dd, packet, sc5)); + +DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u8, hdr_type) + __field(u8, ack) + __field(u8, age) + __field(bool, becn) + __field(bool, fecn) + __field(u8, l4) + __field(u8, lnh) + __field(u8, lver) + __field(u8, mig) + __field(u8, opcode) + __field(u8, pad) + __field(u8, rc) + __field(u8, sc) + __field(u8, se) + __field(u8, sl) + __field(u8, tver) + __field(u16, entropy) + __field(u16, len) + __field(u16, pkey) + __field(u32, dlid) + __field(u32, psn) + __field(u32, qpn) + __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) + /* extended headers */ + __dynamic_array(u8, ehdrs, + hfi1_trace_opa_hdr_len(opah)) + ), + TP_fast_assign( + struct ib_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + + __entry->hdr_type = opah->hdr_type; + __entry->dest_qpn = 0; + __entry->src_qpn = 0; + if (__entry->hdr_type) { + hfi1_trace_parse_16b_hdr(&opah->opah, + &__entry->age, + &__entry->becn, + &__entry->fecn, + &__entry->l4, + &__entry->rc, + &__entry->sc, + &__entry->entropy, + &__entry->len, + &__entry->pkey, + &__entry->dlid, + &__entry->slid); + + if (__entry->l4 == OPA_16B_L4_FM) { + ohdr = NULL; + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(&opah->opah.u.mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(&opah->opah.u.mgmt); + } else { + if (__entry->l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &opah->opah.u.oth; + else + ohdr = &opah->opah.u.l.oth; + hfi1_trace_parse_16b_bth(ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } + } else { + __entry->l4 = OPA_16B_L4_9B; + hfi1_trace_parse_9b_hdr(&opah->ibh, sc5, + &__entry->lnh, + &__entry->lver, + &__entry->sl, + &__entry->sc, + &__entry->len, + &__entry->dlid, + &__entry->slid); + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &opah->ibh.u.oth; + else + ohdr = &opah->ibh.u.l.oth; + hfi1_trace_parse_9b_bth(ohdr, + &__entry->ack, + &__entry->becn, + &__entry->fecn, + &__entry->mig, + &__entry->se, + &__entry->pad, + &__entry->opcode, + &__entry->tver, + &__entry->pkey, + &__entry->psn, + &__entry->qpn); + } + + /* extended headers */ + if (__entry->l4 != OPA_16B_L4_FM) + memcpy(__get_dynamic_array(ehdrs), + &ohdr->u, __get_dynamic_array_len(ehdrs)); + ), + TP_printk("[%s] (%s) %s %s hlen:%d %s", + __get_str(dev), + hfi1_trace_get_packet_l4_str(__entry->l4), + hfi1_trace_fmt_lrh(p, + !!__entry->hdr_type, + __entry->age, + __entry->becn, + __entry->fecn, + __entry->l4, + __entry->lnh, + show_lnh(__entry->lnh), + __entry->lver, + __entry->rc, + __entry->sc, + __entry->sl, + __entry->entropy, + __entry->len, + __entry->pkey, + __entry->dlid, + __entry->slid), + hfi1_trace_fmt_rest(p, + !!__entry->hdr_type, + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), + /* extended headers */ + __get_dynamic_array_len(ehdrs), + __parse_ib_ehdrs( + __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_output_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); + +DEFINE_EVENT(hfi1_output_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); + +DEFINE_EVENT(hfi1_output_ibhdr_template, sdma_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_opa_header *opah, bool sc5), + TP_ARGS(dd, opah, sc5)); + + +#endif /* __HFI1_TRACE_IBHDRS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_ibhdrs +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h new file mode 100644 index 0000000000..27f4334ece --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_iowait.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IOWAIT_H + +#include <linux/tracepoint.h> +#include "iowait.h" +#include "verbs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_iowait + +DECLARE_EVENT_CLASS(hfi1_iowait_template, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag), + TP_STRUCT__entry(/* entry */ + __field(unsigned long, addr) + __field(unsigned long, flags) + __field(u32, flag) + __field(u32, qpn) + ), + TP_fast_assign(/* assign */ + __entry->addr = (unsigned long)wait; + __entry->flags = wait->flags; + __entry->flag = (1 << flag); + __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num; + ), + TP_printk(/* print */ + "iowait 0x%lx qp %u flags 0x%lx flag 0x%x", + __entry->addr, + __entry->qpn, + __entry->flags, + __entry->flag + ) + ); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +#endif /* __HFI1_TRACE_IOWAIT_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_iowait +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h new file mode 100644 index 0000000000..742675fa75 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_misc.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* +* Copyright(c) 2015, 2016 Intel Corporation. +*/ + +#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_MISC_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_misc + +TRACE_EVENT(hfi1_interrupt, + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + is_entry->is_name(__entry->buf, 64, + src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) +); + +DECLARE_EVENT_CLASS( + hfi1_csr_template, + TP_PROTO(void __iomem *addr, u64 value), + TP_ARGS(addr, value), + TP_STRUCT__entry( + __field(void __iomem *, addr) + __field(u64, value) + ), + TP_fast_assign( + __entry->addr = addr; + __entry->value = value; + ), + TP_printk("addr %p value %llx", __entry->addr, __entry->value) +); + +DEFINE_EVENT( + hfi1_csr_template, hfi1_write_rcvarray, + TP_PROTO(void __iomem *addr, u64 value), + TP_ARGS(addr, value)); + +#ifdef CONFIG_FAULT_INJECTION +TRACE_EVENT(hfi1_fault_opcode, + TP_PROTO(struct rvt_qp *qp, u8 opcode), + TP_ARGS(qp, opcode), + TP_STRUCT__entry(DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, opcode) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->opcode = opcode; + ), + TP_printk("[%s] qpn 0x%x opcode 0x%x", + __get_str(dev), __entry->qpn, __entry->opcode) +); + +TRACE_EVENT(hfi1_fault_packet, + TP_PROTO(struct hfi1_packet *packet), + TP_ARGS(packet), + TP_STRUCT__entry(DD_DEV_ENTRY(packet->rcd->ppd->dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(packet->rcd->ppd->dd); + __entry->eflags = rhf_err_flags(packet->rhf); + __entry->ctxt = packet->rcd->ctxt; + __entry->hlen = packet->hlen; + __entry->tlen = packet->tlen; + __entry->updegr = packet->updegr; + __entry->etail = rhf_egr_index(packet->rhf); + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); +#endif + +#endif /* __HFI1_TRACE_MISC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_misc +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_mmu.h b/drivers/infiniband/hw/hfi1/trace_mmu.h new file mode 100644 index 0000000000..82cc12aa3f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_mmu.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2017 Intel Corporation. + */ + +#if !defined(__HFI1_TRACE_MMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_MMU_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_mmu + +DECLARE_EVENT_CLASS(hfi1_mmu_rb_template, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node), + TP_STRUCT__entry(__field(unsigned long, addr) + __field(unsigned long, len) + __field(unsigned int, refcount) + ), + TP_fast_assign(__entry->addr = node->addr; + __entry->len = node->len; + __entry->refcount = kref_read(&node->refcount); + ), + TP_printk("MMU node addr 0x%lx, len %lu, refcount %u", + __entry->addr, + __entry->len, + __entry->refcount + ) +); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_insert, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +TRACE_EVENT(hfi1_mmu_rb_search, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len), + TP_STRUCT__entry(__field(unsigned long, addr) + __field(unsigned long, len) + ), + TP_fast_assign(__entry->addr = addr; + __entry->len = len; + ), + TP_printk("MMU node addr 0x%lx, len %lu", + __entry->addr, + __entry->len + ) +); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_evict, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_release_node, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +#endif /* __HFI1_TRACE_RC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_mmu +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h new file mode 100644 index 0000000000..7c3a1c7753 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rc.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* +* Copyright(c) 2015, 2016, 2017 Intel Corporation. +*/ + +#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RC_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rc + +DECLARE_EVENT_CLASS(hfi1_rc_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u32, psn) + __field(u32, s_psn) + __field(u32, s_next_psn) + __field(u32, s_sending_psn) + __field(u32, s_sending_hpsn) + __field(u32, r_psn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->psn = psn; + __entry->s_psn = qp->s_psn; + __entry->s_next_psn = qp->s_next_psn; + __entry->s_sending_psn = qp->s_sending_psn; + __entry->s_sending_hpsn = qp->s_sending_hpsn; + __entry->r_psn = qp->r_psn; + ), + TP_printk( + "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->psn, + __entry->s_psn, + __entry->s_next_psn, + __entry->s_sending_psn, + __entry->s_sending_hpsn, + __entry->r_psn + ) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_rc_template, hfi1_rc_completion, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* rc_ack */ + hfi1_rc_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u8, opcode) + __field(u32, spsn) + __field(u32, lpsn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->opcode = wqe->wr.opcode; + __entry->spsn = wqe->psn; + __entry->lpsn = wqe->lpsn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->opcode, + __entry->spsn, + __entry->lpsn + ) +); + +DEFINE_EVENT(/* do_rc_ack */ + hfi1_rc_ack_template, hfi1_rc_ack_do, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + struct rvt_swqe *wqe), + TP_ARGS(qp, aeth, psn, wqe) +); + +#endif /* __HFI1_TRACE_RC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rc +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h new file mode 100644 index 0000000000..0da22f9bc7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_RX_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#define tidtype_name(type) { PT_##type, #type } +#define show_tidtype(type) \ +__print_symbolic(type, \ + tidtype_name(EXPECTED), \ + tidtype_name(EAGER), \ + tidtype_name(INVALID)) \ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rx + +TRACE_EVENT(hfi1_rcvhdr, + TP_PROTO(struct hfi1_packet *packet), + TP_ARGS(packet), + TP_STRUCT__entry(DD_DEV_ENTRY(packet->rcd->dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(packet->rcd->dd); + __entry->eflags = rhf_err_flags(packet->rhf); + __entry->ctxt = packet->rcd->ctxt; + __entry->etype = packet->etype; + __entry->hlen = packet->hlen; + __entry->tlen = packet->tlen; + __entry->updegr = packet->updegr; + __entry->etail = rhf_egr_index(packet->rhf); + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); + +TRACE_EVENT(hfi1_receive_interrupt, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd), + TP_ARGS(dd, rcd), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = rcd->ctxt; + __entry->slow_path = hfi1_is_slowpath(rcd); + __entry->dma_rtail = get_dma_rtail_setting(rcd); + ), + TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) +); + +TRACE_EVENT(hfi1_mmu_invalidate, + TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, + unsigned long start, unsigned long end), + TP_ARGS(ctxt, subctxt, type, start, end), + TP_STRUCT__entry( + __field(unsigned int, ctxt) + __field(u16, subctxt) + __string(type, type) + __field(unsigned long, start) + __field(unsigned long, end) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __assign_str(type, type); + __entry->start = start; + __entry->end = end; + ), + TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", + __entry->ctxt, + __entry->subctxt, + __get_str(type), + __entry->start, + __entry->end + ) + ); + +#endif /* __HFI1_TRACE_RX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_rx +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h new file mode 100644 index 0000000000..d129b81959 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -0,0 +1,1642 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TID_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" + +#define tidtype_name(type) { PT_##type, #type } +#define show_tidtype(type) \ +__print_symbolic(type, \ + tidtype_name(EXPECTED), \ + tidtype_name(EAGER), \ + tidtype_name(INVALID)) \ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tid + +u8 hfi1_trace_get_tid_ctrl(u32 ent); +u16 hfi1_trace_get_tid_len(u32 ent); +u16 hfi1_trace_get_tid_idx(u32 ent); + +#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \ + "max write %u, max length %u, jkey 0x%x timeout %u " \ + "urg %u" + +#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \ + "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \ + "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \ + "tidcnt %u tid_idx %u tid_offset %u length %u sent %u" + +#define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \ + "used %u cnt %u" + +#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ + "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ + "r_head_ack_queue %u s_tail_ack_queue %u " \ + "s_acked_ack_queue %u s_ack_state 0x%x " \ + "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx" + +#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \ + "s_head %u s_acked %u s_last %u s_psn 0x%x " \ + "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \ + "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u" + +#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \ + "tid_r_comp %u pending_tid_r_segs %u " \ + "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \ + "s_state 0x%x hw_flow_index %u generation 0x%x " \ + "fpsn 0x%x" + +#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \ + "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \ + "total_segs %u setup_head %u clear_tail %u flow_idx %u " \ + "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \ + "r_last_ackd 0x%x s_next_psn 0x%x" + +#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ + "s_acked_ack_queue %u s_tail_ack_queue %u " \ + "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ + " diff %d" + +#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \ + "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \ + "pending_tid_w_segs %u sync_pt %s " \ + "ps_nak_psn 0x%x ps_nak_state 0x%x " \ + "prnr_nak_state 0x%x hw_flow_index %u generation "\ + "0x%x fpsn 0x%x resync %s" \ + "r_next_psn_kdeth 0x%x" + +#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \ + "s_tid_tail %u s_tid_head %u " \ + "pending_tid_w_resp %u n_requests %u " \ + "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\ + "iow_flags 0x%lx s_state 0x%x s_retry %u" + +#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \ + "RcvTypeError 0x%x PSN 0x%x" + +DECLARE_EVENT_CLASS(/* class */ + hfi1_exp_tid_reg_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) +); + +DEFINE_EVENT(/* exp_tid_unreg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +DEFINE_EVENT(/* exp_tid_reg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +TRACE_EVENT(/* put_tid */ + hfi1_put_tid, + TP_PROTO(struct hfi1_devdata *dd, + u32 index, u32 type, unsigned long pa, u16 order), + TP_ARGS(dd, index, type, pa, order), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd) + __field(unsigned long, pa) + __field(u32, index) + __field(u32, type) + __field(u16, order) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd); + __entry->pa = pa; + __entry->index = index; + __entry->type = type; + __entry->order = order; + ), + TP_printk("[%s] type %s pa %lx index %u order %u", + __get_str(dev), + show_tidtype(__entry->type), + __entry->pa, + __entry->index, + __entry->order + ) +); + +TRACE_EVENT(/* exp_tid_inval */ + hfi1_exp_tid_inval, + TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) +); + +DECLARE_EVENT_CLASS(/* opfn_state */ + hfi1_opfn_state_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u16, requested) + __field(u16, completed) + __field(u8, curr) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->requested = priv->opfn.requested; + __entry->completed = priv->opfn.completed; + __entry->curr = priv->opfn.curr; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x", + __get_str(dev), + __entry->qpn, + __entry->requested, + __entry->completed, + __entry->curr + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_response, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_reply, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_error, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* opfn_data */ + hfi1_opfn_data_template, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, state) + __field(u8, capcode) + __field(u64, data) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->capcode = capcode; + __entry->data = data; + ), + TP_printk(/* printk */ + "[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx", + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->capcode, + __entry->data + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_request, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_response, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_reply, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DECLARE_EVENT_CLASS(/* opfn_param */ + hfi1_opfn_param_template, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, remote) + __field(u32, param_qp) + __field(u32, max_len) + __field(u16, jkey) + __field(u8, max_read) + __field(u8, max_write) + __field(u8, timeout) + __field(u8, urg) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->remote = remote; + __entry->param_qp = param->qp; + __entry->max_len = param->max_len; + __entry->jkey = param->jkey; + __entry->max_read = param->max_read; + __entry->max_write = param->max_write; + __entry->timeout = param->timeout; + __entry->urg = param->urg; + ), + TP_printk(/* print */ + OPFN_PARAM_PRN, + __get_str(dev), + __entry->qpn, + __entry->remote ? "remote" : "local", + __entry->param_qp, + __entry->max_read, + __entry->max_write, + __entry->max_len, + __entry->jkey, + __entry->timeout, + __entry->urg + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_param_template, hfi1_opfn_param, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param) +); + +DECLARE_EVENT_CLASS(/* msg */ + hfi1_msg_template, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more), + TP_STRUCT__entry(/* entry */ + __field(u32, qpn) + __string(msg, msg) + __field(u64, more) + ), + TP_fast_assign(/* assign */ + __entry->qpn = qp ? qp->ibqp.qp_num : 0; + __assign_str(msg, msg); + __entry->more = more; + ), + TP_printk(/* print */ + "qpn 0x%x %s 0x%llx", + __entry->qpn, + __get_str(msg), + __entry->more + ) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_request, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_error, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_alloc_tids, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_restart_req, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DECLARE_EVENT_CLASS(/* tid_flow_page */ + hfi1_tid_flow_page_template, + TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, + char mtu8k, char v1, void *vaddr), + TP_ARGS(qp, flow, index, mtu8k, v1, vaddr), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, mtu8k) + __field(char, v1) + __field(u32, index) + __field(u64, page) + __field(u64, vaddr) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->mtu8k = mtu8k; + __entry->v1 = v1; + __entry->index = index; + __entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL; + __entry->vaddr = (u64)vaddr; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->page, + __entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr", + __entry->vaddr + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_page_template, hfi1_tid_flow_page, + TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index, + char mtu8k, char v1, void *vaddr), + TP_ARGS(qp, flow, index, mtu8k, v1, vaddr) +); + +DECLARE_EVENT_CLASS(/* tid_pageset */ + hfi1_tid_pageset_template, + TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count), + TP_ARGS(qp, index, idx, count), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, index) + __field(u16, idx) + __field(u16, count) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->idx = idx; + __entry->count = count; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x list[%u]: idx %u count %u", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->count + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_pageset_template, hfi1_tid_pageset, + TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count), + TP_ARGS(qp, index, idx, count) +); + +DECLARE_EVENT_CLASS(/* tid_fow */ + hfi1_tid_flow_template, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(int, idx) + __field(u32, resp_ib_psn) + __field(u32, generation) + __field(u32, fspsn) + __field(u32, flpsn) + __field(u32, r_next_psn) + __field(u32, ib_spsn) + __field(u32, ib_lpsn) + __field(u32, npagesets) + __field(u32, tnode_cnt) + __field(u32, tidcnt) + __field(u32, tid_idx) + __field(u32, tid_offset) + __field(u32, length) + __field(u32, sent) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->idx = flow->idx; + __entry->resp_ib_psn = flow->flow_state.resp_ib_psn; + __entry->generation = flow->flow_state.generation; + __entry->fspsn = full_flow_psn(flow, + flow->flow_state.spsn); + __entry->flpsn = full_flow_psn(flow, + flow->flow_state.lpsn); + __entry->r_next_psn = flow->flow_state.r_next_psn; + __entry->ib_spsn = flow->flow_state.ib_spsn; + __entry->ib_lpsn = flow->flow_state.ib_lpsn; + __entry->npagesets = flow->npagesets; + __entry->tnode_cnt = flow->tnode_cnt; + __entry->tidcnt = flow->tidcnt; + __entry->tid_idx = flow->tid_idx; + __entry->tid_offset = flow->tid_offset; + __entry->length = flow->length; + __entry->sent = flow->sent; + ), + TP_printk(/* print */ + TID_FLOW_PRN, + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->resp_ib_psn, + __entry->generation, + __entry->fspsn, + __entry->flpsn, + __entry->r_next_psn, + __entry->ib_spsn, + __entry->ib_lpsn, + __entry->npagesets, + __entry->tnode_cnt, + __entry->tidcnt, + __entry->tid_idx, + __entry->tid_offset, + __entry->length, + __entry->sent + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_alloc, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_restart_req, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_flow_template, hfi1_tid_flow_read_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow), + TP_ARGS(qp, index, flow) +); + +DECLARE_EVENT_CLASS(/* tid_node */ + hfi1_tid_node_template, + TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, + u8 map, u8 used, u8 cnt), + TP_ARGS(qp, msg, index, base, map, used, cnt), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __string(msg, msg) + __field(u32, index) + __field(u32, base) + __field(u8, map) + __field(u8, used) + __field(u8, cnt) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __assign_str(msg, msg); + __entry->index = index; + __entry->base = base; + __entry->map = map; + __entry->used = used; + __entry->cnt = cnt; + ), + TP_printk(/* print */ + TID_NODE_PRN, + __get_str(dev), + __entry->qpn, + __get_str(msg), + __entry->index, + __entry->base, + __entry->map, + __entry->used, + __entry->cnt + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_node_template, hfi1_tid_node_add, + TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base, + u8 map, u8 used, u8 cnt), + TP_ARGS(qp, msg, index, base, map, used, cnt) +); + +DECLARE_EVENT_CLASS(/* tid_entry */ + hfi1_tid_entry_template, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(u8, ctrl) + __field(u16, idx) + __field(u16, len) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->ctrl = hfi1_trace_get_tid_ctrl(ent); + __entry->idx = hfi1_trace_get_tid_idx(ent); + __entry->len = hfi1_trace_get_tid_len(ent); + ), + TP_printk(/* print */ + "[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->idx, + __entry->len, + __entry->ctrl + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_alloc, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, int index, u32 ent), + TP_ARGS(qp, index, ent) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_entry_template, hfi1_tid_entry_build_write_data, + TP_PROTO(struct rvt_qp *qp, int index, u32 entry), + TP_ARGS(qp, index, entry) +); + +DECLARE_EVENT_CLASS(/* rsp_info */ + hfi1_responder_info_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u8, s_state) + __field(u32, psn) + __field(u32, r_psn) + __field(u8, r_state) + __field(u8, r_flags) + __field(u8, r_head_ack_queue) + __field(u8, s_tail_ack_queue) + __field(u8, s_acked_ack_queue) + __field(u8, s_ack_state) + __field(u8, s_nak_state) + __field(u8, r_nak_state) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_state = qp->s_state; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->r_state = qp->r_state; + __entry->r_flags = qp->r_flags; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; + __entry->s_ack_state = qp->s_ack_state; + __entry->s_nak_state = qp->s_nak_state; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + ), + TP_printk(/* print */ + RSP_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_state, + __entry->psn, + __entry->r_psn, + __entry->r_state, + __entry->r_flags, + __entry->r_head_ack_queue, + __entry->s_tail_ack_queue, + __entry->s_acked_ack_queue, + __entry->s_ack_state, + __entry->s_nak_state, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags + ) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(/* event */ + hfi1_responder_info_template, hfi1_rsp_read_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DECLARE_EVENT_CLASS(/* sender_info */ + hfi1_sender_info_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, state) + __field(u32, s_cur) + __field(u32, s_tail) + __field(u32, s_head) + __field(u32, s_acked) + __field(u32, s_last) + __field(u32, s_psn) + __field(u32, s_last_psn) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_num_rd) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->s_cur = qp->s_cur; + __entry->s_tail = qp->s_tail; + __entry->s_head = qp->s_head; + __entry->s_acked = qp->s_acked; + __entry->s_last = qp->s_last; + __entry->s_psn = qp->s_psn; + __entry->s_last_psn = qp->s_last_psn; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; + __entry->s_state = qp->s_state; + __entry->s_num_rd = qp->s_num_rd_atomic; + __entry->s_retry = qp->s_retry; + ), + TP_printk(/* print */ + SENDER_INFO_PRN, + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->s_cur, + __entry->s_tail, + __entry->s_head, + __entry->s_acked, + __entry->s_last, + __entry->s_psn, + __entry->s_last_psn, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_num_rd, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_rc_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_reset_psn, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_do_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_sender_info_template, hfi1_sender_read_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_read_sender */ + hfi1_tid_read_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, tid_r_reqs) + __field(u32, tid_r_comp) + __field(u32, pending_tid_r_segs) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->tid_r_reqs = priv->tid_r_reqs; + __entry->tid_r_comp = priv->tid_r_comp; + __entry->pending_tid_r_segs = priv->pending_tid_r_segs; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + ), + TP_printk(/* print */ + TID_READ_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->tid_r_reqs, + __entry->tid_r_comp, + __entry->pending_tid_r_segs, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_read_sender_template, hfi1_tid_read_sender_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_rdma_request */ + hfi1_tid_rdma_request_template, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u8, opcode) + __field(u32, psn) + __field(u32, lpsn) + __field(u32, cur_seg) + __field(u32, comp_seg) + __field(u32, ack_seg) + __field(u32, alloc_seg) + __field(u32, total_segs) + __field(u16, setup_head) + __field(u16, clear_tail) + __field(u16, flow_idx) + __field(u16, acked_tail) + __field(u32, state) + __field(u32, r_ack_psn) + __field(u32, r_flow_psn) + __field(u32, r_last_acked) + __field(u32, s_next_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->lpsn = lpsn; + __entry->cur_seg = req->cur_seg; + __entry->comp_seg = req->comp_seg; + __entry->ack_seg = req->ack_seg; + __entry->alloc_seg = req->alloc_seg; + __entry->total_segs = req->total_segs; + __entry->setup_head = req->setup_head; + __entry->clear_tail = req->clear_tail; + __entry->flow_idx = req->flow_idx; + __entry->acked_tail = req->acked_tail; + __entry->state = req->state; + __entry->r_ack_psn = req->r_ack_psn; + __entry->r_flow_psn = req->r_flow_psn; + __entry->r_last_acked = req->r_last_acked; + __entry->s_next_psn = req->s_next_psn; + ), + TP_printk(/* print */ + TID_REQ_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->opcode, + __entry->psn, + __entry->lpsn, + __entry->cur_seg, + __entry->comp_seg, + __entry->ack_seg, + __entry->alloc_seg, + __entry->total_segs, + __entry->setup_head, + __entry->clear_tail, + __entry->flow_idx, + __entry->acked_tail, + __entry->state, + __entry->r_ack_psn, + __entry->r_flow_psn, + __entry->r_last_acked, + __entry->s_next_psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_read_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_rdma_request_template, hfi1_tid_req_update_num_rd_atomic, + TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn, + struct tid_rdma_request *req), + TP_ARGS(qp, newreq, opcode, psn, lpsn, req) +); + +DECLARE_EVENT_CLASS(/* rc_rcv_err */ + hfi1_rc_rcv_err_template, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u8, state) + __field(u8, s_acked_ack_queue) + __field(u8, s_tail_ack_queue) + __field(u8, r_head_ack_queue) + __field(u32, opcode) + __field(u32, psn) + __field(u32, r_psn) + __field(int, diff) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->state = qp->state; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; + __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->r_head_ack_queue = qp->r_head_ack_queue; + __entry->opcode = opcode; + __entry->psn = psn; + __entry->r_psn = qp->r_psn; + __entry->diff = diff; + ), + TP_printk(/* print */ + RCV_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->state, + __entry->s_acked_ack_queue, + __entry->s_tail_ack_queue, + __entry->r_head_ack_queue, + __entry->opcode, + __entry->psn, + __entry->r_psn, + __entry->diff + ) +); + +DEFINE_EVENT(/* event */ + hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err, + TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff), + TP_ARGS(qp, opcode, psn, diff) +); + +DECLARE_EVENT_CLASS(/* sge */ + hfi1_sge_template, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(int, index) + __field(u64, vaddr) + __field(u32, sge_length) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->index = index; + __entry->vaddr = (u64)sge->vaddr; + __entry->sge_length = sge->sge_length; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u", + __get_str(dev), + __entry->qpn, + __entry->index, + __entry->vaddr, + __entry->sge_length + ) +); + +DEFINE_EVENT(/* event */ + hfi1_sge_template, hfi1_sge_check_align, + TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge), + TP_ARGS(qp, index, sge) +); + +DECLARE_EVENT_CLASS(/* tid_write_sp */ + hfi1_tid_write_rsp_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, r_tid_head) + __field(u32, r_tid_tail) + __field(u32, r_tid_ack) + __field(u32, r_tid_alloc) + __field(u32, alloc_w_segs) + __field(u32, pending_tid_w_segs) + __field(bool, sync_pt) + __field(u32, ps_nak_psn) + __field(u8, ps_nak_state) + __field(u8, prnr_nak_state) + __field(u32, hw_flow_index) + __field(u32, generation) + __field(u32, fpsn) + __field(bool, resync) + __field(u32, r_next_psn_kdeth) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->r_tid_head = priv->r_tid_head; + __entry->r_tid_tail = priv->r_tid_tail; + __entry->r_tid_ack = priv->r_tid_ack; + __entry->r_tid_alloc = priv->r_tid_alloc; + __entry->alloc_w_segs = priv->alloc_w_segs; + __entry->pending_tid_w_segs = priv->pending_tid_w_segs; + __entry->sync_pt = priv->sync_pt; + __entry->ps_nak_psn = priv->s_nak_psn; + __entry->ps_nak_state = priv->s_nak_state; + __entry->prnr_nak_state = priv->rnr_nak_state; + __entry->hw_flow_index = priv->flow_state.index; + __entry->generation = priv->flow_state.generation; + __entry->fpsn = priv->flow_state.psn; + __entry->resync = priv->resync; + __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth; + ), + TP_printk(/* print */ + TID_WRITE_RSPDR_PRN, + __get_str(dev), + __entry->qpn, + __entry->r_tid_head, + __entry->r_tid_tail, + __entry->r_tid_ack, + __entry->r_tid_alloc, + __entry->alloc_w_segs, + __entry->pending_tid_w_segs, + __entry->sync_pt ? "yes" : "no", + __entry->ps_nak_psn, + __entry->ps_nak_state, + __entry->prnr_nak_state, + __entry->hw_flow_index, + __entry->generation, + __entry->fpsn, + __entry->resync ? "yes" : "no", + __entry->r_next_psn_kdeth + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* tid_write_sender */ + hfi1_tid_write_sender_template, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, newreq) + __field(u32, s_tid_cur) + __field(u32, s_tid_tail) + __field(u32, s_tid_head) + __field(u32, pending_tid_w_resp) + __field(u32, n_requests) + __field(u32, n_tid_requests) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + __field(u8, s_state) + __field(u8, s_retry) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->newreq = newreq; + __entry->s_tid_cur = priv->s_tid_cur; + __entry->s_tid_tail = priv->s_tid_tail; + __entry->s_tid_head = priv->s_tid_head; + __entry->pending_tid_w_resp = priv->pending_tid_w_resp; + __entry->n_requests = atomic_read(&priv->n_requests); + __entry->n_tid_requests = atomic_read(&priv->n_tid_requests); + __entry->s_flags = qp->s_flags; + __entry->ps_flags = priv->s_flags; + __entry->iow_flags = priv->s_iowait.flags; + __entry->s_state = priv->s_state; + __entry->s_retry = priv->s_retry; + ), + TP_printk(/* print */ + TID_WRITE_SENDER_PRN, + __get_str(dev), + __entry->qpn, + __entry->newreq, + __entry->s_tid_cur, + __entry->s_tid_tail, + __entry->s_tid_head, + __entry->pending_tid_w_resp, + __entry->n_requests, + __entry->n_tid_requests, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags, + __entry->s_state, + __entry->s_retry + ) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DEFINE_EVENT(/* event */ + hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc, + TP_PROTO(struct rvt_qp *qp, char newreq), + TP_ARGS(qp, newreq) +); + +DECLARE_EVENT_CLASS(/* tid_ack */ + hfi1_tid_ack_template, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, aeth) + __field(u32, psn) + __field(u32, req_psn) + __field(u32, resync_psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->aeth = aeth; + __entry->psn = psn; + __entry->req_psn = req_psn; + __entry->resync_psn = resync_psn; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->aeth, + __entry->psn, + __entry->req_psn, + __entry->resync_psn + ) +); + +DEFINE_EVENT(/* rcv_tid_ack */ + hfi1_tid_ack_template, hfi1_rcv_tid_ack, + TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn, + u32 req_psn, u32 resync_psn), + TP_ARGS(qp, aeth, psn, req_psn, resync_psn) +); + +DECLARE_EVENT_CLASS(/* kdeth_eflags_error */ + hfi1_kdeth_eflags_error_template, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u8, rcv_type) + __field(u8, rte) + __field(u32, psn) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->rcv_type = rcv_type; + __entry->rte = rte; + __entry->psn = psn; + ), + TP_printk(/* print */ + KDETH_EFLAGS_ERR_PRN, + __get_str(dev), + __entry->qpn, + __entry->rcv_type, + __entry->rte, + __entry->psn + ) +); + +DEFINE_EVENT(/* event */ + hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write, + TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn), + TP_ARGS(qp, rcv_type, rte, psn) +); + +#endif /* __HFI1_TRACE_TID_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tid +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h new file mode 100644 index 0000000000..ed1b9e1e4b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -0,0 +1,1065 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2017 Intel Corporation. + */ +#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TX_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" +#include "mad.h" +#include "sdma.h" +#include "ipoib.h" +#include "user_sdma.h" + +const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); + +#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tx + +TRACE_EVENT(hfi1_piofree, + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk("[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) +); + +TRACE_EVENT(hfi1_wantpiointr, + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) +); + +DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + __field(u32, ps_flags) + __field(unsigned long, iow_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->ps_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_flags; + __entry->iow_flags = + ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags, + __entry->ps_flags, + __entry->iow_flags + ) +); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +TRACE_EVENT(hfi1_sdma_descriptor, + TP_PROTO(struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), + TP_ARGS(sde, desc0, desc1, e, descp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), + TP_printk( + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & + SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & + SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & + SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) +); + +TRACE_EVENT(hfi1_sdma_engine_select, + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) +); + +TRACE_EVENT(hfi1_sdma_user_free_queues, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt), + TP_ARGS(dd, ctxt, subctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + ), + TP_printk("[%s] SDMA [%u:%u] Freeing user SDMA queues", + __get_str(dev), + __entry->ctxt, + __entry->subctxt + ) +); + +TRACE_EVENT(hfi1_sdma_user_process_request, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx), + TP_ARGS(dd, ctxt, subctxt, comp_idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + ), + TP_printk("[%s] SDMA [%u:%u] Using req/comp entry: %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx + ) +); + +DECLARE_EVENT_CLASS( + hfi1_sdma_value_template, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, u16 comp_idx, + u32 value), + TP_ARGS(dd, ctxt, subctxt, comp_idx, value), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + __field(u32, value) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + __entry->value = value; + ), + TP_printk("[%s] SDMA [%u:%u:%u] value: %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx, + __entry->value + ) +); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_initial_tidoffset, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 tidoffset), + TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset)); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_data_length, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 data_len), + TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len)); + +DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_compute_length, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 data_len), + TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len)); + +TRACE_EVENT(hfi1_sdma_user_tid_info, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + u16 comp_idx, u32 tidoffset, u32 units, u8 shift), + TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset, units, shift), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(u16, comp_idx) + __field(u32, tidoffset) + __field(u32, units) + __field(u8, shift) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->comp_idx = comp_idx; + __entry->tidoffset = tidoffset; + __entry->units = units; + __entry->shift = shift; + ), + TP_printk("[%s] SDMA [%u:%u:%u] TID offset %ubytes %uunits om %u", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->comp_idx, + __entry->tidoffset, + __entry->units, + __entry->shift + ) +); + +TRACE_EVENT(hfi1_sdma_request, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, + unsigned long dim), + TP_ARGS(dd, ctxt, subctxt, dim), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u16, subctxt) + __field(unsigned long, dim) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->dim = dim; + ), + TP_printk("[%s] SDMA from %u:%u (%lu)", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->dim + ) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk("[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) +); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#else +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#endif + +DECLARE_EVENT_CLASS(hfi1_sdma_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn) +); + +#define USDMA_HDR_FORMAT \ + "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" + +TRACE_EVENT(hfi1_sdma_user_header, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + struct hfi1_pkt_header *hdr, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u32, pbc0) + __field(u32, pbc1) + __field(u32, lrh0) + __field(u32, lrh1) + __field(u32, bth0) + __field(u32, bth1) + __field(u32, bth2) + __field(u32, kdeth0) + __field(u32, kdeth1) + __field(u32, kdeth2) + __field(u32, kdeth3) + __field(u32, kdeth4) + __field(u32, kdeth5) + __field(u32, kdeth6) + __field(u32, kdeth7) + __field(u32, kdeth8) + __field(u32, tidval) + ), + TP_fast_assign( + __le32 *pbc = (__le32 *)hdr->pbc; + __be32 *lrh = (__be32 *)hdr->lrh; + __be32 *bth = (__be32 *)hdr->bth; + __le32 *kdeth = (__le32 *)&hdr->kdeth; + + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->pbc0 = le32_to_cpu(pbc[0]); + __entry->pbc1 = le32_to_cpu(pbc[1]); + __entry->lrh0 = be32_to_cpu(lrh[0]); + __entry->lrh1 = be32_to_cpu(lrh[1]); + __entry->bth0 = be32_to_cpu(bth[0]); + __entry->bth1 = be32_to_cpu(bth[1]); + __entry->bth2 = be32_to_cpu(bth[2]); + __entry->kdeth0 = le32_to_cpu(kdeth[0]); + __entry->kdeth1 = le32_to_cpu(kdeth[1]); + __entry->kdeth2 = le32_to_cpu(kdeth[2]); + __entry->kdeth3 = le32_to_cpu(kdeth[3]); + __entry->kdeth4 = le32_to_cpu(kdeth[4]); + __entry->kdeth5 = le32_to_cpu(kdeth[5]); + __entry->kdeth6 = le32_to_cpu(kdeth[6]); + __entry->kdeth7 = le32_to_cpu(kdeth[7]); + __entry->kdeth8 = le32_to_cpu(kdeth[8]); + __entry->tidval = tidval; + ), + TP_printk(USDMA_HDR_FORMAT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->pbc1, + __entry->pbc0, + __entry->lrh0, + __entry->lrh1, + __entry->bth0, + __entry->bth1, + __entry->bth2, + __entry->kdeth0, + __entry->kdeth1, + __entry->kdeth2, + __entry->kdeth3, + __entry->kdeth4, + __entry->kdeth5, + __entry->kdeth6, + __entry->kdeth7, + __entry->kdeth8, + __entry->tidval + ) +); + +#define SDMA_UREQ_FMT \ + "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" +TRACE_EVENT(hfi1_sdma_user_reqinfo, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), + TP_ARGS(dd, ctxt, subctxt, i), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u8, ver_opcode) + __field(u8, iovcnt) + __field(u16, npkts) + __field(u16, fragsize) + __field(u16, comp_idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->ver_opcode = i[0] & 0xff; + __entry->iovcnt = (i[0] >> 8) & 0xff; + __entry->npkts = i[1]; + __entry->fragsize = i[2]; + __entry->comp_idx = i[3]; + ), + TP_printk(SDMA_UREQ_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->ver_opcode, + __entry->iovcnt, + __entry->npkts, + __entry->fragsize, + __entry->comp_idx + ) +); + +#define usdma_complete_name(st) { st, #st } +#define show_usdma_complete_state(st) \ + __print_symbolic(st, \ + usdma_complete_name(FREE), \ + usdma_complete_name(QUEUED), \ + usdma_complete_name(COMPLETE), \ + usdma_complete_name(ERROR)) + +TRACE_EVENT(hfi1_sdma_user_completion, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, + u8 state, int code), + TP_ARGS(dd, ctxt, subctxt, idx, state, code), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, idx) + __field(u8, state) + __field(int, code) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->idx = idx; + __entry->state = state; + __entry->code = code; + ), + TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", + __get_str(dev), __entry->ctxt, __entry->subctxt, + __entry->idx, show_usdma_complete_state(__entry->state), + __entry->code) +); + +TRACE_EVENT(hfi1_usdma_defer, + TP_PROTO(struct hfi1_user_sdma_pkt_q *pq, + struct sdma_engine *sde, + struct iowait *wait), + TP_ARGS(pq, sde, wait), + TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd) + __field(struct hfi1_user_sdma_pkt_q *, pq) + __field(struct sdma_engine *, sde) + __field(struct iowait *, wait) + __field(int, engine) + __field(int, empty) + ), + TP_fast_assign(DD_DEV_ASSIGN(pq->dd); + __entry->pq = pq; + __entry->sde = sde; + __entry->wait = wait; + __entry->engine = sde->this_idx; + __entry->empty = list_empty(&__entry->wait->list); + ), + TP_printk("[%s] pq %llx sde %llx wait %llx engine %d empty %d", + __get_str(dev), + (unsigned long long)__entry->pq, + (unsigned long long)__entry->sde, + (unsigned long long)__entry->wait, + __entry->engine, + __entry->empty + ) +); + +TRACE_EVENT(hfi1_usdma_activate, + TP_PROTO(struct hfi1_user_sdma_pkt_q *pq, + struct iowait *wait, + int reason), + TP_ARGS(pq, wait, reason), + TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd) + __field(struct hfi1_user_sdma_pkt_q *, pq) + __field(struct iowait *, wait) + __field(int, reason) + ), + TP_fast_assign(DD_DEV_ASSIGN(pq->dd); + __entry->pq = pq; + __entry->wait = wait; + __entry->reason = reason; + ), + TP_printk("[%s] pq %llx wait %llx reason %d", + __get_str(dev), + (unsigned long long)__entry->pq, + (unsigned long long)__entry->wait, + __entry->reason + ) +); + +TRACE_EVENT(hfi1_usdma_we, + TP_PROTO(struct hfi1_user_sdma_pkt_q *pq, + int we_ret), + TP_ARGS(pq, we_ret), + TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd) + __field(struct hfi1_user_sdma_pkt_q *, pq) + __field(int, state) + __field(int, we_ret) + ), + TP_fast_assign(DD_DEV_ASSIGN(pq->dd); + __entry->pq = pq; + __entry->state = pq->state; + __entry->we_ret = we_ret; + ), + TP_printk("[%s] pq %llx state %d we_ret %d", + __get_str(dev), + (unsigned long long)__entry->pq, + __entry->state, + __entry->we_ret + ) +); + +const char *print_u32_array(struct trace_seq *, u32 *, int); +#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) + +TRACE_EVENT(hfi1_sdma_user_header_ahg, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u8, sde) + __field(u8, idx) + __field(int, len) + __field(u32, tidval) + __array(u32, ahg, 10) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->sde = sde; + __entry->idx = ahgidx; + __entry->len = len; + __entry->tidval = tidval; + memcpy(__entry->ahg, ahg, len * sizeof(u32)); + ), + TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->sde, + __entry->idx, + __entry->len - 1, + __print_u32_hex(__entry->ahg, __entry->len), + __entry->tidval + ) +); + +TRACE_EVENT(hfi1_sdma_state, + TP_PROTO(struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), + TP_printk("[%s] current state %s new state %s", + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) +); + +#define BCT_FORMAT \ + "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" + +#define BCT(field) \ + be16_to_cpu( \ + ((struct buffer_control *)__get_dynamic_array(bct))->field \ + ) + +DECLARE_EVENT_CLASS(hfi1_bct_template, + TP_PROTO(struct hfi1_devdata *dd, + struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + memcpy(__get_dynamic_array(bct), bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) +); + +DEFINE_EVENT(hfi1_bct_template, bct_set, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +DEFINE_EVENT(hfi1_bct_template, bct_get, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +TRACE_EVENT( + hfi1_qp_send_completion, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx), + TP_ARGS(qp, wqe, idx), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(struct rvt_swqe *, wqe) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, qpt) + __field(u32, length) + __field(u32, idx) + __field(u32, ssn) + __field(enum ib_wr_opcode, opcode) + __field(int, send_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->wqe = wqe; + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->qpt = qp->ibqp.qp_type; + __entry->length = wqe->length; + __entry->idx = idx; + __entry->ssn = wqe->ssn; + __entry->opcode = wqe->wr.opcode; + __entry->send_flags = wqe->wr.send_flags; + ), + TP_printk( + "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x", + __get_str(dev), + __entry->qpn, + __entry->qpt, + __entry->wqe, + __entry->idx, + __entry->wr_id, + __entry->length, + __entry->ssn, + __entry->opcode, + __entry->send_flags + ) +); + +DECLARE_EVENT_CLASS( + hfi1_do_send_template, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(bool, flag) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->flag = flag; + ), + TP_printk( + "[%s] qpn %x flag %d", + __get_str(dev), + __entry->qpn, + __entry->flag + ) +); + +DEFINE_EVENT( + hfi1_do_send_template, hfi1_rc_do_send, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + +DEFINE_EVENT(/* event */ + hfi1_do_send_template, hfi1_rc_do_tid_send, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + +DEFINE_EVENT( + hfi1_do_send_template, hfi1_rc_expired_time_slice, + TP_PROTO(struct rvt_qp *qp, bool flag), + TP_ARGS(qp, flag) +); + +DECLARE_EVENT_CLASS(/* AIP */ + hfi1_ipoib_txq_template, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(txq->priv->dd) + __field(struct hfi1_ipoib_txq *, txq) + __field(struct sdma_engine *, sde) + __field(ulong, head) + __field(ulong, tail) + __field(uint, used) + __field(uint, flow) + __field(int, stops) + __field(int, no_desc) + __field(u8, idx) + __field(u8, stopped) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(txq->priv->dd); + __entry->txq = txq; + __entry->sde = txq->sde; + __entry->head = txq->tx_ring.head; + __entry->tail = txq->tx_ring.tail; + __entry->idx = txq->q_idx; + __entry->used = + txq->tx_ring.sent_txreqs - + txq->tx_ring.complete_txreqs; + __entry->flow = txq->flow.as_int; + __entry->stops = atomic_read(&txq->tx_ring.stops); + __entry->no_desc = atomic_read(&txq->tx_ring.no_desc); + __entry->stopped = + __netif_subqueue_stopped(txq->priv->netdev, txq->q_idx); + ), + TP_printk(/* print */ + "[%s] txq %llx idx %u sde %llx:%u cpu %d head %lx tail %lx flow %x used %u stops %d no_desc %d stopped %u", + __get_str(dev), + (unsigned long long)__entry->txq, + __entry->idx, + (unsigned long long)__entry->sde, + __entry->sde ? __entry->sde->this_idx : 0, + __entry->sde ? __entry->sde->cpu : 0, + __entry->head, + __entry->tail, + __entry->flow, + __entry->used, + __entry->stops, + __entry->no_desc, + __entry->stopped + ) +); + +DEFINE_EVENT(/* queue stop */ + hfi1_ipoib_txq_template, hfi1_txq_stop, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* queue wake */ + hfi1_ipoib_txq_template, hfi1_txq_wake, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* flow flush */ + hfi1_ipoib_txq_template, hfi1_flow_flush, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* flow switch */ + hfi1_ipoib_txq_template, hfi1_flow_switch, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* wakeup */ + hfi1_ipoib_txq_template, hfi1_txq_wakeup, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* full */ + hfi1_ipoib_txq_template, hfi1_txq_full, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* queued */ + hfi1_ipoib_txq_template, hfi1_txq_queued, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* xmit_stopped */ + hfi1_ipoib_txq_template, hfi1_txq_xmit_stopped, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* xmit_unstopped */ + hfi1_ipoib_txq_template, hfi1_txq_xmit_unstopped, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DECLARE_EVENT_CLASS(/* AIP */ + hfi1_ipoib_tx_template, + TP_PROTO(struct ipoib_txreq *tx, u32 idx), + TP_ARGS(tx, idx), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(tx->txq->priv->dd) + __field(struct ipoib_txreq *, tx) + __field(struct hfi1_ipoib_txq *, txq) + __field(struct sk_buff *, skb) + __field(ulong, idx) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(tx->txq->priv->dd); + __entry->tx = tx; + __entry->skb = tx->skb; + __entry->txq = tx->txq; + __entry->idx = idx; + ), + TP_printk(/* print */ + "[%s] tx %llx txq %llx,%u skb %llx idx %lu", + __get_str(dev), + (unsigned long long)__entry->tx, + (unsigned long long)__entry->txq, + __entry->txq ? __entry->txq->q_idx : 0, + (unsigned long long)__entry->skb, + __entry->idx + ) +); + +DEFINE_EVENT(/* produce */ + hfi1_ipoib_tx_template, hfi1_tx_produce, + TP_PROTO(struct ipoib_txreq *tx, u32 idx), + TP_ARGS(tx, idx) +); + +DEFINE_EVENT(/* consume */ + hfi1_ipoib_tx_template, hfi1_tx_consume, + TP_PROTO(struct ipoib_txreq *tx, u32 idx), + TP_ARGS(tx, idx) +); + +DEFINE_EVENT(/* alloc_tx */ + hfi1_ipoib_txq_template, hfi1_txq_alloc_tx, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* poll */ + hfi1_ipoib_txq_template, hfi1_txq_poll, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +DEFINE_EVENT(/* complete */ + hfi1_ipoib_txq_template, hfi1_txq_complete, + TP_PROTO(struct hfi1_ipoib_txq *txq), + TP_ARGS(txq) +); + +#endif /* __HFI1_TRACE_TX_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tx +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c new file mode 100644 index 0000000000..4e9d6aa393 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#include "hfi.h" +#include "verbs_txreq.h" +#include "qp.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) UC_OP(x) + +/** + * hfi1_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * @ps: the current packet state + * + * Assume s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rvt_swqe *wqe; + u32 hwords; + u32 bth0 = 0; + u32 len; + u32 pmtu = qp->pmtu; + int middle = 0; + + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == READ_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done_free_tx; + } + + if (priv->hdr_type == HFI1_PKT_TYPE_9B) { + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } else { + /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ + hwords = 7; + if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && + (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + else + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + } + + /* Get the next send request. */ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == READ_ONCE(qp->s_head)) { + clear_ahg(qp); + goto bail; + } + /* + * Local operations are processed immediately + * after all prior requests have completed. + */ + if (wqe->wr.opcode == IB_WR_REG_MR || + wqe->wr.opcode == IB_WR_LOCAL_INV) { + int local_ops = 0; + int err = 0; + + if (qp->s_last != qp->s_cur) + goto bail; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + err = rvt_invalidate_rkey( + qp, wqe->wr.ex.invalidate_rkey); + local_ops = 1; + } + rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); + if (local_ops) + atomic_dec(&qp->local_ops_pending); + goto done_free_tx; + } + /* + * Start a new request. + */ + qp->s_psn = wqe->psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + len = wqe->length; + qp->s_len = len; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_ONLY); + } else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_ONLY); + } else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + fallthrough; + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_LAST); + } else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + fallthrough; + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_LAST); + } else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + ps->s_txreq->hdr_dwords = hwords; + ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->ss = &qp->s_sge; + ps->s_txreq->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), + qp->remote_qpn, mask_psn(qp->s_psn++), + middle, ps); + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + return 0; +} + +/** + * hfi1_uc_rcv - handle an incoming UC packet + * @packet: the packet structure + * + * This is called from qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_uc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + void *data = packet->payload; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct ib_other_headers *ohdr = packet->ohdr; + u32 opcode = packet->opcode; + u32 hdrsize = packet->hlen; + u32 psn; + u32 pad = packet->pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + struct ib_reth *reth; + int ret; + u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); + + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + process_ecn(qp, packet); + + psn = ib_bth_get_psn(ohdr); + /* Compare the PSN verses the expected PSN. */ + if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; +inv: + if (qp->r_state == OP(SEND_FIRST) || + qp->r_state == OP(SEND_MIDDLE)) { + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; + } else { + rvt_put_ss(&qp->r_sge); + } + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + goto drop; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): +send_first: + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { + qp->r_sge = qp->s_rdma_read_sge; + } else { + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + /* + * qp->s_rdma_read_sge will be the owner + * of the mr references. + */ + qp->s_rdma_read_sge = qp->r_sge; + } + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + fallthrough; + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + /* + * There will be no padding for 9B packet but 16B packets + * will come in with some padding since we always add + * CRC and LT bytes which will need to be flit aligned + */ + if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) + goto rewind; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto rewind; + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; +send_last: + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + extra_bytes))) + goto rewind; + /* Don't count the CRC. */ + tlen -= (hdrsize + extra_bytes); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto rewind; + wc.opcode = IB_WC_RECV; + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); + rvt_put_ss(&qp->s_rdma_read_sge); +last_imm: + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ +rdma_first: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + goto drop; + } + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto drop; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_ONLY)) { + goto rdma_last; + } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; + goto rdma_last_imm; + } + fallthrough; + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto drop; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto drop; + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: + wc.wc_flags = IB_WC_WITH_IMM; + + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + extra_bytes); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { + rvt_put_ss(&qp->s_rdma_read_sge); + } else { + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); + rvt_put_ss(&qp->r_sge); + goto last_imm; + + case OP(RDMA_WRITE_LAST): +rdma_last: + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + extra_bytes); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); + rvt_put_ss(&qp->r_sge); + break; + + default: + /* Drop packet for unknown opcodes. */ + goto drop; + } + qp->r_psn++; + qp->r_state = opcode; + return; + +rewind: + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; +drop: + ibp->rvp.n_pkt_drops++; + return; + +op_err: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); +} diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c new file mode 100644 index 0000000000..b64b9d7e08 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -0,0 +1,1023 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2019 Intel Corporation. + */ + +#include <linux/net.h> +#include <rdma/ib_smi.h> + +#include "hfi.h" +#include "mad.h" +#include "verbs_txreq.h" +#include "trace_ibhdrs.h" +#include "qp.h" + +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_req hfi1_make_ud_req_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_ud_req_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_ud_req_16B +}; + +/** + * ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from hfi1_make_ud_req() to forward a WQE addressed + * to the same HFI. + * Note that the receive interrupt handler may be calling hfi1_ud_rcv() + * while this is being called. + */ +static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) +{ + struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct hfi1_pportdata *ppd; + struct hfi1_qp_priv *priv = sqp->priv; + struct rvt_qp *qp; + struct rdma_ah_attr *ah_attr; + unsigned long flags; + struct rvt_sge_state ssge; + struct rvt_sge *sge; + struct ib_wc wc; + u32 length; + enum ib_qp_type sqptype, dqptype; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, + rvt_get_swqe_remote_qpn(swqe)); + if (!qp) { + ibp->rvp.n_pkt_drops++; + rcu_read_unlock(); + return; + } + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || + !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + goto drop; + } + + ah_attr = rvt_get_swqe_ah_attr(swqe); + ppd = ppd_from_ibp(ibp); + + if (qp->ibqp.qp_num > 1) { + u16 pkey; + u32 slid; + u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + + pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index); + slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1)); + if (unlikely(ingress_pkey_check(ppd, pkey, sc5, + qp->s_pkey_index, + slid, false))) { + hfi1_bad_pkey(ibp, pkey, + rdma_ah_get_sl(ah_attr), + sqp->ibqp.qp_num, qp->ibqp.qp_num, + slid, rdma_ah_get_dlid(ah_attr)); + goto drop; + } + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (qp->ibqp.qp_num) { + u32 qkey; + + qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ? + sqp->qkey : rvt_get_swqe_remote_qkey(swqe); + if (unlikely(qkey != qp->qkey)) + goto drop; /* silently drop per IBTA spec */ + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof(wc)); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + spin_lock_irqsave(&qp->r_lock, flags); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { + int ret; + + ret = rvt_get_rwqe(qp, false); + if (ret < 0) { + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto bail_unlock; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->rvp.n_vl15_dropped++; + goto bail_unlock; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= RVT_R_REUSE_SGE; + ibp->rvp.n_pkt_drops++; + goto bail_unlock; + } + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + struct ib_grh grh; + struct ib_global_route grd = *(rdma_ah_read_grh(ah_attr)); + + /* + * For loopback packets with extended LIDs, the + * sgid_index in the GRH is 0 and the dgid is + * OPA GID of the sender. While creating a response + * to the loopback packet, IB core creates the new + * sgid_index from the DGID and that will be the + * OPA_GID_INDEX. The new dgid is from the sgid + * index and that will be in the IB GID format. + * + * We now have a case where the sent packet had a + * different sgid_index and dgid compared to the + * one that was received in response. + * + * Fix this inconsistency. + */ + if (priv->hdr_type == HFI1_PKT_TYPE_16B) { + if (grd.sgid_index == 0) + grd.sgid_index = OPA_GID_INDEX; + + if (ib_is_opa_gid(&grd.dgid)) + grd.dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); + } + + hfi1_make_grh(ibp, &grh, &grd, 0, 0); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); + wc.wc_flags |= IB_WC_GRH; + } else { + rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); + } + ssge.sg_list = swqe->sg_list + 1; + ssge.sge = *swqe->sg_list; + ssge.num_sge = swqe->wr.num_sge; + sge = &ssge.sge; + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); + rvt_update_sge(&ssge, len, false); + length -= len; + } + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto bail_unlock; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { + if (sqp->ibqp.qp_type == IB_QPT_GSI || + sqp->ibqp.qp_type == IB_QPT_SMI) + wc.pkey_index = rvt_get_swqe_pkey_index(swqe); + else + wc.pkey_index = sqp->s_pkey_index; + } else { + wc.pkey_index = 0; + } + wc.slid = (ppd->lid | (rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1))) & U16_MAX; + /* Check for loopback when the port lid is not set */ + if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) + wc.slid = be16_to_cpu(IB_LID_PERMISSIVE); + wc.sl = rdma_ah_get_sl(ah_attr); + wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->rvp.n_loop_pkts++; +bail_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +drop: + rcu_read_unlock(); +} + +static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u16 *pkey, u32 extra_bytes, bool bypass) +{ + u32 bth0; + struct hfi1_ibport *ibp; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else { + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } + + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) + *pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); + else + *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + if (!bypass) + bth0 |= *pkey; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe)); + ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = + cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey : + rvt_get_swqe_remote_qkey(wqe)); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); +} + +void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + u32 nwords, extra_bytes; + u16 len, slid, dlid, pkey; + u16 lrh0 = 0; + u8 sc5; + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct ib_grh *grh; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = rvt_get_swqe_ah_attr(wqe); + + extra_bytes = -wqe->length & 3; + nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; + /* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */ + ps->s_txreq->hdr_dwords = 7; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + ps->s_txreq->hdr_dwords++; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh; + ps->s_txreq->hdr_dwords += + hfi1_make_grh(ibp, grh, rdma_ah_read_grh(ah_attr), + ps->s_txreq->hdr_dwords - LRH_9B_DWORDS, + nwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + } else { + lrh0 = HFI1_LRH_BTH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) { + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + priv->s_sc = 0xf; + } else { + lrh0 |= (sc5 & 0xf) << 12; + priv->s_sc = sc5; + } + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B); + if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } else { + u16 lid = (u16)ppd->lid; + + if (lid) { + lid |= rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1); + slid = lid; + } else { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } + } + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, false); + len = ps->s_txreq->hdr_dwords + nwords; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_9B; + hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh, + lrh0, len, dlid, slid); +} + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid, slid, nwords, extra_bytes; + u32 dest_qp = rvt_get_swqe_remote_qpn(wqe); + u32 src_qp = qp->ibqp.qp_num; + u16 len, pkey; + u8 l4, sc5; + bool is_mgmt = false; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = rvt_get_swqe_ah_attr(wqe); + + /* + * Build 16B Management Packet if either the destination + * or source queue pair number is 0 or 1. + */ + if (dest_qp == 0 || src_qp == 0 || dest_qp == 1 || src_qp == 1) { + /* header size in dwords 16B LRH+L4_FM = (16+8)/4. */ + ps->s_txreq->hdr_dwords = 6; + is_mgmt = true; + } else { + /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ + ps->s_txreq->hdr_dwords = 9; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + ps->s_txreq->hdr_dwords++; + } + + /* SW provides space for CRC and LT for bypass packets. */ + extra_bytes = hfi1_get_16b_padding((ps->s_txreq->hdr_dwords << 2), + wqe->length); + nwords = ((wqe->length + extra_bytes + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + + if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) { + struct ib_grh *grh; + struct ib_global_route *grd = rdma_ah_retrieve_grh(ah_attr); + /* + * Ensure OPA GIDs are transformed to IB gids + * before creating the GRH. + */ + if (grd->sgid_index == OPA_GID_INDEX) { + dd_dev_warn(ppd->dd, "Bad sgid_index. sgid_index: %d\n", + grd->sgid_index); + grd->sgid_index = 0; + } + grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh; + ps->s_txreq->hdr_dwords += hfi1_make_grh( + ibp, grh, grd, + ps->s_txreq->hdr_dwords - LRH_16B_DWORDS, + nwords); + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + } else { + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + if (qp->ibqp.qp_type == IB_QPT_SMI) + priv->s_sc = 0xf; + else + priv->s_sc = sc5; + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 16B); + if (!ppd->lid) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + else + slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1)); + + if (is_mgmt) { + l4 = OPA_16B_L4_FM; + pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); + hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, + dest_qp, src_qp); + } else { + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + } + /* Convert dwords to flits */ + len = (ps->s_txreq->hdr_dwords + nwords) >> 1; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_16B; + hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah, + slid, dlid, len, pkey, 0, 0, l4, priv->s_sc); +} + +/** + * hfi1_make_ud_req - construct a UD request packet + * @qp: the QP + * @ps: the current packet state + * + * Assume s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct rvt_swqe *wqe; + int next_cur; + u32 lid; + + ps->s_txreq = get_txreq(ps->dev, qp); + if (!ps->s_txreq) + goto bail_no_tx; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == READ_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done_free_tx; + } + + /* see post_one_send() */ + if (qp->s_cur == READ_ONCE(qp->s_head)) + goto bail; + + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = rvt_get_swqe_ah_attr(wqe); + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); + if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || + (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { + lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1); + if (unlikely(!loopback && + ((lid == ppd->lid) || + ((lid == be32_to_cpu(OPA_LID_PERMISSIVE)) && + (qp->ibqp.qp_type == IB_QPT_GSI))))) { + unsigned long tflags = ps->flags; + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, tflags); + ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, tflags); + ps->flags = tflags; + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done_free_tx; + } + } + + qp->s_cur = next_cur; + ps->s_txreq->s_cur_size = wqe->length; + ps->s_txreq->ss = &qp->s_sge; + qp->s_srate = rdma_ah_get_static_rate(ah_attr); + qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + + /* Make the appropriate header */ + hfi1_make_ud_req_tbl[priv->hdr_type](qp, ps, qp->s_wqe); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + ps->s_txreq->sde = priv->s_sde; + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + ps->s_txreq->psc = priv->s_sendcontext; + /* disarm any ahg */ + priv->s_ahg->ahgcount = 0; + priv->s_ahg->ahgidx = 0; + priv->s_ahg->tx_flags = 0; + + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + return 0; +} + +/* + * Hardware can't check this so we do it here. + * + * This is a slightly different algorithm than the standard pkey check. It + * special cases the management keys and allows for 0x7fff and 0xffff to be in + * the table at the same time. + * + * @returns the index found or -1 if not found + */ +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + unsigned i; + + if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) { + unsigned lim_idx = -1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) { + /* here we look for an exact match */ + if (ppd->pkeys[i] == pkey) + return i; + if (ppd->pkeys[i] == LIM_MGMT_P_KEY) + lim_idx = i; + } + + /* did not find 0xffff return 0x7fff idx if found */ + if (pkey == FULL_MGMT_P_KEY) + return lim_idx; + + /* no match... */ + return -1; + } + + pkey &= 0x7fff; /* remove limited/full membership bit */ + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) + if ((ppd->pkeys[i] & 0x7fff) == pkey) + return i; + + /* + * Should not get here, this means hardware failed to validate pkeys. + */ + return -1; +} + +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u16 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 7; + u16 len; + u8 l4; + struct hfi1_opa_header hdr; + struct ib_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nwords; + + hdr.hdr_type = HFI1_PKT_TYPE_16B; + /* Populate length */ + nwords = ((hfi1_get_16b_padding(hwords << 2, 0) + + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + if (old_grh) { + struct ib_grh *grh = &hdr.opah.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16( + (hwords - LRH_16B_DWORDS + nwords) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.opah.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.opah.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + /* BIT 16 to 19 is TVER. Bit 20 to 22 is pad cnt */ + bth0 = (IB_OPCODE_CNP << 24) | (1 << 16) | + (hfi1_get_16b_padding(hwords << 2, 0) << 20); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn); + ohdr->bth[2] = 0; /* PSN 0 */ + + /* Convert dwords to flits */ + len = (hwords + nwords) >> 1; + hfi1_make_16b_hdr(&hdr.opah, slid, dlid, len, pkey, 1, 0, l4, sc5); + + plen = 2 /* PBC */ + hwords + nwords; + pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (!IS_ERR_OR_NULL(pbuf)) { + trace_pio_output_ibhdr(ppd->dd, &hdr, sc5); + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } + } +} + +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, + u16 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 5; + u16 lrh0; + u8 sl = ibp->sc_to_sl[sc5]; + struct hfi1_opa_header hdr; + struct ib_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + hdr.hdr_type = HFI1_PKT_TYPE_9B; + if (old_grh) { + struct ib_grh *grh = &hdr.ibh.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16( + (hwords - LRH_9B_DWORDS + SIZE_OF_CRC) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.ibh.u.l.oth; + lrh0 = HFI1_LRH_GRH; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.ibh.u.oth; + lrh0 = HFI1_LRH_BTH; + } + + lrh0 |= (sc5 & 0xf) << 12 | sl << 4; + + bth0 = pkey | (IB_OPCODE_CNP << 24); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT)); + ohdr->bth[2] = 0; /* PSN 0 */ + + hfi1_make_ib_hdr(&hdr.ibh, lrh0, hwords + SIZE_OF_CRC, dlid, slid); + plen = 2 /* PBC */ + hwords; + pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (!IS_ERR_OR_NULL(pbuf)) { + trace_pio_output_ibhdr(ppd->dd, &hdr, sc5); + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } + } +} + +/* + * opa_smp_check() - Do the regular pkey checking, and the additional + * checks for SMPs specified in OPAv1 rev 1.0, 9/19/2016 update, section + * 9.10.25 ("SMA Packet Checks"). + * + * Note that: + * - Checks are done using the pkey directly from the packet's BTH, + * and specifically _not_ the pkey that we attach to the completion, + * which may be different. + * - These checks are specifically for "non-local" SMPs (i.e., SMPs + * which originated on another node). SMPs which are sent from, and + * destined to this node are checked in opa_local_smp_check(). + * + * At the point where opa_smp_check() is called, we know: + * - destination QP is QP0 + * + * opa_smp_check() returns 0 if all checks succeed, 1 otherwise. + */ +static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, + struct rvt_qp *qp, u16 slid, struct opa_smp *smp) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + /* + * I don't think it's possible for us to get here with sc != 0xf, + * but check it to be certain. + */ + if (sc5 != 0xf) + return 1; + + if (rcv_pkey_check(ppd, pkey, sc5, slid)) + return 1; + + /* + * At this point we know (and so don't need to check again) that + * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY + * (see ingress_pkey_check). + */ + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + + /* + * SMPs fall into one of four (disjoint) categories: + * SMA request, SMA response, SMA trap, or SMA trap repress. + * Our response depends, in part, on which type of SMP we're + * processing. + * + * If this is an SMA response, skip the check here. + * + * If this is an SMA request or SMA trap repress: + * - pkey != FULL_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + * + * Otherwise: + * - accept if the port is running an SM + * - drop MAD if it's an SMA trap + * - pkey == FULL_MGMT_P_KEY => + * reply with unsupported method + * - pkey != FULL_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + */ + switch (smp->method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_MGMT_METHOD_REPORT_RESP: + break; + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (pkey != FULL_MGMT_P_KEY) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + break; + default: + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return 0; + if (smp->method == IB_MGMT_METHOD_TRAP) + return 1; + if (pkey == FULL_MGMT_P_KEY) { + smp->status |= IB_SMP_UNSUP_METHOD; + return 0; + } + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + return 0; +} + +/** + * hfi1_ud_rcv - receive an incoming UD packet + * @packet: the packet structure + * + * This is called from qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_ud_rcv(struct hfi1_packet *packet) +{ + u32 hdrsize = packet->hlen; + struct ib_wc wc; + u32 src_qp; + u16 pkey; + int mgmt_pkey_idx = -1; + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + void *data = packet->payload; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + u8 sc5 = packet->sc; + u8 sl_from_sc; + u8 opcode = packet->opcode; + u8 sl = packet->sl; + u32 dlid = packet->dlid; + u32 slid = packet->slid; + u8 extra_bytes; + u8 l4 = 0; + bool dlid_is_permissive; + bool slid_is_permissive; + bool solicited = false; + + extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2); + + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + u32 permissive_lid = + opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B); + + l4 = hfi1_16B_get_l4(packet->hdr); + pkey = hfi1_16B_get_pkey(packet->hdr); + dlid_is_permissive = (dlid == permissive_lid); + slid_is_permissive = (slid == permissive_lid); + } else { + pkey = ib_bth_get_pkey(packet->ohdr); + dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE)); + slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE)); + } + sl_from_sc = ibp->sc_to_sl[sc5]; + + if (likely(l4 != OPA_16B_L4_FM)) { + src_qp = ib_get_sqpn(packet->ohdr); + solicited = ib_bth_is_solicited(packet->ohdr); + } else { + src_qp = hfi1_16B_get_src_qpn(packet->mgmt); + } + + process_ecn(qp, packet); + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ + if (unlikely(tlen < (hdrsize + extra_bytes))) + goto drop; + + tlen -= hdrsize + extra_bytes; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(dlid_is_permissive || slid_is_permissive)) + goto drop; + if (qp->ibqp.qp_num > 1) { + if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { + /* + * Traps will not be sent for packets dropped + * by the HW. This is fine, as sending trap + * for invalid pkeys is optional according to + * IB spec (release 1.3, section 10.9.4) + */ + hfi1_bad_pkey(ibp, + pkey, sl, + src_qp, qp->ibqp.qp_num, + slid, dlid); + return; + } + } else { + /* GSI packet */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + } + if (unlikely(l4 != OPA_16B_L4_FM && + ib_get_qkey(packet->ohdr) != qp->qkey)) + return; /* Silent drop */ + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely(qp->ibqp.qp_num == 1 && + (tlen > 2048 || (sc5 == 0xF)))) + goto drop; + } else { + /* Received on QP0, and so by definition, this is an SMP */ + struct opa_smp *smp = (struct opa_smp *)data; + + if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp)) + goto drop; + + if (tlen > 2048) + goto drop; + if ((dlid_is_permissive || slid_is_permissive) && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; + + /* look up SMI pkey */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + } + + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + wc.ex.imm_data = packet->ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + goto drop; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { + int ret; + + ret = rvt_get_rwqe(qp, false); + if (ret < 0) { + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->rvp.n_vl15_dropped++; + return; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= RVT_R_REUSE_SGE; + goto drop; + } + if (packet->grh) { + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); + wc.wc_flags |= IB_WC_GRH; + } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { + struct ib_grh grh; + /* + * Assuming we only created 16B on the send side + * if we want to use large LIDs, since GRH was stripped + * out when creating 16B, add back the GRH here. + */ + hfi1_make_ext_grh(packet, &grh, slid, dlid); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); + wc.wc_flags |= IB_WC_GRH; + } else { + rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); + } + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + return; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + + if (qp->ibqp.qp_type == IB_QPT_GSI || + qp->ibqp.qp_type == IB_QPT_SMI) { + if (mgmt_pkey_idx < 0) { + if (net_ratelimit()) { + struct hfi1_devdata *dd = ppd->dd; + + dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n", + qp->ibqp.qp_type); + mgmt_pkey_idx = 0; + } + } + wc.pkey_index = (unsigned)mgmt_pkey_idx; + } else { + wc.pkey_index = 0; + } + if (slid_is_permissive) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + wc.slid = slid & U16_MAX; + wc.sl = sl_from_sc; + + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = hfi1_check_mcast(dlid) ? 0 : + dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + rvt_recv_cq(qp, &wc, solicited); + return; + +drop: + ibp->rvp.n_pkt_drops++; +} diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c new file mode 100644 index 0000000000..96058baf36 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -0,0 +1,966 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2018 Intel Corporation. + */ +#include <asm/page.h> +#include <linux/string.h> + +#include "mmu_rb.h" +#include "user_exp_rcv.h" +#include "trace.h" + +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, + struct exp_tid_set *set, + struct hfi1_filedata *fd); +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, + u32 rcventry, struct tid_group *grp, + u16 pageidx, unsigned int npages); +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode); +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, + struct tid_group *grp, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped); +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); +static void __clear_tid_node(struct hfi1_filedata *fd, + struct tid_rb_node *node); +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); + +static const struct mmu_interval_notifier_ops tid_mn_ops = { + .invalidate = tid_rb_invalidate, +}; +static const struct mmu_interval_notifier_ops tid_cover_ops = { + .invalidate = tid_cover_invalidate, +}; + +/* + * Initialize context and file private data needed for Expected + * receive caching. This needs to be done after the context has + * been configured with the eager/expected RcvEntry counts. + */ +int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) +{ + int ret = 0; + + fd->entry_to_rb = kcalloc(uctxt->expected_count, + sizeof(struct rb_node *), + GFP_KERNEL); + if (!fd->entry_to_rb) + return -ENOMEM; + + if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { + fd->invalid_tid_idx = 0; + fd->invalid_tids = kcalloc(uctxt->expected_count, + sizeof(*fd->invalid_tids), + GFP_KERNEL); + if (!fd->invalid_tids) { + kfree(fd->entry_to_rb); + fd->entry_to_rb = NULL; + return -ENOMEM; + } + fd->use_mn = true; + } + + /* + * PSM does not have a good way to separate, count, and + * effectively enforce a limit on RcvArray entries used by + * subctxts (when context sharing is used) when TID caching + * is enabled. To help with that, we calculate a per-process + * RcvArray entry share and enforce that. + * If TID caching is not in use, PSM deals with usage on its + * own. In that case, we allow any subctxt to take all of the + * entries. + * + * Make sure that we set the tid counts only after successful + * init. + */ + spin_lock(&fd->tid_lock); + if (uctxt->subctxt_cnt && fd->use_mn) { + u16 remainder; + + fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; + remainder = uctxt->expected_count % uctxt->subctxt_cnt; + if (remainder && fd->subctxt < remainder) + fd->tid_limit++; + } else { + fd->tid_limit = uctxt->expected_count; + } + spin_unlock(&fd->tid_lock); + + return ret; +} + +void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + mutex_lock(&uctxt->exp_mutex); + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); + mutex_unlock(&uctxt->exp_mutex); + + kfree(fd->invalid_tids); + fd->invalid_tids = NULL; + + kfree(fd->entry_to_rb); + fd->entry_to_rb = NULL; +} + +/* + * Release pinned receive buffer pages. + * + * @mapped: true if the pages have been DMA mapped. false otherwise. + * @idx: Index of the first page to unpin. + * @npages: No of pages to unpin. + * + * If the pages have been DMA mapped (indicated by mapped parameter), their + * info will be passed via a struct tid_rb_node. If they haven't been mapped, + * their info will be passed via a struct tid_user_buf. + */ +static void unpin_rcv_pages(struct hfi1_filedata *fd, + struct tid_user_buf *tidbuf, + struct tid_rb_node *node, + unsigned int idx, + unsigned int npages, + bool mapped) +{ + struct page **pages; + struct hfi1_devdata *dd = fd->uctxt->dd; + struct mm_struct *mm; + + if (mapped) { + dma_unmap_single(&dd->pcidev->dev, node->dma_addr, + node->npages * PAGE_SIZE, DMA_FROM_DEVICE); + pages = &node->pages[idx]; + mm = mm_from_tid_node(node); + } else { + pages = &tidbuf->pages[idx]; + mm = current->mm; + } + hfi1_release_user_pages(mm, pages, npages, mapped); + fd->tid_n_pinned -= npages; +} + +/* + * Pin receive buffer pages. + */ +static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) +{ + int pinned; + unsigned int npages = tidbuf->npages; + unsigned long vaddr = tidbuf->vaddr; + struct page **pages = NULL; + struct hfi1_devdata *dd = fd->uctxt->dd; + + if (npages > fd->uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + /* + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ + if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) { + kfree(pages); + return -ENOMEM; + } + + pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages); + if (pinned <= 0) { + kfree(pages); + return pinned; + } + tidbuf->pages = pages; + fd->tid_n_pinned += pinned; + return pinned; +} + +/* + * RcvArray entry allocation for Expected Receives is done by the + * following algorithm: + * + * The context keeps 3 lists of groups of RcvArray entries: + * 1. List of empty groups - tid_group_list + * This list is created during user context creation and + * contains elements which describe sets (of 8) of empty + * RcvArray entries. + * 2. List of partially used groups - tid_used_list + * This list contains sets of RcvArray entries which are + * not completely used up. Another mapping request could + * use some of all of the remaining entries. + * 3. List of full groups - tid_full_list + * This is the list where sets that are completely used + * up go. + * + * An attempt to optimize the usage of RcvArray entries is + * made by finding all sets of physically contiguous pages in a + * user's buffer. + * These physically contiguous sets are further split into + * sizes supported by the receive engine of the HFI. The + * resulting sets of pages are stored in struct tid_pageset, + * which describes the sets as: + * * .count - number of pages in this set + * * .idx - starting index into struct page ** array + * of this set + * + * From this point on, the algorithm deals with the page sets + * described above. The number of pagesets is divided by the + * RcvArray group size to produce the number of full groups + * needed. + * + * Groups from the 3 lists are manipulated using the following + * rules: + * 1. For each set of 8 pagesets, a complete group from + * tid_group_list is taken, programmed, and moved to + * the tid_full_list list. + * 2. For all remaining pagesets: + * 2.1 If the tid_used_list is empty and the tid_group_list + * is empty, stop processing pageset and return only + * what has been programmed up to this point. + * 2.2 If the tid_used_list is empty and the tid_group_list + * is not empty, move a group from tid_group_list to + * tid_used_list. + * 2.3 For each group is tid_used_group, program as much as + * can fit into the group. If the group becomes fully + * used, move it to tid_full_list. + */ +int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo) +{ + int ret = 0, need_group = 0, pinned; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned int ngroups, pageset_count, + tididx = 0, mapped, mapped_pages = 0; + u32 *tidlist = NULL; + struct tid_user_buf *tidbuf; + unsigned long mmu_seq = 0; + + if (!PAGE_ALIGNED(tinfo->vaddr)) + return -EINVAL; + if (tinfo->length == 0) + return -EINVAL; + + tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); + if (!tidbuf) + return -ENOMEM; + + mutex_init(&tidbuf->cover_mutex); + tidbuf->vaddr = tinfo->vaddr; + tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); + tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), + GFP_KERNEL); + if (!tidbuf->psets) { + ret = -ENOMEM; + goto fail_release_mem; + } + + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &tidbuf->notifier, current->mm, + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, + &tid_cover_ops); + if (ret) + goto fail_release_mem; + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); + } + + pinned = pin_rcv_pages(fd, tidbuf); + if (pinned <= 0) { + ret = (pinned < 0) ? pinned : -ENOSPC; + goto fail_unpin; + } + + /* Find sets of physically contiguous pages */ + tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); + + /* Reserve the number of expected tids to be used. */ + spin_lock(&fd->tid_lock); + if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) + pageset_count = fd->tid_limit - fd->tid_used; + else + pageset_count = tidbuf->n_psets; + fd->tid_used += pageset_count; + spin_unlock(&fd->tid_lock); + + if (!pageset_count) { + ret = -ENOSPC; + goto fail_unreserve; + } + + ngroups = pageset_count / dd->rcv_entries.group_size; + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); + if (!tidlist) { + ret = -ENOMEM; + goto fail_unreserve; + } + + tididx = 0; + + /* + * From this point on, we are going to be using shared (between master + * and subcontexts) context resources. We need to take the lock. + */ + mutex_lock(&uctxt->exp_mutex); + /* + * The first step is to program the RcvArray entries which are complete + * groups. + */ + while (ngroups && uctxt->tid_group_list.count) { + struct tid_group *grp = + tid_group_pop(&uctxt->tid_group_list); + + ret = program_rcvarray(fd, tidbuf, grp, + dd->rcv_entries.group_size, + tidlist, &tididx, &mapped); + /* + * If there was a failure to program the RcvArray + * entries for the entire group, reset the grp fields + * and add the grp back to the free group list. + */ + if (ret <= 0) { + tid_group_add_tail(grp, &uctxt->tid_group_list); + hfi1_cdbg(TID, + "Failed to program RcvArray group %d", ret); + goto unlock; + } + + tid_group_add_tail(grp, &uctxt->tid_full_list); + ngroups--; + mapped_pages += mapped; + } + + while (tididx < pageset_count) { + struct tid_group *grp, *ptr; + /* + * If we don't have any partially used tid groups, check + * if we have empty groups. If so, take one from there and + * put in the partially used list. + */ + if (!uctxt->tid_used_list.count || need_group) { + if (!uctxt->tid_group_list.count) + goto unlock; + + grp = tid_group_pop(&uctxt->tid_group_list); + tid_group_add_tail(grp, &uctxt->tid_used_list); + need_group = 0; + } + /* + * There is an optimization opportunity here - instead of + * fitting as many page sets as we can, check for a group + * later on in the list that could fit all of them. + */ + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, + list) { + unsigned use = min_t(unsigned, pageset_count - tididx, + grp->size - grp->used); + + ret = program_rcvarray(fd, tidbuf, grp, + use, tidlist, + &tididx, &mapped); + if (ret < 0) { + hfi1_cdbg(TID, + "Failed to program RcvArray entries %d", + ret); + goto unlock; + } else if (ret > 0) { + if (grp->used == grp->size) + tid_group_move(grp, + &uctxt->tid_used_list, + &uctxt->tid_full_list); + mapped_pages += mapped; + need_group = 0; + /* Check if we are done so we break out early */ + if (tididx >= pageset_count) + break; + } else if (WARN_ON(ret == 0)) { + /* + * If ret is 0, we did not program any entries + * into this group, which can only happen if + * we've screwed up the accounting somewhere. + * Warn and try to continue. + */ + need_group = 1; + } + } + } +unlock: + mutex_unlock(&uctxt->exp_mutex); + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, + mapped_pages, ret); + + /* fail if nothing was programmed, set error if none provided */ + if (tididx == 0) { + if (ret >= 0) + ret = -ENOSPC; + goto fail_unreserve; + } + + /* adjust reserved tid_used to actual count */ + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count - tididx; + spin_unlock(&fd->tid_lock); + + /* unpin all pages not covered by a TID */ + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, + false); + + if (fd->use_mn) { + /* check for an invalidate during setup */ + bool fail = false; + + mutex_lock(&tidbuf->cover_mutex); + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); + mutex_unlock(&tidbuf->cover_mutex); + + if (fail) { + ret = -EBUSY; + goto fail_unprogram; + } + } + + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), + tidlist, sizeof(tidlist[0]) * tididx)) { + ret = -EFAULT; + goto fail_unprogram; + } + + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + kfree(tidbuf->pages); + kfree(tidbuf->psets); + kfree(tidbuf); + kfree(tidlist); + return 0; + +fail_unprogram: + /* unprogram, unmap, and unpin all allocated TIDs */ + tinfo->tidlist = (unsigned long)tidlist; + hfi1_user_exp_rcv_clear(fd, tinfo); + tinfo->tidlist = 0; + pinned = 0; /* nothing left to unpin */ + pageset_count = 0; /* nothing left reserved */ +fail_unreserve: + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count; + spin_unlock(&fd->tid_lock); +fail_unpin: + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + if (pinned > 0) + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); +fail_release_mem: + kfree(tidbuf->pages); + kfree(tidbuf->psets); + kfree(tidbuf); + kfree(tidlist); + return ret; +} + +int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo) +{ + int ret = 0; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + u32 *tidinfo; + unsigned tididx; + + if (unlikely(tinfo->tidcnt > fd->tid_used)) + return -EINVAL; + + tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist), + sizeof(tidinfo[0]) * tinfo->tidcnt); + if (IS_ERR(tidinfo)) + return PTR_ERR(tidinfo); + + mutex_lock(&uctxt->exp_mutex); + for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { + ret = unprogram_rcvarray(fd, tidinfo[tididx]); + if (ret) { + hfi1_cdbg(TID, "Failed to unprogram rcv array %d", + ret); + break; + } + } + spin_lock(&fd->tid_lock); + fd->tid_used -= tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + mutex_unlock(&uctxt->exp_mutex); + + kfree(tidinfo); + return ret; +} + +int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + unsigned long *ev = uctxt->dd->events + + (uctxt_offset(uctxt) + fd->subctxt); + u32 *array; + int ret = 0; + + /* + * copy_to_user() can sleep, which will leave the invalid_lock + * locked and cause the MMU notifier to be blocked on the lock + * for a long time. + * Copy the data to a local buffer so we can release the lock. + */ + array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); + if (!array) + return -EFAULT; + + spin_lock(&fd->invalid_lock); + if (fd->invalid_tid_idx) { + memcpy(array, fd->invalid_tids, sizeof(*array) * + fd->invalid_tid_idx); + memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * + fd->invalid_tid_idx); + tinfo->tidcnt = fd->invalid_tid_idx; + fd->invalid_tid_idx = 0; + /* + * Reset the user flag while still holding the lock. + * Otherwise, PSM can miss events. + */ + clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } else { + tinfo->tidcnt = 0; + } + spin_unlock(&fd->invalid_lock); + + if (tinfo->tidcnt) { + if (copy_to_user((void __user *)tinfo->tidlist, + array, sizeof(*array) * tinfo->tidcnt)) + ret = -EFAULT; + } + kfree(array); + + return ret; +} + +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) +{ + unsigned pagecount, pageidx, setcount = 0, i; + unsigned long pfn, this_pfn; + struct page **pages = tidbuf->pages; + struct tid_pageset *list = tidbuf->psets; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + pfn = page_to_pfn(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; + + /* + * If the pfn's are not sequential, pages are not physically + * contiguous. + */ + if (this_pfn != ++pfn) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + pfn = this_pfn; + } else { + pagecount++; + } + } + return setcount; +} + +/** + * program_rcvarray() - program an RcvArray group with receive buffers + * @fd: filedata pointer + * @tbuf: pointer to struct tid_user_buf that has the user buffer starting + * virtual address, buffer length, page pointers, pagesets (array of + * struct tid_pageset holding information on physically contiguous + * chunks from the user buffer), and other fields. + * @grp: RcvArray group + * @count: number of struct tid_pageset's to program + * @tidlist: the array of u32 elements when the information about the + * programmed RcvArray entries is to be encoded. + * @tididx: starting offset into tidlist + * @pmapped: (output parameter) number of pages programmed into the RcvArray + * entries. + * + * This function will program up to 'count' number of RcvArray entries from the + * group 'grp'. To make best use of write-combining writes, the function will + * perform writes to the unused RcvArray entries which will be ignored by the + * HW. Each RcvArray entry will be programmed with a physically contiguous + * buffer chunk from the user's virtual buffer. + * + * Return: + * -EINVAL if the requested count is larger than the size of the group, + * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or + * number of RcvArray entries programmed. + */ +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, + struct tid_group *grp, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + u16 idx; + unsigned int start = *tididx; + u32 tidinfo = 0, rcventry, useidx = 0; + int mapped = 0; + + /* Count should never be larger than the group size */ + if (count > grp->size) + return -EINVAL; + + /* Find the first unused entry in the group */ + for (idx = 0; idx < grp->size; idx++) { + if (!(grp->map & (1 << idx))) { + useidx = idx; + break; + } + rcv_array_wc_fill(dd, grp->base + idx); + } + + idx = 0; + while (idx < count) { + u16 npages, pageidx, setidx = start + idx; + int ret = 0; + + /* + * If this entry in the group is used, move to the next one. + * If we go past the end of the group, exit the loop. + */ + if (useidx >= grp->size) { + break; + } else if (grp->map & (1 << useidx)) { + rcv_array_wc_fill(dd, grp->base + useidx); + useidx++; + continue; + } + + rcventry = grp->base + useidx; + npages = tbuf->psets[setidx].count; + pageidx = tbuf->psets[setidx].idx; + + ret = set_rcvarray_entry(fd, tbuf, + rcventry, grp, pageidx, + npages); + if (ret) + return ret; + mapped += npages; + + tidinfo = create_tid(rcventry - uctxt->expected_base, npages); + tidlist[(*tididx)++] = tidinfo; + grp->used++; + grp->map |= 1 << useidx++; + idx++; + } + + /* Fill the rest of the group with "blank" writes */ + for (; useidx < grp->size; useidx++) + rcv_array_wc_fill(dd, grp->base + useidx); + *pmapped = mapped; + return idx; +} + +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, + u32 rcventry, struct tid_group *grp, + u16 pageidx, unsigned int npages) +{ + int ret; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_rb_node *node; + struct hfi1_devdata *dd = uctxt->dd; + dma_addr_t phys; + struct page **pages = tbuf->pages + pageidx; + + /* + * Allocate the node first so we can handle a potential + * failure before we've programmed anything. + */ + node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL); + if (!node) + return -ENOMEM; + + phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])), + npages * PAGE_SIZE, DMA_FROM_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, phys)) { + dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", + phys); + kfree(node); + return -EFAULT; + } + + node->fdata = fd; + mutex_init(&node->invalidate_mutex); + node->phys = page_to_phys(pages[0]); + node->npages = npages; + node->rcventry = rcventry; + node->dma_addr = phys; + node->grp = grp; + node->freed = false; + memcpy(node->pages, pages, flex_array_size(node, pages, npages)); + + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &node->notifier, current->mm, + tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, + &tid_mn_ops); + if (ret) + goto out_unmap; + } + fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; + + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); + trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, + node->notifier.interval_tree.start, node->phys, + phys); + return 0; + +out_unmap: + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->notifier.interval_tree.start, + node->phys, ret); + dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE, + DMA_FROM_DEVICE); + kfree(node); + return -EFAULT; +} + +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + struct tid_rb_node *node; + u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; + + if (tidctrl == 0x3 || tidctrl == 0x0) + return -EINVAL; + + rcventry = tididx + (tidctrl - 1); + + if (rcventry >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + rcventry, uctxt->ctxt); + return -EINVAL; + } + + node = fd->entry_to_rb[rcventry]; + if (!node || node->rcventry != (uctxt->expected_base + rcventry)) + return -EBADF; + + if (fd->use_mn) + mmu_interval_notifier_remove(&node->notifier); + cacheless_tid_rb_remove(fd, node); + + return 0; +} + +static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + + mutex_lock(&node->invalidate_mutex); + if (node->freed) + goto done; + node->freed = true; + + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, + node->npages, + node->notifier.interval_tree.start, node->phys, + node->dma_addr); + + /* Make sure device has seen the write before pages are unpinned */ + hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); + + unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); +done: + mutex_unlock(&node->invalidate_mutex); +} + +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + __clear_tid_node(fd, node); + + node->grp->used--; + node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); + + if (node->grp->used == node->grp->size - 1) + tid_group_move(node->grp, &uctxt->tid_full_list, + &uctxt->tid_used_list); + else if (!node->grp->used) + tid_group_move(node->grp, &uctxt->tid_used_list, + &uctxt->tid_group_list); + kfree(node); +} + +/* + * As a simple helper for hfi1_user_exp_rcv_free, this function deals with + * clearing nodes in the non-cached case. + */ +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, + struct exp_tid_set *set, + struct hfi1_filedata *fd) +{ + struct tid_group *grp, *ptr; + int i; + + list_for_each_entry_safe(grp, ptr, &set->list, list) { + list_del_init(&grp->list); + + for (i = 0; i < grp->size; i++) { + if (grp->map & (1 << i)) { + u16 rcventry = grp->base + i; + struct tid_rb_node *node; + + node = fd->entry_to_rb[rcventry - + uctxt->expected_base]; + if (!node || node->rcventry != rcventry) + continue; + + if (fd->use_mn) + mmu_interval_notifier_remove( + &node->notifier); + cacheless_tid_rb_remove(fd, node); + } + } + } +} + +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct tid_rb_node *node = + container_of(mni, struct tid_rb_node, notifier); + struct hfi1_filedata *fdata = node->fdata; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + + if (node->freed) + return true; + + /* take action only if unmapping */ + if (range->event != MMU_NOTIFY_UNMAP) + return true; + + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, + node->notifier.interval_tree.start, + node->rcventry, node->npages, node->dma_addr); + + /* clear the hardware rcvarray entry */ + __clear_tid_node(fdata, node); + + spin_lock(&fdata->invalid_lock); + if (fdata->invalid_tid_idx < uctxt->expected_count) { + fdata->invalid_tids[fdata->invalid_tid_idx] = + create_tid(node->rcventry - uctxt->expected_base, + node->npages); + if (!fdata->invalid_tid_idx) { + unsigned long *ev; + + /* + * hfi1_set_uevent_bits() sets a user event flag + * for all processes. Because calling into the + * driver to process TID cache invalidations is + * expensive and TID cache invalidations are + * handled on a per-process basis, we can + * optimize this to set the flag only for the + * process in question. + */ + ev = uctxt->dd->events + + (uctxt_offset(uctxt) + fdata->subctxt); + set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } + fdata->invalid_tid_idx++; + } + spin_unlock(&fdata->invalid_lock); + return true; +} + +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct tid_user_buf *tidbuf = + container_of(mni, struct tid_user_buf, notifier); + + /* take action only if unmapping */ + if (range->event == MMU_NOTIFY_UNMAP) { + mutex_lock(&tidbuf->cover_mutex); + mmu_interval_set_seq(mni, cur_seq); + mutex_unlock(&tidbuf->cover_mutex); + } + + return true; +} + +static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, + struct tid_rb_node *tnode) +{ + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = NULL; + clear_tid_node(fdata, tnode); +} diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h new file mode 100644 index 0000000000..f8ee997d00 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2020 - Cornelis Networks, Inc. + * Copyright(c) 2015 - 2017 Intel Corporation. + */ + +#ifndef _HFI1_USER_EXP_RCV_H +#define _HFI1_USER_EXP_RCV_H + +#include "hfi.h" +#include "exp_rcv.h" + +struct tid_pageset { + u16 idx; + u16 count; +}; + +struct tid_user_buf { + struct mmu_interval_notifier notifier; + struct mutex cover_mutex; + unsigned long vaddr; + unsigned long length; + unsigned int npages; + struct page **pages; + struct tid_pageset *psets; + unsigned int n_psets; +}; + +struct tid_rb_node { + struct mmu_interval_notifier notifier; + struct hfi1_filedata *fdata; + struct mutex invalidate_mutex; /* covers hw removal */ + unsigned long phys; + struct tid_group *grp; + u32 rcventry; + dma_addr_t dma_addr; + bool freed; + unsigned int npages; + struct page *pages[]; +}; + +static inline int num_user_pages(unsigned long addr, + unsigned long len) +{ + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); +void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd); +int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo); +int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo); +int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo); + +static inline struct mm_struct *mm_from_tid_node(struct tid_rb_node *node) +{ + return node->notifier.mm; +} + +#endif /* _HFI1_USER_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c new file mode 100644 index 0000000000..36aaedc651 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015-2017 Intel Corporation. + */ + +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/device.h> +#include <linux/module.h> + +#include "hfi.h" + +static unsigned long cache_size = 256; +module_param(cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); + +/* + * Determine whether the caller can pin pages. + * + * This function should be used in the implementation of buffer caches. + * The cache implementation should call this function prior to attempting + * to pin buffer pages in order to determine whether they should do so. + * The function computes cache limits based on the configured ulimit and + * cache size. Use of this function is especially important for caches + * which are not limited in any other way (e.g. by HW resources) and, thus, + * could keeping caching buffers. + * + */ +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages) +{ + unsigned long ulimit_pages; + unsigned long cache_limit_pages; + unsigned int usr_ctxts; + + /* + * Perform RLIMIT_MEMLOCK based checks unless CAP_IPC_LOCK is present. + */ + if (!capable(CAP_IPC_LOCK)) { + ulimit_pages = + DIV_ROUND_DOWN_ULL(rlimit(RLIMIT_MEMLOCK), PAGE_SIZE); + + /* + * Pinning these pages would exceed this process's locked memory + * limit. + */ + if (atomic64_read(&mm->pinned_vm) + npages > ulimit_pages) + return false; + + /* + * Only allow 1/4 of the user's RLIMIT_MEMLOCK to be used for HFI + * caches. This fraction is then equally distributed among all + * existing user contexts. Note that if RLIMIT_MEMLOCK is + * 'unlimited' (-1), the value of this limit will be > 2^42 pages + * (2^64 / 2^12 / 2^8 / 2^2). + * + * The effectiveness of this check may be reduced if I/O occurs on + * some user contexts before all user contexts are created. This + * check assumes that this process is the only one using this + * context (e.g., the corresponding fd was not passed to another + * process for concurrent access) as there is no per-context, + * per-process tracking of pinned pages. It also assumes that each + * user context has only one cache to limit. + */ + usr_ctxts = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; + if (nlocked + npages > (ulimit_pages / usr_ctxts / 4)) + return false; + } + + /* + * Pinning these pages would exceed the size limit for this cache. + */ + cache_limit_pages = cache_size * (1024 * 1024) / PAGE_SIZE; + if (nlocked + npages > cache_limit_pages) + return false; + + return true; +} + +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, + bool writable, struct page **pages) +{ + int ret; + unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0); + + ret = pin_user_pages_fast(vaddr, npages, gup_flags, pages); + if (ret < 0) + return ret; + + atomic64_add(ret, &mm->pinned_vm); + + return ret; +} + +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty) +{ + unpin_user_pages_dirty_lock(p, npages, dirty); + + if (mm) { /* during close after signal, mm can be NULL */ + atomic64_sub(npages, &mm->pinned_vm); + } +} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c new file mode 100644 index 0000000000..29ae7beb9b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -0,0 +1,1224 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2020 - 2023 Cornelis Networks, Inc. + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/dmapool.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/string.h> + +#include "hfi.h" +#include "sdma.h" +#include "user_sdma.h" +#include "verbs.h" /* for the headers */ +#include "common.h" /* for struct hfi1_tid_info */ +#include "trace.h" + +static uint hfi1_sdma_comp_ring_size = 128; +module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); + +static unsigned initial_pkt_count = 8; + +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); +static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); +static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); +static void user_sdma_free_request(struct user_sdma_request *req); +static int check_header_template(struct user_sdma_request *req, + struct hfi1_pkt_header *hdr, u32 lrhlen, + u32 datalen); +static int set_txreq_header(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 datalen); +static int set_txreq_header_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 len); +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, + struct hfi1_user_sdma_comp_q *cq, + u16 idx, enum hfi1_sdma_comp_state state, + int ret); +static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); +static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); + +static int defer_packet_queue( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *txreq, + uint seq, + bool pkts_sent); +static void activate_packet_queue(struct iowait *wait, int reason); + +static int defer_packet_queue( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *txreq, + uint seq, + bool pkts_sent) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); + + write_seqlock(&sde->waitlock); + trace_hfi1_usdma_defer(pq, sde, &pq->busy); + if (sdma_progress(sde, seq, txreq)) + goto eagain; + /* + * We are assuming that if the list is enqueued somewhere, it + * is to the dmawait list since that is the only place where + * it is supposed to be enqueued. + */ + xchg(&pq->state, SDMA_PKT_Q_DEFERRED); + if (list_empty(&pq->busy.list)) { + pq->busy.lock = &sde->waitlock; + iowait_get_priority(&pq->busy); + iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); + } + write_sequnlock(&sde->waitlock); + return -EBUSY; +eagain: + write_sequnlock(&sde->waitlock); + return -EAGAIN; +} + +static void activate_packet_queue(struct iowait *wait, int reason) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + + trace_hfi1_usdma_activate(pq, wait, reason); + xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + wake_up(&wait->wait_dma); +}; + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, + struct hfi1_filedata *fd) +{ + int ret = -ENOMEM; + char buf[64]; + struct hfi1_devdata *dd; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + + if (!uctxt || !fd) + return -EBADF; + + if (!hfi1_sdma_comp_ring_size) + return -EINVAL; + + dd = uctxt->dd; + + pq = kzalloc(sizeof(*pq), GFP_KERNEL); + if (!pq) + return -ENOMEM; + pq->dd = dd; + pq->ctxt = uctxt->ctxt; + pq->subctxt = fd->subctxt; + pq->n_max_reqs = hfi1_sdma_comp_ring_size; + atomic_set(&pq->n_reqs, 0); + init_waitqueue_head(&pq->wait); + atomic_set(&pq->n_locked, 0); + + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, + activate_packet_queue, NULL, NULL); + pq->reqidx = 0; + + pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, + sizeof(*pq->reqs), + GFP_KERNEL); + if (!pq->reqs) + goto pq_reqs_nomem; + + pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); + if (!pq->req_in_use) + goto pq_reqs_no_in_use; + + snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, + fd->subctxt); + pq->txreq_cache = kmem_cache_create(buf, + sizeof(struct user_sdma_txreq), + L1_CACHE_BYTES, + SLAB_HWCACHE_ALIGN, + NULL); + if (!pq->txreq_cache) { + dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", + uctxt->ctxt); + goto pq_txreq_nomem; + } + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + goto cq_nomem; + + cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) + * hfi1_sdma_comp_ring_size)); + if (!cq->comps) + goto cq_comps_nomem; + + cq->nentries = hfi1_sdma_comp_ring_size; + + ret = hfi1_init_system_pinning(pq); + if (ret) + goto pq_mmu_fail; + + rcu_assign_pointer(fd->pq, pq); + fd->cq = cq; + + return 0; + +pq_mmu_fail: + vfree(cq->comps); +cq_comps_nomem: + kfree(cq); +cq_nomem: + kmem_cache_destroy(pq->txreq_cache); +pq_txreq_nomem: + bitmap_free(pq->req_in_use); +pq_reqs_no_in_use: + kfree(pq->reqs); +pq_reqs_nomem: + kfree(pq); + + return ret; +} + +static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) +{ + unsigned long flags; + seqlock_t *lock = pq->busy.lock; + + if (!lock) + return; + write_seqlock_irqsave(lock, flags); + if (!list_empty(&pq->busy.list)) { + list_del_init(&pq->busy.list); + pq->busy.lock = NULL; + } + write_sequnlock_irqrestore(lock, flags); +} + +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) +{ + struct hfi1_user_sdma_pkt_q *pq; + + trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); + + spin_lock(&fd->pq_rcu_lock); + pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, + lockdep_is_held(&fd->pq_rcu_lock)); + if (pq) { + rcu_assign_pointer(fd->pq, NULL); + spin_unlock(&fd->pq_rcu_lock); + synchronize_srcu(&fd->pq_srcu); + /* at this point there can be no more new requests */ + iowait_sdma_drain(&pq->busy); + /* Wait until all requests have been freed. */ + wait_event_interruptible( + pq->wait, + !atomic_read(&pq->n_reqs)); + kfree(pq->reqs); + hfi1_free_system_pinning(pq); + bitmap_free(pq->req_in_use); + kmem_cache_destroy(pq->txreq_cache); + flush_pq_iowait(pq); + kfree(pq); + } else { + spin_unlock(&fd->pq_rcu_lock); + } + if (fd->cq) { + vfree(fd->cq->comps); + kfree(fd->cq); + fd->cq = NULL; + } + return 0; +} + +static u8 dlid_to_selector(u16 dlid) +{ + static u8 mapping[256]; + static int initialized; + static u8 next; + int hash; + + if (!initialized) { + memset(mapping, 0xFF, 256); + initialized = 1; + } + + hash = ((dlid >> 8) ^ dlid) & 0xFF; + if (mapping[hash] == 0xFF) { + mapping[hash] = next; + next = (next + 1) & 0x7F; + } + + return mapping[hash]; +} + +/** + * hfi1_user_sdma_process_request() - Process and start a user sdma request + * @fd: valid file descriptor + * @iovec: array of io vectors to process + * @dim: overall iovec array size + * @count: number of io vector array entries processed + */ +int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, + struct iovec *iovec, unsigned long dim, + unsigned long *count) +{ + int ret = 0, i; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_sdma_pkt_q *pq = + srcu_dereference(fd->pq, &fd->pq_srcu); + struct hfi1_user_sdma_comp_q *cq = fd->cq; + struct hfi1_devdata *dd = pq->dd; + unsigned long idx = 0; + u8 pcount = initial_pkt_count; + struct sdma_req_info info; + struct user_sdma_request *req; + u8 opcode, sc, vl; + u16 pkey; + u32 slid; + u16 dlid; + u32 selector; + + if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { + hfi1_cdbg( + SDMA, + "[%u:%u:%u] First vector not big enough for header %lu/%lu", + dd->unit, uctxt->ctxt, fd->subctxt, + iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); + return -EINVAL; + } + ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); + if (ret) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", + dd->unit, uctxt->ctxt, fd->subctxt, ret); + return -EFAULT; + } + + trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, + (u16 *)&info); + if (info.comp_idx >= hfi1_sdma_comp_ring_size) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid comp index", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); + return -EINVAL; + } + + /* + * Sanity check the header io vector count. Need at least 1 vector + * (header) and cannot be larger than the actual io vector count. + */ + if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, + req_iovcnt(info.ctrl), dim); + return -EINVAL; + } + + if (!info.fragsize) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Request does not specify fragsize", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); + return -EINVAL; + } + + /* Try to claim the request. */ + if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", + dd->unit, uctxt->ctxt, fd->subctxt, + info.comp_idx); + return -EBADSLT; + } + /* + * All safety checks have been done and this request has been claimed. + */ + trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx); + req = pq->reqs + info.comp_idx; + req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ + req->data_len = 0; + req->pq = pq; + req->cq = cq; + req->ahg_idx = -1; + req->iov_idx = 0; + req->sent = 0; + req->seqnum = 0; + req->seqcomp = 0; + req->seqsubmitted = 0; + req->tids = NULL; + req->has_error = 0; + INIT_LIST_HEAD(&req->txps); + + memcpy(&req->info, &info, sizeof(info)); + + /* The request is initialized, count it */ + atomic_inc(&pq->n_reqs); + + if (req_opcode(info.ctrl) == EXPECTED) { + /* expected must have a TID info and at least one data vector */ + if (req->data_iovs < 2) { + SDMA_DBG(req, + "Not enough vectors for expected request"); + ret = -EINVAL; + goto free_req; + } + req->data_iovs--; + } + + if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { + SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, + MAX_VECTORS_PER_REQ); + ret = -EINVAL; + goto free_req; + } + + /* Copy the header from the user buffer */ + ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), + sizeof(req->hdr)); + if (ret) { + SDMA_DBG(req, "Failed to copy header template (%d)", ret); + ret = -EFAULT; + goto free_req; + } + + /* If Static rate control is not enabled, sanitize the header. */ + if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) + req->hdr.pbc[2] = 0; + + /* Validate the opcode. Do not trust packets from user space blindly. */ + opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; + if ((opcode & USER_OPCODE_CHECK_MASK) != + USER_OPCODE_CHECK_VAL) { + SDMA_DBG(req, "Invalid opcode (%d)", opcode); + ret = -EINVAL; + goto free_req; + } + /* + * Validate the vl. Do not trust packets from user space blindly. + * VL comes from PBC, SC comes from LRH, and the VL needs to + * match the SC look up. + */ + vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; + sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | + (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); + if (vl >= dd->pport->vls_operational || + vl != sc_to_vlt(dd, sc)) { + SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); + ret = -EINVAL; + goto free_req; + } + + /* Checking P_KEY for requests from user-space */ + pkey = (u16)be32_to_cpu(req->hdr.bth[0]); + slid = be16_to_cpu(req->hdr.lrh[3]); + if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { + ret = -EINVAL; + goto free_req; + } + + /* + * Also should check the BTH.lnh. If it says the next header is GRH then + * the RXE parsing will be off and will land in the middle of the KDETH + * or miss it entirely. + */ + if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { + SDMA_DBG(req, "User tried to pass in a GRH"); + ret = -EINVAL; + goto free_req; + } + + req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); + /* + * Calculate the initial TID offset based on the values of + * KDETH.OFFSET and KDETH.OM that are passed in. + */ + req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx, req->tidoffset); + idx++; + + /* Save all the IO vector structures */ + for (i = 0; i < req->data_iovs; i++) { + req->iovs[i].offset = 0; + INIT_LIST_HEAD(&req->iovs[i].list); + memcpy(&req->iovs[i].iov, + iovec + idx++, + sizeof(req->iovs[i].iov)); + if (req->iovs[i].iov.iov_len == 0) { + ret = -EINVAL; + goto free_req; + } + req->data_len += req->iovs[i].iov.iov_len; + } + trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, + info.comp_idx, req->data_len); + if (pcount > req->info.npkts) + pcount = req->info.npkts; + /* + * Copy any TID info + * User space will provide the TID info only when the + * request type is EXPECTED. This is true even if there is + * only one packet in the request and the header is already + * setup. The reason for the singular TID case is that the + * driver needs to perform safety checks. + */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); + u32 *tmp; + + if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { + ret = -EINVAL; + goto free_req; + } + + /* + * We have to copy all of the tids because they may vary + * in size and, therefore, the TID count might not be + * equal to the pkt count. However, there is no way to + * tell at this point. + */ + tmp = memdup_user(iovec[idx].iov_base, + ntids * sizeof(*req->tids)); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + SDMA_DBG(req, "Failed to copy %d TIDs (%d)", + ntids, ret); + goto free_req; + } + req->tids = tmp; + req->n_tids = ntids; + req->tididx = 0; + idx++; + } + + dlid = be16_to_cpu(req->hdr.lrh[1]); + selector = dlid_to_selector(dlid); + selector += uctxt->ctxt + fd->subctxt; + req->sde = sdma_select_user_engine(dd, selector, vl); + + if (!req->sde || !sdma_running(req->sde)) { + ret = -ECOMM; + goto free_req; + } + + /* We don't need an AHG entry if the request contains only one packet */ + if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) + req->ahg_idx = sdma_ahg_alloc(req->sde); + + set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); + pq->state = SDMA_PKT_Q_ACTIVE; + + /* + * This is a somewhat blocking send implementation. + * The driver will block the caller until all packets of the + * request have been submitted to the SDMA engine. However, it + * will not wait for send completions. + */ + while (req->seqsubmitted != req->info.npkts) { + ret = user_sdma_send_pkts(req, pcount); + if (ret < 0) { + int we_ret; + + if (ret != -EBUSY) + goto free_req; + we_ret = wait_event_interruptible_timeout( + pq->busy.wait_dma, + pq->state == SDMA_PKT_Q_ACTIVE, + msecs_to_jiffies( + SDMA_IOWAIT_TIMEOUT)); + trace_hfi1_usdma_we(pq, we_ret); + if (we_ret <= 0) + flush_pq_iowait(pq); + } + } + *count += idx; + return 0; +free_req: + /* + * If the submitted seqsubmitted == npkts, the completion routine + * controls the final state. If sequbmitted < npkts, wait for any + * outstanding packets to finish before cleaning up. + */ + if (req->seqsubmitted < req->info.npkts) { + if (req->seqsubmitted) + wait_event(pq->busy.wait_dma, + (req->seqcomp == req->seqsubmitted - 1)); + user_sdma_free_request(req); + pq_update(pq); + set_comp_state(pq, cq, info.comp_idx, ERROR, ret); + } + return ret; +} + +static inline u32 compute_data_length(struct user_sdma_request *req, + struct user_sdma_txreq *tx) +{ + /* + * Determine the proper size of the packet data. + * The size of the data of the first packet is in the header + * template. However, it includes the header and ICRC, which need + * to be subtracted. + * The minimum representable packet data length in a header is 4 bytes, + * therefore, when the data length request is less than 4 bytes, there's + * only one packet, and the packet data length is equal to that of the + * request data length. + * The size of the remaining packets is the minimum of the frag + * size (MTU) or remaining data in the request. + */ + u32 len; + + if (!req->seqnum) { + if (req->data_len < sizeof(u32)) + len = req->data_len; + else + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); + } else if (req_opcode(req->info.ctrl) == EXPECTED) { + u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * + PAGE_SIZE; + /* + * Get the data length based on the remaining space in the + * TID pair. + */ + len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); + /* If we've filled up the TID pair, move to the next one. */ + if (unlikely(!len) && ++req->tididx < req->n_tids && + req->tids[req->tididx]) { + tidlen = EXP_TID_GET(req->tids[req->tididx], + LEN) * PAGE_SIZE; + req->tidoffset = 0; + len = min_t(u32, tidlen, req->info.fragsize); + } + /* + * Since the TID pairs map entire pages, make sure that we + * are not going to try to send more data that we have + * remaining. + */ + len = min(len, req->data_len - req->sent); + } else { + len = min(req->data_len - req->sent, (u32)req->info.fragsize); + } + trace_hfi1_sdma_user_compute_length(req->pq->dd, + req->pq->ctxt, + req->pq->subctxt, + req->info.comp_idx, + len); + return len; +} + +static inline u32 pad_len(u32 len) +{ + if (len & (sizeof(u32) - 1)) + len += sizeof(u32) - (len & (sizeof(u32) - 1)); + return len; +} + +static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) +{ + /* (Size of complete header - size of PBC) + 4B ICRC + data length */ + return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); +} + +static int user_sdma_txadd_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, + u32 datalen) +{ + int ret; + u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); + u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); + struct hfi1_user_sdma_pkt_q *pq = req->pq; + + /* + * Copy the request header into the tx header + * because the HW needs a cacheline-aligned + * address. + * This copy can be optimized out if the hdr + * member of user_sdma_request were also + * cacheline aligned. + */ + memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); + tx->hdr.pbc[0] = cpu_to_le16(pbclen); + } + ret = check_header_template(req, &tx->hdr, lrhlen, datalen); + if (ret) + return ret; + ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, + sizeof(tx->hdr) + datalen, req->ahg_idx, + 0, NULL, 0, user_sdma_txreq_cb); + if (ret) + return ret; + ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); + if (ret) + sdma_txclean(pq->dd, &tx->txreq); + return ret; +} + +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) +{ + int ret = 0; + u16 count; + unsigned npkts = 0; + struct user_sdma_txreq *tx = NULL; + struct hfi1_user_sdma_pkt_q *pq = NULL; + struct user_sdma_iovec *iovec = NULL; + + if (!req->pq) + return -EINVAL; + + pq = req->pq; + + /* If tx completion has reported an error, we are done. */ + if (READ_ONCE(req->has_error)) + return -EFAULT; + + /* + * Check if we might have sent the entire request already + */ + if (unlikely(req->seqnum == req->info.npkts)) { + if (!list_empty(&req->txps)) + goto dosend; + return ret; + } + + if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) + maxpkts = req->info.npkts - req->seqnum; + + while (npkts < maxpkts) { + u32 datalen = 0; + + /* + * Check whether any of the completions have come back + * with errors. If so, we are not going to process any + * more packets from this request. + */ + if (READ_ONCE(req->has_error)) + return -EFAULT; + + tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); + if (!tx) + return -ENOMEM; + + tx->flags = 0; + tx->req = req; + INIT_LIST_HEAD(&tx->list); + + /* + * For the last packet set the ACK request + * and disable header suppression. + */ + if (req->seqnum == req->info.npkts - 1) + tx->flags |= (TXREQ_FLAGS_REQ_ACK | + TXREQ_FLAGS_REQ_DISABLE_SH); + + /* + * Calculate the payload size - this is min of the fragment + * (MTU) size or the remaining bytes in the request but only + * if we have payload data. + */ + if (req->data_len) { + iovec = &req->iovs[req->iov_idx]; + if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { + if (++req->iov_idx == req->data_iovs) { + ret = -EFAULT; + goto free_tx; + } + iovec = &req->iovs[req->iov_idx]; + WARN_ON(iovec->offset); + } + + datalen = compute_data_length(req, tx); + + /* + * Disable header suppression for the payload <= 8DWS. + * If there is an uncorrectable error in the receive + * data FIFO when the received payload size is less than + * or equal to 8DWS then the RxDmaDataFifoRdUncErr is + * not reported.There is set RHF.EccErr if the header + * is not suppressed. + */ + if (!datalen) { + SDMA_DBG(req, + "Request has data but pkt len is 0"); + ret = -EFAULT; + goto free_tx; + } else if (datalen <= 32) { + tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; + } + } + + if (req->ahg_idx >= 0) { + if (!req->seqnum) { + ret = user_sdma_txadd_ahg(req, tx, datalen); + if (ret) + goto free_tx; + } else { + int changes; + + changes = set_txreq_header_ahg(req, tx, + datalen); + if (changes < 0) { + ret = changes; + goto free_tx; + } + } + } else { + ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + + datalen, user_sdma_txreq_cb); + if (ret) + goto free_tx; + /* + * Modify the header for this packet. This only needs + * to be done if we are not going to use AHG. Otherwise, + * the HW will do it based on the changes we gave it + * during sdma_txinit_ahg(). + */ + ret = set_txreq_header(req, tx, datalen); + if (ret) + goto free_txreq; + } + + req->koffset += datalen; + if (req_opcode(req->info.ctrl) == EXPECTED) + req->tidoffset += datalen; + req->sent += datalen; + while (datalen) { + ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec, + &datalen); + if (ret) + goto free_txreq; + iovec = &req->iovs[req->iov_idx]; + } + list_add_tail(&tx->txreq.list, &req->txps); + /* + * It is important to increment this here as it is used to + * generate the BTH.PSN and, therefore, can't be bulk-updated + * outside of the loop. + */ + tx->seqnum = req->seqnum++; + npkts++; + } +dosend: + ret = sdma_send_txlist(req->sde, + iowait_get_ib_work(&pq->busy), + &req->txps, &count); + req->seqsubmitted += count; + if (req->seqsubmitted == req->info.npkts) { + /* + * The txreq has already been submitted to the HW queue + * so we can free the AHG entry now. Corruption will not + * happen due to the sequential manner in which + * descriptors are processed. + */ + if (req->ahg_idx >= 0) + sdma_ahg_free(req->sde, req->ahg_idx); + } + return ret; + +free_txreq: + sdma_txclean(pq->dd, &tx->txreq); +free_tx: + kmem_cache_free(pq->txreq_cache, tx); + return ret; +} + +static int check_header_template(struct user_sdma_request *req, + struct hfi1_pkt_header *hdr, u32 lrhlen, + u32 datalen) +{ + /* + * Perform safety checks for any type of packet: + * - transfer size is multiple of 64bytes + * - packet length is multiple of 4 bytes + * - packet length is not larger than MTU size + * + * These checks are only done for the first packet of the + * transfer since the header is "given" to us by user space. + * For the remainder of the packets we compute the values. + */ + if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || + lrhlen > get_lrh_len(*hdr, req->info.fragsize)) + return -EINVAL; + + if (req_opcode(req->info.ctrl) == EXPECTED) { + /* + * The header is checked only on the first packet. Furthermore, + * we ensure that at least one TID entry is copied when the + * request is submitted. Therefore, we don't have to verify that + * tididx points to something sane. + */ + u32 tidval = req->tids[req->tididx], + tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, + tididx = EXP_TID_GET(tidval, IDX), + tidctrl = EXP_TID_GET(tidval, CTRL), + tidoff; + __le32 kval = hdr->kdeth.ver_tid_offset; + + tidoff = KDETH_GET(kval, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + /* + * Expected receive packets have the following + * additional checks: + * - offset is not larger than the TID size + * - TIDCtrl values match between header and TID array + * - TID indexes match between header and TID array + */ + if ((tidoff + datalen > tidlen) || + KDETH_GET(kval, TIDCTRL) != tidctrl || + KDETH_GET(kval, TID) != tididx) + return -EINVAL; + } + return 0; +} + +/* + * Correctly set the BTH.PSN field based on type of + * transfer - eager packets can just increment the PSN but + * expected packets encode generation and sequence in the + * BTH.PSN field so just incrementing will result in errors. + */ +static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) +{ + u32 val = be32_to_cpu(bthpsn), + mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : + 0xffffffull), + psn = val & mask; + if (expct) + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); + else + psn = psn + frags; + return psn & mask; +} + +static int set_txreq_header(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 datalen) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &tx->hdr; + u8 omfactor; /* KDETH.OM */ + u16 pbclen; + int ret; + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); + + /* Copy the header template to the request before modification */ + memcpy(hdr, &req->hdr, sizeof(*hdr)); + + /* + * Check if the PBC and LRH length are mismatched. If so + * adjust both in the header. + */ + pbclen = le16_to_cpu(hdr->pbc[0]); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); + hdr->pbc[0] = cpu_to_le16(pbclen); + hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); + /* + * Third packet + * This is the first packet in the sequence that has + * a "static" size that can be used for the rest of + * the packets (besides the last one). + */ + if (unlikely(req->seqnum == 2)) { + /* + * From this point on the lengths in both the + * PBC and LRH are the same until the last + * packet. + * Adjust the template so we don't have to update + * every packet + */ + req->hdr.pbc[0] = hdr->pbc[0]; + req->hdr.lrh[2] = hdr->lrh[2]; + } + } + /* + * We only have to modify the header if this is not the + * first packet in the request. Otherwise, we use the + * header given to us. + */ + if (unlikely(!req->seqnum)) { + ret = check_header_template(req, hdr, lrhlen, datalen); + if (ret) + return ret; + goto done; + } + + hdr->bth[2] = cpu_to_be32( + set_pkt_bth_psn(hdr->bth[2], + (req_opcode(req->info.ctrl) == EXPECTED), + req->seqnum)); + + /* Set ACK request on last packet */ + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) + hdr->bth[2] |= cpu_to_be32(1UL << 31); + + /* Set the new offset */ + hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); + /* Expected packets have to fill in the new TID information */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + tidval = req->tids[req->tididx]; + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) { + return -EINVAL; + } + tidval = req->tids[req->tididx]; + } + omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= + KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : + KDETH_OM_SMALL_SHIFT; + /* Set KDETH.TIDCtrl based on value for this TID. */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, + EXP_TID_GET(tidval, CTRL)); + /* Set KDETH.TID based on value for this TID */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TID, + EXP_TID_GET(tidval, IDX)); + /* Clear KDETH.SH when DISABLE_SH flag is set */ + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) + KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); + /* + * Set the KDETH.OFFSET and KDETH.OM based on size of + * transfer. + */ + trace_hfi1_sdma_user_tid_info( + pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, + req->tidoffset, req->tidoffset >> omfactor, + omfactor != KDETH_OM_SMALL_SHIFT); + KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, + req->tidoffset >> omfactor); + KDETH_SET(hdr->kdeth.ver_tid_offset, OM, + omfactor != KDETH_OM_SMALL_SHIFT); + } +done: + trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, hdr, tidval); + return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); +} + +static int set_txreq_header_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 datalen) +{ + u32 ahg[AHG_KDETH_ARRAY_SIZE]; + int idx = 0; + u8 omfactor; /* KDETH.OM */ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &req->hdr; + u16 pbclen = le16_to_cpu(hdr->pbc[0]); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); + size_t array_size = ARRAY_SIZE(ahg); + + if (PBC2LRH(pbclen) != lrhlen) { + /* PBC.PbcLengthDWs */ + idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, + (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); + if (idx < 0) + return idx; + /* LRH.PktLen (we need the full 16 bits due to byte swap) */ + idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, + (__force u16)cpu_to_be16(lrhlen >> 2)); + if (idx < 0) + return idx; + } + + /* + * Do the common updates + */ + /* BTH.PSN and BTH.A */ + val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & + (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) + val32 |= 1UL << 31; + idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, + (__force u16)cpu_to_be16(val32 >> 16)); + if (idx < 0) + return idx; + idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, + (__force u16)cpu_to_be16(val32 & 0xffff)); + if (idx < 0) + return idx; + /* KDETH.Offset */ + idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, + (__force u16)cpu_to_le16(req->koffset & 0xffff)); + if (idx < 0) + return idx; + idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, + (__force u16)cpu_to_le16(req->koffset >> 16)); + if (idx < 0) + return idx; + if (req_opcode(req->info.ctrl) == EXPECTED) { + __le16 val; + + tidval = req->tids[req->tididx]; + + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) + return -EINVAL; + tidval = req->tids[req->tididx]; + } + omfactor = ((EXP_TID_GET(tidval, LEN) * + PAGE_SIZE) >= + KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : + KDETH_OM_SMALL_SHIFT; + /* KDETH.OM and KDETH.OFFSET (TID) */ + idx = ahg_header_set( + ahg, idx, array_size, 7, 0, 16, + ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | + ((req->tidoffset >> omfactor) + & 0x7fff))); + if (idx < 0) + return idx; + /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ + val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | + (EXP_TID_GET(tidval, IDX) & 0x3ff)); + + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { + val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, + INTR) << + AHG_KDETH_INTR_SHIFT)); + } else { + val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? + cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : + cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, + INTR) << + AHG_KDETH_INTR_SHIFT)); + } + + idx = ahg_header_set(ahg, idx, array_size, + 7, 16, 14, (__force u16)val); + if (idx < 0) + return idx; + } + + trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, req->sde->this_idx, + req->ahg_idx, ahg, idx, tidval); + sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_USE_AHG, + datalen, req->ahg_idx, idx, + ahg, sizeof(req->hdr), + user_sdma_txreq_cb); + + return idx; +} + +/** + * user_sdma_txreq_cb() - SDMA tx request completion callback. + * @txreq: valid sdma tx request + * @status: success/failure of request + * + * Called when the SDMA progress state machine gets notification that + * the SDMA descriptors for this tx request have been processed by the + * DMA engine. Called in interrupt context. + * Only do work on completed sequences. + */ +static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) +{ + struct user_sdma_txreq *tx = + container_of(txreq, struct user_sdma_txreq, txreq); + struct user_sdma_request *req; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + enum hfi1_sdma_comp_state state = COMPLETE; + + if (!tx->req) + return; + + req = tx->req; + pq = req->pq; + cq = req->cq; + + if (status != SDMA_TXREQ_S_OK) { + SDMA_DBG(req, "SDMA completion with error %d", + status); + WRITE_ONCE(req->has_error, 1); + state = ERROR; + } + + req->seqcomp = tx->seqnum; + kmem_cache_free(pq->txreq_cache, tx); + + /* sequence isn't complete? We are done */ + if (req->seqcomp != req->info.npkts - 1) + return; + + user_sdma_free_request(req); + set_comp_state(pq, cq, req->info.comp_idx, state, status); + pq_update(pq); +} + +static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) +{ + if (atomic_dec_and_test(&pq->n_reqs)) + wake_up(&pq->wait); +} + +static void user_sdma_free_request(struct user_sdma_request *req) +{ + if (!list_empty(&req->txps)) { + struct sdma_txreq *t, *p; + + list_for_each_entry_safe(t, p, &req->txps, list) { + struct user_sdma_txreq *tx = + container_of(t, struct user_sdma_txreq, txreq); + list_del_init(&t->list); + sdma_txclean(req->pq->dd, t); + kmem_cache_free(req->pq->txreq_cache, tx); + } + } + + kfree(req->tids); + clear_bit(req->info.comp_idx, req->pq->req_in_use); +} + +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, + struct hfi1_user_sdma_comp_q *cq, + u16 idx, enum hfi1_sdma_comp_state state, + int ret) +{ + if (state == ERROR) + cq->comps[idx].errcode = -ret; + smp_wmb(); /* make sure errcode is visible first */ + cq->comps[idx].status = state; + trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, + idx, state, ret); +} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h new file mode 100644 index 0000000000..742ec1470c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2023 - Cornelis Networks, Inc. + * Copyright(c) 2015 - 2018 Intel Corporation. + */ +#ifndef _HFI1_USER_SDMA_H +#define _HFI1_USER_SDMA_H + +#include <linux/device.h> +#include <linux/wait.h> + +#include "common.h" +#include "iowait.h" +#include "user_exp_rcv.h" +#include "mmu_rb.h" +#include "pinning.h" +#include "sdma.h" + +/* The maximum number of Data io vectors per message/request */ +#define MAX_VECTORS_PER_REQ 8 +/* + * Maximum number of packet to send from each message/request + * before moving to the next one. + */ +#define MAX_PKTS_PER_QUEUE 16 + +#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) + +#define req_opcode(x) \ + (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_version(x) \ + (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_iovcnt(x) \ + (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define BTH_SEQ_MASK 0x7ffull + +#define AHG_KDETH_INTR_SHIFT 12 +#define AHG_KDETH_SH_SHIFT 13 +#define AHG_KDETH_ARRAY_SIZE 9 + +#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) +#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) + +/** + * Build an SDMA AHG header update descriptor and save it to an array. + * @arr - Array to save the descriptor to. + * @idx - Index of the array at which the descriptor will be saved. + * @array_size - Size of the array arr. + * @dw - Update index into the header in DWs. + * @bit - Start bit. + * @width - Field width. + * @value - 16 bits of immediate data to write into the field. + * Returns -ERANGE if idx is invalid. If successful, returns the next index + * (idx + 1) of the array to be used for the next descriptor. + */ +static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, + u8 dw, u8 bit, u8 width, u16 value) +{ + if ((size_t)idx >= array_size) + return -ERANGE; + arr[idx++] = sdma_build_ahg_descriptor(value, dw, bit, width); + return idx; +} + +/* Tx request flag bits */ +#define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ +#define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ + +enum pkt_q_sdma_state { + SDMA_PKT_Q_ACTIVE, + SDMA_PKT_Q_DEFERRED, +}; + +#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ + +#define SDMA_DBG(req, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ + (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ + ##__VA_ARGS__) + +struct hfi1_user_sdma_pkt_q { + u16 ctxt; + u16 subctxt; + u16 n_max_reqs; + atomic_t n_reqs; + u16 reqidx; + struct hfi1_devdata *dd; + struct kmem_cache *txreq_cache; + struct user_sdma_request *reqs; + unsigned long *req_in_use; + struct iowait busy; + enum pkt_q_sdma_state state; + wait_queue_head_t wait; + unsigned long unpinned; + struct mmu_rb_handler *handler; + atomic_t n_locked; +}; + +struct hfi1_user_sdma_comp_q { + u16 nentries; + struct hfi1_sdma_comp_entry *comps; +}; + +struct user_sdma_iovec { + struct list_head list; + struct iovec iov; + /* + * offset into the virtual address space of the vector at + * which we last left off. + */ + u64 offset; +}; + +/* evict operation argument */ +struct evict_data { + u32 cleared; /* count evicted so far */ + u32 target; /* target count to evict */ +}; + +struct user_sdma_request { + /* This is the original header from user space */ + struct hfi1_pkt_header hdr; + + /* Read mostly fields */ + struct hfi1_user_sdma_pkt_q *pq ____cacheline_aligned_in_smp; + struct hfi1_user_sdma_comp_q *cq; + /* + * Pointer to the SDMA engine for this request. + * Since different request could be on different VLs, + * each request will need it's own engine pointer. + */ + struct sdma_engine *sde; + struct sdma_req_info info; + /* TID array values copied from the tid_iov vector */ + u32 *tids; + /* total length of the data in the request */ + u32 data_len; + /* number of elements copied to the tids array */ + u16 n_tids; + /* + * We copy the iovs for this request (based on + * info.iovcnt). These are only the data vectors + */ + u8 data_iovs; + s8 ahg_idx; + + /* Writeable fields shared with interrupt */ + u16 seqcomp ____cacheline_aligned_in_smp; + u16 seqsubmitted; + + /* Send side fields */ + struct list_head txps ____cacheline_aligned_in_smp; + u16 seqnum; + /* + * KDETH.OFFSET (TID) field + * The offset can cover multiple packets, depending on the + * size of the TID entry. + */ + u32 tidoffset; + /* + * KDETH.Offset (Eager) field + * We need to remember the initial value so the headers + * can be updated properly. + */ + u32 koffset; + u32 sent; + /* TID index copied from the tid_iov vector */ + u16 tididx; + /* progress index moving along the iovs array */ + u8 iov_idx; + u8 has_error; + + struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; +} ____cacheline_aligned_in_smp; + +/* + * A single txreq could span up to 3 physical pages when the MTU + * is sufficiently large (> 4K). Each of the IOV pointers also + * needs it's own set of flags so the vector has been handled + * independently of each other. + */ +struct user_sdma_txreq { + /* Packet header for the txreq */ + struct hfi1_pkt_header hdr; + struct sdma_txreq txreq; + struct list_head list; + struct user_sdma_request *req; + u16 flags; + u16 seqnum; +}; + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, + struct hfi1_filedata *fd); +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt); +int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, + struct iovec *iovec, unsigned long dim, + unsigned long *count); +#endif /* _HFI1_USER_SDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c new file mode 100644 index 0000000000..fbdcfecb17 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -0,0 +1,1948 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2015 - 2020 Intel Corporation. + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_user_verbs.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/rculist.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <rdma/opa_addr.h> +#include <linux/nospec.h> + +#include "hfi.h" +#include "common.h" +#include "device.h" +#include "trace.h" +#include "qp.h" +#include "verbs_txreq.h" +#include "debugfs.h" +#include "vnic.h" +#include "fault.h" +#include "affinity.h" +#include "ipoib.h" + +static unsigned int hfi1_lkey_table_size = 16; +module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int hfi1_max_pds = 0xFFFF; +module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int hfi1_max_ahs = 0xFFFF; +module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int hfi1_max_cqes = 0x2FFFFF; +module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int hfi1_max_cqs = 0x1FFFF; +module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int hfi1_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int hfi1_max_qps = 32768; +module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int hfi1_max_sges = 0x60; +module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int hfi1_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int hfi1_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int hfi1_max_srqs = 1024; +module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int hfi1_max_srq_sges = 128; +module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int hfi1_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +unsigned short piothreshold = 256; +module_param(piothreshold, ushort, S_IRUGO); +MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); + +static unsigned int sge_copy_mode; +module_param(sge_copy_mode, uint, S_IRUGO); +MODULE_PARM_DESC(sge_copy_mode, + "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS"); + +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status); + +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag); + +/* Length of buffer to create verbs txreq cache name */ +#define TXREQ_NAME_LEN 24 + +static uint wss_threshold = 80; +module_param(wss_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); +static uint wss_clean_period = 256; +module_param(wss_clean_period, uint, S_IRUGO); +MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [IB_WR_SEND_WITH_INV] = IB_WC_SEND, + [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV, + [IB_WR_REG_MR] = IB_WC_REG_MR +}; + +/* + * Length of header by opcode, 0 --> not supported + */ +const u8 hdr_len_by_opcode[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4, + [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4 + 8, + [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, + [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12 +}; + +static const opcode_handler opcode_handler_tbl[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, + [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, + + /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, + [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, + + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv, + /* CNP */ + [IB_OPCODE_CNP] = &hfi1_cnp_rcv +}; + +#define OPMASK 0x1f + +static const u32 pio_opmask[BIT(3)] = { + /* RC */ + [IB_OPCODE_RC >> 5] = + BIT(RC_OP(SEND_ONLY) & OPMASK) | + BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) | + BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) | + BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) | + BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) | + BIT(RC_OP(ACKNOWLEDGE) & OPMASK) | + BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) | + BIT(RC_OP(COMPARE_SWAP) & OPMASK) | + BIT(RC_OP(FETCH_ADD) & OPMASK), + /* UC */ + [IB_OPCODE_UC >> 5] = + BIT(UC_OP(SEND_ONLY) & OPMASK) | + BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) | + BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) | + BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK), +}; + +/* + * System image GUID. + */ +__be64 ib_hfi1_sys_image_guid; + +/* + * Make sure the QP is ready and able to accept the given opcode. + */ +static inline opcode_handler qp_ok(struct hfi1_packet *packet) +{ + if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) + return NULL; + if (((packet->opcode & RVT_OPCODE_QP_MASK) == + packet->qp->allowed_ops) || + (packet->opcode == IB_OPCODE_CNP)) + return opcode_handler_tbl[packet->opcode]; + + return NULL; +} + +static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) +{ +#ifdef CONFIG_FAULT_INJECTION + if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) { + /* + * In order to drop non-IB traffic we + * set PbcInsertHrc to NONE (0x2). + * The packet will still be delivered + * to the receiving node but a + * KHdrHCRCErr (KDETH packet with a bad + * HCRC) will be triggered and the + * packet will not be delivered to the + * correct context. + */ + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT; + } else { + /* + * In order to drop regular verbs + * traffic we set the PbcTestEbp + * flag. The packet will still be + * delivered to the receiving node but + * a 'late ebp error' will be + * triggered and will be dropped. + */ + pbc |= PBC_TEST_EBP; + } +#endif + return pbc; +} + +static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet) +{ + if (packet->qp->ibqp.qp_type != IB_QPT_RC || + !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) + return NULL; + if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA) + return opcode_handler_tbl[opcode]; + return NULL; +} + +void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + +void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler opcode_handler; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + + /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */ + if (unlikely(tlen < 15 * sizeof(u32))) + goto drop; + + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh != HFI1_LRH_BTH) + goto drop; + + packet->ohdr = &hdr->u.oth; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* verbs_qp can be picked up from any tid_rdma header struct */ + qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) & + RVT_QPN_MASK; + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto drop_rcu; + spin_lock_irqsave(&packet->qp->r_lock, flags); + opcode_handler = tid_qp_ok(opcode, packet); + if (likely(opcode_handler)) + opcode_handler(packet); + else + goto drop_unlock; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + + return; +drop_unlock: + spin_unlock_irqrestore(&packet->qp->r_lock, flags); +drop_rcu: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + +static int hfi1_do_pkey_check(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_16b_header *hdr = packet->hdr; + u16 pkey; + + /* Pkey check needed only for bypass packets */ + if (packet->etype != RHF_RCV_TYPE_BYPASS) + return 0; + + /* Perform pkey check */ + pkey = hfi1_16B_get_pkey(hdr); + return ingress_pkey_check(ppd, pkey, packet->sc, + packet->qp->s_pkey_index, + packet->slid, true); +} + +static inline void hfi1_handle_packet(struct hfi1_packet *packet, + bool is_mcast) +{ + u32 qp_num; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = rcd_to_iport(rcd); + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + opcode_handler packet_handler; + unsigned long flags; + + inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]); + + if (unlikely(is_mcast)) { + struct rvt_mcast *mcast; + struct rvt_mcast_qp *p; + + if (!packet->grh) + goto drop; + mcast = rvt_mcast_find(&ibp->rvp, + &packet->grh->dgid, + opa_get_lid(packet->dlid, 9B)); + if (!mcast) + goto drop; + rcu_read_lock(); + list_for_each_entry_rcu(p, &mcast->qp_list, list) { + packet->qp = p->qp; + if (hfi1_do_pkey_check(packet)) + goto unlock_drop; + spin_lock_irqsave(&packet->qp->r_lock, flags); + packet_handler = qp_ok(packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + } + rcu_read_unlock(); + /* + * Notify rvt_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + /* Get the destination QP number. */ + if (packet->etype == RHF_RCV_TYPE_BYPASS && + hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM) + qp_num = hfi1_16B_get_dest_qpn(packet->mgmt); + else + qp_num = ib_bth_get_qpn(packet->ohdr); + + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) + goto unlock_drop; + + if (hfi1_do_pkey_check(packet)) + goto unlock_drop; + + spin_lock_irqsave(&packet->qp->r_lock, flags); + packet_handler = qp_ok(packet); + if (likely(packet_handler)) + packet_handler(packet); + else + ibp->rvp.n_pkt_drops++; + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + } + return; +unlock_drop: + rcu_read_unlock(); +drop: + ibp->rvp.n_pkt_drops++; +} + +/** + * hfi1_ib_rcv - process an incoming packet + * @packet: data packet information + * + * This is called to process an incoming packet at interrupt level. + */ +void hfi1_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); +} + +void hfi1_16B_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + trace_input_ibhdr(rcd->dd, packet, false); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(struct timer_list *t) +{ + struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer); + struct list_head *list = &dev->memwait; + struct rvt_qp *qp = NULL; + struct iowait *wait; + unsigned long flags; + struct hfi1_qp_priv *priv; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(list)) { + wait = list_first_entry(list, struct iowait, list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + /* refcount held until actual wake up */ + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + if (qp) + hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM); +} + +/* + * This is called with progress side lock held. + */ +/* New API */ +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status) +{ + struct verbs_txreq *tx = + container_of(cookie, struct verbs_txreq, txreq); + struct rvt_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) { + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct hfi1_opa_header *hdr; + + hdr = &tx->phdr.hdr; + if (unlikely(status == SDMA_TXREQ_S_ABORTED)) + hfi1_rc_verbs_aborted(qp, hdr); + hfi1_rc_send_complete(qp, hdr); + } + spin_unlock(&qp->s_lock); + + hfi1_put_txreq(tx); +} + +void hfi1_wait_kmem(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct ib_device *ibdev = ibqp->device; + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + priv->s_iowait.lock = &dev->iowait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + rvt_get_qp(qp); + } +} + +static int wait_kmem(struct hfi1_ibdev *dev, + struct rvt_qp *qp, + struct hfi1_pkt_state *ps) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + list_add_tail(&ps->s_txreq->txreq.list, + &ps->wait->tx_head); + hfi1_wait_kmem(qp); + write_sequnlock(&dev->iowait_lock); + hfi1_qp_unbusy(qp, ps->wait); + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +/* + * This routine calls txadds for each sg entry. + * + * Add failures will revert the sge cursor + */ +static noinline int build_verbs_ulp_payload( + struct sdma_engine *sde, + u32 length, + struct verbs_txreq *tx) +{ + struct rvt_sge_state *ss = tx->ss; + struct rvt_sge *sg_list = ss->sg_list; + struct rvt_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 len; + int ret = 0; + + while (length) { + len = rvt_get_sge_length(&ss->sge, length); + WARN_ON_ONCE(len == 0); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + ss->sge.vaddr, + len); + if (ret) + goto bail_txadd; + rvt_update_sge(ss, len, false); + length -= len; + } + return ret; +bail_txadd: + /* unwind cursor */ + ss->sge = sge; + ss->num_sge = num_sge; + ss->sg_list = sg_list; + return ret; +} + +/** + * update_tx_opstats - record stats by opcode + * @qp: the qp + * @ps: transmit packet state + * @plen: the plen in dwords + * + * This is a routine to record the tx opstats after a + * packet has been presented to the egress mechanism. + */ +static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u32 plen) +{ +#ifdef CONFIG_DEBUG_FS + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats); + + inc_opstats(plen * 4, &s->stats[ps->opcode]); + put_cpu_ptr(s); +#endif +} + +/* + * Build the number of DMA descriptors needed to send length bytes of data. + * + * NOTE: DMA mapping is held in the tx until completed in the ring or + * the tx desc is freed without having been submitted to the ring + * + * This routine ensures all the helper routine calls succeed. + */ +/* New API */ +static int build_verbs_tx_desc( + struct sdma_engine *sde, + u32 length, + struct verbs_txreq *tx, + struct hfi1_ahg_info *ahg_info, + u64 pbc) +{ + int ret = 0; + struct hfi1_sdma_header *phdr = &tx->phdr; + u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2; + u8 extra_bytes = 0; + + if (tx->phdr.hdr.hdr_type) { + /* + * hdrbytes accounts for PBC. Need to subtract 8 bytes + * before calculating padding. + */ + extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) + + (SIZE_OF_CRC << 2) + SIZE_OF_LT; + } + if (!ahg_info->ahgcount) { + ret = sdma_txinit_ahg( + &tx->txreq, + ahg_info->tx_flags, + hdrbytes + length + + extra_bytes, + ahg_info->ahgidx, + 0, + NULL, + 0, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + phdr->pbc = cpu_to_le64(pbc); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + phdr, + hdrbytes); + if (ret) + goto bail_txadd; + } else { + ret = sdma_txinit_ahg( + &tx->txreq, + ahg_info->tx_flags, + length, + ahg_info->ahgidx, + ahg_info->ahgcount, + ahg_info->ahgdesc, + hdrbytes, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + } + /* add the ulp payload - if any. tx->ss can be NULL for acks */ + if (tx->ss) { + ret = build_verbs_ulp_payload(sde, length, tx); + if (ret) + goto bail_txadd; + } + + /* add icrc, lt byte, and padding to flit */ + if (extra_bytes) + ret = sdma_txadd_daddr(sde->dd, &tx->txreq, sde->dd->sdma_pad_phys, + extra_bytes); + +bail_txadd: + return ret; +} + +static u64 update_hcrc(u8 opcode, u64 pbc) +{ + if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) { + pbc &= ~PBC_INSERT_HCRC_SMASK; + pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT; + } + return pbc; +} + +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ahg_info *ahg_info = priv->s_ahg; + u32 hdrwords = ps->s_txreq->hdr_dwords; + u32 len = ps->s_txreq->s_cur_size; + u32 plen; + struct hfi1_ibdev *dev = ps->dev; + struct hfi1_pportdata *ppd = ps->ppd; + struct verbs_txreq *tx; + u8 sc5 = priv->s_sc; + int ret; + u32 dwords; + + if (ps->s_txreq->phdr.hdr.hdr_type) { + u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len); + + dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) + + SIZE_OF_LT) >> 2; + } else { + dwords = (len + 3) >> 2; + } + plen = hdrwords + dwords + sizeof(pbc) / 4; + + tx = ps->s_txreq; + if (!sdma_txreq_built(&tx->txreq)) { + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + + /* No vl15 here */ + /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ + if (ps->s_txreq->phdr.hdr.hdr_type) + pbc |= PBC_PACKET_BYPASS | + PBC_INSERT_BYPASS_ICRC; + else + pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); + + pbc = create_pbc(ppd, + pbc, + qp->srate_mbps, + vl, + plen); + + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) + pbc = hfi1_fault_tx(qp, ps->opcode, pbc); + else + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); + } + tx->wqe = qp->s_wqe; + ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc); + if (unlikely(ret)) + goto bail_build; + } + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); + if (unlikely(ret < 0)) { + if (ret == -ECOMM) + goto bail_ecomm; + return ret; + } + + update_tx_opstats(qp, ps, plen); + trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); + return ret; + +bail_ecomm: + /* The current one got "sent" */ + return 0; +bail_build: + ret = wait_kmem(dev, qp, ps); + if (!ret) { + /* free txreq - bad state */ + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + } + return ret; +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_devdata *dd = sc->dd; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, sc_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + write_seqlock(&sc->waitlock); + list_add_tail(&ps->s_txreq->txreq.list, + &ps->wait->tx_head); + if (list_empty(&priv->s_iowait.list)) { + struct hfi1_ibdev *dev = &dd->verbs_dev; + int was_empty; + + dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); + dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); + qp->s_flags |= flag; + was_empty = list_empty(&sc->piowait); + iowait_get_priority(&priv->s_iowait); + iowait_queue(ps->pkts_sent, &priv->s_iowait, + &sc->piowait); + priv->s_iowait.lock = &sc->waitlock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); + rvt_get_qp(qp); + /* counting: only call wantpiobuf_intr if first user */ + if (was_empty) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock(&sc->waitlock); + hfi1_qp_unbusy(qp, ps->wait); + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static void verbs_pio_complete(void *arg, int code) +{ + struct rvt_qp *qp = (struct rvt_qp *)arg; + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_pio_dec(&priv->s_iowait)) + iowait_drain_wakeup(&priv->s_iowait); +} + +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc) +{ + struct hfi1_qp_priv *priv = qp->priv; + u32 hdrwords = ps->s_txreq->hdr_dwords; + struct rvt_sge_state *ss = ps->s_txreq->ss; + u32 len = ps->s_txreq->s_cur_size; + u32 dwords; + u32 plen; + struct hfi1_pportdata *ppd = ps->ppd; + u32 *hdr; + u8 sc5; + unsigned long flags = 0; + struct send_context *sc; + struct pio_buf *pbuf; + int wc_status = IB_WC_SUCCESS; + int ret = 0; + pio_release_cb cb = NULL; + u8 extra_bytes = 0; + + if (ps->s_txreq->phdr.hdr.hdr_type) { + u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len); + + extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT; + dwords = (len + extra_bytes) >> 2; + hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah; + } else { + dwords = (len + 3) >> 2; + hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh; + } + plen = hdrwords + dwords + sizeof(pbc) / 4; + + /* only RC/UC use complete */ + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + cb = verbs_pio_complete; + break; + default: + break; + } + + /* vl15 special case taken care of in ud.c */ + sc5 = priv->s_sc; + sc = ps->s_txreq->psc; + + if (likely(pbc == 0)) { + u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + + /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ + if (ps->s_txreq->phdr.hdr.hdr_type) + pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + else + pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); + + pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) + pbc = hfi1_fault_tx(qp, ps->opcode, pbc); + else + /* Update HCRC based on packet opcode */ + pbc = update_hcrc(ps->opcode, pbc); + } + if (cb) + iowait_pio_inc(&priv->s_iowait); + pbuf = sc_buffer_alloc(sc, plen, cb, qp); + if (IS_ERR_OR_NULL(pbuf)) { + if (cb) + verbs_pio_complete(qp, 0); + if (IS_ERR(pbuf)) { + /* + * If we have filled the PIO buffers to capacity and are + * not in an active state this request is not going to + * go out to so just complete it with an error or else a + * ULP or the core may be stuck waiting. + */ + hfi1_cdbg( + PIO, + "alloc failed. state not active, completing"); + wc_status = IB_WC_GENERAL_ERR; + goto pio_bail; + } else { + /* + * This is a normal occurrence. The PIO buffs are full + * up but we are still happily sending, well we could be + * so lets continue to queue the request. + */ + hfi1_cdbg(PIO, "alloc failed. state active, queuing"); + ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO); + if (!ret) + /* txreq not queued - free */ + goto bail; + /* tx consumed in wait */ + return ret; + } + } + + if (dwords == 0) { + pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); + } else { + seg_pio_copy_start(pbuf, pbc, + hdr, hdrwords * 4); + if (ss) { + while (len) { + void *addr = ss->sge.vaddr; + u32 slen = rvt_get_sge_length(&ss->sge, len); + + rvt_update_sge(ss, slen, false); + seg_pio_copy_mid(pbuf, addr, slen); + len -= slen; + } + } + /* add icrc, lt byte, and padding to flit */ + if (extra_bytes) + seg_pio_copy_mid(pbuf, ppd->dd->sdma_pad_dma, + extra_bytes); + + seg_pio_copy_end(pbuf); + } + + update_tx_opstats(qp, ps, plen); + trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); + +pio_bail: + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_wqe) { + rvt_send_complete(qp, qp->s_wqe, wc_status); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + if (unlikely(wc_status == IB_WC_GENERAL_ERR)) + hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr); + hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + ret = 0; + +bail: + hfi1_put_txreq(ps->s_txreq); + return ret; +} + +/* + * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the partition key table), return 0 + * otherwise. Use the matching criteria for egress partition keys + * specified in the OPAv1 spec., section 9.1l.7. + */ +static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 mentry = ent & PKEY_LOW_15_MASK; + + if (mkey == mentry) { + /* + * If pkey[15] is set (full partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (pkey & PKEY_MEMBER_MASK) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/** + * egress_pkey_check - check P_KEY of a packet + * @ppd: Physical IB port data + * @slid: SLID for packet + * @pkey: PKEY for header + * @sc5: SC for packet + * @s_pkey_index: It will be used for look up optimization for kernel contexts + * only. If it is negative value, then it means user contexts is calling this + * function. + * + * It checks if hdr's pkey is valid. + * + * Return: 0 on success, otherwise, 1 + */ +int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey, + u8 sc5, int8_t s_pkey_index) +{ + struct hfi1_devdata *dd; + int i; + int is_user_ctxt_mechanism = (s_pkey_index < 0); + + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* + * For the kernel contexts only, if a qp is passed into the function, + * the most likely matching pkey has index qp->s_pkey_index + */ + if (!is_user_ctxt_mechanism && + egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) { + return 0; + } + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } +bad: + /* + * For the user-context mechanism, the P_KEY check would only happen + * once per SDMA request, not once per packet. Therefore, there's no + * need to increment the counter for the user-context mechanism. + */ + if (!is_user_ctxt_mechanism) { + incr_cntr64(&ppd->port_xmit_constraint_errors); + dd = ppd->dd; + if (!(dd->err_info_xmit_constraint.status & + OPA_EI_STATUS_SMASK)) { + dd->err_info_xmit_constraint.status |= + OPA_EI_STATUS_SMASK; + dd->err_info_xmit_constraint.slid = slid; + dd->err_info_xmit_constraint.pkey = pkey; + } + } + return 1; +} + +/* + * get_send_routine - choose an egress routine + * + * Choose an egress routine based on QP type + * and size + */ +static inline send_routine get_send_routine(struct rvt_qp *qp, + struct hfi1_pkt_state *ps) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + struct verbs_txreq *tx = ps->s_txreq; + + if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA))) + return dd->process_pio_send; + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + return dd->process_pio_send; + case IB_QPT_GSI: + case IB_QPT_UD: + break; + case IB_QPT_UC: + case IB_QPT_RC: + priv->s_running_pkt_size = + (tx->s_cur_size + priv->s_running_pkt_size) / 2; + if (piothreshold && + priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) && + (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) && + iowait_sdma_pending(&priv->s_iowait) == 0 && + !sdma_txreq_built(&tx->txreq)) + return dd->process_pio_send; + break; + default: + break; + } + return dd->process_dma_send; +} + +/** + * hfi1_verbs_send - send a packet + * @qp: the QP to send on + * @ps: the state of the packet to send + * + * Return zero if packet is sent or queued OK. + * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise. + */ +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr = NULL; + send_routine sr; + int ret; + u16 pkey; + u32 slid; + u8 l4 = 0; + + /* locate the pkey within the headers */ + if (ps->s_txreq->phdr.hdr.hdr_type) { + struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah; + + l4 = hfi1_16B_get_l4(hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr->u.oth; + else if (l4 == OPA_16B_L4_IB_GLOBAL) + ohdr = &hdr->u.l.oth; + + slid = hfi1_16B_get_slid(hdr); + pkey = hfi1_16B_get_pkey(hdr); + } else { + struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh; + u8 lnh = ib_get_lnh(hdr); + + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; + slid = ib_get_slid(hdr); + pkey = ib_bth_get_pkey(ohdr); + } + + if (likely(l4 != OPA_16B_L4_FM)) + ps->opcode = ib_bth_get_opcode(ohdr); + else + ps->opcode = IB_OPCODE_UD_SEND_ONLY; + + sr = get_send_routine(qp, ps); + ret = egress_pkey_check(dd->pport, slid, pkey, + priv->s_sc, qp->s_pkey_index); + if (unlikely(ret)) { + /* + * The value we are returning here does not get propagated to + * the verbs caller. Thus we need to complete the request with + * error otherwise the caller could be sitting waiting on the + * completion event. Only do this for PIO. SDMA has its own + * mechanism for handling the errors. So for SDMA we can just + * return. + */ + if (sr == dd->process_pio_send) { + unsigned long flags; + + hfi1_cdbg(PIO, "%s() Failed. Completing with err", + __func__); + spin_lock_irqsave(&qp->s_lock, flags); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return -EINVAL; + } + if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait)) + return pio_wait(qp, + ps->s_txreq->psc, + ps, + HFI1_S_WAIT_PIO_DRAIN); + return sr(qp, ps, 0); +} + +/** + * hfi1_fill_device_attr - Fill in rvt dev info device attributes. + * @dd: the device data structure + */ +static void hfi1_fill_device_attr(struct hfi1_devdata *dd) +{ + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + u32 ver = dd->dc8051_ver; + + memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); + + rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) | + ((u64)(dc8051_ver_min(ver)) << 16) | + (u64)dc8051_ver_patch(ver); + + rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | + IB_DEVICE_MEM_MGT_EXTENSIONS; + rdi->dparms.props.kernel_cap_flags = IBK_RDMA_NETDEV_OPA; + rdi->dparms.props.page_size_cap = PAGE_SIZE; + rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; + rdi->dparms.props.vendor_part_id = dd->pcidev->device; + rdi->dparms.props.hw_ver = dd->minrev; + rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; + rdi->dparms.props.max_mr_size = U64_MAX; + rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; + rdi->dparms.props.max_qp = hfi1_max_qps; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); + rdi->dparms.props.max_send_sge = hfi1_max_sges; + rdi->dparms.props.max_recv_sge = hfi1_max_sges; + rdi->dparms.props.max_sge_rd = hfi1_max_sges; + rdi->dparms.props.max_cq = hfi1_max_cqs; + rdi->dparms.props.max_ah = hfi1_max_ahs; + rdi->dparms.props.max_cqe = hfi1_max_cqes; + rdi->dparms.props.max_pd = hfi1_max_pds; + rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; + rdi->dparms.props.max_qp_init_rd_atom = 255; + rdi->dparms.props.max_srq = hfi1_max_srqs; + rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs; + rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges; + rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB; + rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd); + rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps; + rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached; + rdi->dparms.props.max_total_mcast_qp_attach = + rdi->dparms.props.max_mcast_qp_attach * + rdi->dparms.props.max_mcast_grp; +} + +static inline u16 opa_speed_to_ib(u16 in) +{ + u16 out = 0; + + if (in & OPA_LINK_SPEED_25G) + out |= IB_SPEED_EDR; + if (in & OPA_LINK_SPEED_12_5G) + out |= IB_SPEED_FDR; + + return out; +} + +/* + * Convert a single OPA link width (no multiple flags) to an IB value. + * A zero OPA link width means link down, which means the IB width value + * is a don't care. + */ +static inline u16 opa_width_to_ib(u16 in) +{ + switch (in) { + case OPA_LINK_WIDTH_1X: + /* map 2x and 3x to 1x as they don't exist in IB */ + case OPA_LINK_WIDTH_2X: + case OPA_LINK_WIDTH_3X: + return IB_WIDTH_1X; + default: /* link down or unknown, return our largest width */ + case OPA_LINK_WIDTH_4X: + return IB_WIDTH_4X; + } +} + +static int query_port(struct rvt_dev_info *rdi, u32 port_num, + struct ib_port_attr *props) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + u32 lid = ppd->lid; + + /* props being zeroed by the caller, avoid zeroing it here */ + props->lid = lid ? lid : 0; + props->lmc = ppd->lmc; + /* OPA logical states match IB logical states */ + props->state = driver_lstate(ppd); + props->phys_state = driver_pstate(ppd); + props->gid_tbl_len = HFI1_GUIDS_PER_PORT; + props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); + /* see rate_show() in ib core/sysfs.c */ + props->active_speed = opa_speed_to_ib(ppd->link_speed_active); + props->max_vl_num = ppd->vls_supported; + + /* Once we are a "first class" citizen and have added the OPA MTUs to + * the core we can advertise the larger MTU enum to the ULPs, for now + * advertise only 4K. + * + * Those applications which are either OPA aware or pass the MTU enum + * from the Path Records to us will get the new 8k MTU. Those that + * attempt to process the MTU enum may fail in various ways. + */ + props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? + 4096 : hfi1_max_mtu), IB_MTU_4096); + props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : + mtu_to_enum(ppd->ibmtu, IB_MTU_4096); + props->phys_mtu = hfi1_max_mtu; + + return 0; +} + +static int modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + unsigned i; + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(device->node_desc, device_modify->node_desc, + IB_DEVICE_NODE_DESC_MAX); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_node_desc_chg(ibp); + } + } + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + ib_hfi1_sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_sys_guid_chg(ibp); + } + } + + ret = 0; + +bail: + return ret; +} + +static int shut_down_port(struct rvt_dev_info *rdi, u32 port_num) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, + OPA_LINKDOWN_REASON_UNKNOWN); + return set_link_state(ppd, HLS_DN_DOWNDEF); +} + +static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid) +{ + struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp); + + if (guid_index >= HFI1_GUIDS_PER_PORT) + return -EINVAL; + + *guid = get_sguid(ibp, guid_index); + return 0; +} + +/* + * convert ah port,sl to sc + */ +u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah)); + + return ibp->sl_to_sc[rdma_ah_get_sl(ah)]; +} + +static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) +{ + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; + u8 sl; + + if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + + /* test the mapping for validity */ + ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); + ppd = ppd_from_ibp(ibp); + dd = dd_from_ppd(ppd); + + sl = rdma_ah_get_sl(ah_attr); + if (sl >= ARRAY_SIZE(ibp->sl_to_sc)) + return -EINVAL; + sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc)); + + sc5 = ibp->sl_to_sc[sl]; + if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) + return -EINVAL; + return 0; +} + +static void hfi1_notify_new_ah(struct ib_device *ibdev, + struct rdma_ah_attr *ah_attr, + struct rvt_ah *ah) +{ + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; + struct rdma_ah_attr *attr = &ah->attr; + + /* + * Do not trust reading anything from rvt_ah at this point as it is not + * done being setup. We can however modify things which we need to set. + */ + + ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); + ppd = ppd_from_ibp(ibp); + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)]; + hfi1_update_ah_attr(ibdev, attr); + hfi1_make_opa_lid(attr); + dd = dd_from_ppd(ppd); + ah->vl = sc_to_vlt(dd, sc5); + if (ah->vl < num_vls || ah->vl == 15) + ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); +} + +/** + * hfi1_get_npkeys - return the size of the PKEY table for context 0 + * @dd: the hfi1_ib device + */ +unsigned hfi1_get_npkeys(struct hfi1_devdata *dd) +{ + return ARRAY_SIZE(dd->pport[0].pkeys); +} + +static void init_ibport(struct hfi1_pportdata *ppd) +{ + struct hfi1_ibport *ibp = &ppd->ibport_data; + size_t sz = ARRAY_SIZE(ibp->sl_to_sc); + int i; + + for (i = 0; i < sz; i++) { + ibp->sl_to_sc[i] = i; + ibp->sc_to_sl[i] = i; + } + + for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++) + INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list); + timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0); + + spin_lock_init(&ibp->rvp.lock); + /* Set the prefix to the default value (see ch. 4.1.1) */ + ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->rvp.sm_lid = 0; + /* + * Below should only set bits defined in OPA PortInfo.CapabilityMask + * and PortInfo.CapabilityMask3 + */ + ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | + IB_PORT_CAP_MASK_NOTICE_SUP; + ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported; + ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + RCU_INIT_POINTER(ibp->rvp.qp[0], NULL); + RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); +} + +static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct hfi1_ibdev *dev = dev_from_rdi(rdi); + u32 ver = dd_from_dev(dev)->dc8051_ver; + + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver), + dc8051_ver_min(ver), dc8051_ver_patch(ver)); +} + +static const char * const driver_cntr_names[] = { + /* must be element 0*/ + "DRIVER_KernIntr", + "DRIVER_ErrorIntr", + "DRIVER_Tx_Errs", + "DRIVER_Rcv_Errs", + "DRIVER_HW_Errs", + "DRIVER_NoPIOBufs", + "DRIVER_CtxtsOpen", + "DRIVER_RcvLen_Errs", + "DRIVER_EgrBufFull", + "DRIVER_EgrHdrFull" +}; + +static struct rdma_stat_desc *dev_cntr_descs; +static struct rdma_stat_desc *port_cntr_descs; +int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); +static int num_dev_cntrs; +static int num_port_cntrs; + +/* + * Convert a list of names separated by '\n' into an array of NULL terminated + * strings. Optionally some entries can be reserved in the array to hold extra + * external strings. + */ +static int init_cntr_names(const char *names_in, const size_t names_len, + int num_extra_names, int *num_cntrs, + struct rdma_stat_desc **cntr_descs) +{ + struct rdma_stat_desc *names_out; + char *p; + int i, n; + + n = 0; + for (i = 0; i < names_len; i++) + if (names_in[i] == '\n') + n++; + + names_out = kzalloc((n + num_extra_names) * sizeof(*names_out) + + names_len, + GFP_KERNEL); + if (!names_out) { + *num_cntrs = 0; + *cntr_descs = NULL; + return -ENOMEM; + } + + p = (char *)&names_out[n + num_extra_names]; + memcpy(p, names_in, names_len); + + for (i = 0; i < n; i++) { + names_out[i].name = p; + p = strchr(p, '\n'); + *p++ = '\0'; + } + + *num_cntrs = n; + *cntr_descs = names_out; + return 0; +} + +static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) +{ + if (!dev_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int i, err; + + err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, + num_driver_cntrs, + &num_dev_cntrs, &dev_cntr_descs); + if (err) + return NULL; + + for (i = 0; i < num_driver_cntrs; i++) + dev_cntr_descs[num_dev_cntrs + i].name = + driver_cntr_names[i]; + } + return rdma_alloc_hw_stats_struct(dev_cntr_descs, + num_dev_cntrs + num_driver_cntrs, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num) +{ + if (!port_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int err; + + err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, + 0, + &num_port_cntrs, &port_cntr_descs); + if (err) + return NULL; + } + return rdma_alloc_hw_stats_struct(port_cntr_descs, num_port_cntrs, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static u64 hfi1_sps_ints(void) +{ + unsigned long index, flags; + struct hfi1_devdata *dd; + u64 sps_ints = 0; + + xa_lock_irqsave(&hfi1_dev_table, flags); + xa_for_each(&hfi1_dev_table, index, dd) { + sps_ints += get_all_cpu_total(dd->int_counter); + } + xa_unlock_irqrestore(&hfi1_dev_table, flags); + return sps_ints; +} + +static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port, int index) +{ + u64 *values; + int count; + + if (!port) { + u64 *stats = (u64 *)&hfi1_stats; + int i; + + hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values); + values[num_dev_cntrs] = hfi1_sps_ints(); + for (i = 1; i < num_driver_cntrs; i++) + values[num_dev_cntrs + i] = stats[i]; + count = num_dev_cntrs + num_driver_cntrs; + } else { + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values); + count = num_port_cntrs; + } + + memcpy(stats->value, values, count * sizeof(u64)); + return count; +} + +static const struct ib_device_ops hfi1_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_HFI1, + + .alloc_hw_device_stats = hfi1_alloc_hw_device_stats, + .alloc_hw_port_stats = hfi_alloc_hw_port_stats, + .alloc_rdma_netdev = hfi1_vnic_alloc_rn, + .device_group = &ib_hfi1_attr_group, + .get_dev_fw_str = hfi1_get_dev_fw_str, + .get_hw_stats = get_hw_stats, + .modify_device = modify_device, + .port_groups = hfi1_attr_port_groups, + /* keep process mad in the driver */ + .process_mad = hfi1_process_mad, + .rdma_netdev_get_params = hfi1_ipoib_rn_get_params, +}; + +/** + * hfi1_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return 0 if successful, errno if unsuccessful. + */ +int hfi1_register_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->rdi.ibdev; + struct hfi1_pportdata *ppd = dd->pport; + struct hfi1_ibport *ibp = &ppd->ibport_data; + unsigned i; + int ret; + + for (i = 0; i < dd->num_pports; i++) + init_ibport(ppd + i); + + /* Only need to initialize non-zero fields. */ + + timer_setup(&dev->mem_timer, mem_timer, 0); + + seqlock_init(&dev->iowait_lock); + seqlock_init(&dev->txwait_lock); + INIT_LIST_HEAD(&dev->txwait); + INIT_LIST_HEAD(&dev->memwait); + + ret = verbs_txreq_init(dev); + if (ret) + goto err_verbs_txreq; + + /* Use first-port GUID as node guid */ + ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX); + + /* + * The system image GUID is supposed to be the same for all + * HFIs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!ib_hfi1_sys_image_guid) + ib_hfi1_sys_image_guid = ibdev->node_guid; + ibdev->phys_port_cnt = dd->num_pports; + ibdev->dev.parent = &dd->pcidev->dev; + + ib_set_device_ops(ibdev, &hfi1_dev_ops); + + strscpy(ibdev->node_desc, init_utsname()->nodename, + sizeof(ibdev->node_desc)); + + /* + * Fill in rvt info object. + */ + dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; + dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; + dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; + dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be; + dd->verbs_dev.rdi.driver_f.query_port_state = query_port; + dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port; + dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg; + /* + * Fill in rvt info device attributes. + */ + hfi1_fill_device_attr(dd); + + /* queue pair */ + dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size; + dd->verbs_dev.rdi.dparms.qpn_start = 0; + dd->verbs_dev.rdi.dparms.qpn_inc = 1; + dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; + dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE; + dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX; + dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; + dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; + dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; + dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK; + dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA | + RDMA_CORE_CAP_OPA_AH; + dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; + + dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; + dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init; + dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; + dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; + dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; + dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt; + dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters; + dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue; + dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp; + dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu; + dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; + dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; + dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; + dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = + hfi1_comp_vect_mappings_lookup; + + /* completeion queue */ + dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus; + dd->verbs_dev.rdi.dparms.node = dd->node; + + /* misc settings */ + dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */ + dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; + dd->verbs_dev.rdi.dparms.nports = dd->num_pports; + dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.reserved_operations = 1; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT; + + /* post send table */ + dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; + + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) + rvt_init_port(&dd->verbs_dev.rdi, + &ppd->ibport_data.rvp, + i, + ppd->pkeys); + + ret = rvt_register_device(&dd->verbs_dev.rdi); + if (ret) + goto err_verbs_txreq; + + ret = hfi1_verbs_register_sysfs(dd); + if (ret) + goto err_class; + + return ret; + +err_class: + rvt_unregister_device(&dd->verbs_dev.rdi); +err_verbs_txreq: + verbs_txreq_exit(dev); + dd_dev_err(dd, "cannot register verbs: %d!\n", -ret); + return ret; +} + +void hfi1_unregister_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + + hfi1_verbs_unregister_sysfs(dd); + + rvt_unregister_device(&dd->verbs_dev.rdi); + + if (!list_empty(&dev->txwait)) + dd_dev_err(dd, "txwait list not empty!\n"); + if (!list_empty(&dev->memwait)) + dd_dev_err(dd, "memwait list not empty!\n"); + + del_timer_sync(&dev->mem_timer); + verbs_txreq_exit(dev); + + kfree(dev_cntr_descs); + kfree(port_cntr_descs); + dev_cntr_descs = NULL; + port_cntr_descs = NULL; +} + +void hfi1_cnp_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_header *hdr = packet->hdr; + struct rvt_qp *qp = packet->qp; + u32 lqpn, rqpn = 0; + u16 rlid = 0; + u8 sl, sc5, svc_type; + + switch (packet->qp->ibqp.qp_type) { + case IB_QPT_UC: + rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + case IB_QPT_RC: + rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_RC; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + svc_type = IB_CC_SVCTYPE_UD; + break; + default: + ibp->rvp.n_pkt_drops++; + return; + } + + sc5 = hfi1_9B_get_sc5(hdr, packet->rhf); + sl = ibp->sc_to_sl[sc5]; + lqpn = qp->ibqp.qp_num; + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); +} diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h new file mode 100644 index 0000000000..7f30f32b34 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2015 - 2018 Intel Corporation. + */ + +#ifndef HFI1_VERBS_H +#define HFI1_VERBS_H + +#include <linux/types.h> +#include <linux/seqlock.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/kref.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_hdrs.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> +#include <rdma/rdmavt_cq.h> + +struct hfi1_ctxtdata; +struct hfi1_pportdata; +struct hfi1_devdata; +struct hfi1_packet; + +#include "iowait.h" +#include "tid_rdma.h" +#include "opfn.h" + +#define HFI1_MAX_RDMA_ATOMIC 16 + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define HFI1_UVERBS_ABI_VERSION 2 + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0) + +#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +#define OPA_BTH_MIG_REQ BIT(31) + +#define RC_OP(x) IB_OPCODE_RC_##x +#define UC_OP(x) IB_OPCODE_UC_##x + +/* flags passed by hfi1_ib_rcv() */ +enum { + HFI1_HAS_GRH = (1 << 0), +}; + +#define LRH_16B_BYTES (sizeof_field(struct hfi1_16b_header, lrh)) +#define LRH_16B_DWORDS (LRH_16B_BYTES / sizeof(u32)) +#define LRH_9B_BYTES (sizeof_field(struct ib_header, lrh)) +#define LRH_9B_DWORDS (LRH_9B_BYTES / sizeof(u32)) + +/* 24Bits for qpn, upper 8Bits reserved */ +struct opa_16b_mgmt { + __be32 dest_qpn; + __be32 src_qpn; +}; + +struct hfi1_16b_header { + u32 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ib_other_headers oth; + } l; + struct ib_other_headers oth; + struct opa_16b_mgmt mgmt; + } u; +} __packed; + +struct hfi1_opa_header { + union { + struct ib_header ibh; /* 9B header */ + struct hfi1_16b_header opah; /* 16B header */ + }; + u8 hdr_type; /* 9B or 16B */ +} __packed; + +struct hfi1_ahg_info { + u32 ahgdesc[2]; + u16 tx_flags; + u8 ahgcount; + u8 ahgidx; +}; + +struct hfi1_sdma_header { + __le64 pbc; + struct hfi1_opa_header hdr; +} __packed; + +/* + * hfi1 specific data structures that will be hidden from rvt after the queue + * pair is made common + */ +struct hfi1_qp_priv { + struct hfi1_ahg_info *s_ahg; /* ahg info for next header */ + struct sdma_engine *s_sde; /* current sde */ + struct send_context *s_sendcontext; /* current sendcontext */ + struct hfi1_ctxtdata *rcd; /* QP's receive context */ + struct page **pages; /* for TID page scan */ + u32 tid_enqueue; /* saved when tid waited */ + u8 s_sc; /* SC[0..4] for next packet */ + struct iowait s_iowait; + struct timer_list s_tid_timer; /* for timing tid wait */ + struct timer_list s_tid_retry_timer; /* for timing tid ack */ + struct list_head tid_wait; /* for queueing tid space */ + struct hfi1_opfn_data opfn; + struct tid_flow_state flow_state; + struct tid_rdma_qp_params tid_rdma; + struct rvt_qp *owner; + u16 s_running_pkt_size; + u8 hdr_type; /* 9B or 16B */ + struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */ + atomic_t n_requests; /* # of TID RDMA requests in the */ + /* queue */ + atomic_t n_tid_requests; /* # of sent TID RDMA requests */ + unsigned long tid_timer_timeout_jiffies; + unsigned long tid_retry_timeout_jiffies; + + /* variables for the TID RDMA SE state machine */ + u8 s_state; + u8 s_retry; + u8 rnr_nak_state; /* RNR NAK state */ + u8 s_nak_state; + u32 s_nak_psn; + u32 s_flags; + u32 s_tid_cur; + u32 s_tid_head; + u32 s_tid_tail; + u32 r_tid_head; /* Most recently added TID RDMA request */ + u32 r_tid_tail; /* the last completed TID RDMA request */ + u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ + u32 r_tid_alloc; /* Request for which we are allocating resources */ + u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ + u32 alloc_w_segs; /* Number of segments for which write */ + /* resources have been allocated for this QP */ + + /* For TID RDMA READ */ + u32 tid_r_reqs; /* Num of tid reads requested */ + u32 tid_r_comp; /* Num of tid reads completed */ + u32 pending_tid_r_segs; /* Num of pending tid read segments */ + u16 pkts_ps; /* packets per segment */ + u8 timeout_shift; /* account for number of packets per segment */ + + u32 r_next_psn_kdeth; + u32 r_next_psn_kdeth_save; + u32 s_resync_psn; + u8 sync_pt; /* Set when QP reaches sync point */ + u8 resync; +}; + +#define HFI1_QP_WQE_INVALID ((u32)-1) + +struct hfi1_swqe_priv { + struct tid_rdma_request tid_req; + struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ +}; + +struct hfi1_ack_priv { + struct rvt_sge_state ss; /* used for TID WRITE RESP */ + struct tid_rdma_request tid_req; +}; + +/* + * This structure is used to hold commonly lookedup and computed values during + * the send engine progress. + */ +struct iowait_work; +struct hfi1_pkt_state { + struct hfi1_ibdev *dev; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct verbs_txreq *s_txreq; + struct iowait_work *wait; + unsigned long flags; + unsigned long timeout; + unsigned long timeout_int; + int cpu; + u8 opcode; + bool in_thread; + bool pkts_sent; +}; + +#define HFI1_PSN_CREDIT 16 + +struct hfi1_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct hfi1_opcode_stats_perctx { + struct hfi1_opcode_stats stats[256]; +}; + +static inline void inc_opstats( + u32 tlen, + struct hfi1_opcode_stats *stats) +{ +#ifdef CONFIG_DEBUG_FS + stats->n_bytes += tlen; + stats->n_packets++; +#endif +} + +struct hfi1_ibport { + struct rvt_qp __rcu *qp[2]; + struct rvt_ibport rvp; + + /* the first 16 entries are sl_to_vl for !OPA */ + u8 sl_to_sc[32]; + u8 sc_to_sl[32]; +}; + +struct hfi1_ibdev { + struct rvt_dev_info rdi; /* Must be first */ + + /* QP numbers are shared by all IB ports */ + /* protect txwait list */ + seqlock_t txwait_lock ____cacheline_aligned_in_smp; + struct list_head txwait; /* list for wait verbs_txreq */ + struct list_head memwait; /* list for wait kernel memory */ + struct kmem_cache *verbs_txreq_cache; + u64 n_txwait; + u64 n_kmem_wait; + u64 n_tidwait; + + /* protect iowait lists */ + seqlock_t iowait_lock ____cacheline_aligned_in_smp; + u64 n_piowait; + u64 n_piodrain; + struct timer_list mem_timer; + +#ifdef CONFIG_DEBUG_FS + /* per HFI debugfs */ + struct dentry *hfi1_ibdev_dbg; + /* per HFI symlinks to above */ + struct dentry *hfi1_ibdev_link; +#ifdef CONFIG_FAULT_INJECTION + struct fault *fault; +#endif +#endif +}; + +static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) +{ + struct rvt_dev_info *rdi; + + rdi = container_of(ibdev, struct rvt_dev_info, ibdev); + return container_of(rdi, struct hfi1_ibdev, rdi); +} + +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +{ + struct hfi1_qp_priv *priv; + + priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait); + return priv->owner; +} + +/* + * This must be called with s_lock held. + */ +void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, + u32 qp1, u32 qp2, u32 lid1, u32 lid2); +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num); +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); +void hfi1_node_desc_chg(struct hfi1_ibport *ibp); +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u32 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index); + +/* + * The PSN_MASK and PSN_SHIFT allow for + * 1) comparing two PSNs + * 2) returning the PSN with any upper bits masked + * 3) returning the difference between to PSNs + * + * The number of significant bits in the PSN must + * necessarily be at least one bit less than + * the container holding the PSN. + */ +#define PSN_MASK 0x7FFFFFFF +#define PSN_SHIFT 1 +#define PSN_MODIFY_MASK 0xFFFFFF + +/* + * Compare two PSNs + * Returns an integer <, ==, or > than zero. + */ +static inline int cmp_psn(u32 a, u32 b) +{ + return (((int)a) - ((int)b)) << PSN_SHIFT; +} + +/* + * Return masked PSN + */ +static inline u32 mask_psn(u32 a) +{ + return a & PSN_MASK; +} + +/* + * Return delta between two PSNs + */ +static inline u32 delta_psn(u32 a, u32 b) +{ + return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; +} + +static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe) +{ + return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req; +} + +static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e) +{ + return &((struct hfi1_ack_priv *)e->priv)->tid_req; +} + +/* + * Look through all the active flows for a TID RDMA request and find + * the one (if it exists) that contains the specified PSN. + */ +static inline u32 __full_flow_psn(struct flow_state *state, u32 psn) +{ + return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + (psn & HFI1_KDETH_BTH_SEQ_MASK)); +} + +static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn) +{ + return __full_flow_psn(&flow->flow_state, psn); +} + +struct verbs_txreq; +void hfi1_put_txreq(struct verbs_txreq *tx); + +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +void hfi1_cnp_rcv(struct hfi1_packet *packet); + +void hfi1_uc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet, + struct rvt_qp *qp); + +u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); + +void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah); +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah); + +void hfi1_ud_rcv(struct hfi1_packet *packet); + +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); + +void hfi1_migrate_qp(struct rvt_qp *qp); + +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); + +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet); + +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + const struct ib_global_route *grh, u32 hwords, u32 nwords); + +void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, + u32 bth0, u32 bth1, u32 bth2, int middle, + struct hfi1_pkt_state *ps); + +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid); + +void _hfi1_do_send(struct work_struct *work); + +void hfi1_do_send_from_rvt(struct rvt_qp *qp); + +void hfi1_do_send(struct rvt_qp *qp, bool in_thread); + +void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn); + +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_register_ib_device(struct hfi1_devdata *); + +void hfi1_unregister_ib_device(struct hfi1_devdata *); + +void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet); + +void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet); + +void hfi1_ib_rcv(struct hfi1_packet *packet); + +void hfi1_16B_rcv(struct hfi1_packet *packet); + +unsigned hfi1_get_npkeys(struct hfi1_devdata *); + +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); + +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); + +static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) +{ + return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); +} + +void hfi1_wait_kmem(struct rvt_qp *qp); + +static inline void hfi1_trdma_send_complete(struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + trdma_clean_swqe(qp, wqe); + rvt_send_complete(qp, wqe, status); +} + +extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; + +extern const u8 hdr_len_by_opcode[]; + +extern const int ib_rvt_state_ops[]; + +extern __be64 ib_hfi1_sys_image_guid; /* in network order */ + +extern unsigned int hfi1_max_cqes; + +extern unsigned int hfi1_max_cqs; + +extern unsigned int hfi1_max_qp_wrs; + +extern unsigned int hfi1_max_qps; + +extern unsigned int hfi1_max_sges; + +extern unsigned int hfi1_max_mcast_grps; + +extern unsigned int hfi1_max_mcast_qp_attached; + +extern unsigned int hfi1_max_srqs; + +extern unsigned int hfi1_max_srq_sges; + +extern unsigned int hfi1_max_srq_wrs; + +extern unsigned short piothreshold; + +extern const u32 ib_hfi1_rnr_table[]; + +#endif /* HFI1_VERBS_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c new file mode 100644 index 0000000000..cfecc81a27 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2016 - 2018 Intel Corporation. + */ + +#include "hfi.h" +#include "verbs_txreq.h" +#include "qp.h" +#include "trace.h" + +#define TXREQ_LEN 24 + +void hfi1_put_txreq(struct verbs_txreq *tx) +{ + struct hfi1_ibdev *dev; + struct rvt_qp *qp; + unsigned long flags; + unsigned int seq; + struct hfi1_qp_priv *priv; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (tx->mr) + rvt_put_mr(tx->mr); + + sdma_txclean(dd_from_dev(dev), &tx->txreq); + + /* Free verbs_txreq and return to slab cache */ + kmem_cache_free(dev->verbs_txreq_cache, tx); + + do { + seq = read_seqbegin(&dev->txwait_lock); + if (!list_empty(&dev->txwait)) { + struct iowait *wait; + + write_seqlock_irqsave(&dev->txwait_lock, flags); + wait = list_first_entry(&dev->txwait, struct iowait, + list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + /* refcount held until actual wake up */ + write_sequnlock_irqrestore(&dev->txwait_lock, flags); + hfi1_qp_wakeup(qp, RVT_S_WAIT_TX); + break; + } + } while (read_seqretry(&dev->txwait_lock, seq)); +} + +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct verbs_txreq *tx = NULL; + + write_seqlock(&dev->txwait_lock); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + struct hfi1_qp_priv *priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP); + if (tx) + goto out; + priv = qp->priv; + if (list_empty(&priv->s_iowait.list)) { + dev->n_txwait++; + qp->s_flags |= RVT_S_WAIT_TX; + list_add_tail(&priv->s_iowait.list, &dev->txwait); + priv->s_iowait.lock = &dev->txwait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX); + rvt_get_qp(qp); + } + qp->s_flags &= ~RVT_S_BUSY; + } +out: + write_sequnlock(&dev->txwait_lock); + return tx; +} + +int verbs_txreq_init(struct hfi1_ibdev *dev) +{ + char buf[TXREQ_LEN]; + struct hfi1_devdata *dd = dd_from_dev(dev); + + snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit); + dev->verbs_txreq_cache = kmem_cache_create(buf, + sizeof(struct verbs_txreq), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!dev->verbs_txreq_cache) + return -ENOMEM; + return 0; +} + +void verbs_txreq_exit(struct hfi1_ibdev *dev) +{ + kmem_cache_destroy(dev->verbs_txreq_cache); + dev->verbs_txreq_cache = NULL; +} diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h new file mode 100644 index 0000000000..2a7e0ae892 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2016 - 2018 Intel Corporation. + */ + +#ifndef HFI1_VERBS_TXREQ_H +#define HFI1_VERBS_TXREQ_H + +#include <linux/types.h> +#include <linux/slab.h> + +#include "verbs.h" +#include "sdma_txreq.h" +#include "iowait.h" + +struct verbs_txreq { + struct hfi1_sdma_header phdr; + struct sdma_txreq txreq; + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_mregion *mr; + struct rvt_sge_state *ss; + struct sdma_engine *sde; + struct send_context *psc; + u16 hdr_dwords; + u16 s_cur_size; +}; + +struct hfi1_ibdev; +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp); + +#define VERBS_TXREQ_GFP (GFP_ATOMIC | __GFP_NOWARN) +static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) + __must_hold(&qp->slock) +{ + struct verbs_txreq *tx; + struct hfi1_qp_priv *priv = qp->priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP); + if (unlikely(!tx)) { + /* call slow path to get the lock */ + tx = __get_txreq(dev, qp); + if (!tx) + return tx; + } + tx->qp = qp; + tx->mr = NULL; + tx->sde = priv->s_sde; + tx->psc = priv->s_sendcontext; + /* so that we can test if the sdma descriptors are there */ + tx->txreq.num_desc = 0; + /* Set the header type */ + tx->phdr.hdr.hdr_type = priv->hdr_type; + tx->txreq.flags = 0; + return tx; +} + +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w) +{ + struct sdma_txreq *stx; + + stx = iowait_get_txhead(w); + if (stx) + return container_of(stx, struct verbs_txreq, txreq); + return NULL; +} + +static inline bool verbs_txreq_queued(struct iowait_work *w) +{ + return iowait_packet_queued(w); +} + +void hfi1_put_txreq(struct verbs_txreq *tx); +int verbs_txreq_init(struct hfi1_ibdev *dev); +void verbs_txreq_exit(struct hfi1_ibdev *dev); + +#endif /* HFI1_VERBS_TXREQ_H */ diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h new file mode 100644 index 0000000000..34f03e7770 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/vnic.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ +/* + * Copyright(c) 2017 - 2020 Intel Corporation. + */ + +#ifndef _HFI1_VNIC_H +#define _HFI1_VNIC_H +#include <rdma/opa_vnic.h> +#include "hfi.h" +#include "sdma.h" + +#define HFI1_VNIC_MAX_TXQ 16 +#define HFI1_VNIC_MAX_PAD 12 + +/* L4 header definitions */ +#define HFI1_VNIC_L4_HDR_OFFSET OPA_VNIC_L2_HDR_LEN + +#define HFI1_VNIC_GET_L4_HDR(data) \ + (*((u16 *)((u8 *)(data) + HFI1_VNIC_L4_HDR_OFFSET))) + +#define HFI1_VNIC_GET_VESWID(data) \ + (HFI1_VNIC_GET_L4_HDR(data) & 0xFFF) + +/* Service class */ +#define HFI1_VNIC_SC_OFFSET_LOW 6 +#define HFI1_VNIC_SC_OFFSET_HI 7 +#define HFI1_VNIC_SC_SHIFT 4 + +#define HFI1_VNIC_MAX_QUEUE 16 +#define HFI1_NUM_VNIC_CTXT 8 + +/** + * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information + * @dd - device data pointer + * @sde - sdma engine + * @vinfo - vnic info pointer + * @wait - iowait structure + * @stx - sdma tx request + * @state - vnic Tx ring SDMA state + * @q_idx - vnic Tx queue index + */ +struct hfi1_vnic_sdma { + struct hfi1_devdata *dd; + struct sdma_engine *sde; + struct hfi1_vnic_vport_info *vinfo; + struct iowait wait; + struct sdma_txreq stx; + unsigned int state; + u8 q_idx; + bool pkts_sent; +}; + +/** + * struct hfi1_vnic_rx_queue - HFI1 VNIC receive queue + * @idx: queue index + * @vinfo: pointer to vport information + * @netdev: network device + * @napi: netdev napi structure + * @skbq: queue of received socket buffers + */ +struct hfi1_vnic_rx_queue { + u8 idx; + struct hfi1_vnic_vport_info *vinfo; + struct net_device *netdev; + struct napi_struct napi; +}; + +/** + * struct hfi1_vnic_vport_info - HFI1 VNIC virtual port information + * @dd: device data pointer + * @netdev: net device pointer + * @flags: state flags + * @lock: vport lock + * @num_tx_q: number of transmit queues + * @num_rx_q: number of receive queues + * @vesw_id: virtual switch id + * @rxq: Array of receive queues + * @stats: per queue stats + * @sdma: VNIC SDMA structure per TXQ + */ +struct hfi1_vnic_vport_info { + struct hfi1_devdata *dd; + struct net_device *netdev; + unsigned long flags; + + /* Lock used around state updates */ + struct mutex lock; + + u8 num_tx_q; + u8 num_rx_q; + u16 vesw_id; + struct hfi1_vnic_rx_queue rxq[HFI1_NUM_VNIC_CTXT]; + + struct opa_vnic_stats stats[HFI1_VNIC_MAX_QUEUE]; + struct hfi1_vnic_sdma sdma[HFI1_VNIC_MAX_TXQ]; +}; + +#define v_dbg(format, arg...) \ + netdev_dbg(vinfo->netdev, format, ## arg) +#define v_err(format, arg...) \ + netdev_err(vinfo->netdev, format, ## arg) +#define v_info(format, arg...) \ + netdev_info(vinfo->netdev, format, ## arg) + +/* vnic hfi1 internal functions */ +void hfi1_vnic_setup(struct hfi1_devdata *dd); +int hfi1_vnic_txreq_init(struct hfi1_devdata *dd); +void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd); + +void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet); +void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo); +bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo, + u8 q_idx); + +/* vnic rdma netdev operations */ +struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, + u32 port_num, + enum rdma_netdev_t type, + const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *)); +int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, + struct hfi1_vnic_vport_info *vinfo, + struct sk_buff *skb, u64 pbc, u8 plen); + +#endif /* _HFI1_VNIC_H */ diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c new file mode 100644 index 0000000000..3650fababf --- /dev/null +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2017 - 2020 Intel Corporation. + */ + +/* + * This file contains HFI1 support for VNIC functionality + */ + +#include <linux/io.h> +#include <linux/if_vlan.h> + +#include "vnic.h" +#include "netdev.h" + +#define HFI_TX_TIMEOUT_MS 1000 + +#define HFI1_VNIC_RCV_Q_SIZE 1024 + +#define HFI1_VNIC_UP 0 + +static DEFINE_SPINLOCK(vport_cntr_lock); + +#define SUM_GRP_COUNTERS(stats, qstats, x_grp) do { \ + u64 *src64, *dst64; \ + for (src64 = &qstats->x_grp.unicast, \ + dst64 = &stats->x_grp.unicast; \ + dst64 <= &stats->x_grp.s_1519_max;) { \ + *dst64++ += *src64++; \ + } \ + } while (0) + +#define VNIC_MASK (0xFF) +#define VNIC_ID(val) ((1ull << 24) | ((val) & VNIC_MASK)) + +/* hfi1_vnic_update_stats - update statistics */ +static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo, + struct opa_vnic_stats *stats) +{ + struct net_device *netdev = vinfo->netdev; + u8 i; + + /* add tx counters on different queues */ + for (i = 0; i < vinfo->num_tx_q; i++) { + struct opa_vnic_stats *qstats = &vinfo->stats[i]; + struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats; + + stats->netstats.tx_fifo_errors += qnstats->tx_fifo_errors; + stats->netstats.tx_carrier_errors += qnstats->tx_carrier_errors; + stats->tx_drop_state += qstats->tx_drop_state; + stats->tx_dlid_zero += qstats->tx_dlid_zero; + + SUM_GRP_COUNTERS(stats, qstats, tx_grp); + stats->netstats.tx_packets += qnstats->tx_packets; + stats->netstats.tx_bytes += qnstats->tx_bytes; + } + + /* add rx counters on different queues */ + for (i = 0; i < vinfo->num_rx_q; i++) { + struct opa_vnic_stats *qstats = &vinfo->stats[i]; + struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats; + + stats->netstats.rx_fifo_errors += qnstats->rx_fifo_errors; + stats->netstats.rx_nohandler += qnstats->rx_nohandler; + stats->rx_drop_state += qstats->rx_drop_state; + stats->rx_oversize += qstats->rx_oversize; + stats->rx_runt += qstats->rx_runt; + + SUM_GRP_COUNTERS(stats, qstats, rx_grp); + stats->netstats.rx_packets += qnstats->rx_packets; + stats->netstats.rx_bytes += qnstats->rx_bytes; + } + + stats->netstats.tx_errors = stats->netstats.tx_fifo_errors + + stats->netstats.tx_carrier_errors + + stats->tx_drop_state + stats->tx_dlid_zero; + stats->netstats.tx_dropped = stats->netstats.tx_errors; + + stats->netstats.rx_errors = stats->netstats.rx_fifo_errors + + stats->netstats.rx_nohandler + + stats->rx_drop_state + stats->rx_oversize + + stats->rx_runt; + stats->netstats.rx_dropped = stats->netstats.rx_errors; + + netdev->stats.tx_packets = stats->netstats.tx_packets; + netdev->stats.tx_bytes = stats->netstats.tx_bytes; + netdev->stats.tx_fifo_errors = stats->netstats.tx_fifo_errors; + netdev->stats.tx_carrier_errors = stats->netstats.tx_carrier_errors; + netdev->stats.tx_errors = stats->netstats.tx_errors; + netdev->stats.tx_dropped = stats->netstats.tx_dropped; + + netdev->stats.rx_packets = stats->netstats.rx_packets; + netdev->stats.rx_bytes = stats->netstats.rx_bytes; + netdev->stats.rx_fifo_errors = stats->netstats.rx_fifo_errors; + netdev->stats.multicast = stats->rx_grp.mcastbcast; + netdev->stats.rx_length_errors = stats->rx_oversize + stats->rx_runt; + netdev->stats.rx_errors = stats->netstats.rx_errors; + netdev->stats.rx_dropped = stats->netstats.rx_dropped; +} + +/* update_len_counters - update pkt's len histogram counters */ +static inline void update_len_counters(struct opa_vnic_grp_stats *grp, + int len) +{ + /* account for 4 byte FCS */ + if (len >= 1515) + grp->s_1519_max++; + else if (len >= 1020) + grp->s_1024_1518++; + else if (len >= 508) + grp->s_512_1023++; + else if (len >= 252) + grp->s_256_511++; + else if (len >= 124) + grp->s_128_255++; + else if (len >= 61) + grp->s_65_127++; + else + grp->s_64++; +} + +/* hfi1_vnic_update_tx_counters - update transmit counters */ +static void hfi1_vnic_update_tx_counters(struct hfi1_vnic_vport_info *vinfo, + u8 q_idx, struct sk_buff *skb, int err) +{ + struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb); + struct opa_vnic_stats *stats = &vinfo->stats[q_idx]; + struct opa_vnic_grp_stats *tx_grp = &stats->tx_grp; + u16 vlan_tci; + + stats->netstats.tx_packets++; + stats->netstats.tx_bytes += skb->len + ETH_FCS_LEN; + + update_len_counters(tx_grp, skb->len); + + /* rest of the counts are for good packets only */ + if (unlikely(err)) + return; + + if (is_multicast_ether_addr(mac_hdr->h_dest)) + tx_grp->mcastbcast++; + else + tx_grp->unicast++; + + if (!__vlan_get_tag(skb, &vlan_tci)) + tx_grp->vlan++; + else + tx_grp->untagged++; +} + +/* hfi1_vnic_update_rx_counters - update receive counters */ +static void hfi1_vnic_update_rx_counters(struct hfi1_vnic_vport_info *vinfo, + u8 q_idx, struct sk_buff *skb, int err) +{ + struct ethhdr *mac_hdr = (struct ethhdr *)skb->data; + struct opa_vnic_stats *stats = &vinfo->stats[q_idx]; + struct opa_vnic_grp_stats *rx_grp = &stats->rx_grp; + u16 vlan_tci; + + stats->netstats.rx_packets++; + stats->netstats.rx_bytes += skb->len + ETH_FCS_LEN; + + update_len_counters(rx_grp, skb->len); + + /* rest of the counts are for good packets only */ + if (unlikely(err)) + return; + + if (is_multicast_ether_addr(mac_hdr->h_dest)) + rx_grp->mcastbcast++; + else + rx_grp->unicast++; + + if (!__vlan_get_tag(skb, &vlan_tci)) + rx_grp->vlan++; + else + rx_grp->untagged++; +} + +/* This function is overloaded for opa_vnic specific implementation */ +static void hfi1_vnic_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct opa_vnic_stats *vstats = (struct opa_vnic_stats *)stats; + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + + hfi1_vnic_update_stats(vinfo, vstats); +} + +static u64 create_bypass_pbc(u32 vl, u32 dw_len) +{ + u64 pbc; + + pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT) + | PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN + | PBC_PACKET_BYPASS + | ((vl & PBC_VL_MASK) << PBC_VL_SHIFT) + | (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT; + + return pbc; +} + +/* hfi1_vnic_maybe_stop_tx - stop tx queue if required */ +static void hfi1_vnic_maybe_stop_tx(struct hfi1_vnic_vport_info *vinfo, + u8 q_idx) +{ + netif_stop_subqueue(vinfo->netdev, q_idx); + if (!hfi1_vnic_sdma_write_avail(vinfo, q_idx)) + return; + + netif_start_subqueue(vinfo->netdev, q_idx); +} + +static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + u8 pad_len, q_idx = skb->queue_mapping; + struct hfi1_devdata *dd = vinfo->dd; + struct opa_vnic_skb_mdata *mdata; + u32 pkt_len, total_len; + int err = -EINVAL; + u64 pbc; + + v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len); + if (unlikely(!netif_oper_up(netdev))) { + vinfo->stats[q_idx].tx_drop_state++; + goto tx_finish; + } + + /* take out meta data */ + mdata = (struct opa_vnic_skb_mdata *)skb->data; + skb_pull(skb, sizeof(*mdata)); + if (unlikely(mdata->flags & OPA_VNIC_SKB_MDATA_ENCAP_ERR)) { + vinfo->stats[q_idx].tx_dlid_zero++; + goto tx_finish; + } + + /* add tail padding (for 8 bytes size alignment) and icrc */ + pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7; + pad_len += OPA_VNIC_ICRC_TAIL_LEN; + + /* + * pkt_len is how much data we have to write, includes header and data. + * total_len is length of the packet in Dwords plus the PBC should not + * include the CRC. + */ + pkt_len = (skb->len + pad_len) >> 2; + total_len = pkt_len + 2; /* PBC + packet */ + + pbc = create_bypass_pbc(mdata->vl, total_len); + + skb_get(skb); + v_dbg("pbc 0x%016llX len %d pad_len %d\n", pbc, skb->len, pad_len); + err = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb, pbc, pad_len); + if (unlikely(err)) { + if (err == -ENOMEM) + vinfo->stats[q_idx].netstats.tx_fifo_errors++; + else if (err != -EBUSY) + vinfo->stats[q_idx].netstats.tx_carrier_errors++; + } + /* remove the header before updating tx counters */ + skb_pull(skb, OPA_VNIC_HDR_LEN); + + if (unlikely(err == -EBUSY)) { + hfi1_vnic_maybe_stop_tx(vinfo, q_idx); + dev_kfree_skb_any(skb); + return NETDEV_TX_BUSY; + } + +tx_finish: + /* update tx counters */ + hfi1_vnic_update_tx_counters(vinfo, q_idx, skb, err); + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; +} + +static u16 hfi1_vnic_select_queue(struct net_device *netdev, + struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + struct opa_vnic_skb_mdata *mdata; + struct sdma_engine *sde; + + mdata = (struct opa_vnic_skb_mdata *)skb->data; + sde = sdma_select_engine_vl(vinfo->dd, mdata->entropy, mdata->vl); + return sde->this_idx; +} + +/* hfi1_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */ +static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq, + struct sk_buff *skb) +{ + struct hfi1_vnic_vport_info *vinfo = rxq->vinfo; + int max_len = vinfo->netdev->mtu + VLAN_ETH_HLEN; + int rc = -EFAULT; + + skb_pull(skb, OPA_VNIC_HDR_LEN); + + /* Validate Packet length */ + if (unlikely(skb->len > max_len)) + vinfo->stats[rxq->idx].rx_oversize++; + else if (unlikely(skb->len < ETH_ZLEN)) + vinfo->stats[rxq->idx].rx_runt++; + else + rc = 0; + return rc; +} + +static struct hfi1_vnic_vport_info *get_vnic_port(struct hfi1_devdata *dd, + int vesw_id) +{ + int vnic_id = VNIC_ID(vesw_id); + + return hfi1_netdev_get_data(dd, vnic_id); +} + +static struct hfi1_vnic_vport_info *get_first_vnic_port(struct hfi1_devdata *dd) +{ + struct hfi1_vnic_vport_info *vinfo; + int next_id = VNIC_ID(0); + + vinfo = hfi1_netdev_get_first_data(dd, &next_id); + + if (next_id > VNIC_ID(VNIC_MASK)) + return NULL; + + return vinfo; +} + +void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) +{ + struct hfi1_devdata *dd = packet->rcd->dd; + struct hfi1_vnic_vport_info *vinfo = NULL; + struct hfi1_vnic_rx_queue *rxq; + struct sk_buff *skb; + int l4_type, vesw_id = -1, rc; + u8 q_idx; + unsigned char *pad_info; + + l4_type = hfi1_16B_get_l4(packet->ebuf); + if (likely(l4_type == OPA_16B_L4_ETHR)) { + vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf); + vinfo = get_vnic_port(dd, vesw_id); + + /* + * In case of invalid vesw id, count the error on + * the first available vport. + */ + if (unlikely(!vinfo)) { + struct hfi1_vnic_vport_info *vinfo_tmp; + + vinfo_tmp = get_first_vnic_port(dd); + if (vinfo_tmp) { + spin_lock(&vport_cntr_lock); + vinfo_tmp->stats[0].netstats.rx_nohandler++; + spin_unlock(&vport_cntr_lock); + } + } + } + + if (unlikely(!vinfo)) { + dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n", + l4_type, vesw_id, packet->rcd->ctxt); + return; + } + + q_idx = packet->rcd->vnic_q_idx; + rxq = &vinfo->rxq[q_idx]; + if (unlikely(!netif_oper_up(vinfo->netdev))) { + vinfo->stats[q_idx].rx_drop_state++; + return; + } + + skb = netdev_alloc_skb(vinfo->netdev, packet->tlen); + if (unlikely(!skb)) { + vinfo->stats[q_idx].netstats.rx_fifo_errors++; + return; + } + + memcpy(skb->data, packet->ebuf, packet->tlen); + skb_put(skb, packet->tlen); + + pad_info = skb->data + skb->len - 1; + skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN - + ((*pad_info) & 0x7))); + + rc = hfi1_vnic_decap_skb(rxq, skb); + + /* update rx counters */ + hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc); + if (unlikely(rc)) { + dev_kfree_skb_any(skb); + return; + } + + skb_checksum_none_assert(skb); + skb->protocol = eth_type_trans(skb, rxq->netdev); + + napi_gro_receive(&rxq->napi, skb); +} + +static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo) +{ + struct hfi1_devdata *dd = vinfo->dd; + struct net_device *netdev = vinfo->netdev; + int rc; + + /* ensure virtual eth switch id is valid */ + if (!vinfo->vesw_id) + return -EINVAL; + + rc = hfi1_netdev_add_data(dd, VNIC_ID(vinfo->vesw_id), vinfo); + if (rc < 0) + return rc; + + rc = hfi1_netdev_rx_init(dd); + if (rc) + goto err_remove; + + netif_carrier_on(netdev); + netif_tx_start_all_queues(netdev); + set_bit(HFI1_VNIC_UP, &vinfo->flags); + + return 0; + +err_remove: + hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); + return rc; +} + +static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) +{ + struct hfi1_devdata *dd = vinfo->dd; + + clear_bit(HFI1_VNIC_UP, &vinfo->flags); + netif_carrier_off(vinfo->netdev); + netif_tx_disable(vinfo->netdev); + hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); + + hfi1_netdev_rx_destroy(dd); +} + +static int hfi1_netdev_open(struct net_device *netdev) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + int rc; + + mutex_lock(&vinfo->lock); + rc = hfi1_vnic_up(vinfo); + mutex_unlock(&vinfo->lock); + return rc; +} + +static int hfi1_netdev_close(struct net_device *netdev) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + + mutex_lock(&vinfo->lock); + if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) + hfi1_vnic_down(vinfo); + mutex_unlock(&vinfo->lock); + return 0; +} + +static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) +{ + struct hfi1_devdata *dd = vinfo->dd; + int rc = 0; + + mutex_lock(&hfi1_mutex); + if (!dd->vnic_num_vports) { + rc = hfi1_vnic_txreq_init(dd); + if (rc) + goto txreq_fail; + } + + rc = hfi1_netdev_rx_init(dd); + if (rc) { + dd_dev_err(dd, "Unable to initialize netdev contexts\n"); + goto alloc_fail; + } + + hfi1_init_vnic_rsm(dd); + + dd->vnic_num_vports++; + hfi1_vnic_sdma_init(vinfo); + +alloc_fail: + if (!dd->vnic_num_vports) + hfi1_vnic_txreq_deinit(dd); +txreq_fail: + mutex_unlock(&hfi1_mutex); + return rc; +} + +static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo) +{ + struct hfi1_devdata *dd = vinfo->dd; + + mutex_lock(&hfi1_mutex); + if (--dd->vnic_num_vports == 0) { + hfi1_deinit_vnic_rsm(dd); + hfi1_vnic_txreq_deinit(dd); + } + mutex_unlock(&hfi1_mutex); + hfi1_netdev_rx_destroy(dd); +} + +static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + bool reopen = false; + + /* + * If vesw_id is being changed, and if the vnic port is up, + * reset the vnic port to ensure new vesw_id gets picked up + */ + if (id != vinfo->vesw_id) { + mutex_lock(&vinfo->lock); + if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) { + hfi1_vnic_down(vinfo); + reopen = true; + } + + vinfo->vesw_id = id; + if (reopen) + hfi1_vnic_up(vinfo); + + mutex_unlock(&vinfo->lock); + } +} + +/* netdev ops */ +static const struct net_device_ops hfi1_netdev_ops = { + .ndo_open = hfi1_netdev_open, + .ndo_stop = hfi1_netdev_close, + .ndo_start_xmit = hfi1_netdev_start_xmit, + .ndo_select_queue = hfi1_vnic_select_queue, + .ndo_get_stats64 = hfi1_vnic_get_stats64, +}; + +static void hfi1_vnic_free_rn(struct net_device *netdev) +{ + struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); + + hfi1_vnic_deinit(vinfo); + mutex_destroy(&vinfo->lock); + free_netdev(netdev); +} + +struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, + u32 port_num, + enum rdma_netdev_t type, + const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *)) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + struct hfi1_vnic_vport_info *vinfo; + struct net_device *netdev; + struct rdma_netdev *rn; + int i, size, rc; + + if (!dd->num_netdev_contexts) + return ERR_PTR(-ENOMEM); + + if (!port_num || (port_num > dd->num_pports)) + return ERR_PTR(-EINVAL); + + if (type != RDMA_NETDEV_OPA_VNIC) + return ERR_PTR(-EOPNOTSUPP); + + size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo); + netdev = alloc_netdev_mqs(size, name, name_assign_type, setup, + chip_sdma_engines(dd), + dd->num_netdev_contexts); + if (!netdev) + return ERR_PTR(-ENOMEM); + + rn = netdev_priv(netdev); + vinfo = opa_vnic_dev_priv(netdev); + vinfo->dd = dd; + vinfo->num_tx_q = chip_sdma_engines(dd); + vinfo->num_rx_q = dd->num_netdev_contexts; + vinfo->netdev = netdev; + rn->free_rdma_netdev = hfi1_vnic_free_rn; + rn->set_id = hfi1_vnic_set_vesw_id; + + netdev->features = NETIF_F_HIGHDMA | NETIF_F_SG; + netdev->hw_features = netdev->features; + netdev->vlan_features = netdev->features; + netdev->watchdog_timeo = msecs_to_jiffies(HFI_TX_TIMEOUT_MS); + netdev->netdev_ops = &hfi1_netdev_ops; + mutex_init(&vinfo->lock); + + for (i = 0; i < vinfo->num_rx_q; i++) { + struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i]; + + rxq->idx = i; + rxq->vinfo = vinfo; + rxq->netdev = netdev; + } + + rc = hfi1_vnic_init(vinfo); + if (rc) + goto init_fail; + + return netdev; +init_fail: + mutex_destroy(&vinfo->lock); + free_netdev(netdev); + return ERR_PTR(rc); +} diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c new file mode 100644 index 0000000000..cc6324d2d1 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause +/* + * Copyright(c) 2017 - 2018 Intel Corporation. + */ + +/* + * This file contains HFI1 support for VNIC SDMA functionality + */ + +#include "sdma.h" +#include "vnic.h" + +#define HFI1_VNIC_SDMA_Q_ACTIVE BIT(0) +#define HFI1_VNIC_SDMA_Q_DEFERRED BIT(1) + +#define HFI1_VNIC_TXREQ_NAME_LEN 32 +#define HFI1_VNIC_SDMA_DESC_WTRMRK 64 + +/* + * struct vnic_txreq - VNIC transmit descriptor + * @txreq: sdma transmit request + * @sdma: vnic sdma pointer + * @skb: skb to send + * @pad: pad buffer + * @plen: pad length + * @pbc_val: pbc value + */ +struct vnic_txreq { + struct sdma_txreq txreq; + struct hfi1_vnic_sdma *sdma; + + struct sk_buff *skb; + unsigned char pad[HFI1_VNIC_MAX_PAD]; + u16 plen; + __le64 pbc_val; +}; + +static void vnic_sdma_complete(struct sdma_txreq *txreq, + int status) +{ + struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); + struct hfi1_vnic_sdma *vnic_sdma = tx->sdma; + + sdma_txclean(vnic_sdma->dd, txreq); + dev_kfree_skb_any(tx->skb); + kmem_cache_free(vnic_sdma->dd->vnic.txreq_cache, tx); +} + +static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, + struct vnic_txreq *tx) +{ + int i, ret = 0; + + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + tx->skb->data, + skb_headlen(tx->skb)); + if (unlikely(ret)) + goto bail_txadd; + + for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(tx->skb)->frags[i]; + + /* combine physically continuous fragments later? */ + ret = sdma_txadd_page(sde->dd, + &tx->txreq, + skb_frag_page(frag), + skb_frag_off(frag), + skb_frag_size(frag), + NULL, NULL, NULL); + if (unlikely(ret)) + goto bail_txadd; + } + + if (tx->plen) + ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq, + tx->pad + HFI1_VNIC_MAX_PAD - tx->plen, + tx->plen); + +bail_txadd: + return ret; +} + +static int build_vnic_tx_desc(struct sdma_engine *sde, + struct vnic_txreq *tx, + u64 pbc) +{ + int ret = 0; + u16 hdrbytes = 2 << 2; /* PBC */ + + ret = sdma_txinit_ahg( + &tx->txreq, + 0, + hdrbytes + tx->skb->len + tx->plen, + 0, + 0, + NULL, + 0, + vnic_sdma_complete); + if (unlikely(ret)) + goto bail_txadd; + + /* add pbc */ + tx->pbc_val = cpu_to_le64(pbc); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + &tx->pbc_val, + hdrbytes); + if (unlikely(ret)) + goto bail_txadd; + + /* add the ulp payload */ + ret = build_vnic_ulp_payload(sde, tx); +bail_txadd: + return ret; +} + +/* setup the last plen bypes of pad */ +static inline void hfi1_vnic_update_pad(unsigned char *pad, u8 plen) +{ + pad[HFI1_VNIC_MAX_PAD - 1] = plen - OPA_VNIC_ICRC_TAIL_LEN; +} + +int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, + struct hfi1_vnic_vport_info *vinfo, + struct sk_buff *skb, u64 pbc, u8 plen) +{ + struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx]; + struct sdma_engine *sde = vnic_sdma->sde; + struct vnic_txreq *tx; + int ret = -ECOMM; + + if (unlikely(READ_ONCE(vnic_sdma->state) != HFI1_VNIC_SDMA_Q_ACTIVE)) + goto tx_err; + + if (unlikely(!sde || !sdma_running(sde))) + goto tx_err; + + tx = kmem_cache_alloc(dd->vnic.txreq_cache, GFP_ATOMIC); + if (unlikely(!tx)) { + ret = -ENOMEM; + goto tx_err; + } + + tx->sdma = vnic_sdma; + tx->skb = skb; + hfi1_vnic_update_pad(tx->pad, plen); + tx->plen = plen; + ret = build_vnic_tx_desc(sde, tx, pbc); + if (unlikely(ret)) + goto free_desc; + + ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), + &tx->txreq, vnic_sdma->pkts_sent); + /* When -ECOMM, sdma callback will be called with ABORT status */ + if (unlikely(ret && unlikely(ret != -ECOMM))) + goto free_desc; + + if (!ret) { + vnic_sdma->pkts_sent = true; + iowait_starve_clear(vnic_sdma->pkts_sent, &vnic_sdma->wait); + } + return ret; + +free_desc: + sdma_txclean(dd, &tx->txreq); + kmem_cache_free(dd->vnic.txreq_cache, tx); +tx_err: + if (ret != -EBUSY) + dev_kfree_skb_any(skb); + else + vnic_sdma->pkts_sent = false; + return ret; +} + +/* + * hfi1_vnic_sdma_sleep - vnic sdma sleep function + * + * This function gets called from sdma_send_txreq() when there are not enough + * sdma descriptors available to send the packet. It adds Tx queue's wait + * structure to sdma engine's dmawait list to be woken up when descriptors + * become available. + */ +static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *txreq, + uint seq, + bool pkts_sent) +{ + struct hfi1_vnic_sdma *vnic_sdma = + container_of(wait->iow, struct hfi1_vnic_sdma, wait); + + write_seqlock(&sde->waitlock); + if (sdma_progress(sde, seq, txreq)) { + write_sequnlock(&sde->waitlock); + return -EAGAIN; + } + + vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; + if (list_empty(&vnic_sdma->wait.list)) { + iowait_get_priority(wait->iow); + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } + write_sequnlock(&sde->waitlock); + return -EBUSY; +} + +/* + * hfi1_vnic_sdma_wakeup - vnic sdma wakeup function + * + * This function gets called when SDMA descriptors becomes available and Tx + * queue's wait structure was previously added to sdma engine's dmawait list. + * It notifies the upper driver about Tx queue wakeup. + */ +static void hfi1_vnic_sdma_wakeup(struct iowait *wait, int reason) +{ + struct hfi1_vnic_sdma *vnic_sdma = + container_of(wait, struct hfi1_vnic_sdma, wait); + struct hfi1_vnic_vport_info *vinfo = vnic_sdma->vinfo; + + vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE; + if (__netif_subqueue_stopped(vinfo->netdev, vnic_sdma->q_idx)) + netif_wake_subqueue(vinfo->netdev, vnic_sdma->q_idx); +}; + +inline bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo, + u8 q_idx) +{ + struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx]; + + return (READ_ONCE(vnic_sdma->state) == HFI1_VNIC_SDMA_Q_ACTIVE); +} + +void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) +{ + int i; + + for (i = 0; i < vinfo->num_tx_q; i++) { + struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; + + iowait_init(&vnic_sdma->wait, 0, NULL, NULL, + hfi1_vnic_sdma_sleep, + hfi1_vnic_sdma_wakeup, NULL, NULL); + vnic_sdma->sde = &vinfo->dd->per_sdma[i]; + vnic_sdma->dd = vinfo->dd; + vnic_sdma->vinfo = vinfo; + vnic_sdma->q_idx = i; + vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE; + + /* Add a free descriptor watermark for wakeups */ + if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { + struct iowait_work *work; + + INIT_LIST_HEAD(&vnic_sdma->stx.list); + vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; + work = iowait_get_ib_work(&vnic_sdma->wait); + list_add_tail(&vnic_sdma->stx.list, &work->tx_head); + } + } +} + +int hfi1_vnic_txreq_init(struct hfi1_devdata *dd) +{ + char buf[HFI1_VNIC_TXREQ_NAME_LEN]; + + snprintf(buf, sizeof(buf), "hfi1_%u_vnic_txreq_cache", dd->unit); + dd->vnic.txreq_cache = kmem_cache_create(buf, + sizeof(struct vnic_txreq), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!dd->vnic.txreq_cache) + return -ENOMEM; + return 0; +} + +void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd) +{ + kmem_cache_destroy(dd->vnic.txreq_cache); + dd->vnic.txreq_cache = NULL; +} |