diff options
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 9275 |
1 files changed, 9275 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c new file mode 100644 index 000000000..92560414e --- /dev/null +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -0,0 +1,9275 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "gaudiP.h" +#include "../include/hw_ip/mmu/mmu_general.h" +#include "../include/hw_ip/mmu/mmu_v1_1.h" +#include "../include/gaudi/gaudi_masks.h" +#include "../include/gaudi/gaudi_fw_if.h" +#include "../include/gaudi/gaudi_reg_map.h" +#include "../include/gaudi/gaudi_async_ids_map_extended.h" + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/firmware.h> +#include <linux/hwmon.h> +#include <linux/iommu.h> +#include <linux/seq_file.h> + +/* + * Gaudi security scheme: + * + * 1. Host is protected by: + * - Range registers + * - MMU + * + * 2. DDR is protected by: + * - Range registers (protect the first 512MB) + * + * 3. Configuration is protected by: + * - Range registers + * - Protection bits + * + * MMU is always enabled. + * + * QMAN DMA channels 0,1 (PCI DMAN): + * - DMA is not secured. + * - PQ and CQ are secured. + * - CP is secured: The driver needs to parse CB but WREG should be allowed + * because of TDMA (tensor DMA). Hence, WREG is always not + * secured. + * + * When the driver needs to use DMA it will check that Gaudi is idle, set DMA + * channel 0 to be secured, execute the DMA and change it back to not secured. + * Currently, the driver doesn't use the DMA while there are compute jobs + * running. + * + * The current use cases for the driver to use the DMA are: + * - Clear SRAM on context switch (happens on context switch when device is + * idle) + * - MMU page tables area clear (happens on init) + * + * QMAN DMA 2-7, TPC, MME, NIC: + * PQ is secured and is located on the Host (HBM CON TPC3 bug) + * CQ, CP and the engine are not secured + * + */ + +#define GAUDI_BOOT_FIT_FILE "habanalabs/gaudi/gaudi-boot-fit.itb" +#define GAUDI_LINUX_FW_FILE "habanalabs/gaudi/gaudi-fit.itb" +#define GAUDI_TPC_FW_FILE "habanalabs/gaudi/gaudi_tpc.bin" + +#define GAUDI_DMA_POOL_BLK_SIZE 0x100 /* 256 bytes */ + +#define GAUDI_RESET_TIMEOUT_MSEC 2000 /* 2000ms */ +#define GAUDI_RESET_WAIT_MSEC 1 /* 1ms */ +#define GAUDI_CPU_RESET_WAIT_MSEC 200 /* 200ms */ +#define GAUDI_TEST_QUEUE_WAIT_USEC 100000 /* 100ms */ + +#define GAUDI_PLDM_RESET_WAIT_MSEC 1000 /* 1s */ +#define GAUDI_PLDM_HRESET_TIMEOUT_MSEC 20000 /* 20s */ +#define GAUDI_PLDM_TEST_QUEUE_WAIT_USEC 1000000 /* 1s */ +#define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100) +#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) +#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) +#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 4000000 /* 4s */ +#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */ +#define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC 15000000 /* 15s */ + +#define GAUDI_QMAN0_FENCE_VAL 0x72E91AB9 + +#define GAUDI_MAX_STRING_LEN 20 + +#define GAUDI_CB_POOL_CB_CNT 512 +#define GAUDI_CB_POOL_CB_SIZE 0x20000 /* 128KB */ + +#define GAUDI_ALLOC_CPU_MEM_RETRY_CNT 3 + +#define GAUDI_NUM_OF_TPC_INTR_CAUSE 20 + +#define GAUDI_NUM_OF_QM_ERR_CAUSE 16 + +#define GAUDI_NUM_OF_QM_ARB_ERR_CAUSE 3 + +#define GAUDI_ARB_WDT_TIMEOUT 0xEE6b27FF /* 8 seconds */ + +#define HBM_SCRUBBING_TIMEOUT_US 1000000 /* 1s */ + +#define BIN_REG_STRING_SIZE sizeof("0b10101010101010101010101010101010") + +#define MONITOR_SOB_STRING_SIZE 256 + +static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = { + GAUDI_QUEUE_ID_DMA_0_0, + GAUDI_QUEUE_ID_DMA_0_1, + GAUDI_QUEUE_ID_DMA_0_2, + GAUDI_QUEUE_ID_DMA_0_3, + GAUDI_QUEUE_ID_DMA_1_0, + GAUDI_QUEUE_ID_DMA_1_1, + GAUDI_QUEUE_ID_DMA_1_2, + GAUDI_QUEUE_ID_DMA_1_3 +}; + +static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { + "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3", + "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3", + "gaudi cq 5_0", "gaudi cq 5_1", "gaudi cq 5_2", "gaudi cq 5_3", + "gaudi cpu eq" +}; + +static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = { + [GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0, + [GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1, + [GAUDI_HBM_DMA_1] = GAUDI_ENGINE_ID_DMA_2, + [GAUDI_HBM_DMA_2] = GAUDI_ENGINE_ID_DMA_3, + [GAUDI_HBM_DMA_3] = GAUDI_ENGINE_ID_DMA_4, + [GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_5, + [GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_6, + [GAUDI_HBM_DMA_6] = GAUDI_ENGINE_ID_DMA_7 +}; + +static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = { + [0] = GAUDI_QUEUE_ID_DMA_0_0, + [1] = GAUDI_QUEUE_ID_DMA_0_1, + [2] = GAUDI_QUEUE_ID_DMA_0_2, + [3] = GAUDI_QUEUE_ID_DMA_0_3, + [4] = GAUDI_QUEUE_ID_DMA_1_0, + [5] = GAUDI_QUEUE_ID_DMA_1_1, + [6] = GAUDI_QUEUE_ID_DMA_1_2, + [7] = GAUDI_QUEUE_ID_DMA_1_3, +}; + +static const u16 gaudi_packet_sizes[MAX_PACKET_ID] = { + [PACKET_WREG_32] = sizeof(struct packet_wreg32), + [PACKET_WREG_BULK] = sizeof(struct packet_wreg_bulk), + [PACKET_MSG_LONG] = sizeof(struct packet_msg_long), + [PACKET_MSG_SHORT] = sizeof(struct packet_msg_short), + [PACKET_CP_DMA] = sizeof(struct packet_cp_dma), + [PACKET_REPEAT] = sizeof(struct packet_repeat), + [PACKET_MSG_PROT] = sizeof(struct packet_msg_prot), + [PACKET_FENCE] = sizeof(struct packet_fence), + [PACKET_LIN_DMA] = sizeof(struct packet_lin_dma), + [PACKET_NOP] = sizeof(struct packet_nop), + [PACKET_STOP] = sizeof(struct packet_stop), + [PACKET_ARB_POINT] = sizeof(struct packet_arb_point), + [PACKET_WAIT] = sizeof(struct packet_wait), + [PACKET_LOAD_AND_EXE] = sizeof(struct packet_load_and_exe) +}; + +static inline bool validate_packet_id(enum packet_id id) +{ + switch (id) { + case PACKET_WREG_32: + case PACKET_WREG_BULK: + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_CP_DMA: + case PACKET_REPEAT: + case PACKET_MSG_PROT: + case PACKET_FENCE: + case PACKET_LIN_DMA: + case PACKET_NOP: + case PACKET_STOP: + case PACKET_ARB_POINT: + case PACKET_WAIT: + case PACKET_LOAD_AND_EXE: + return true; + default: + return false; + } +} + +static const char * const +gaudi_tpc_interrupts_cause[GAUDI_NUM_OF_TPC_INTR_CAUSE] = { + "tpc_address_exceed_slm", + "tpc_div_by_0", + "tpc_spu_mac_overflow", + "tpc_spu_addsub_overflow", + "tpc_spu_abs_overflow", + "tpc_spu_fp_dst_nan_inf", + "tpc_spu_fp_dst_denorm", + "tpc_vpu_mac_overflow", + "tpc_vpu_addsub_overflow", + "tpc_vpu_abs_overflow", + "tpc_vpu_fp_dst_nan_inf", + "tpc_vpu_fp_dst_denorm", + "tpc_assertions", + "tpc_illegal_instruction", + "tpc_pc_wrap_around", + "tpc_qm_sw_err", + "tpc_hbw_rresp_err", + "tpc_hbw_bresp_err", + "tpc_lbw_rresp_err", + "tpc_lbw_bresp_err" +}; + +static const char * const +gaudi_qman_error_cause[GAUDI_NUM_OF_QM_ERR_CAUSE] = { + "PQ AXI HBW error", + "CQ AXI HBW error", + "CP AXI HBW error", + "CP error due to undefined OPCODE", + "CP encountered STOP OPCODE", + "CP AXI LBW error", + "CP WRREG32 or WRBULK returned error", + "N/A", + "FENCE 0 inc over max value and clipped", + "FENCE 1 inc over max value and clipped", + "FENCE 2 inc over max value and clipped", + "FENCE 3 inc over max value and clipped", + "FENCE 0 dec under min value and clipped", + "FENCE 1 dec under min value and clipped", + "FENCE 2 dec under min value and clipped", + "FENCE 3 dec under min value and clipped" +}; + +static const char * const +gaudi_qman_arb_error_cause[GAUDI_NUM_OF_QM_ARB_ERR_CAUSE] = { + "Choice push while full error", + "Choice Q watchdog error", + "MSG AXI LBW returned with error" +}; + +static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = { + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_0 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_1 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_2 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_3 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_0 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_1 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_2 */ + QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_3 */ + QUEUE_TYPE_CPU, /* GAUDI_QUEUE_ID_CPU_PQ */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_3 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_0 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_1 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_2 */ + QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */ +}; + +static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = { + { .id = 0, .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" }, + { .id = 1, .name = "SYNC_OBJ_DMA_UP_FEEDBACK" }, + { .id = 2, .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" }, + { .id = 3, .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" }, + { .id = 4, .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" }, + { .id = 5, .name = "SYNC_OBJ_HOST_DRAM_DONE" }, + { .id = 6, .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" }, + { .id = 7, .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" }, + { .id = 8, .name = "SYNC_OBJ_ENGINE_SEM_MME_0" }, + { .id = 9, .name = "SYNC_OBJ_ENGINE_SEM_MME_1" }, + { .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" }, + { .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" }, + { .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" }, + { .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" }, + { .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" }, + { .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" }, + { .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" }, + { .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" }, + { .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" }, + { .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" }, + { .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" }, + { .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" }, + { .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" }, + { .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" }, + { .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" }, + { .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" }, + { .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" }, +}; + +static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = { + { .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" }, + { .id = 201, .name = "MON_OBJ_DMA_UP_FEEDBACK_RESET" }, + { .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" }, + { .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" }, + { .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" }, + { .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" }, + { .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" }, + { .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" }, + { .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" }, + { .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" }, + { .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" }, +}; + +static s64 gaudi_state_dump_specs_props[] = { + [SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0, + [SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL, + [SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK, + [SP_MON_OBJ_WR_ADDR_LOW] = + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0, + [SP_MON_OBJ_WR_ADDR_HIGH] = + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0, + [SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0, + [SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0, + [SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0, + [SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK, + [SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0, + [SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR, + [SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0, + [SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0, + [SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL, + [SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0, + [SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0, + [SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO, + [SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0, + [SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES, + [SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES, + [SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES, + [SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES, + [SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES, + [SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS, + [SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES, + [SP_FENCE0_CNT_OFFSET] = + mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0, + [SP_FENCE0_RDATA_OFFSET] = + mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0, + [SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0, + [SP_NUM_CORES] = 1, +}; + +static const int gaudi_queue_id_to_engine_id[] = { + [GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0, + [GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1, + [GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE, + [GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2, + [GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3, + [GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4, + [GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5, + [GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6, + [GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7, + [GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0, + [GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_2, + [GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0, + [GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1, + [GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2, + [GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3, + [GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4, + [GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5, + [GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6, + [GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7, + [GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0, + [GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1, + [GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2, + [GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3, + [GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4, + [GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5, + [GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6, + [GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7, + [GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8, + [GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9, +}; + +/* The order here is opposite to the order of the indexing in the h/w. + * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc. + */ +static const char * const gaudi_sync_manager_names[] = { + "SYNC_MGR_E_N", + "SYNC_MGR_W_N", + "SYNC_MGR_E_S", + "SYNC_MGR_W_S", + NULL +}; + +struct ecc_info_extract_params { + u64 block_address; + u32 num_memories; + bool derr; +}; + +static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, + u64 phys_addr); +static int gaudi_send_job_on_qman0(struct hl_device *hdev, + struct hl_cs_job *job); +static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, + u32 size, u64 val); +static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base, + u32 num_regs, u32 val); +static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel, + u32 tpc_id); +static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev); +static int gaudi_cpucp_info_get(struct hl_device *hdev); +static void gaudi_disable_clock_gating(struct hl_device *hdev); +static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid); +static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, + u32 size, bool eb); +static u32 gaudi_gen_wait_cb(struct hl_device *hdev, + struct hl_gen_wait_properties *prop); +static inline enum hl_collective_mode +get_collective_mode(struct hl_device *hdev, u32 queue_id) +{ + if (gaudi_queue_type[queue_id] == QUEUE_TYPE_EXT) + return HL_COLLECTIVE_MASTER; + + if (queue_id >= GAUDI_QUEUE_ID_DMA_5_0 && + queue_id <= GAUDI_QUEUE_ID_DMA_5_3) + return HL_COLLECTIVE_SLAVE; + + if (queue_id >= GAUDI_QUEUE_ID_TPC_7_0 && + queue_id <= GAUDI_QUEUE_ID_TPC_7_3) + return HL_COLLECTIVE_SLAVE; + + if (queue_id >= GAUDI_QUEUE_ID_NIC_0_0 && + queue_id <= GAUDI_QUEUE_ID_NIC_9_3) + return HL_COLLECTIVE_SLAVE; + + return HL_COLLECTIVE_NOT_SUPPORTED; +} + +static inline void set_default_power_values(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + if (hdev->card_type == cpucp_card_type_pmc) { + prop->max_power_default = MAX_POWER_DEFAULT_PMC; + + if (prop->fw_security_enabled) + prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC; + else + prop->dc_power_default = DC_POWER_DEFAULT_PMC; + } else { + prop->max_power_default = MAX_POWER_DEFAULT_PCI; + prop->dc_power_default = DC_POWER_DEFAULT_PCI; + } +} + +static int gaudi_set_fixed_properties(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 num_sync_stream_queues = 0; + int i; + + prop->max_queues = GAUDI_QUEUE_ID_SIZE; + prop->hw_queues_props = kcalloc(prop->max_queues, + sizeof(struct hw_queue_properties), + GFP_KERNEL); + + if (!prop->hw_queues_props) + return -ENOMEM; + + for (i = 0 ; i < prop->max_queues ; i++) { + if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) { + prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; + prop->hw_queues_props[i].driver_only = 0; + prop->hw_queues_props[i].supports_sync_stream = 1; + prop->hw_queues_props[i].cb_alloc_flags = + CB_ALLOC_KERNEL; + num_sync_stream_queues++; + } else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) { + prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; + prop->hw_queues_props[i].driver_only = 1; + prop->hw_queues_props[i].supports_sync_stream = 0; + prop->hw_queues_props[i].cb_alloc_flags = + CB_ALLOC_KERNEL; + } else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) { + prop->hw_queues_props[i].type = QUEUE_TYPE_INT; + prop->hw_queues_props[i].driver_only = 0; + prop->hw_queues_props[i].supports_sync_stream = 0; + prop->hw_queues_props[i].cb_alloc_flags = + CB_ALLOC_USER; + + } + prop->hw_queues_props[i].collective_mode = + get_collective_mode(hdev, i); + } + + prop->cache_line_size = DEVICE_CACHE_LINE_SIZE; + prop->cfg_base_address = CFG_BASE; + prop->device_dma_offset_for_host_access = HOST_PHYS_BASE; + prop->host_base_address = HOST_PHYS_BASE; + prop->host_end_address = prop->host_base_address + HOST_PHYS_SIZE; + prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; + prop->completion_mode = HL_COMPLETION_MODE_JOB; + prop->collective_first_sob = 0; + prop->collective_first_mon = 0; + + /* 2 SOBs per internal queue stream are reserved for collective */ + prop->sync_stream_first_sob = + ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR) + * QMAN_STREAMS * HL_RSVD_SOBS; + + /* 1 monitor per internal queue stream are reserved for collective + * 2 monitors per external queue stream are reserved for collective + */ + prop->sync_stream_first_mon = + (NUMBER_OF_COLLECTIVE_QUEUES * QMAN_STREAMS) + + (NUMBER_OF_EXT_HW_QUEUES * 2); + + prop->dram_base_address = DRAM_PHYS_BASE; + prop->dram_size = GAUDI_HBM_SIZE_32GB; + prop->dram_end_address = prop->dram_base_address + prop->dram_size; + prop->dram_user_base_address = DRAM_BASE_ADDR_USER; + + prop->sram_base_address = SRAM_BASE_ADDR; + prop->sram_size = SRAM_SIZE; + prop->sram_end_address = prop->sram_base_address + prop->sram_size; + prop->sram_user_base_address = + prop->sram_base_address + SRAM_USER_BASE_OFFSET; + + prop->mmu_cache_mng_addr = MMU_CACHE_MNG_ADDR; + prop->mmu_cache_mng_size = MMU_CACHE_MNG_SIZE; + + prop->mmu_pgt_addr = MMU_PAGE_TABLES_ADDR; + if (hdev->pldm) + prop->mmu_pgt_size = 0x800000; /* 8MB */ + else + prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE; + prop->mmu_pte_size = HL_PTE_SIZE; + prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE; + prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE; + prop->dram_page_size = PAGE_SIZE_2MB; + prop->device_mem_alloc_default_page_size = prop->dram_page_size; + prop->dram_supports_virtual_memory = false; + + prop->pmmu.hop_shifts[MMU_HOP0] = MMU_V1_1_HOP0_SHIFT; + prop->pmmu.hop_shifts[MMU_HOP1] = MMU_V1_1_HOP1_SHIFT; + prop->pmmu.hop_shifts[MMU_HOP2] = MMU_V1_1_HOP2_SHIFT; + prop->pmmu.hop_shifts[MMU_HOP3] = MMU_V1_1_HOP3_SHIFT; + prop->pmmu.hop_shifts[MMU_HOP4] = MMU_V1_1_HOP4_SHIFT; + prop->pmmu.hop_masks[MMU_HOP0] = MMU_V1_1_HOP0_MASK; + prop->pmmu.hop_masks[MMU_HOP1] = MMU_V1_1_HOP1_MASK; + prop->pmmu.hop_masks[MMU_HOP2] = MMU_V1_1_HOP2_MASK; + prop->pmmu.hop_masks[MMU_HOP3] = MMU_V1_1_HOP3_MASK; + prop->pmmu.hop_masks[MMU_HOP4] = MMU_V1_1_HOP4_MASK; + prop->pmmu.start_addr = VA_HOST_SPACE_START; + prop->pmmu.end_addr = + (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1; + prop->pmmu.page_size = PAGE_SIZE_4KB; + prop->pmmu.num_hops = MMU_ARCH_5_HOPS; + prop->pmmu.last_mask = LAST_MASK; + /* TODO: will be duplicated until implementing per-MMU props */ + prop->pmmu.hop_table_size = prop->mmu_hop_table_size; + prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size; + + /* PMMU and HPMMU are the same except of page size */ + memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu)); + prop->pmmu_huge.page_size = PAGE_SIZE_2MB; + + /* shifts and masks are the same in PMMU and DMMU */ + memcpy(&prop->dmmu, &prop->pmmu, sizeof(prop->pmmu)); + prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2); + prop->dmmu.end_addr = VA_HOST_SPACE_END; + prop->dmmu.page_size = PAGE_SIZE_2MB; + + prop->cfg_size = CFG_SIZE; + prop->max_asid = MAX_ASID; + prop->num_of_events = GAUDI_EVENT_SIZE; + prop->tpc_enabled_mask = TPC_ENABLED_MASK; + + set_default_power_values(hdev); + + prop->cb_pool_cb_cnt = GAUDI_CB_POOL_CB_CNT; + prop->cb_pool_cb_size = GAUDI_CB_POOL_CB_SIZE; + + prop->pcie_dbi_base_address = mmPCIE_DBI_BASE; + prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI; + + strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME, + CARD_NAME_MAX_LEN); + + prop->max_pending_cs = GAUDI_MAX_PENDING_CS; + + prop->first_available_user_sob[HL_GAUDI_WS_DCORE] = + prop->sync_stream_first_sob + + (num_sync_stream_queues * HL_RSVD_SOBS); + prop->first_available_user_mon[HL_GAUDI_WS_DCORE] = + prop->sync_stream_first_mon + + (num_sync_stream_queues * HL_RSVD_MONS); + + prop->first_available_user_interrupt = USHRT_MAX; + + for (i = 0 ; i < HL_MAX_DCORES ; i++) + prop->first_available_cq[i] = USHRT_MAX; + + prop->fw_cpu_boot_dev_sts0_valid = false; + prop->fw_cpu_boot_dev_sts1_valid = false; + prop->hard_reset_done_by_fw = false; + prop->gic_interrupts_enable = true; + + prop->server_type = HL_SERVER_TYPE_UNKNOWN; + + prop->clk_pll_index = HL_GAUDI_MME_PLL; + prop->max_freq_value = GAUDI_MAX_CLK_FREQ; + + prop->use_get_power_for_reset_history = true; + + prop->configurable_stop_on_err = true; + + prop->set_max_power_on_device_init = true; + + prop->dma_mask = 48; + + return 0; +} + +static int gaudi_pci_bars_map(struct hl_device *hdev) +{ + static const char * const name[] = {"SRAM", "CFG", "HBM"}; + bool is_wc[3] = {false, false, true}; + int rc; + + rc = hl_pci_bars_map(hdev, name, is_wc); + if (rc) + return rc; + + hdev->rmmio = hdev->pcie_bar[CFG_BAR_ID] + + (CFG_BASE - SPI_FLASH_BASE_ADDR); + + return 0; +} + +static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct hl_inbound_pci_region pci_region; + u64 old_addr = addr; + int rc; + + if ((gaudi) && (gaudi->hbm_bar_cur_addr == addr)) + return old_addr; + + if (hdev->asic_prop.iatu_done_by_fw) + return U64_MAX; + + /* Inbound Region 2 - Bar 4 - Point to HBM */ + pci_region.mode = PCI_BAR_MATCH_MODE; + pci_region.bar = HBM_BAR_ID; + pci_region.addr = addr; + rc = hl_pci_set_inbound_region(hdev, 2, &pci_region); + if (rc) + return U64_MAX; + + if (gaudi) { + old_addr = gaudi->hbm_bar_cur_addr; + gaudi->hbm_bar_cur_addr = addr; + } + + return old_addr; +} + +static int gaudi_init_iatu(struct hl_device *hdev) +{ + struct hl_inbound_pci_region inbound_region; + struct hl_outbound_pci_region outbound_region; + int rc; + + if (hdev->asic_prop.iatu_done_by_fw) + return 0; + + /* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = SRAM_BAR_ID; + inbound_region.addr = SRAM_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region); + if (rc) + goto done; + + /* Inbound Region 1 - Bar 2 - Point to SPI FLASH */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = CFG_BAR_ID; + inbound_region.addr = SPI_FLASH_BASE_ADDR; + rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region); + if (rc) + goto done; + + /* Inbound Region 2 - Bar 4 - Point to HBM */ + inbound_region.mode = PCI_BAR_MATCH_MODE; + inbound_region.bar = HBM_BAR_ID; + inbound_region.addr = DRAM_PHYS_BASE; + rc = hl_pci_set_inbound_region(hdev, 2, &inbound_region); + if (rc) + goto done; + + /* Outbound Region 0 - Point to Host */ + outbound_region.addr = HOST_PHYS_BASE; + outbound_region.size = HOST_PHYS_SIZE; + rc = hl_pci_set_outbound_region(hdev, &outbound_region); + +done: + return rc; +} + +static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev) +{ + return RREG32(mmHW_STATE); +} + +static int gaudi_early_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pci_dev *pdev = hdev->pdev; + resource_size_t pci_bar_size; + u32 fw_boot_status; + int rc; + + rc = gaudi_set_fixed_properties(hdev); + if (rc) { + dev_err(hdev->dev, "Failed setting fixed properties\n"); + return rc; + } + + /* Check BAR sizes */ + pci_bar_size = pci_resource_len(pdev, SRAM_BAR_ID); + + if (pci_bar_size != SRAM_BAR_SIZE) { + dev_err(hdev->dev, "Not " HL_NAME "? BAR %d size %pa, expecting %llu\n", + SRAM_BAR_ID, &pci_bar_size, SRAM_BAR_SIZE); + rc = -ENODEV; + goto free_queue_props; + } + + pci_bar_size = pci_resource_len(pdev, CFG_BAR_ID); + + if (pci_bar_size != CFG_BAR_SIZE) { + dev_err(hdev->dev, "Not " HL_NAME "? BAR %d size %pa, expecting %llu\n", + CFG_BAR_ID, &pci_bar_size, CFG_BAR_SIZE); + rc = -ENODEV; + goto free_queue_props; + } + + prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID); + hdev->dram_pci_bar_start = pci_resource_start(pdev, HBM_BAR_ID); + + /* If FW security is enabled at this point it means no access to ELBI */ + if (hdev->asic_prop.fw_security_enabled) { + hdev->asic_prop.iatu_done_by_fw = true; + + /* + * GIC-security-bit can ONLY be set by CPUCP, so in this stage + * decision can only be taken based on PCI ID security. + */ + hdev->asic_prop.gic_interrupts_enable = false; + goto pci_init; + } + + rc = hl_pci_elbi_read(hdev, CFG_BASE + mmCPU_BOOT_DEV_STS0, + &fw_boot_status); + if (rc) + goto free_queue_props; + + /* Check whether FW is configuring iATU */ + if ((fw_boot_status & CPU_BOOT_DEV_STS0_ENABLED) && + (fw_boot_status & CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN)) + hdev->asic_prop.iatu_done_by_fw = true; + +pci_init: + rc = hl_pci_init(hdev); + if (rc) + goto free_queue_props; + + /* Before continuing in the initialization, we need to read the preboot + * version to determine whether we run with a security-enabled firmware + */ + rc = hl_fw_read_preboot_status(hdev); + if (rc) { + if (hdev->reset_on_preboot_fail) + hdev->asic_funcs->hw_fini(hdev, true, false); + goto pci_fini; + } + + if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { + dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n"); + hdev->asic_funcs->hw_fini(hdev, true, false); + } + + return 0; + +pci_fini: + hl_pci_fini(hdev); +free_queue_props: + kfree(hdev->asic_prop.hw_queues_props); + return rc; +} + +static int gaudi_early_fini(struct hl_device *hdev) +{ + kfree(hdev->asic_prop.hw_queues_props); + hl_pci_fini(hdev); + + return 0; +} + +/** + * gaudi_fetch_psoc_frequency - Fetch PSOC frequency values + * + * @hdev: pointer to hl_device structure + * + */ +static int gaudi_fetch_psoc_frequency(struct hl_device *hdev) +{ + u32 nr = 0, nf = 0, od = 0, div_fctr = 0, pll_clk, div_sel; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u16 pll_freq_arr[HL_PLL_NUM_OUTPUTS], freq; + int rc; + + if ((hdev->fw_components & FW_TYPE_LINUX) && + (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PLL_INFO_EN)) { + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr); + + if (rc) + return rc; + + freq = pll_freq_arr[2]; + } else { + /* Backward compatibility */ + div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2); + div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2); + nr = RREG32(mmPSOC_CPU_PLL_NR); + nf = RREG32(mmPSOC_CPU_PLL_NF); + od = RREG32(mmPSOC_CPU_PLL_OD); + + if (div_sel == DIV_SEL_REF_CLK || + div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + freq = PLL_REF_CLK; + else + freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / + ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + freq = pll_clk; + else + freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, "Received invalid div select value: %#x", div_sel); + freq = 0; + } + } + + prop->psoc_timestamp_frequency = freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; + + return 0; +} + +static int _gaudi_init_tpc_mem(struct hl_device *hdev, + dma_addr_t tpc_kernel_src_addr, u32 tpc_kernel_size) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct packet_lin_dma *init_tpc_mem_pkt; + struct hl_cs_job *job; + struct hl_cb *cb; + u64 dst_addr; + u32 cb_size, ctl; + u8 tpc_id; + int rc; + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); + if (!cb) + return -EFAULT; + + init_tpc_mem_pkt = cb->kernel_address; + cb_size = sizeof(*init_tpc_mem_pkt); + memset(init_tpc_mem_pkt, 0, cb_size); + + init_tpc_mem_pkt->tsize = cpu_to_le32(tpc_kernel_size); + + ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA); + ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + init_tpc_mem_pkt->ctl = cpu_to_le32(ctl); + + init_tpc_mem_pkt->src_addr = cpu_to_le64(tpc_kernel_src_addr); + + /* TPC_CMD is configured with I$ prefetch enabled, so address should be aligned to 8KB */ + dst_addr = FIELD_PREP(GAUDI_PKT_LIN_DMA_DST_ADDR_MASK, + round_up(prop->sram_user_base_address, SZ_8K)); + init_tpc_mem_pkt->dst_addr |= cpu_to_le64(dst_addr); + + job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + job->id = 0; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = cb_size; + job->hw_queue_id = GAUDI_QUEUE_ID_DMA_0_0; + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size + sizeof(struct packet_msg_prot); + + hl_debugfs_add_job(hdev, job); + + rc = gaudi_send_job_on_qman0(hdev, job); + + if (rc) + goto free_job; + + for (tpc_id = 0 ; tpc_id < TPC_NUMBER_OF_ENGINES ; tpc_id++) { + rc = gaudi_run_tpc_kernel(hdev, dst_addr, tpc_id); + if (rc) + break; + } + +free_job: + hl_userptr_delete_list(hdev, &job->userptr_list); + hl_debugfs_remove_job(hdev, job); + kfree(job); + atomic_dec(&cb->cs_cnt); + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle); + + return rc; +} + +/* + * gaudi_init_tpc_mem() - Initialize TPC memories. + * @hdev: Pointer to hl_device structure. + * + * Copy TPC kernel fw from firmware file and run it to initialize TPC memories. + * + * Return: 0 for success, negative value for error. + */ +static int gaudi_init_tpc_mem(struct hl_device *hdev) +{ + const struct firmware *fw; + size_t fw_size; + void *cpu_addr; + dma_addr_t dma_handle; + int rc, count = 5; + +again: + rc = request_firmware(&fw, GAUDI_TPC_FW_FILE, hdev->dev); + if (rc == -EINTR && count-- > 0) { + msleep(50); + goto again; + } + + if (rc) { + dev_err(hdev->dev, "Failed to load firmware file %s\n", + GAUDI_TPC_FW_FILE); + goto out; + } + + fw_size = fw->size; + cpu_addr = hl_asic_dma_alloc_coherent(hdev, fw_size, &dma_handle, GFP_KERNEL | __GFP_ZERO); + if (!cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate %zu of dma memory for TPC kernel\n", + fw_size); + rc = -ENOMEM; + goto out; + } + + memcpy(cpu_addr, fw->data, fw_size); + + rc = _gaudi_init_tpc_mem(hdev, dma_handle, fw_size); + + hl_asic_dma_free_coherent(hdev, fw->size, cpu_addr, dma_handle); + +out: + release_firmware(fw); + return rc; +} + +static void gaudi_collective_map_sobs(struct hl_device *hdev, u32 stream) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_collective_properties *prop = &gaudi->collective_props; + struct hl_hw_queue *q; + u32 i, sob_id, sob_group_id, queue_id; + + /* Iterate through SOB groups and assign a SOB for each slave queue */ + sob_group_id = + stream * HL_RSVD_SOBS + prop->curr_sob_group_idx[stream]; + sob_id = prop->hw_sob_group[sob_group_id].base_sob_id; + + queue_id = GAUDI_QUEUE_ID_NIC_0_0 + stream; + for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++) { + q = &hdev->kernel_queues[queue_id + (4 * i)]; + q->sync_stream_prop.collective_sob_id = sob_id + i; + } + + /* Both DMA5 and TPC7 use the same resources since only a single + * engine need to participate in the reduction process + */ + queue_id = GAUDI_QUEUE_ID_DMA_5_0 + stream; + q = &hdev->kernel_queues[queue_id]; + q->sync_stream_prop.collective_sob_id = + sob_id + NIC_NUMBER_OF_ENGINES; + + queue_id = GAUDI_QUEUE_ID_TPC_7_0 + stream; + q = &hdev->kernel_queues[queue_id]; + q->sync_stream_prop.collective_sob_id = + sob_id + NIC_NUMBER_OF_ENGINES; +} + +static void gaudi_sob_group_hw_reset(struct kref *ref) +{ + struct gaudi_hw_sob_group *hw_sob_group = + container_of(ref, struct gaudi_hw_sob_group, kref); + struct hl_device *hdev = hw_sob_group->hdev; + int i; + + for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++) + WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + + (hw_sob_group->base_sob_id * 4) + (i * 4)), 0); + + kref_init(&hw_sob_group->kref); +} + +static void gaudi_sob_group_reset_error(struct kref *ref) +{ + struct gaudi_hw_sob_group *hw_sob_group = + container_of(ref, struct gaudi_hw_sob_group, kref); + struct hl_device *hdev = hw_sob_group->hdev; + + dev_crit(hdev->dev, + "SOB release shouldn't be called here, base_sob_id: %d\n", + hw_sob_group->base_sob_id); +} + +static void gaudi_collective_mstr_sob_mask_set(struct gaudi_device *gaudi) +{ + struct gaudi_collective_properties *prop; + int i; + + prop = &gaudi->collective_props; + + memset(prop->mstr_sob_mask, 0, sizeof(prop->mstr_sob_mask)); + + for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++) + if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i)) + prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |= + BIT(i % HL_MAX_SOBS_PER_MONITOR); + /* Set collective engine bit */ + prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |= + BIT(i % HL_MAX_SOBS_PER_MONITOR); +} + +static int gaudi_collective_init(struct hl_device *hdev) +{ + u32 i, sob_id, reserved_sobs_per_group; + struct gaudi_collective_properties *prop; + struct gaudi_device *gaudi; + + gaudi = hdev->asic_specific; + prop = &gaudi->collective_props; + sob_id = hdev->asic_prop.collective_first_sob; + + /* First sob in group must be aligned to HL_MAX_SOBS_PER_MONITOR */ + reserved_sobs_per_group = + ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR); + + /* Init SOB groups */ + for (i = 0 ; i < NUM_SOB_GROUPS; i++) { + prop->hw_sob_group[i].hdev = hdev; + prop->hw_sob_group[i].base_sob_id = sob_id; + sob_id += reserved_sobs_per_group; + gaudi_sob_group_hw_reset(&prop->hw_sob_group[i].kref); + } + + for (i = 0 ; i < QMAN_STREAMS; i++) { + prop->next_sob_group_val[i] = 1; + prop->curr_sob_group_idx[i] = 0; + gaudi_collective_map_sobs(hdev, i); + } + + gaudi_collective_mstr_sob_mask_set(gaudi); + + return 0; +} + +static void gaudi_reset_sob_group(struct hl_device *hdev, u16 sob_group) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_collective_properties *cprop = &gaudi->collective_props; + + kref_put(&cprop->hw_sob_group[sob_group].kref, + gaudi_sob_group_hw_reset); +} + +static void gaudi_collective_master_init_job(struct hl_device *hdev, + struct hl_cs_job *job, u32 stream, u32 sob_group_offset) +{ + u32 master_sob_base, master_monitor, queue_id, cb_size = 0; + struct gaudi_collective_properties *cprop; + struct hl_gen_wait_properties wait_prop; + struct hl_sync_stream_properties *prop; + struct gaudi_device *gaudi; + + gaudi = hdev->asic_specific; + cprop = &gaudi->collective_props; + queue_id = job->hw_queue_id; + prop = &hdev->kernel_queues[queue_id].sync_stream_prop; + + master_sob_base = + cprop->hw_sob_group[sob_group_offset].base_sob_id; + master_monitor = prop->collective_mstr_mon_id[0]; + + cprop->hw_sob_group[sob_group_offset].queue_id = queue_id; + + dev_dbg(hdev->dev, + "Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n", + master_sob_base, cprop->mstr_sob_mask[0], + cprop->next_sob_group_val[stream], + master_monitor, queue_id); + + wait_prop.data = (void *) job->patched_cb; + wait_prop.sob_base = master_sob_base; + wait_prop.sob_mask = cprop->mstr_sob_mask[0]; + wait_prop.sob_val = cprop->next_sob_group_val[stream]; + wait_prop.mon_id = master_monitor; + wait_prop.q_idx = queue_id; + wait_prop.size = cb_size; + cb_size += gaudi_gen_wait_cb(hdev, &wait_prop); + + master_sob_base += HL_MAX_SOBS_PER_MONITOR; + master_monitor = prop->collective_mstr_mon_id[1]; + + dev_dbg(hdev->dev, + "Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n", + master_sob_base, cprop->mstr_sob_mask[1], + cprop->next_sob_group_val[stream], + master_monitor, queue_id); + + wait_prop.sob_base = master_sob_base; + wait_prop.sob_mask = cprop->mstr_sob_mask[1]; + wait_prop.mon_id = master_monitor; + wait_prop.size = cb_size; + cb_size += gaudi_gen_wait_cb(hdev, &wait_prop); +} + +static void gaudi_collective_slave_init_job(struct hl_device *hdev, + struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) +{ + struct hl_gen_wait_properties wait_prop; + struct hl_sync_stream_properties *prop; + u32 queue_id, cb_size = 0; + + queue_id = job->hw_queue_id; + prop = &hdev->kernel_queues[queue_id].sync_stream_prop; + + if (job->cs->encaps_signals) { + /* use the encaps signal handle store earlier in the flow + * and set the SOB information from the encaps + * signals handle + */ + hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job, + cs_cmpl); + + dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u, wait for sob_val: %u\n", + job->cs->sequence, + cs_cmpl->hw_sob->sob_id, + cs_cmpl->sob_val); + } + + /* Add to wait CBs using slave monitor */ + wait_prop.data = (void *) job->user_cb; + wait_prop.sob_base = cs_cmpl->hw_sob->sob_id; + wait_prop.sob_mask = 0x1; + wait_prop.sob_val = cs_cmpl->sob_val; + wait_prop.mon_id = prop->collective_slave_mon_id; + wait_prop.q_idx = queue_id; + wait_prop.size = cb_size; + + dev_dbg(hdev->dev, + "Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n", + cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, + prop->collective_slave_mon_id, queue_id); + + cb_size += gaudi_gen_wait_cb(hdev, &wait_prop); + + dev_dbg(hdev->dev, + "generate signal CB, sob_id: %d, sob val: 1, q_idx: %d\n", + prop->collective_sob_id, queue_id); + + cb_size += gaudi_gen_signal_cb(hdev, job->user_cb, + prop->collective_sob_id, cb_size, false); +} + +static int gaudi_collective_wait_init_cs(struct hl_cs *cs) +{ + struct hl_cs_compl *signal_cs_cmpl = + container_of(cs->signal_fence, struct hl_cs_compl, base_fence); + struct hl_cs_compl *cs_cmpl = + container_of(cs->fence, struct hl_cs_compl, base_fence); + struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; + struct gaudi_collective_properties *cprop; + u32 stream, queue_id, sob_group_offset; + struct gaudi_device *gaudi; + struct hl_device *hdev; + struct hl_cs_job *job; + struct hl_ctx *ctx; + + ctx = cs->ctx; + hdev = ctx->hdev; + gaudi = hdev->asic_specific; + cprop = &gaudi->collective_props; + + if (cs->encaps_signals) { + cs_cmpl->hw_sob = handle->hw_sob; + /* at this checkpoint we only need the hw_sob pointer + * for the completion check before start going over the jobs + * of the master/slaves, the sob_value will be taken later on + * in gaudi_collective_slave_init_job depends on each + * job wait offset value. + */ + cs_cmpl->sob_val = 0; + } else { + /* copy the SOB id and value of the signal CS */ + cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; + cs_cmpl->sob_val = signal_cs_cmpl->sob_val; + } + + /* check again if the signal cs already completed. + * if yes then don't send any wait cs since the hw_sob + * could be in reset already. if signal is not completed + * then get refcount to hw_sob to prevent resetting the sob + * while wait cs is not submitted. + * note that this check is protected by two locks, + * hw queue lock and completion object lock, + * and the same completion object lock also protects + * the hw_sob reset handler function. + * The hw_queue lock prevent out of sync of hw_sob + * refcount value, changed by signal/wait flows. + */ + spin_lock(&signal_cs_cmpl->lock); + + if (completion_done(&cs->signal_fence->completion)) { + spin_unlock(&signal_cs_cmpl->lock); + return -EINVAL; + } + /* Increment kref since all slave queues are now waiting on it */ + kref_get(&cs_cmpl->hw_sob->kref); + + spin_unlock(&signal_cs_cmpl->lock); + + /* Calculate the stream from collective master queue (1st job) */ + job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node); + stream = job->hw_queue_id % 4; + sob_group_offset = + stream * HL_RSVD_SOBS + cprop->curr_sob_group_idx[stream]; + + list_for_each_entry(job, &cs->job_list, cs_node) { + queue_id = job->hw_queue_id; + + if (hdev->kernel_queues[queue_id].collective_mode == + HL_COLLECTIVE_MASTER) + gaudi_collective_master_init_job(hdev, job, stream, + sob_group_offset); + else + gaudi_collective_slave_init_job(hdev, job, cs_cmpl); + } + + cs_cmpl->sob_group = sob_group_offset; + + /* Handle sob group kref and wraparound */ + kref_get(&cprop->hw_sob_group[sob_group_offset].kref); + cprop->next_sob_group_val[stream]++; + + if (cprop->next_sob_group_val[stream] == HL_MAX_SOB_VAL) { + /* + * Decrement as we reached the max value. + * The release function won't be called here as we've + * just incremented the refcount. + */ + kref_put(&cprop->hw_sob_group[sob_group_offset].kref, + gaudi_sob_group_reset_error); + cprop->next_sob_group_val[stream] = 1; + /* only two SOBs are currently in use */ + cprop->curr_sob_group_idx[stream] = + (cprop->curr_sob_group_idx[stream] + 1) & + (HL_RSVD_SOBS - 1); + + gaudi_collective_map_sobs(hdev, stream); + + dev_dbg(hdev->dev, "switched to SOB group %d, stream: %d\n", + cprop->curr_sob_group_idx[stream], stream); + } + + mb(); + hl_fence_put(cs->signal_fence); + cs->signal_fence = NULL; + + return 0; +} + +static u32 gaudi_get_patched_cb_extra_size(u32 user_cb_size) +{ + u32 cacheline_end, additional_commands; + + cacheline_end = round_up(user_cb_size, DEVICE_CACHE_LINE_SIZE); + additional_commands = sizeof(struct packet_msg_prot) * 2; + + if (user_cb_size + additional_commands > cacheline_end) + return cacheline_end - user_cb_size + additional_commands; + else + return additional_commands; +} + +static int gaudi_collective_wait_create_job(struct hl_device *hdev, + struct hl_ctx *ctx, struct hl_cs *cs, + enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id, + u32 encaps_signal_offset) +{ + struct hw_queue_properties *hw_queue_prop; + struct hl_cs_counters_atomic *cntr; + struct hl_cs_job *job; + struct hl_cb *cb; + u32 cb_size; + bool patched_cb; + + cntr = &hdev->aggregated_cs_counters; + + if (mode == HL_COLLECTIVE_MASTER) { + /* CB size of collective master queue contains + * 4 msg short packets for monitor 1 configuration + * 1 fence packet + * 4 msg short packets for monitor 2 configuration + * 1 fence packet + * 2 msg prot packets for completion and MSI + */ + cb_size = sizeof(struct packet_msg_short) * 8 + + sizeof(struct packet_fence) * 2 + + sizeof(struct packet_msg_prot) * 2; + patched_cb = true; + } else { + /* CB size of collective slave queues contains + * 4 msg short packets for monitor configuration + * 1 fence packet + * 1 additional msg short packet for sob signal + */ + cb_size = sizeof(struct packet_msg_short) * 5 + + sizeof(struct packet_fence); + patched_cb = false; + } + + hw_queue_prop = &hdev->asic_prop.hw_queues_props[queue_id]; + job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true); + if (!job) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + dev_err(hdev->dev, "Failed to allocate a new job\n"); + return -ENOMEM; + } + + /* Allocate internal mapped CB for non patched CBs */ + cb = hl_cb_kernel_create(hdev, cb_size, + hdev->mmu_enable && !patched_cb); + if (!cb) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + kfree(job); + return -EFAULT; + } + + job->id = 0; + job->cs = cs; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = cb_size; + job->hw_queue_id = queue_id; + + /* since its guaranteed to have only one chunk in the collective wait + * cs, we can use this chunk to set the encapsulated signal offset + * in the jobs. + */ + if (cs->encaps_signals) + job->encaps_sig_wait_offset = encaps_signal_offset; + + /* + * No need in parsing, user CB is the patched CB. + * We call hl_cb_destroy() out of two reasons - we don't need + * the CB in the CB idr anymore and to decrement its refcount as + * it was incremented inside hl_cb_kernel_create(). + */ + if (patched_cb) + job->patched_cb = job->user_cb; + else + job->patched_cb = NULL; + + job->job_cb_size = job->user_cb_size; + hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle); + + /* increment refcount as for external queues we get completion */ + if (hw_queue_prop->type == QUEUE_TYPE_EXT) + cs_get(cs); + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + + list_add_tail(&job->cs_node, &cs->job_list); + + hl_debugfs_add_job(hdev, job); + + return 0; +} + +static int gaudi_collective_wait_create_jobs(struct hl_device *hdev, + struct hl_ctx *ctx, struct hl_cs *cs, + u32 wait_queue_id, u32 collective_engine_id, + u32 encaps_signal_offset) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct hw_queue_properties *hw_queue_prop; + u32 queue_id, collective_queue, num_jobs; + u32 stream, nic_queue, nic_idx = 0; + bool skip; + int i, rc = 0; + + /* Verify wait queue id is configured as master */ + hw_queue_prop = &hdev->asic_prop.hw_queues_props[wait_queue_id]; + if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) { + dev_err(hdev->dev, + "Queue %d is not configured as collective master\n", + wait_queue_id); + return -EINVAL; + } + + /* Verify engine id is supported */ + if (collective_engine_id != GAUDI_ENGINE_ID_DMA_5 && + collective_engine_id != GAUDI_ENGINE_ID_TPC_7) { + dev_err(hdev->dev, + "Collective wait does not support engine %u\n", + collective_engine_id); + return -EINVAL; + } + + stream = wait_queue_id % 4; + + if (collective_engine_id == GAUDI_ENGINE_ID_DMA_5) + collective_queue = GAUDI_QUEUE_ID_DMA_5_0 + stream; + else + collective_queue = GAUDI_QUEUE_ID_TPC_7_0 + stream; + + num_jobs = NUMBER_OF_SOBS_IN_GRP + 1; + nic_queue = GAUDI_QUEUE_ID_NIC_0_0 + stream; + + /* First job goes to the collective master queue, it will wait for + * the collective slave queues to finish execution. + * The synchronization is done using two monitors: + * First monitor for NICs 0-7, second monitor for NICs 8-9 and the + * reduction engine (DMA5/TPC7). + * + * Rest of the jobs goes to the collective slave queues which will + * all wait for the user to signal sob 'cs_cmpl->sob_val'. + */ + for (i = 0 ; i < num_jobs ; i++) { + if (i == 0) { + queue_id = wait_queue_id; + rc = gaudi_collective_wait_create_job(hdev, ctx, cs, + HL_COLLECTIVE_MASTER, queue_id, + wait_queue_id, encaps_signal_offset); + } else { + if (nic_idx < NIC_NUMBER_OF_ENGINES) { + if (gaudi->hw_cap_initialized & + BIT(HW_CAP_NIC_SHIFT + nic_idx)) + skip = false; + else + skip = true; + + queue_id = nic_queue; + nic_queue += 4; + nic_idx++; + + if (skip) + continue; + } else { + queue_id = collective_queue; + } + + rc = gaudi_collective_wait_create_job(hdev, ctx, cs, + HL_COLLECTIVE_SLAVE, queue_id, + wait_queue_id, encaps_signal_offset); + } + + if (rc) + return rc; + } + + return rc; +} + +static int gaudi_late_init(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int rc; + + rc = gaudi->cpucp_info_get(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to get cpucp info\n"); + return rc; + } + + if ((hdev->card_type == cpucp_card_type_pci) && + (hdev->nic_ports_mask & 0x3)) { + dev_info(hdev->dev, + "PCI card detected, only 8 ports are enabled\n"); + hdev->nic_ports_mask &= ~0x3; + + /* Stop and disable unused NIC QMANs */ + WREG32(mmNIC0_QM0_GLBL_CFG1, NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + WREG32(mmNIC0_QM1_GLBL_CFG1, NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + WREG32(mmNIC0_QM0_GLBL_CFG0, 0); + WREG32(mmNIC0_QM1_GLBL_CFG0, 0); + + gaudi->hw_cap_initialized &= ~(HW_CAP_NIC0 | HW_CAP_NIC1); + } + + rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0); + if (rc) { + dev_err(hdev->dev, "Failed to enable PCI access from CPU\n"); + return rc; + } + + /* Scrub both SRAM and DRAM */ + rc = hdev->asic_funcs->scrub_device_mem(hdev); + if (rc) + goto disable_pci_access; + + rc = gaudi_fetch_psoc_frequency(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to fetch psoc frequency\n"); + goto disable_pci_access; + } + + rc = gaudi_mmu_clear_pgt_range(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to clear MMU page tables range\n"); + goto disable_pci_access; + } + + rc = gaudi_init_tpc_mem(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize TPC memories\n"); + goto disable_pci_access; + } + + rc = gaudi_collective_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to init collective\n"); + goto disable_pci_access; + } + + /* We only support a single ASID for the user, so for the sake of optimization, just + * initialize the ASID one time during device initialization with the fixed value of 1 + */ + gaudi_mmu_prepare(hdev, 1); + + hl_fw_set_pll_profile(hdev); + + return 0; + +disable_pci_access: + hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); + + return rc; +} + +static void gaudi_late_fini(struct hl_device *hdev) +{ + hl_hwmon_release_resources(hdev); +} + +static int gaudi_alloc_cpu_accessible_dma_mem(struct hl_device *hdev) +{ + dma_addr_t dma_addr_arr[GAUDI_ALLOC_CPU_MEM_RETRY_CNT] = {}, end_addr; + void *virt_addr_arr[GAUDI_ALLOC_CPU_MEM_RETRY_CNT] = {}; + int i, j, rc = 0; + + /* + * The device CPU works with 40-bits addresses, while bit 39 must be set + * to '1' when accessing the host. + * Bits 49:39 of the full host address are saved for a later + * configuration of the HW to perform extension to 50 bits. + * Because there is a single HW register that holds the extension bits, + * these bits must be identical in all allocated range. + */ + + for (i = 0 ; i < GAUDI_ALLOC_CPU_MEM_RETRY_CNT ; i++) { + virt_addr_arr[i] = hl_asic_dma_alloc_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, + &dma_addr_arr[i], + GFP_KERNEL | __GFP_ZERO); + if (!virt_addr_arr[i]) { + rc = -ENOMEM; + goto free_dma_mem_arr; + } + + end_addr = dma_addr_arr[i] + HL_CPU_ACCESSIBLE_MEM_SIZE - 1; + if (GAUDI_CPU_PCI_MSB_ADDR(dma_addr_arr[i]) == + GAUDI_CPU_PCI_MSB_ADDR(end_addr)) + break; + } + + if (i == GAUDI_ALLOC_CPU_MEM_RETRY_CNT) { + dev_err(hdev->dev, + "MSB of CPU accessible DMA memory are not identical in all range\n"); + rc = -EFAULT; + goto free_dma_mem_arr; + } + + hdev->cpu_accessible_dma_mem = virt_addr_arr[i]; + hdev->cpu_accessible_dma_address = dma_addr_arr[i]; + hdev->cpu_pci_msb_addr = + GAUDI_CPU_PCI_MSB_ADDR(hdev->cpu_accessible_dma_address); + + if (!hdev->asic_prop.fw_security_enabled) + GAUDI_PCI_TO_CPU_ADDR(hdev->cpu_accessible_dma_address); + +free_dma_mem_arr: + for (j = 0 ; j < i ; j++) + hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, virt_addr_arr[j], + dma_addr_arr[j]); + + return rc; +} + +static void gaudi_free_internal_qmans_pq_mem(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + u32 i; + + for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) { + q = &gaudi->internal_qmans[i]; + if (!q->pq_kernel_addr) + continue; + hl_asic_dma_free_coherent(hdev, q->pq_size, q->pq_kernel_addr, q->pq_dma_addr); + } +} + +static int gaudi_alloc_internal_qmans_pq_mem(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + int rc, i; + + for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) { + if (gaudi_queue_type[i] != QUEUE_TYPE_INT) + continue; + + q = &gaudi->internal_qmans[i]; + + switch (i) { + case GAUDI_QUEUE_ID_DMA_2_0 ... GAUDI_QUEUE_ID_DMA_7_3: + q->pq_size = HBM_DMA_QMAN_SIZE_IN_BYTES; + break; + case GAUDI_QUEUE_ID_MME_0_0 ... GAUDI_QUEUE_ID_MME_1_3: + q->pq_size = MME_QMAN_SIZE_IN_BYTES; + break; + case GAUDI_QUEUE_ID_TPC_0_0 ... GAUDI_QUEUE_ID_TPC_7_3: + q->pq_size = TPC_QMAN_SIZE_IN_BYTES; + break; + case GAUDI_QUEUE_ID_NIC_0_0 ... GAUDI_QUEUE_ID_NIC_9_3: + q->pq_size = NIC_QMAN_SIZE_IN_BYTES; + break; + default: + dev_err(hdev->dev, "Bad internal queue index %d", i); + rc = -EINVAL; + goto free_internal_qmans_pq_mem; + } + + q->pq_kernel_addr = hl_asic_dma_alloc_coherent(hdev, q->pq_size, &q->pq_dma_addr, + GFP_KERNEL | __GFP_ZERO); + if (!q->pq_kernel_addr) { + rc = -ENOMEM; + goto free_internal_qmans_pq_mem; + } + } + + return 0; + +free_internal_qmans_pq_mem: + gaudi_free_internal_qmans_pq_mem(hdev); + return rc; +} + +static void gaudi_set_pci_memory_regions(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pci_mem_region *region; + + /* CFG */ + region = &hdev->pci_mem_region[PCI_REGION_CFG]; + region->region_base = CFG_BASE; + region->region_size = CFG_SIZE; + region->offset_in_bar = CFG_BASE - SPI_FLASH_BASE_ADDR; + region->bar_size = CFG_BAR_SIZE; + region->bar_id = CFG_BAR_ID; + region->used = 1; + + /* SRAM */ + region = &hdev->pci_mem_region[PCI_REGION_SRAM]; + region->region_base = SRAM_BASE_ADDR; + region->region_size = SRAM_SIZE; + region->offset_in_bar = 0; + region->bar_size = SRAM_BAR_SIZE; + region->bar_id = SRAM_BAR_ID; + region->used = 1; + + /* DRAM */ + region = &hdev->pci_mem_region[PCI_REGION_DRAM]; + region->region_base = DRAM_PHYS_BASE; + region->region_size = hdev->asic_prop.dram_size; + region->offset_in_bar = 0; + region->bar_size = prop->dram_pci_bar_size; + region->bar_id = HBM_BAR_ID; + region->used = 1; + + /* SP SRAM */ + region = &hdev->pci_mem_region[PCI_REGION_SP_SRAM]; + region->region_base = PSOC_SCRATCHPAD_ADDR; + region->region_size = PSOC_SCRATCHPAD_SIZE; + region->offset_in_bar = PSOC_SCRATCHPAD_ADDR - SPI_FLASH_BASE_ADDR; + region->bar_size = CFG_BAR_SIZE; + region->bar_id = CFG_BAR_ID; + region->used = 1; +} + +static int gaudi_sw_init(struct hl_device *hdev) +{ + struct gaudi_device *gaudi; + u32 i, event_id = 0; + int rc; + + /* Allocate device structure */ + gaudi = kzalloc(sizeof(*gaudi), GFP_KERNEL); + if (!gaudi) + return -ENOMEM; + + for (i = 0 ; i < ARRAY_SIZE(gaudi_irq_map_table) ; i++) { + if (gaudi_irq_map_table[i].valid) { + if (event_id == GAUDI_EVENT_SIZE) { + dev_err(hdev->dev, + "Event array exceeds the limit of %u events\n", + GAUDI_EVENT_SIZE); + rc = -EINVAL; + goto free_gaudi_device; + } + + gaudi->events[event_id++] = + gaudi_irq_map_table[i].fc_id; + } + } + + gaudi->cpucp_info_get = gaudi_cpucp_info_get; + + hdev->asic_specific = gaudi; + + /* Create DMA pool for small allocations */ + hdev->dma_pool = dma_pool_create(dev_name(hdev->dev), + &hdev->pdev->dev, GAUDI_DMA_POOL_BLK_SIZE, 8, 0); + if (!hdev->dma_pool) { + dev_err(hdev->dev, "failed to create DMA pool\n"); + rc = -ENOMEM; + goto free_gaudi_device; + } + + rc = gaudi_alloc_cpu_accessible_dma_mem(hdev); + if (rc) + goto free_dma_pool; + + hdev->cpu_accessible_dma_pool = gen_pool_create(ilog2(32), -1); + if (!hdev->cpu_accessible_dma_pool) { + dev_err(hdev->dev, + "Failed to create CPU accessible DMA pool\n"); + rc = -ENOMEM; + goto free_cpu_dma_mem; + } + + rc = gen_pool_add(hdev->cpu_accessible_dma_pool, + (uintptr_t) hdev->cpu_accessible_dma_mem, + HL_CPU_ACCESSIBLE_MEM_SIZE, -1); + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to CPU accessible DMA pool\n"); + rc = -EFAULT; + goto free_cpu_accessible_dma_pool; + } + + rc = gaudi_alloc_internal_qmans_pq_mem(hdev); + if (rc) + goto free_cpu_accessible_dma_pool; + + spin_lock_init(&gaudi->hw_queues_lock); + + hdev->supports_sync_stream = true; + hdev->supports_coresight = true; + hdev->supports_staged_submission = true; + hdev->supports_wait_for_multi_cs = true; + + hdev->asic_funcs->set_pci_memory_regions(hdev); + hdev->stream_master_qid_arr = + hdev->asic_funcs->get_stream_master_qid_arr(); + hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE; + + return 0; + +free_cpu_accessible_dma_pool: + gen_pool_destroy(hdev->cpu_accessible_dma_pool); +free_cpu_dma_mem: + if (!hdev->asic_prop.fw_security_enabled) + GAUDI_CPU_TO_PCI_ADDR(hdev->cpu_accessible_dma_address, + hdev->cpu_pci_msb_addr); + hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem, + hdev->cpu_accessible_dma_address); +free_dma_pool: + dma_pool_destroy(hdev->dma_pool); +free_gaudi_device: + kfree(gaudi); + return rc; +} + +static int gaudi_sw_fini(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + gaudi_free_internal_qmans_pq_mem(hdev); + + gen_pool_destroy(hdev->cpu_accessible_dma_pool); + + if (!hdev->asic_prop.fw_security_enabled) + GAUDI_CPU_TO_PCI_ADDR(hdev->cpu_accessible_dma_address, + hdev->cpu_pci_msb_addr); + + hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem, + hdev->cpu_accessible_dma_address); + + dma_pool_destroy(hdev->dma_pool); + + kfree(gaudi); + + return 0; +} + +static irqreturn_t gaudi_irq_handler_single(int irq, void *arg) +{ + struct hl_device *hdev = arg; + int i; + + if (hdev->disabled) + return IRQ_HANDLED; + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + hl_irq_handler_cq(irq, &hdev->completion_queue[i]); + + hl_irq_handler_eq(irq, &hdev->event_queue); + + return IRQ_HANDLED; +} + +/* + * For backward compatibility, new MSI interrupts should be set after the + * existing CPU and NIC interrupts. + */ +static int gaudi_pci_irq_vector(struct hl_device *hdev, unsigned int nr, + bool cpu_eq) +{ + int msi_vec; + + if ((nr != GAUDI_EVENT_QUEUE_MSI_IDX) && (cpu_eq)) + dev_crit(hdev->dev, "CPU EQ must use IRQ %d\n", + GAUDI_EVENT_QUEUE_MSI_IDX); + + msi_vec = ((nr < GAUDI_EVENT_QUEUE_MSI_IDX) || (cpu_eq)) ? nr : + (nr + NIC_NUMBER_OF_ENGINES + 1); + + return pci_irq_vector(hdev->pdev, msi_vec); +} + +static int gaudi_enable_msi_single(struct hl_device *hdev) +{ + int rc, irq; + + dev_dbg(hdev->dev, "Working in single MSI IRQ mode\n"); + + irq = gaudi_pci_irq_vector(hdev, 0, false); + rc = request_irq(irq, gaudi_irq_handler_single, 0, + "gaudi single msi", hdev); + if (rc) + dev_err(hdev->dev, + "Failed to request single MSI IRQ\n"); + + return rc; +} + +static int gaudi_enable_msi_multi(struct hl_device *hdev) +{ + int cq_cnt = hdev->asic_prop.completion_queues_count; + int rc, i, irq_cnt_init, irq; + + for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) { + irq = gaudi_pci_irq_vector(hdev, i, false); + rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i], + &hdev->completion_queue[i]); + if (rc) { + dev_err(hdev->dev, "Failed to request IRQ %d", irq); + goto free_irqs; + } + } + + irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true); + rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt], + &hdev->event_queue); + if (rc) { + dev_err(hdev->dev, "Failed to request IRQ %d", irq); + goto free_irqs; + } + + return 0; + +free_irqs: + for (i = 0 ; i < irq_cnt_init ; i++) + free_irq(gaudi_pci_irq_vector(hdev, i, false), + &hdev->completion_queue[i]); + return rc; +} + +static int gaudi_enable_msi(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int rc; + + if (gaudi->hw_cap_initialized & HW_CAP_MSI) + return 0; + + rc = pci_alloc_irq_vectors(hdev->pdev, 1, 1, PCI_IRQ_MSI); + if (rc < 0) { + dev_err(hdev->dev, "MSI: Failed to enable support %d\n", rc); + return rc; + } + + if (rc < NUMBER_OF_INTERRUPTS) { + gaudi->multi_msi_mode = false; + rc = gaudi_enable_msi_single(hdev); + } else { + gaudi->multi_msi_mode = true; + rc = gaudi_enable_msi_multi(hdev); + } + + if (rc) + goto free_pci_irq_vectors; + + gaudi->hw_cap_initialized |= HW_CAP_MSI; + + return 0; + +free_pci_irq_vectors: + pci_free_irq_vectors(hdev->pdev); + return rc; +} + +static void gaudi_sync_irqs(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int i, cq_cnt = hdev->asic_prop.completion_queues_count; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) + return; + + /* Wait for all pending IRQs to be finished */ + if (gaudi->multi_msi_mode) { + for (i = 0 ; i < cq_cnt ; i++) + synchronize_irq(gaudi_pci_irq_vector(hdev, i, false)); + + synchronize_irq(gaudi_pci_irq_vector(hdev, + GAUDI_EVENT_QUEUE_MSI_IDX, + true)); + } else { + synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); + } +} + +static void gaudi_disable_msi(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) + return; + + gaudi_sync_irqs(hdev); + + if (gaudi->multi_msi_mode) { + irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, + true); + free_irq(irq, &hdev->event_queue); + + for (i = 0 ; i < cq_cnt ; i++) { + irq = gaudi_pci_irq_vector(hdev, i, false); + free_irq(irq, &hdev->completion_queue[i]); + } + } else { + free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); + } + + pci_free_irq_vectors(hdev->pdev); + + gaudi->hw_cap_initialized &= ~HW_CAP_MSI; +} + +static void gaudi_init_scrambler_sram(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (hdev->asic_prop.fw_security_enabled) + return; + + if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_SRAM_SCR_EN) + return; + + if (gaudi->hw_cap_initialized & HW_CAP_SRAM_SCRAMBLER) + return; + + WREG32(mmNIF_RTR_CTRL_0_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_1_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_2_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_3_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_4_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_5_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_6_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_7_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_0_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_1_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_2_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_3_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_4_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_5_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_6_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_7_SCRAM_SRAM_EN, + 1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_N_DOWN_CH0_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_N_DOWN_CH1_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH0_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH1_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH0_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH1_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH0_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH1_SCRAM_SRAM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_SRAM_EN_VAL_SHIFT); + + gaudi->hw_cap_initialized |= HW_CAP_SRAM_SCRAMBLER; +} + +static void gaudi_init_scrambler_hbm(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (hdev->asic_prop.fw_security_enabled) + return; + + if (hdev->asic_prop.fw_bootfit_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_DRAM_SCR_EN) + return; + + if (gaudi->hw_cap_initialized & HW_CAP_HBM_SCRAMBLER) + return; + + WREG32(mmNIF_RTR_CTRL_0_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_1_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_2_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_3_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_4_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_5_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_6_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_7_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_0_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_1_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_2_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_3_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_4_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_5_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_6_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_7_SCRAM_HBM_EN, + 1 << IF_RTR_CTRL_SCRAM_HBM_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_N_DOWN_CH0_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_N_DOWN_CH1_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH0_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH1_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH0_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH1_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH0_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH1_SCRAM_HBM_EN, + 1 << DMA_IF_DOWN_CHX_SCRAM_HBM_EN_VAL_SHIFT); + + gaudi->hw_cap_initialized |= HW_CAP_HBM_SCRAMBLER; +} + +static void gaudi_init_e2e(struct hl_device *hdev) +{ + if (hdev->asic_prop.fw_security_enabled) + return; + + if (hdev->asic_prop.fw_bootfit_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_E2E_CRED_EN) + return; + + WREG32(mmSIF_RTR_CTRL_0_E2E_HBM_WR_SIZE, 247 >> 3); + WREG32(mmSIF_RTR_CTRL_0_E2E_HBM_RD_SIZE, 785 >> 3); + WREG32(mmSIF_RTR_CTRL_0_E2E_PCI_WR_SIZE, 49); + WREG32(mmSIF_RTR_CTRL_0_E2E_PCI_RD_SIZE, 101); + + WREG32(mmSIF_RTR_CTRL_1_E2E_HBM_WR_SIZE, 275 >> 3); + WREG32(mmSIF_RTR_CTRL_1_E2E_HBM_RD_SIZE, 614 >> 3); + WREG32(mmSIF_RTR_CTRL_1_E2E_PCI_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_1_E2E_PCI_RD_SIZE, 39); + + WREG32(mmSIF_RTR_CTRL_2_E2E_HBM_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_2_E2E_HBM_RD_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_2_E2E_PCI_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_2_E2E_PCI_RD_SIZE, 32); + + WREG32(mmSIF_RTR_CTRL_3_E2E_HBM_WR_SIZE, 176 >> 3); + WREG32(mmSIF_RTR_CTRL_3_E2E_HBM_RD_SIZE, 32 >> 3); + WREG32(mmSIF_RTR_CTRL_3_E2E_PCI_WR_SIZE, 19); + WREG32(mmSIF_RTR_CTRL_3_E2E_PCI_RD_SIZE, 32); + + WREG32(mmSIF_RTR_CTRL_4_E2E_HBM_WR_SIZE, 176 >> 3); + WREG32(mmSIF_RTR_CTRL_4_E2E_HBM_RD_SIZE, 32 >> 3); + WREG32(mmSIF_RTR_CTRL_4_E2E_PCI_WR_SIZE, 19); + WREG32(mmSIF_RTR_CTRL_4_E2E_PCI_RD_SIZE, 32); + + WREG32(mmSIF_RTR_CTRL_5_E2E_HBM_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_5_E2E_HBM_RD_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_5_E2E_PCI_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_5_E2E_PCI_RD_SIZE, 32); + + WREG32(mmSIF_RTR_CTRL_6_E2E_HBM_WR_SIZE, 275 >> 3); + WREG32(mmSIF_RTR_CTRL_6_E2E_HBM_RD_SIZE, 614 >> 3); + WREG32(mmSIF_RTR_CTRL_6_E2E_PCI_WR_SIZE, 1); + WREG32(mmSIF_RTR_CTRL_6_E2E_PCI_RD_SIZE, 39); + + WREG32(mmSIF_RTR_CTRL_7_E2E_HBM_WR_SIZE, 297 >> 3); + WREG32(mmSIF_RTR_CTRL_7_E2E_HBM_RD_SIZE, 908 >> 3); + WREG32(mmSIF_RTR_CTRL_7_E2E_PCI_WR_SIZE, 19); + WREG32(mmSIF_RTR_CTRL_7_E2E_PCI_RD_SIZE, 19); + + WREG32(mmNIF_RTR_CTRL_0_E2E_HBM_WR_SIZE, 318 >> 3); + WREG32(mmNIF_RTR_CTRL_0_E2E_HBM_RD_SIZE, 956 >> 3); + WREG32(mmNIF_RTR_CTRL_0_E2E_PCI_WR_SIZE, 79); + WREG32(mmNIF_RTR_CTRL_0_E2E_PCI_RD_SIZE, 163); + + WREG32(mmNIF_RTR_CTRL_1_E2E_HBM_WR_SIZE, 275 >> 3); + WREG32(mmNIF_RTR_CTRL_1_E2E_HBM_RD_SIZE, 614 >> 3); + WREG32(mmNIF_RTR_CTRL_1_E2E_PCI_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_1_E2E_PCI_RD_SIZE, 39); + + WREG32(mmNIF_RTR_CTRL_2_E2E_HBM_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_2_E2E_HBM_RD_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_2_E2E_PCI_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_2_E2E_PCI_RD_SIZE, 32); + + WREG32(mmNIF_RTR_CTRL_3_E2E_HBM_WR_SIZE, 176 >> 3); + WREG32(mmNIF_RTR_CTRL_3_E2E_HBM_RD_SIZE, 32 >> 3); + WREG32(mmNIF_RTR_CTRL_3_E2E_PCI_WR_SIZE, 19); + WREG32(mmNIF_RTR_CTRL_3_E2E_PCI_RD_SIZE, 32); + + WREG32(mmNIF_RTR_CTRL_4_E2E_HBM_WR_SIZE, 176 >> 3); + WREG32(mmNIF_RTR_CTRL_4_E2E_HBM_RD_SIZE, 32 >> 3); + WREG32(mmNIF_RTR_CTRL_4_E2E_PCI_WR_SIZE, 19); + WREG32(mmNIF_RTR_CTRL_4_E2E_PCI_RD_SIZE, 32); + + WREG32(mmNIF_RTR_CTRL_5_E2E_HBM_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_5_E2E_HBM_RD_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_5_E2E_PCI_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_5_E2E_PCI_RD_SIZE, 32); + + WREG32(mmNIF_RTR_CTRL_6_E2E_HBM_WR_SIZE, 275 >> 3); + WREG32(mmNIF_RTR_CTRL_6_E2E_HBM_RD_SIZE, 614 >> 3); + WREG32(mmNIF_RTR_CTRL_6_E2E_PCI_WR_SIZE, 1); + WREG32(mmNIF_RTR_CTRL_6_E2E_PCI_RD_SIZE, 39); + + WREG32(mmNIF_RTR_CTRL_7_E2E_HBM_WR_SIZE, 318 >> 3); + WREG32(mmNIF_RTR_CTRL_7_E2E_HBM_RD_SIZE, 956 >> 3); + WREG32(mmNIF_RTR_CTRL_7_E2E_PCI_WR_SIZE, 79); + WREG32(mmNIF_RTR_CTRL_7_E2E_PCI_RD_SIZE, 79); + + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_PCI_RD_SIZE, 338); + + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_HBM_WR_SIZE, 344 >> 3); + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_HBM_RD_SIZE, 1000 >> 3); + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_PCI_WR_SIZE, 162); + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_PCI_RD_SIZE, 338); + + WREG32(mmSIF_RTR_CTRL_0_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_0_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_1_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_1_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_2_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_2_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_3_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_3_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_4_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_4_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_5_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_5_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_6_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_6_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmSIF_RTR_CTRL_7_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmSIF_RTR_CTRL_7_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_0_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_0_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_1_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_1_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_2_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_2_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_3_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_3_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_4_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_4_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_5_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_5_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_6_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_6_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmNIF_RTR_CTRL_7_E2E_HBM_EN, + 1 << IF_RTR_CTRL_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmNIF_RTR_CTRL_7_E2E_PCI_EN, + 1 << IF_RTR_CTRL_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_N_DOWN_CH0_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_N_DOWN_CH1_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH0_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_E_S_DOWN_CH1_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH0_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_N_DOWN_CH1_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH0_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); + + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_HBM_EN, + 1 << DMA_IF_DOWN_CHX_E2E_HBM_EN_VAL_SHIFT); + WREG32(mmDMA_IF_W_S_DOWN_CH1_E2E_PCI_EN, + 1 << DMA_IF_DOWN_CHX_E2E_PCI_EN_VAL_SHIFT); +} + +static void gaudi_init_hbm_cred(struct hl_device *hdev) +{ + u32 hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd; + + if (hdev->asic_prop.fw_security_enabled) + return; + + if (hdev->asic_prop.fw_bootfit_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_HBM_CRED_EN) + return; + + hbm0_wr = 0x33333333; + hbm0_rd = 0x77777777; + hbm1_wr = 0x55555555; + hbm1_rd = 0xDDDDDDDD; + + WREG32(mmDMA_IF_E_N_HBM0_WR_CRED_CNT, hbm0_wr); + WREG32(mmDMA_IF_E_N_HBM1_WR_CRED_CNT, hbm1_wr); + WREG32(mmDMA_IF_E_N_HBM0_RD_CRED_CNT, hbm0_rd); + WREG32(mmDMA_IF_E_N_HBM1_RD_CRED_CNT, hbm1_rd); + + WREG32(mmDMA_IF_E_S_HBM0_WR_CRED_CNT, hbm0_wr); + WREG32(mmDMA_IF_E_S_HBM1_WR_CRED_CNT, hbm1_wr); + WREG32(mmDMA_IF_E_S_HBM0_RD_CRED_CNT, hbm0_rd); + WREG32(mmDMA_IF_E_S_HBM1_RD_CRED_CNT, hbm1_rd); + + WREG32(mmDMA_IF_W_N_HBM0_WR_CRED_CNT, hbm0_wr); + WREG32(mmDMA_IF_W_N_HBM1_WR_CRED_CNT, hbm1_wr); + WREG32(mmDMA_IF_W_N_HBM0_RD_CRED_CNT, hbm0_rd); + WREG32(mmDMA_IF_W_N_HBM1_RD_CRED_CNT, hbm1_rd); + + WREG32(mmDMA_IF_W_S_HBM0_WR_CRED_CNT, hbm0_wr); + WREG32(mmDMA_IF_W_S_HBM1_WR_CRED_CNT, hbm1_wr); + WREG32(mmDMA_IF_W_S_HBM0_RD_CRED_CNT, hbm0_rd); + WREG32(mmDMA_IF_W_S_HBM1_RD_CRED_CNT, hbm1_rd); + + WREG32(mmDMA_IF_E_N_HBM_CRED_EN_0, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_E_S_HBM_CRED_EN_0, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_W_N_HBM_CRED_EN_0, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_W_S_HBM_CRED_EN_0, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + + WREG32(mmDMA_IF_E_N_HBM_CRED_EN_1, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_E_S_HBM_CRED_EN_1, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_W_N_HBM_CRED_EN_1, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); + WREG32(mmDMA_IF_W_S_HBM_CRED_EN_1, + (1 << DMA_IF_HBM_CRED_EN_READ_CREDIT_EN_SHIFT) | + (1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT)); +} + +static void gaudi_init_golden_registers(struct hl_device *hdev) +{ + u32 tpc_offset; + int tpc_id, i; + + gaudi_init_e2e(hdev); + gaudi_init_hbm_cred(hdev); + + for (tpc_id = 0, tpc_offset = 0; + tpc_id < TPC_NUMBER_OF_ENGINES; + tpc_id++, tpc_offset += TPC_CFG_OFFSET) { + /* Mask all arithmetic interrupts from TPC */ + WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFE); + /* Set 16 cache lines */ + WREG32_FIELD(TPC0_CFG_MSS_CONFIG, tpc_offset, + ICACHE_FETCH_LINE_NUM, 2); + } + + /* Make sure 1st 128 bytes in SRAM are 0 for Tensor DMA */ + for (i = 0 ; i < 128 ; i += 8) + writeq(0, hdev->pcie_bar[SRAM_BAR_ID] + i); + + WREG32(mmMME0_CTRL_EUS_ROLLUP_CNT_ADD, 3); + WREG32(mmMME1_CTRL_EUS_ROLLUP_CNT_ADD, 3); + WREG32(mmMME2_CTRL_EUS_ROLLUP_CNT_ADD, 3); + WREG32(mmMME3_CTRL_EUS_ROLLUP_CNT_ADD, 3); +} + +static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id, + int qman_id, dma_addr_t qman_pq_addr) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi; + u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi; + u32 q_off, dma_qm_offset; + u32 dma_qm_err_cfg, irq_handler_offset; + + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + + mtr_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + mtr_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + + q_off = dma_qm_offset + qman_id * 4; + + WREG32(mmDMA0_QM_PQ_BASE_LO_0 + q_off, lower_32_bits(qman_pq_addr)); + WREG32(mmDMA0_QM_PQ_BASE_HI_0 + q_off, upper_32_bits(qman_pq_addr)); + + WREG32(mmDMA0_QM_PQ_SIZE_0 + q_off, ilog2(HL_QUEUE_LENGTH)); + WREG32(mmDMA0_QM_PQ_PI_0 + q_off, 0); + WREG32(mmDMA0_QM_PQ_CI_0 + q_off, 0); + + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, QMAN_LDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); + + WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi); + WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi); + WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_LO_0 + q_off, mtr_base_ws_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_HI_0 + q_off, mtr_base_ws_hi); + WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off, so_base_ws_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off, so_base_ws_hi); + + WREG32(mmDMA0_QM_CP_BARRIER_CFG_0 + q_off, 0x100); + + /* The following configuration is needed only once per QMAN */ + if (qman_id == 0) { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_dma_qm_irq_ctrl); + + /* Configure RAZWI IRQ */ + dma_qm_err_cfg = PCI_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; + if (hdev->stop_on_err) + dma_qm_err_cfg |= + PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK; + + WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg); + + WREG32(mmDMA0_QM_GLBL_ERR_ADDR_LO + dma_qm_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmDMA0_QM_GLBL_ERR_ADDR_HI + dma_qm_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmDMA0_QM_GLBL_ERR_WDATA + dma_qm_offset, + gaudi_irq_map_table[GAUDI_EVENT_DMA0_QM].cpu_id + + dma_id); + + WREG32(mmDMA0_QM_ARB_ERR_MSG_EN + dma_qm_offset, + QM_ARB_ERR_MSG_EN_MASK); + + /* Set timeout to maximum */ + WREG32(mmDMA0_QM_ARB_SLV_CHOISE_WDT + dma_qm_offset, GAUDI_ARB_WDT_TIMEOUT); + + WREG32(mmDMA0_QM_GLBL_PROT + dma_qm_offset, + QMAN_EXTERNAL_MAKE_TRUSTED); + + WREG32(mmDMA0_QM_GLBL_CFG1 + dma_qm_offset, 0); + } +} + +static void gaudi_init_dma_core(struct hl_device *hdev, int dma_id) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 dma_err_cfg = 1 << DMA0_CORE_ERR_CFG_ERR_MSG_EN_SHIFT; + u32 dma_offset = dma_id * DMA_CORE_OFFSET; + u32 irq_handler_offset; + + /* Set to maximum possible according to physical size */ + WREG32(mmDMA0_CORE_RD_MAX_OUTSTAND + dma_offset, 0); + WREG32(mmDMA0_CORE_RD_MAX_SIZE + dma_offset, 0); + + /* WA for H/W bug H3-2116 */ + WREG32(mmDMA0_CORE_LBW_MAX_OUTSTAND + dma_offset, 15); + + /* STOP_ON bit implies no completion to operation in case of RAZWI */ + if (hdev->stop_on_err) + dma_err_cfg |= 1 << DMA0_CORE_ERR_CFG_STOP_ON_ERR_SHIFT; + + WREG32(mmDMA0_CORE_ERR_CFG + dma_offset, dma_err_cfg); + + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_dma_core_irq_ctrl); + + WREG32(mmDMA0_CORE_ERRMSG_ADDR_LO + dma_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmDMA0_CORE_ERRMSG_ADDR_HI + dma_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmDMA0_CORE_ERRMSG_WDATA + dma_offset, + gaudi_irq_map_table[GAUDI_EVENT_DMA0_CORE].cpu_id + dma_id); + WREG32(mmDMA0_CORE_PROT + dma_offset, + 1 << DMA0_CORE_PROT_ERR_VAL_SHIFT); + /* If the channel is secured, it should be in MMU bypass mode */ + WREG32(mmDMA0_CORE_SECURE_PROPS + dma_offset, + 1 << DMA0_CORE_SECURE_PROPS_MMBP_SHIFT); + WREG32(mmDMA0_CORE_CFG_0 + dma_offset, 1 << DMA0_CORE_CFG_0_EN_SHIFT); +} + +static void gaudi_enable_qman(struct hl_device *hdev, int dma_id, + u32 enable_mask) +{ + u32 dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + + WREG32(mmDMA0_QM_GLBL_CFG0 + dma_qm_offset, enable_mask); +} + +static void gaudi_init_pci_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct hl_hw_queue *q; + int i, j, dma_id, cpu_skip, nic_skip, cq_id = 0, q_idx, msi_vec = 0; + + if (gaudi->hw_cap_initialized & HW_CAP_PCI_DMA) + return; + + for (i = 0 ; i < PCI_DMA_NUMBER_OF_CHNLS ; i++) { + dma_id = gaudi_dma_assignment[i]; + /* + * For queues after the CPU Q need to add 1 to get the correct + * queue. In addition, need to add the CPU EQ and NIC IRQs in + * order to get the correct MSI register. + */ + if (dma_id > 1) { + cpu_skip = 1; + nic_skip = NIC_NUMBER_OF_ENGINES; + } else { + cpu_skip = 0; + nic_skip = 0; + } + + for (j = 0 ; j < QMAN_STREAMS ; j++) { + q_idx = 4 * dma_id + j + cpu_skip; + q = &hdev->kernel_queues[q_idx]; + q->cq_id = cq_id++; + q->msi_vec = nic_skip + cpu_skip + msi_vec++; + gaudi_init_pci_dma_qman(hdev, dma_id, j, + q->bus_address); + } + + gaudi_init_dma_core(hdev, dma_id); + + gaudi_enable_qman(hdev, dma_id, PCI_DMA_QMAN_ENABLE); + } + + gaudi->hw_cap_initialized |= HW_CAP_PCI_DMA; +} + +static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id, + int qman_id, u64 qman_base_addr) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi; + u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi; + u32 dma_qm_err_cfg, irq_handler_offset; + u32 q_off, dma_qm_offset; + + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + + mtr_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + mtr_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + + q_off = dma_qm_offset + qman_id * 4; + + if (qman_id < 4) { + WREG32(mmDMA0_QM_PQ_BASE_LO_0 + q_off, + lower_32_bits(qman_base_addr)); + WREG32(mmDMA0_QM_PQ_BASE_HI_0 + q_off, + upper_32_bits(qman_base_addr)); + + WREG32(mmDMA0_QM_PQ_SIZE_0 + q_off, ilog2(HBM_DMA_QMAN_LENGTH)); + WREG32(mmDMA0_QM_PQ_PI_0 + q_off, 0); + WREG32(mmDMA0_QM_PQ_CI_0 + q_off, 0); + + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); + } else { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_dma_qm_irq_ctrl); + + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); + + /* Configure RAZWI IRQ */ + dma_qm_err_cfg = HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; + if (hdev->stop_on_err) + dma_qm_err_cfg |= + HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK; + + WREG32(mmDMA0_QM_GLBL_ERR_CFG + dma_qm_offset, dma_qm_err_cfg); + + WREG32(mmDMA0_QM_GLBL_ERR_ADDR_LO + dma_qm_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmDMA0_QM_GLBL_ERR_ADDR_HI + dma_qm_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmDMA0_QM_GLBL_ERR_WDATA + dma_qm_offset, + gaudi_irq_map_table[GAUDI_EVENT_DMA0_QM].cpu_id + + dma_id); + + WREG32(mmDMA0_QM_ARB_ERR_MSG_EN + dma_qm_offset, + QM_ARB_ERR_MSG_EN_MASK); + + /* Set timeout to maximum */ + WREG32(mmDMA0_QM_ARB_SLV_CHOISE_WDT + dma_qm_offset, GAUDI_ARB_WDT_TIMEOUT); + + WREG32(mmDMA0_QM_GLBL_CFG1 + dma_qm_offset, 0); + WREG32(mmDMA0_QM_GLBL_PROT + dma_qm_offset, + QMAN_INTERNAL_MAKE_TRUSTED); + } + + WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi); + WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi); + + /* Configure DMA5 CP_MSG_BASE 2/3 for sync stream collective */ + if (gaudi_dma_assignment[dma_id] == GAUDI_ENGINE_ID_DMA_5) { + WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_LO_0 + q_off, + mtr_base_ws_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE2_ADDR_HI_0 + q_off, + mtr_base_ws_hi); + WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off, + so_base_ws_lo); + WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off, + so_base_ws_hi); + } +} + +static void gaudi_init_hbm_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + u64 qman_base_addr; + int i, j, dma_id, internal_q_index; + + if (gaudi->hw_cap_initialized & HW_CAP_HBM_DMA) + return; + + for (i = 0 ; i < HBM_DMA_NUMBER_OF_CHNLS ; i++) { + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_1 + i]; + + for (j = 0 ; j < QMAN_STREAMS ; j++) { + /* + * Add the CPU queue in order to get the correct queue + * number as all internal queue are placed after it + */ + internal_q_index = dma_id * QMAN_STREAMS + j + 1; + + q = &gaudi->internal_qmans[internal_q_index]; + qman_base_addr = (u64) q->pq_dma_addr; + gaudi_init_hbm_dma_qman(hdev, dma_id, j, + qman_base_addr); + } + + /* Initializing lower CP for HBM DMA QMAN */ + gaudi_init_hbm_dma_qman(hdev, dma_id, 4, 0); + + gaudi_init_dma_core(hdev, dma_id); + + gaudi_enable_qman(hdev, dma_id, HBM_DMA_QMAN_ENABLE); + } + + gaudi->hw_cap_initialized |= HW_CAP_HBM_DMA; +} + +static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset, + int qman_id, u64 qman_base_addr) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 mtr_base_lo, mtr_base_hi; + u32 so_base_lo, so_base_hi; + u32 irq_handler_offset; + u32 q_off, mme_id; + u32 mme_qm_err_cfg; + + mtr_base_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + + q_off = mme_offset + qman_id * 4; + + if (qman_id < 4) { + WREG32(mmMME0_QM_PQ_BASE_LO_0 + q_off, + lower_32_bits(qman_base_addr)); + WREG32(mmMME0_QM_PQ_BASE_HI_0 + q_off, + upper_32_bits(qman_base_addr)); + + WREG32(mmMME0_QM_PQ_SIZE_0 + q_off, ilog2(MME_QMAN_LENGTH)); + WREG32(mmMME0_QM_PQ_PI_0 + q_off, 0); + WREG32(mmMME0_QM_PQ_CI_0 + q_off, 0); + + WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); + } else { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_mme_qm_irq_ctrl); + + WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); + + /* Configure RAZWI IRQ */ + mme_id = mme_offset / + (mmMME1_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0) / 2; + + mme_qm_err_cfg = MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; + if (hdev->stop_on_err) + mme_qm_err_cfg |= + MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK; + + WREG32(mmMME0_QM_GLBL_ERR_CFG + mme_offset, mme_qm_err_cfg); + + WREG32(mmMME0_QM_GLBL_ERR_ADDR_LO + mme_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmMME0_QM_GLBL_ERR_ADDR_HI + mme_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmMME0_QM_GLBL_ERR_WDATA + mme_offset, + gaudi_irq_map_table[GAUDI_EVENT_MME0_QM].cpu_id + + mme_id); + + WREG32(mmMME0_QM_ARB_ERR_MSG_EN + mme_offset, + QM_ARB_ERR_MSG_EN_MASK); + + /* Set timeout to maximum */ + WREG32(mmMME0_QM_ARB_SLV_CHOISE_WDT + mme_offset, GAUDI_ARB_WDT_TIMEOUT); + + WREG32(mmMME0_QM_GLBL_CFG1 + mme_offset, 0); + WREG32(mmMME0_QM_GLBL_PROT + mme_offset, + QMAN_INTERNAL_MAKE_TRUSTED); + } + + WREG32(mmMME0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_lo); + WREG32(mmMME0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_hi); + WREG32(mmMME0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_lo); + WREG32(mmMME0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_hi); +} + +static void gaudi_init_mme_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + u64 qman_base_addr; + u32 mme_offset; + int i, internal_q_index; + + if (gaudi->hw_cap_initialized & HW_CAP_MME) + return; + + /* + * map GAUDI_QUEUE_ID_MME_0_X to the N_W_MME (mmMME2_QM_BASE) + * and GAUDI_QUEUE_ID_MME_1_X to the S_W_MME (mmMME0_QM_BASE) + */ + + mme_offset = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0; + + for (i = 0 ; i < MME_NUMBER_OF_QMANS ; i++) { + internal_q_index = GAUDI_QUEUE_ID_MME_0_0 + i; + q = &gaudi->internal_qmans[internal_q_index]; + qman_base_addr = (u64) q->pq_dma_addr; + gaudi_init_mme_qman(hdev, mme_offset, (i & 0x3), + qman_base_addr); + if (i == 3) + mme_offset = 0; + } + + /* Initializing lower CP for MME QMANs */ + mme_offset = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0; + gaudi_init_mme_qman(hdev, mme_offset, 4, 0); + gaudi_init_mme_qman(hdev, 0, 4, 0); + + WREG32(mmMME2_QM_GLBL_CFG0, QMAN_MME_ENABLE); + WREG32(mmMME0_QM_GLBL_CFG0, QMAN_MME_ENABLE); + + gaudi->hw_cap_initialized |= HW_CAP_MME; +} + +static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset, + int qman_id, u64 qman_base_addr) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi; + u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi; + u32 tpc_qm_err_cfg, irq_handler_offset; + u32 q_off, tpc_id; + + mtr_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_en_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + mtr_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_ws_lo = lower_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + + q_off = tpc_offset + qman_id * 4; + + tpc_id = tpc_offset / + (mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0); + + if (qman_id < 4) { + WREG32(mmTPC0_QM_PQ_BASE_LO_0 + q_off, + lower_32_bits(qman_base_addr)); + WREG32(mmTPC0_QM_PQ_BASE_HI_0 + q_off, + upper_32_bits(qman_base_addr)); + + WREG32(mmTPC0_QM_PQ_SIZE_0 + q_off, ilog2(TPC_QMAN_LENGTH)); + WREG32(mmTPC0_QM_PQ_PI_0 + q_off, 0); + WREG32(mmTPC0_QM_PQ_CI_0 + q_off, 0); + + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); + } else { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_tpc_qm_irq_ctrl); + + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); + + /* Configure RAZWI IRQ */ + tpc_qm_err_cfg = TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; + if (hdev->stop_on_err) + tpc_qm_err_cfg |= + TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK; + + WREG32(mmTPC0_QM_GLBL_ERR_CFG + tpc_offset, tpc_qm_err_cfg); + + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_LO + tpc_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmTPC0_QM_GLBL_ERR_ADDR_HI + tpc_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmTPC0_QM_GLBL_ERR_WDATA + tpc_offset, + gaudi_irq_map_table[GAUDI_EVENT_TPC0_QM].cpu_id + + tpc_id); + + WREG32(mmTPC0_QM_ARB_ERR_MSG_EN + tpc_offset, + QM_ARB_ERR_MSG_EN_MASK); + + /* Set timeout to maximum */ + WREG32(mmTPC0_QM_ARB_SLV_CHOISE_WDT + tpc_offset, GAUDI_ARB_WDT_TIMEOUT); + + WREG32(mmTPC0_QM_GLBL_CFG1 + tpc_offset, 0); + WREG32(mmTPC0_QM_GLBL_PROT + tpc_offset, + QMAN_INTERNAL_MAKE_TRUSTED); + } + + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi); + + /* Configure TPC7 CP_MSG_BASE 2/3 for sync stream collective */ + if (tpc_id == 6) { + WREG32(mmTPC0_QM_CP_MSG_BASE2_ADDR_LO_0 + q_off, + mtr_base_ws_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE2_ADDR_HI_0 + q_off, + mtr_base_ws_hi); + WREG32(mmTPC0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off, + so_base_ws_lo); + WREG32(mmTPC0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off, + so_base_ws_hi); + } +} + +static void gaudi_init_tpc_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + u64 qman_base_addr; + u32 so_base_hi, tpc_offset = 0; + u32 tpc_delta = mmTPC1_CFG_SM_BASE_ADDRESS_HIGH - + mmTPC0_CFG_SM_BASE_ADDRESS_HIGH; + int i, tpc_id, internal_q_index; + + if (gaudi->hw_cap_initialized & HW_CAP_TPC_MASK) + return; + + so_base_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + + for (tpc_id = 0 ; tpc_id < TPC_NUMBER_OF_ENGINES ; tpc_id++) { + for (i = 0 ; i < QMAN_STREAMS ; i++) { + internal_q_index = GAUDI_QUEUE_ID_TPC_0_0 + + tpc_id * QMAN_STREAMS + i; + q = &gaudi->internal_qmans[internal_q_index]; + qman_base_addr = (u64) q->pq_dma_addr; + gaudi_init_tpc_qman(hdev, tpc_offset, i, + qman_base_addr); + + if (i == 3) { + /* Initializing lower CP for TPC QMAN */ + gaudi_init_tpc_qman(hdev, tpc_offset, 4, 0); + + /* Enable the QMAN and TPC channel */ + WREG32(mmTPC0_QM_GLBL_CFG0 + tpc_offset, + QMAN_TPC_ENABLE); + } + } + + WREG32(mmTPC0_CFG_SM_BASE_ADDRESS_HIGH + tpc_id * tpc_delta, + so_base_hi); + + tpc_offset += mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0; + + gaudi->hw_cap_initialized |= + FIELD_PREP(HW_CAP_TPC_MASK, 1 << tpc_id); + } +} + +static void gaudi_init_nic_qman(struct hl_device *hdev, u32 nic_offset, + int qman_id, u64 qman_base_addr, int nic_id) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 mtr_base_en_lo, mtr_base_en_hi, mtr_base_ws_lo, mtr_base_ws_hi; + u32 so_base_en_lo, so_base_en_hi, so_base_ws_lo, so_base_ws_hi; + u32 nic_qm_err_cfg, irq_handler_offset; + u32 q_off; + + mtr_base_en_lo = lower_32_bits((CFG_BASE & U32_MAX) + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_en_lo = lower_32_bits((CFG_BASE & U32_MAX) + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_en_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0); + mtr_base_ws_lo = lower_32_bits((CFG_BASE & U32_MAX) + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + mtr_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0); + so_base_ws_lo = lower_32_bits((CFG_BASE & U32_MAX) + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + so_base_ws_hi = upper_32_bits(CFG_BASE + + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0); + + q_off = nic_offset + qman_id * 4; + + WREG32(mmNIC0_QM0_PQ_BASE_LO_0 + q_off, lower_32_bits(qman_base_addr)); + WREG32(mmNIC0_QM0_PQ_BASE_HI_0 + q_off, upper_32_bits(qman_base_addr)); + + WREG32(mmNIC0_QM0_PQ_SIZE_0 + q_off, ilog2(NIC_QMAN_LENGTH)); + WREG32(mmNIC0_QM0_PQ_PI_0 + q_off, 0); + WREG32(mmNIC0_QM0_PQ_CI_0 + q_off, 0); + + WREG32(mmNIC0_QM0_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmNIC0_QM0_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmNIC0_QM0_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); + + WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo); + WREG32(mmNIC0_QM0_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi); + WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_LO_0 + q_off, so_base_en_lo); + WREG32(mmNIC0_QM0_CP_MSG_BASE1_ADDR_HI_0 + q_off, so_base_en_hi); + + /* Configure NIC CP_MSG_BASE 2/3 for sync stream collective */ + WREG32(mmNIC0_QM0_CP_MSG_BASE2_ADDR_LO_0 + q_off, mtr_base_ws_lo); + WREG32(mmNIC0_QM0_CP_MSG_BASE2_ADDR_HI_0 + q_off, mtr_base_ws_hi); + WREG32(mmNIC0_QM0_CP_MSG_BASE3_ADDR_LO_0 + q_off, so_base_ws_lo); + WREG32(mmNIC0_QM0_CP_MSG_BASE3_ADDR_HI_0 + q_off, so_base_ws_hi); + + if (qman_id == 0) { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_nic_qm_irq_ctrl); + + /* Configure RAZWI IRQ */ + nic_qm_err_cfg = NIC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; + if (hdev->stop_on_err) + nic_qm_err_cfg |= + NIC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK; + + WREG32(mmNIC0_QM0_GLBL_ERR_CFG + nic_offset, nic_qm_err_cfg); + + WREG32(mmNIC0_QM0_GLBL_ERR_ADDR_LO + nic_offset, + lower_32_bits(CFG_BASE + irq_handler_offset)); + WREG32(mmNIC0_QM0_GLBL_ERR_ADDR_HI + nic_offset, + upper_32_bits(CFG_BASE + irq_handler_offset)); + + WREG32(mmNIC0_QM0_GLBL_ERR_WDATA + nic_offset, + gaudi_irq_map_table[GAUDI_EVENT_NIC0_QM0].cpu_id + + nic_id); + + WREG32(mmNIC0_QM0_ARB_ERR_MSG_EN + nic_offset, + QM_ARB_ERR_MSG_EN_MASK); + + /* Set timeout to maximum */ + WREG32(mmNIC0_QM0_ARB_SLV_CHOISE_WDT + nic_offset, GAUDI_ARB_WDT_TIMEOUT); + + WREG32(mmNIC0_QM0_GLBL_CFG1 + nic_offset, 0); + WREG32(mmNIC0_QM0_GLBL_PROT + nic_offset, + QMAN_INTERNAL_MAKE_TRUSTED); + } +} + +static void gaudi_init_nic_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + u64 qman_base_addr; + u32 nic_offset = 0; + u32 nic_delta_between_qmans = + mmNIC0_QM1_GLBL_CFG0 - mmNIC0_QM0_GLBL_CFG0; + u32 nic_delta_between_nics = + mmNIC1_QM0_GLBL_CFG0 - mmNIC0_QM0_GLBL_CFG0; + int i, nic_id, internal_q_index; + + if (!hdev->nic_ports_mask) + return; + + if (gaudi->hw_cap_initialized & HW_CAP_NIC_MASK) + return; + + dev_dbg(hdev->dev, "Initializing NIC QMANs\n"); + + for (nic_id = 0 ; nic_id < NIC_NUMBER_OF_ENGINES ; nic_id++) { + if (!(hdev->nic_ports_mask & (1 << nic_id))) { + nic_offset += nic_delta_between_qmans; + if (nic_id & 1) { + nic_offset -= (nic_delta_between_qmans * 2); + nic_offset += nic_delta_between_nics; + } + continue; + } + + for (i = 0 ; i < QMAN_STREAMS ; i++) { + internal_q_index = GAUDI_QUEUE_ID_NIC_0_0 + + nic_id * QMAN_STREAMS + i; + q = &gaudi->internal_qmans[internal_q_index]; + qman_base_addr = (u64) q->pq_dma_addr; + gaudi_init_nic_qman(hdev, nic_offset, (i & 0x3), + qman_base_addr, nic_id); + } + + /* Enable the QMAN */ + WREG32(mmNIC0_QM0_GLBL_CFG0 + nic_offset, NIC_QMAN_ENABLE); + + nic_offset += nic_delta_between_qmans; + if (nic_id & 1) { + nic_offset -= (nic_delta_between_qmans * 2); + nic_offset += nic_delta_between_nics; + } + + gaudi->hw_cap_initialized |= 1 << (HW_CAP_NIC_SHIFT + nic_id); + } +} + +static void gaudi_disable_pci_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_PCI_DMA)) + return; + + WREG32(mmDMA0_QM_GLBL_CFG0, 0); + WREG32(mmDMA1_QM_GLBL_CFG0, 0); + WREG32(mmDMA5_QM_GLBL_CFG0, 0); +} + +static void gaudi_disable_hbm_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_HBM_DMA)) + return; + + WREG32(mmDMA2_QM_GLBL_CFG0, 0); + WREG32(mmDMA3_QM_GLBL_CFG0, 0); + WREG32(mmDMA4_QM_GLBL_CFG0, 0); + WREG32(mmDMA6_QM_GLBL_CFG0, 0); + WREG32(mmDMA7_QM_GLBL_CFG0, 0); +} + +static void gaudi_disable_mme_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MME)) + return; + + WREG32(mmMME2_QM_GLBL_CFG0, 0); + WREG32(mmMME0_QM_GLBL_CFG0, 0); +} + +static void gaudi_disable_tpc_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + u32 tpc_offset = 0; + int tpc_id; + + if (!(gaudi->hw_cap_initialized & HW_CAP_TPC_MASK)) + return; + + for (tpc_id = 0 ; tpc_id < TPC_NUMBER_OF_ENGINES ; tpc_id++) { + WREG32(mmTPC0_QM_GLBL_CFG0 + tpc_offset, 0); + tpc_offset += mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0; + } +} + +static void gaudi_disable_nic_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + u32 nic_mask, nic_offset = 0; + u32 nic_delta_between_qmans = + mmNIC0_QM1_GLBL_CFG0 - mmNIC0_QM0_GLBL_CFG0; + u32 nic_delta_between_nics = + mmNIC1_QM0_GLBL_CFG0 - mmNIC0_QM0_GLBL_CFG0; + int nic_id; + + for (nic_id = 0 ; nic_id < NIC_NUMBER_OF_ENGINES ; nic_id++) { + nic_mask = 1 << (HW_CAP_NIC_SHIFT + nic_id); + + if (gaudi->hw_cap_initialized & nic_mask) + WREG32(mmNIC0_QM0_GLBL_CFG0 + nic_offset, 0); + + nic_offset += nic_delta_between_qmans; + if (nic_id & 1) { + nic_offset -= (nic_delta_between_qmans * 2); + nic_offset += nic_delta_between_nics; + } + } +} + +static void gaudi_stop_pci_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_PCI_DMA)) + return; + + /* Stop upper CPs of QMANs 0.0 to 1.3 and 5.0 to 5.3 */ + WREG32(mmDMA0_QM_GLBL_CFG1, 0xF << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA1_QM_GLBL_CFG1, 0xF << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA5_QM_GLBL_CFG1, 0xF << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); +} + +static void gaudi_stop_hbm_dma_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_HBM_DMA)) + return; + + /* Stop CPs of HBM DMA QMANs */ + + WREG32(mmDMA2_QM_GLBL_CFG1, 0x1F << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA3_QM_GLBL_CFG1, 0x1F << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA4_QM_GLBL_CFG1, 0x1F << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA6_QM_GLBL_CFG1, 0x1F << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmDMA7_QM_GLBL_CFG1, 0x1F << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); +} + +static void gaudi_stop_mme_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MME)) + return; + + /* Stop CPs of MME QMANs */ + WREG32(mmMME2_QM_GLBL_CFG1, 0x1F << MME0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmMME0_QM_GLBL_CFG1, 0x1F << MME0_QM_GLBL_CFG1_CP_STOP_SHIFT); +} + +static void gaudi_stop_tpc_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_TPC_MASK)) + return; + + WREG32(mmTPC0_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC1_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC2_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC3_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC4_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC5_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC6_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); + WREG32(mmTPC7_QM_GLBL_CFG1, 0x1F << TPC0_QM_GLBL_CFG1_CP_STOP_SHIFT); +} + +static void gaudi_stop_nic_qmans(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + /* Stop upper CPs of QMANs */ + + if (gaudi->hw_cap_initialized & HW_CAP_NIC0) + WREG32(mmNIC0_QM0_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC1) + WREG32(mmNIC0_QM1_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC2) + WREG32(mmNIC1_QM0_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC3) + WREG32(mmNIC1_QM1_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC4) + WREG32(mmNIC2_QM0_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC5) + WREG32(mmNIC2_QM1_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC6) + WREG32(mmNIC3_QM0_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC7) + WREG32(mmNIC3_QM1_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC8) + WREG32(mmNIC4_QM0_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC9) + WREG32(mmNIC4_QM1_GLBL_CFG1, + NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK | + NIC0_QM0_GLBL_CFG1_CP_STOP_MASK); +} + +static void gaudi_pci_dma_stall(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_PCI_DMA)) + return; + + WREG32(mmDMA0_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA1_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA5_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); +} + +static void gaudi_hbm_dma_stall(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_HBM_DMA)) + return; + + WREG32(mmDMA2_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA3_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA4_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA6_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); + WREG32(mmDMA7_CORE_CFG_1, 1 << DMA0_CORE_CFG_1_HALT_SHIFT); +} + +static void gaudi_mme_stall(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MME)) + return; + + /* WA for H3-1800 bug: do ACC and SBAB writes twice */ + WREG32(mmMME0_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME0_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME0_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME0_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME1_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME1_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME1_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME1_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME2_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME2_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME2_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME2_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME3_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME3_ACC_ACC_STALL, 1 << MME_ACC_ACC_STALL_R_SHIFT); + WREG32(mmMME3_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); + WREG32(mmMME3_SBAB_SB_STALL, 1 << MME_SBAB_SB_STALL_R_SHIFT); +} + +static void gaudi_tpc_stall(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_TPC_MASK)) + return; + + WREG32(mmTPC0_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC1_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC2_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC3_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC4_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC5_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC6_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); + WREG32(mmTPC7_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); +} + +static void gaudi_disable_clock_gating(struct hl_device *hdev) +{ + u32 qman_offset; + int i; + + if (hdev->asic_prop.fw_security_enabled) + return; + + for (i = 0, qman_offset = 0 ; i < DMA_NUMBER_OF_CHANNELS ; i++) { + WREG32(mmDMA0_QM_CGM_CFG + qman_offset, 0); + WREG32(mmDMA0_QM_CGM_CFG1 + qman_offset, 0); + + qman_offset += (mmDMA1_QM_CGM_CFG - mmDMA0_QM_CGM_CFG); + } + + WREG32(mmMME0_QM_CGM_CFG, 0); + WREG32(mmMME0_QM_CGM_CFG1, 0); + WREG32(mmMME2_QM_CGM_CFG, 0); + WREG32(mmMME2_QM_CGM_CFG1, 0); + + for (i = 0, qman_offset = 0 ; i < TPC_NUMBER_OF_ENGINES ; i++) { + WREG32(mmTPC0_QM_CGM_CFG + qman_offset, 0); + WREG32(mmTPC0_QM_CGM_CFG1 + qman_offset, 0); + + qman_offset += (mmTPC1_QM_CGM_CFG - mmTPC0_QM_CGM_CFG); + } +} + +static void gaudi_enable_timestamp(struct hl_device *hdev) +{ + /* Disable the timestamp counter */ + WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); + + /* Zero the lower/upper parts of the 64-bit counter */ + WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0xC, 0); + WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0x8, 0); + + /* Enable the counter */ + WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 1); +} + +static void gaudi_disable_timestamp(struct hl_device *hdev) +{ + /* Disable the timestamp counter */ + WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); +} + +static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset) +{ + u32 wait_timeout_ms; + + if (hdev->pldm) + wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; + else + wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; + + if (fw_reset) + goto skip_engines; + + gaudi_stop_nic_qmans(hdev); + gaudi_stop_mme_qmans(hdev); + gaudi_stop_tpc_qmans(hdev); + gaudi_stop_hbm_dma_qmans(hdev); + gaudi_stop_pci_dma_qmans(hdev); + + msleep(wait_timeout_ms); + + gaudi_pci_dma_stall(hdev); + gaudi_hbm_dma_stall(hdev); + gaudi_tpc_stall(hdev); + gaudi_mme_stall(hdev); + + msleep(wait_timeout_ms); + + gaudi_disable_nic_qmans(hdev); + gaudi_disable_mme_qmans(hdev); + gaudi_disable_tpc_qmans(hdev); + gaudi_disable_hbm_dma_qmans(hdev); + gaudi_disable_pci_dma_qmans(hdev); + + gaudi_disable_timestamp(hdev); + +skip_engines: + gaudi_disable_msi(hdev); +} + +static int gaudi_mmu_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct gaudi_device *gaudi = hdev->asic_specific; + u64 hop0_addr; + int rc, i; + + if (!hdev->mmu_enable) + return 0; + + if (gaudi->hw_cap_initialized & HW_CAP_MMU) + return 0; + + for (i = 0 ; i < prop->max_asid ; i++) { + hop0_addr = prop->mmu_pgt_addr + + (i * prop->mmu_hop_table_size); + + rc = gaudi_mmu_update_asid_hop0_addr(hdev, i, hop0_addr); + if (rc) { + dev_err(hdev->dev, + "failed to set hop0 addr for asid %d\n", i); + goto err; + } + } + + /* init MMU cache manage page */ + WREG32(mmSTLB_CACHE_INV_BASE_39_8, prop->mmu_cache_mng_addr >> 8); + WREG32(mmSTLB_CACHE_INV_BASE_49_40, prop->mmu_cache_mng_addr >> 40); + + /* mem cache invalidation */ + WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1); + + hl_mmu_invalidate_cache(hdev, true, 0); + + WREG32(mmMMU_UP_MMU_ENABLE, 1); + WREG32(mmMMU_UP_SPI_MASK, 0xF); + + WREG32(mmSTLB_HOP_CONFIGURATION, 0x30440); + + /* + * The H/W expects the first PI after init to be 1. After wraparound + * we'll write 0. + */ + gaudi->mmu_cache_inv_pi = 1; + + gaudi->hw_cap_initialized |= HW_CAP_MMU; + + return 0; + +err: + return rc; +} + +static int gaudi_load_firmware_to_device(struct hl_device *hdev) +{ + void __iomem *dst; + + dst = hdev->pcie_bar[HBM_BAR_ID] + LINUX_FW_OFFSET; + + return hl_fw_load_fw_to_device(hdev, GAUDI_LINUX_FW_FILE, dst, 0, 0); +} + +static int gaudi_load_boot_fit_to_device(struct hl_device *hdev) +{ + void __iomem *dst; + + dst = hdev->pcie_bar[SRAM_BAR_ID] + BOOT_FIT_SRAM_OFFSET; + + return hl_fw_load_fw_to_device(hdev, GAUDI_BOOT_FIT_FILE, dst, 0, 0); +} + +static void gaudi_init_dynamic_firmware_loader(struct hl_device *hdev) +{ + struct dynamic_fw_load_mgr *dynamic_loader; + struct cpu_dyn_regs *dyn_regs; + + dynamic_loader = &hdev->fw_loader.dynamic_loader; + + /* + * here we update initial values for few specific dynamic regs (as + * before reading the first descriptor from FW those value has to be + * hard-coded) in later stages of the protocol those values will be + * updated automatically by reading the FW descriptor so data there + * will always be up-to-date + */ + dyn_regs = &dynamic_loader->comm_desc.cpu_dyn_regs; + dyn_regs->kmd_msg_to_cpu = + cpu_to_le32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU); + dyn_regs->cpu_cmd_status_to_host = + cpu_to_le32(mmCPU_CMD_STATUS_TO_HOST); + + dynamic_loader->wait_for_bl_timeout = GAUDI_WAIT_FOR_BL_TIMEOUT_USEC; +} + +static void gaudi_init_static_firmware_loader(struct hl_device *hdev) +{ + struct static_fw_load_mgr *static_loader; + + static_loader = &hdev->fw_loader.static_loader; + + static_loader->preboot_version_max_off = SRAM_SIZE - VERSION_MAX_LEN; + static_loader->boot_fit_version_max_off = SRAM_SIZE - VERSION_MAX_LEN; + static_loader->kmd_msg_to_cpu_reg = mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU; + static_loader->cpu_cmd_status_to_host_reg = mmCPU_CMD_STATUS_TO_HOST; + static_loader->cpu_boot_status_reg = mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS; + static_loader->cpu_boot_dev_status0_reg = mmCPU_BOOT_DEV_STS0; + static_loader->cpu_boot_dev_status1_reg = mmCPU_BOOT_DEV_STS1; + static_loader->boot_err0_reg = mmCPU_BOOT_ERR0; + static_loader->boot_err1_reg = mmCPU_BOOT_ERR1; + static_loader->preboot_version_offset_reg = mmPREBOOT_VER_OFFSET; + static_loader->boot_fit_version_offset_reg = mmUBOOT_VER_OFFSET; + static_loader->sram_offset_mask = ~(lower_32_bits(SRAM_BASE_ADDR)); + static_loader->cpu_reset_wait_msec = hdev->pldm ? + GAUDI_PLDM_RESET_WAIT_MSEC : + GAUDI_CPU_RESET_WAIT_MSEC; +} + +static void gaudi_init_firmware_preload_params(struct hl_device *hdev) +{ + struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load; + + pre_fw_load->cpu_boot_status_reg = mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS; + pre_fw_load->sts_boot_dev_sts0_reg = mmCPU_BOOT_DEV_STS0; + pre_fw_load->sts_boot_dev_sts1_reg = mmCPU_BOOT_DEV_STS1; + pre_fw_load->boot_err0_reg = mmCPU_BOOT_ERR0; + pre_fw_load->boot_err1_reg = mmCPU_BOOT_ERR1; + pre_fw_load->wait_for_preboot_timeout = GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC; +} + +static void gaudi_init_firmware_loader(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + + /* fill common fields */ + fw_loader->fw_comp_loaded = FW_TYPE_NONE; + fw_loader->boot_fit_img.image_name = GAUDI_BOOT_FIT_FILE; + fw_loader->linux_img.image_name = GAUDI_LINUX_FW_FILE; + fw_loader->cpu_timeout = GAUDI_CPU_TIMEOUT_USEC; + fw_loader->boot_fit_timeout = GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC; + fw_loader->skip_bmc = !hdev->bmc_enable; + fw_loader->sram_bar_id = SRAM_BAR_ID; + fw_loader->dram_bar_id = HBM_BAR_ID; + + if (prop->dynamic_fw_load) + gaudi_init_dynamic_firmware_loader(hdev); + else + gaudi_init_static_firmware_loader(hdev); +} + +static int gaudi_init_cpu(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int rc; + + if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU)) + return 0; + + if (gaudi->hw_cap_initialized & HW_CAP_CPU) + return 0; + + /* + * The device CPU works with 40 bits addresses. + * This register sets the extension to 50 bits. + */ + if (!hdev->asic_prop.fw_security_enabled) + WREG32(mmCPU_IF_CPU_MSB_ADDR, hdev->cpu_pci_msb_addr); + + rc = hl_fw_init_cpu(hdev); + + if (rc) + return rc; + + gaudi->hw_cap_initialized |= HW_CAP_CPU; + + return 0; +} + +static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct gaudi_device *gaudi = hdev->asic_specific; + u32 status, irq_handler_offset; + struct hl_eq *eq; + struct hl_hw_queue *cpu_pq = + &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ]; + int err; + + if (!hdev->cpu_queues_enable) + return 0; + + if (gaudi->hw_cap_initialized & HW_CAP_CPU_Q) + return 0; + + eq = &hdev->event_queue; + + WREG32(mmCPU_IF_PQ_BASE_ADDR_LOW, lower_32_bits(cpu_pq->bus_address)); + WREG32(mmCPU_IF_PQ_BASE_ADDR_HIGH, upper_32_bits(cpu_pq->bus_address)); + + WREG32(mmCPU_IF_EQ_BASE_ADDR_LOW, lower_32_bits(eq->bus_address)); + WREG32(mmCPU_IF_EQ_BASE_ADDR_HIGH, upper_32_bits(eq->bus_address)); + + WREG32(mmCPU_IF_CQ_BASE_ADDR_LOW, + lower_32_bits(hdev->cpu_accessible_dma_address)); + WREG32(mmCPU_IF_CQ_BASE_ADDR_HIGH, + upper_32_bits(hdev->cpu_accessible_dma_address)); + + WREG32(mmCPU_IF_PQ_LENGTH, HL_QUEUE_SIZE_IN_BYTES); + WREG32(mmCPU_IF_EQ_LENGTH, HL_EQ_SIZE_IN_BYTES); + WREG32(mmCPU_IF_CQ_LENGTH, HL_CPU_ACCESSIBLE_MEM_SIZE); + + /* Used for EQ CI */ + WREG32(mmCPU_IF_EQ_RD_OFFS, 0); + + WREG32(mmCPU_IF_PF_PQ_PI, 0); + + if (gaudi->multi_msi_mode) + WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP); + else + WREG32(mmCPU_IF_QUEUE_INIT, + PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); + + irq_handler_offset = prop->gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_host_pi_upd_irq); + + WREG32(irq_handler_offset, + gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id); + + err = hl_poll_timeout( + hdev, + mmCPU_IF_QUEUE_INIT, + status, + (status == PQ_INIT_STATUS_READY_FOR_HOST), + 1000, + cpu_timeout); + + if (err) { + dev_err(hdev->dev, + "Failed to communicate with Device CPU (CPU-CP timeout)\n"); + return -EIO; + } + + /* update FW application security bits */ + if (prop->fw_cpu_boot_dev_sts0_valid) + prop->fw_app_cpu_boot_dev_sts0 = RREG32(mmCPU_BOOT_DEV_STS0); + if (prop->fw_cpu_boot_dev_sts1_valid) + prop->fw_app_cpu_boot_dev_sts1 = RREG32(mmCPU_BOOT_DEV_STS1); + + gaudi->hw_cap_initialized |= HW_CAP_CPU_Q; + return 0; +} + +static void gaudi_pre_hw_init(struct hl_device *hdev) +{ + /* Perform read from the device to make sure device is up */ + RREG32(mmHW_STATE); + + if (!hdev->asic_prop.fw_security_enabled) { + /* Set the access through PCI bars (Linux driver only) as + * secured + */ + WREG32(mmPCIE_WRAP_LBW_PROT_OVR, + (PCIE_WRAP_LBW_PROT_OVR_RD_EN_MASK | + PCIE_WRAP_LBW_PROT_OVR_WR_EN_MASK)); + + /* Perform read to flush the waiting writes to ensure + * configuration was set in the device + */ + RREG32(mmPCIE_WRAP_LBW_PROT_OVR); + } + + /* + * Let's mark in the H/W that we have reached this point. We check + * this value in the reset_before_init function to understand whether + * we need to reset the chip before doing H/W init. This register is + * cleared by the H/W upon H/W reset + */ + WREG32(mmHW_STATE, HL_DEVICE_HW_STATE_DIRTY); +} + +static int gaudi_hw_init(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int rc; + + gaudi_pre_hw_init(hdev); + + /* If iATU is done by FW, the HBM bar ALWAYS points to DRAM_PHYS_BASE. + * So we set it here and if anyone tries to move it later to + * a different address, there will be an error + */ + if (hdev->asic_prop.iatu_done_by_fw) + gaudi->hbm_bar_cur_addr = DRAM_PHYS_BASE; + + /* + * Before pushing u-boot/linux to device, need to set the hbm bar to + * base address of dram + */ + if (gaudi_set_hbm_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) { + dev_err(hdev->dev, + "failed to map HBM bar to DRAM base address\n"); + return -EIO; + } + + rc = gaudi_init_cpu(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize CPU\n"); + return rc; + } + + /* In case the clock gating was enabled in preboot we need to disable + * it here before touching the MME/TPC registers. + */ + gaudi_disable_clock_gating(hdev); + + /* SRAM scrambler must be initialized after CPU is running from HBM */ + gaudi_init_scrambler_sram(hdev); + + /* This is here just in case we are working without CPU */ + gaudi_init_scrambler_hbm(hdev); + + gaudi_init_golden_registers(hdev); + + rc = gaudi_mmu_init(hdev); + if (rc) + return rc; + + gaudi_init_security(hdev); + + gaudi_init_pci_dma_qmans(hdev); + + gaudi_init_hbm_dma_qmans(hdev); + + gaudi_init_mme_qmans(hdev); + + gaudi_init_tpc_qmans(hdev); + + gaudi_init_nic_qmans(hdev); + + gaudi_enable_timestamp(hdev); + + /* MSI must be enabled before CPU queues and NIC are initialized */ + rc = gaudi_enable_msi(hdev); + if (rc) + goto disable_queues; + + /* must be called after MSI was enabled */ + rc = gaudi_init_cpu_queues(hdev, GAUDI_CPU_TIMEOUT_USEC); + if (rc) { + dev_err(hdev->dev, "failed to initialize CPU H/W queues %d\n", + rc); + goto disable_msi; + } + + /* Perform read from the device to flush all configuration */ + RREG32(mmHW_STATE); + + return 0; + +disable_msi: + gaudi_disable_msi(hdev); +disable_queues: + gaudi_disable_mme_qmans(hdev); + gaudi_disable_pci_dma_qmans(hdev); + + return rc; +} + +static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 status, reset_timeout_ms, cpu_timeout_ms, irq_handler_offset; + struct gaudi_device *gaudi = hdev->asic_specific; + bool driver_performs_reset; + + if (!hard_reset) { + dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); + return; + } + + if (hdev->pldm) { + reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; + cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; + } else { + reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; + cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; + } + + if (fw_reset) { + dev_dbg(hdev->dev, + "Firmware performs HARD reset, going to wait %dms\n", + reset_timeout_ms); + + goto skip_reset; + } + + driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled && + !hdev->asic_prop.hard_reset_done_by_fw); + + /* Set device to handle FLR by H/W as we will put the device CPU to + * halt mode + */ + if (driver_performs_reset) + WREG32(mmPCIE_AUX_FLR_CTRL, (PCIE_AUX_FLR_CTRL_HW_CTRL_MASK | + PCIE_AUX_FLR_CTRL_INT_MASK_MASK)); + + /* If linux is loaded in the device CPU we need to communicate with it + * via the GIC. Otherwise, we need to use COMMS or the MSG_TO_CPU + * registers in case of old F/Ws + */ + if (hdev->fw_loader.fw_comp_loaded & FW_TYPE_LINUX) { + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_host_halt_irq); + + WREG32(irq_handler_offset, + gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id); + + /* This is a hail-mary attempt to revive the card in the small chance that the + * f/w has experienced a watchdog event, which caused it to return back to preboot. + * In that case, triggering reset through GIC won't help. We need to trigger the + * reset as if Linux wasn't loaded. + * + * We do it only if the reset cause was HB, because that would be the indication + * of such an event. + * + * In case watchdog hasn't expired but we still got HB, then this won't do any + * damage. + */ + if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) { + if (hdev->asic_prop.hard_reset_done_by_fw) + hl_fw_ask_hard_reset_without_linux(hdev); + else + hl_fw_ask_halt_machine_without_linux(hdev); + } + } else { + if (hdev->asic_prop.hard_reset_done_by_fw) + hl_fw_ask_hard_reset_without_linux(hdev); + else + hl_fw_ask_halt_machine_without_linux(hdev); + } + + if (driver_performs_reset) { + + /* Configure the reset registers. Must be done as early as + * possible in case we fail during H/W initialization + */ + WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST_CFG_H, + (CFG_RST_H_DMA_MASK | + CFG_RST_H_MME_MASK | + CFG_RST_H_SM_MASK | + CFG_RST_H_TPC_7_MASK)); + + WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST_CFG_L, CFG_RST_L_TPC_MASK); + + WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST_CFG_H, + (CFG_RST_H_HBM_MASK | + CFG_RST_H_TPC_7_MASK | + CFG_RST_H_NIC_MASK | + CFG_RST_H_SM_MASK | + CFG_RST_H_DMA_MASK | + CFG_RST_H_MME_MASK | + CFG_RST_H_CPU_MASK | + CFG_RST_H_MMU_MASK)); + + WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST_CFG_L, + (CFG_RST_L_IF_MASK | + CFG_RST_L_PSOC_MASK | + CFG_RST_L_TPC_MASK)); + + msleep(cpu_timeout_ms); + + /* Tell ASIC not to re-initialize PCIe */ + WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); + + /* Restart BTL/BLR upon hard-reset */ + WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1); + + WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, + 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); + + dev_dbg(hdev->dev, + "Issued HARD reset command, going to wait %dms\n", + reset_timeout_ms); + } else { + dev_dbg(hdev->dev, + "Firmware performs HARD reset, going to wait %dms\n", + reset_timeout_ms); + } + +skip_reset: + /* + * After hard reset, we can't poll the BTM_FSM register because the PSOC + * itself is in reset. Need to wait until the reset is deasserted + */ + msleep(reset_timeout_ms); + + status = RREG32(mmPSOC_GLOBAL_CONF_BTM_FSM); + if (status & PSOC_GLOBAL_CONF_BTM_FSM_STATE_MASK) + dev_err(hdev->dev, + "Timeout while waiting for device to reset 0x%x\n", + status); + + if (gaudi) { + gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q | HW_CAP_HBM | + HW_CAP_PCI_DMA | HW_CAP_MME | HW_CAP_TPC_MASK | + HW_CAP_HBM_DMA | HW_CAP_PLL | HW_CAP_NIC_MASK | + HW_CAP_MMU | HW_CAP_SRAM_SCRAMBLER | + HW_CAP_HBM_SCRAMBLER); + + memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat)); + + hdev->device_cpu_is_halted = false; + } +} + +static int gaudi_suspend(struct hl_device *hdev) +{ + int rc; + + rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); + if (rc) + dev_err(hdev->dev, "Failed to disable PCI access from CPU\n"); + + return rc; +} + +static int gaudi_resume(struct hl_device *hdev) +{ + return gaudi_init_iatu(hdev); +} + +static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size) +{ + int rc; + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE; + + rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, + (dma_addr - HOST_PHYS_BASE), size); + if (rc) + dev_err(hdev->dev, "dma_mmap_coherent error %d", rc); + + return rc; +} + +static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 db_reg_offset, db_value, dma_qm_offset, q_off, irq_handler_offset; + struct gaudi_device *gaudi = hdev->asic_specific; + bool invalid_queue = false; + int dma_id; + + switch (hw_queue_id) { + case GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3: + dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_1]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + (hw_queue_id & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3: + dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + (hw_queue_id & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_1]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_2]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_3]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_4]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_5]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3: + dma_id = gaudi_dma_assignment[GAUDI_HBM_DMA_6]; + dma_qm_offset = dma_id * DMA_QMAN_OFFSET; + q_off = dma_qm_offset + ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmDMA0_QM_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_CPU_PQ: + if (gaudi->hw_cap_initialized & HW_CAP_CPU_Q) + db_reg_offset = mmCPU_IF_PF_PQ_PI; + else + invalid_queue = true; + break; + + case GAUDI_QUEUE_ID_MME_0_0: + db_reg_offset = mmMME2_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_MME_0_1: + db_reg_offset = mmMME2_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_MME_0_2: + db_reg_offset = mmMME2_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_MME_0_3: + db_reg_offset = mmMME2_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_MME_1_0: + db_reg_offset = mmMME0_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_MME_1_1: + db_reg_offset = mmMME0_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_MME_1_2: + db_reg_offset = mmMME0_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_MME_1_3: + db_reg_offset = mmMME0_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_0_0: + db_reg_offset = mmTPC0_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_0_1: + db_reg_offset = mmTPC0_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_0_2: + db_reg_offset = mmTPC0_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_0_3: + db_reg_offset = mmTPC0_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_1_0: + db_reg_offset = mmTPC1_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_1_1: + db_reg_offset = mmTPC1_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_1_2: + db_reg_offset = mmTPC1_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_1_3: + db_reg_offset = mmTPC1_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_2_0: + db_reg_offset = mmTPC2_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_2_1: + db_reg_offset = mmTPC2_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_2_2: + db_reg_offset = mmTPC2_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_2_3: + db_reg_offset = mmTPC2_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_3_0: + db_reg_offset = mmTPC3_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_3_1: + db_reg_offset = mmTPC3_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_3_2: + db_reg_offset = mmTPC3_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_3_3: + db_reg_offset = mmTPC3_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_4_0: + db_reg_offset = mmTPC4_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_4_1: + db_reg_offset = mmTPC4_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_4_2: + db_reg_offset = mmTPC4_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_4_3: + db_reg_offset = mmTPC4_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_5_0: + db_reg_offset = mmTPC5_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_5_1: + db_reg_offset = mmTPC5_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_5_2: + db_reg_offset = mmTPC5_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_5_3: + db_reg_offset = mmTPC5_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_6_0: + db_reg_offset = mmTPC6_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_6_1: + db_reg_offset = mmTPC6_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_6_2: + db_reg_offset = mmTPC6_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_6_3: + db_reg_offset = mmTPC6_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_TPC_7_0: + db_reg_offset = mmTPC7_QM_PQ_PI_0; + break; + + case GAUDI_QUEUE_ID_TPC_7_1: + db_reg_offset = mmTPC7_QM_PQ_PI_1; + break; + + case GAUDI_QUEUE_ID_TPC_7_2: + db_reg_offset = mmTPC7_QM_PQ_PI_2; + break; + + case GAUDI_QUEUE_ID_TPC_7_3: + db_reg_offset = mmTPC7_QM_PQ_PI_3; + break; + + case GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC0)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC0_QM0_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC1)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC0_QM1_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC2)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC1_QM0_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC3)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC1_QM1_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC4)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC2_QM0_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC5)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC2_QM1_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC6)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC3_QM0_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC7)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC3_QM1_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC8)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC4_QM0_PQ_PI_0 + q_off; + break; + + case GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3: + if (!(gaudi->hw_cap_initialized & HW_CAP_NIC9)) + invalid_queue = true; + + q_off = ((hw_queue_id - 1) & 0x3) * 4; + db_reg_offset = mmNIC4_QM1_PQ_PI_0 + q_off; + break; + + default: + invalid_queue = true; + } + + if (invalid_queue) { + /* Should never get here */ + dev_err(hdev->dev, "h/w queue %d is invalid. Can't set pi\n", + hw_queue_id); + return; + } + + db_value = pi; + + /* ring the doorbell */ + WREG32(db_reg_offset, db_value); + + if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) { + /* make sure device CPU will read latest data from host */ + mb(); + + irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_host_pi_upd_irq); + + WREG32(irq_handler_offset, + gaudi_irq_map_table[GAUDI_EVENT_PI_UPDATE].cpu_id); + } +} + +static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe, + struct hl_bd *bd) +{ + __le64 *pbd = (__le64 *) bd; + + /* The QMANs are on the host memory so a simple copy suffice */ + pqe[0] = pbd[0]; + pqe[1] = pbd[1]; +} + +static void *gaudi_dma_alloc_coherent(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + void *kernel_addr = dma_alloc_coherent(&hdev->pdev->dev, size, + dma_handle, flags); + + /* Shift to the device's base physical address of host memory */ + if (kernel_addr) + *dma_handle += HOST_PHYS_BASE; + + return kernel_addr; +} + +static void gaudi_dma_free_coherent(struct hl_device *hdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + /* Cancel the device's base physical address of host memory */ + dma_addr_t fixed_dma_handle = dma_handle - HOST_PHYS_BASE; + + dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle); +} + +static int gaudi_scrub_device_dram(struct hl_device *hdev, u64 val) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 cur_addr = prop->dram_user_base_address; + u32 chunk_size, busy; + int rc, dma_id; + + while (cur_addr < prop->dram_end_address) { + for (dma_id = 0 ; dma_id < DMA_NUMBER_OF_CHANNELS ; dma_id++) { + u32 dma_offset = dma_id * DMA_CORE_OFFSET; + + chunk_size = + min((u64)SZ_2G, prop->dram_end_address - cur_addr); + + dev_dbg(hdev->dev, + "Doing HBM scrubbing for 0x%09llx - 0x%09llx\n", + cur_addr, cur_addr + chunk_size); + + WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, + lower_32_bits(val)); + WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, + upper_32_bits(val)); + WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset, + lower_32_bits(cur_addr)); + WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset, + upper_32_bits(cur_addr)); + WREG32(mmDMA0_CORE_DST_TSIZE_0 + dma_offset, + chunk_size); + WREG32(mmDMA0_CORE_COMMIT + dma_offset, + ((1 << DMA0_CORE_COMMIT_LIN_SHIFT) | + (1 << DMA0_CORE_COMMIT_MEM_SET_SHIFT))); + + cur_addr += chunk_size; + + if (cur_addr == prop->dram_end_address) + break; + } + + for (dma_id = 0 ; dma_id < DMA_NUMBER_OF_CHANNELS ; dma_id++) { + u32 dma_offset = dma_id * DMA_CORE_OFFSET; + + rc = hl_poll_timeout( + hdev, + mmDMA0_CORE_STS0 + dma_offset, + busy, + ((busy & DMA0_CORE_STS0_BUSY_MASK) == 0), + 1000, + HBM_SCRUBBING_TIMEOUT_US); + + if (rc) { + dev_err(hdev->dev, + "DMA Timeout during HBM scrubbing of DMA #%d\n", + dma_id); + return -EIO; + } + } + } + + return 0; +} + +static int gaudi_scrub_device_mem(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 wait_to_idle_time = hdev->pdev ? HBM_SCRUBBING_TIMEOUT_US : + min_t(u64, HBM_SCRUBBING_TIMEOUT_US * 10, HL_SIM_MAX_TIMEOUT_US); + u64 addr, size, val = hdev->memory_scrub_val; + ktime_t timeout; + int rc = 0; + + if (!hdev->memory_scrub) + return 0; + + timeout = ktime_add_us(ktime_get(), wait_to_idle_time); + while (!hdev->asic_funcs->is_device_idle(hdev, NULL, 0, NULL)) { + if (ktime_compare(ktime_get(), timeout) > 0) { + dev_err(hdev->dev, "waiting for idle timeout\n"); + return -ETIMEDOUT; + } + usleep_range((1000 >> 2) + 1, 1000); + } + + /* Scrub SRAM */ + addr = prop->sram_user_base_address; + size = hdev->pldm ? 0x10000 : prop->sram_size - SRAM_USER_BASE_OFFSET; + + dev_dbg(hdev->dev, "Scrubbing SRAM: 0x%09llx - 0x%09llx val: 0x%llx\n", + addr, addr + size, val); + rc = gaudi_memset_device_memory(hdev, addr, size, val); + if (rc) { + dev_err(hdev->dev, "Failed to clear SRAM (%d)\n", rc); + return rc; + } + + /* Scrub HBM using all DMA channels in parallel */ + rc = gaudi_scrub_device_dram(hdev, val); + if (rc) { + dev_err(hdev->dev, "Failed to clear HBM (%d)\n", rc); + return rc; + } + + return 0; +} + +static void *gaudi_get_int_queue_base(struct hl_device *hdev, + u32 queue_id, dma_addr_t *dma_handle, + u16 *queue_len) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct gaudi_internal_qman_info *q; + + if (queue_id >= GAUDI_QUEUE_ID_SIZE || + gaudi_queue_type[queue_id] != QUEUE_TYPE_INT) { + dev_err(hdev->dev, "Got invalid queue id %d\n", queue_id); + return NULL; + } + + q = &gaudi->internal_qmans[queue_id]; + *dma_handle = q->pq_dma_addr; + *queue_len = q->pq_size / QMAN_PQ_ENTRY_SIZE; + + return q->pq_kernel_addr; +} + +static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg, + u16 len, u32 timeout, u64 *result) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) { + if (result) + *result = 0; + return 0; + } + + if (!timeout) + timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC; + + return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len, + timeout, result); +} + +static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id) +{ + struct packet_msg_prot *fence_pkt; + dma_addr_t pkt_dma_addr; + u32 fence_val, tmp, timeout_usec; + dma_addr_t fence_dma_addr; + u32 *fence_ptr; + int rc; + + if (hdev->pldm) + timeout_usec = GAUDI_PLDM_TEST_QUEUE_WAIT_USEC; + else + timeout_usec = GAUDI_TEST_QUEUE_WAIT_USEC; + + fence_val = GAUDI_QMAN0_FENCE_VAL; + + fence_ptr = hl_asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL, &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate memory for H/W queue %d testing\n", + hw_queue_id); + return -ENOMEM; + } + + *fence_ptr = 0; + + fence_pkt = hl_asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_prot), GFP_KERNEL, + &pkt_dma_addr); + if (!fence_pkt) { + dev_err(hdev->dev, + "Failed to allocate packet for H/W queue %d testing\n", + hw_queue_id); + rc = -ENOMEM; + goto free_fence_ptr; + } + + tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + fence_pkt->ctl = cpu_to_le32(tmp); + fence_pkt->value = cpu_to_le32(fence_val); + fence_pkt->addr = cpu_to_le64(fence_dma_addr); + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, + sizeof(struct packet_msg_prot), + pkt_dma_addr); + if (rc) { + dev_err(hdev->dev, + "Failed to send fence packet to H/W queue %d\n", + hw_queue_id); + goto free_pkt; + } + + rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, (tmp == fence_val), + 1000, timeout_usec, true); + + hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, + "H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n", + hw_queue_id, (unsigned long long) fence_dma_addr, tmp); + rc = -EIO; + } + +free_pkt: + hl_asic_dma_pool_free(hdev, (void *) fence_pkt, pkt_dma_addr); +free_fence_ptr: + hl_asic_dma_pool_free(hdev, (void *) fence_ptr, fence_dma_addr); + return rc; +} + +static int gaudi_test_cpu_queue(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + /* + * check capability here as send_cpu_message() won't update the result + * value if no capability + */ + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + return hl_fw_test_cpu_queue(hdev); +} + +static int gaudi_test_queues(struct hl_device *hdev) +{ + int i, rc, ret_val = 0; + + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++) { + if (hdev->asic_prop.hw_queues_props[i].type == QUEUE_TYPE_EXT) { + rc = gaudi_test_queue(hdev, i); + if (rc) + ret_val = -EINVAL; + } + } + + rc = gaudi_test_cpu_queue(hdev); + if (rc) + ret_val = -EINVAL; + + return ret_val; +} + +static void *gaudi_dma_pool_zalloc(struct hl_device *hdev, size_t size, + gfp_t mem_flags, dma_addr_t *dma_handle) +{ + void *kernel_addr; + + if (size > GAUDI_DMA_POOL_BLK_SIZE) + return NULL; + + kernel_addr = dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle); + + /* Shift to the device's base physical address of host memory */ + if (kernel_addr) + *dma_handle += HOST_PHYS_BASE; + + return kernel_addr; +} + +static void gaudi_dma_pool_free(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr) +{ + /* Cancel the device's base physical address of host memory */ + dma_addr_t fixed_dma_addr = dma_addr - HOST_PHYS_BASE; + + dma_pool_free(hdev->dma_pool, vaddr, fixed_dma_addr); +} + +static void *gaudi_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, + size_t size, dma_addr_t *dma_handle) +{ + return hl_fw_cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); +} + +static void gaudi_cpu_accessible_dma_pool_free(struct hl_device *hdev, + size_t size, void *vaddr) +{ + hl_fw_cpu_accessible_dma_pool_free(hdev, size, vaddr); +} + +static u32 gaudi_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt) +{ + struct scatterlist *sg, *sg_next_iter; + u32 count, dma_desc_cnt; + u64 len, len_next; + dma_addr_t addr, addr_next; + + dma_desc_cnt = 0; + + for_each_sgtable_dma_sg(sgt, sg, count) { + len = sg_dma_len(sg); + addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((addr + len == addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + dma_desc_cnt++; + } + + return dma_desc_cnt * sizeof(struct packet_lin_dma); +} + +static int gaudi_pin_memory_before_cs(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + u64 addr, enum dma_data_direction dir) +{ + struct hl_userptr *userptr; + int rc; + + if (hl_userptr_is_pinned(hdev, addr, le32_to_cpu(user_dma_pkt->tsize), + parser->job_userptr_list, &userptr)) + goto already_pinned; + + userptr = kzalloc(sizeof(*userptr), GFP_KERNEL); + if (!userptr) + return -ENOMEM; + + rc = hl_pin_host_memory(hdev, addr, le32_to_cpu(user_dma_pkt->tsize), + userptr); + if (rc) + goto free_userptr; + + list_add_tail(&userptr->job_node, parser->job_userptr_list); + + rc = hdev->asic_funcs->asic_dma_map_sgtable(hdev, userptr->sgt, dir); + if (rc) { + dev_err(hdev->dev, "failed to map sgt with DMA region\n"); + goto unpin_memory; + } + + userptr->dma_mapped = true; + userptr->dir = dir; + +already_pinned: + parser->patched_cb_size += + gaudi_get_dma_desc_list_size(hdev, userptr->sgt); + + return 0; + +unpin_memory: + list_del(&userptr->job_node); + hl_unpin_host_memory(hdev, userptr); +free_userptr: + kfree(userptr); + return rc; +} + +static int gaudi_validate_dma_pkt_host(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + bool src_in_host) +{ + enum dma_data_direction dir; + bool skip_host_mem_pin = false, user_memset; + u64 addr; + int rc = 0; + + user_memset = (le32_to_cpu(user_dma_pkt->ctl) & + GAUDI_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GAUDI_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + if (src_in_host) { + if (user_memset) + skip_host_mem_pin = true; + + dev_dbg(hdev->dev, "DMA direction is HOST --> DEVICE\n"); + dir = DMA_TO_DEVICE; + addr = le64_to_cpu(user_dma_pkt->src_addr); + } else { + dev_dbg(hdev->dev, "DMA direction is DEVICE --> HOST\n"); + dir = DMA_FROM_DEVICE; + addr = (le64_to_cpu(user_dma_pkt->dst_addr) & + GAUDI_PKT_LIN_DMA_DST_ADDR_MASK) >> + GAUDI_PKT_LIN_DMA_DST_ADDR_SHIFT; + } + + if (skip_host_mem_pin) + parser->patched_cb_size += sizeof(*user_dma_pkt); + else + rc = gaudi_pin_memory_before_cs(hdev, parser, user_dma_pkt, + addr, dir); + + return rc; +} + +static int gaudi_validate_dma_pkt_no_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + bool src_in_host = false; + u64 dst_addr = (le64_to_cpu(user_dma_pkt->dst_addr) & + GAUDI_PKT_LIN_DMA_DST_ADDR_MASK) >> + GAUDI_PKT_LIN_DMA_DST_ADDR_SHIFT; + + dev_dbg(hdev->dev, "DMA packet details:\n"); + dev_dbg(hdev->dev, "source == 0x%llx\n", + le64_to_cpu(user_dma_pkt->src_addr)); + dev_dbg(hdev->dev, "destination == 0x%llx\n", dst_addr); + dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize)); + + /* + * Special handling for DMA with size 0. Bypass all validations + * because no transactions will be done except for WR_COMP, which + * is not a security issue + */ + if (!le32_to_cpu(user_dma_pkt->tsize)) { + parser->patched_cb_size += sizeof(*user_dma_pkt); + return 0; + } + + if (parser->hw_queue_id <= GAUDI_QUEUE_ID_DMA_0_3) + src_in_host = true; + + return gaudi_validate_dma_pkt_host(hdev, parser, user_dma_pkt, + src_in_host); +} + +static int gaudi_validate_load_and_exe_pkt(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_load_and_exe *user_pkt) +{ + u32 cfg; + + cfg = le32_to_cpu(user_pkt->cfg); + + if (cfg & GAUDI_PKT_LOAD_AND_EXE_CFG_DST_MASK) { + dev_err(hdev->dev, + "User not allowed to use Load and Execute\n"); + return -EPERM; + } + + parser->patched_cb_size += sizeof(struct packet_load_and_exe); + + return 0; +} + +static int gaudi_validate_cb(struct hl_device *hdev, + struct hl_cs_parser *parser, bool is_mmu) +{ + u32 cb_parsed_length = 0; + int rc = 0; + + parser->patched_cb_size = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + struct gaudi_packet *user_pkt; + + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; + + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + if (!validate_packet_id(pkt_id)) { + dev_err(hdev->dev, "Invalid packet id %u\n", pkt_id); + rc = -EINVAL; + break; + } + + pkt_size = gaudi_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_WREG_BULK: + dev_err(hdev->dev, + "User not allowed to use WREG_BULK\n"); + rc = -EPERM; + break; + + case PACKET_LOAD_AND_EXE: + rc = gaudi_validate_load_and_exe_pkt(hdev, parser, + (struct packet_load_and_exe *) user_pkt); + break; + + case PACKET_LIN_DMA: + parser->contains_dma_pkt = true; + if (is_mmu) + parser->patched_cb_size += pkt_size; + else + rc = gaudi_validate_dma_pkt_no_mmu(hdev, parser, + (struct packet_lin_dma *) user_pkt); + break; + + case PACKET_WREG_32: + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_REPEAT: + case PACKET_FENCE: + case PACKET_NOP: + case PACKET_ARB_POINT: + parser->patched_cb_size += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + /* + * The new CB should have space at the end for two MSG_PROT packets: + * 1. Optional NOP padding for cacheline alignment + * 2. A packet that will act as a completion packet + * 3. A packet that will generate MSI interrupt + */ + if (parser->completion) + parser->patched_cb_size += gaudi_get_patched_cb_extra_size( + parser->patched_cb_size); + + return rc; +} + +static int gaudi_patch_dma_packet(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + struct packet_lin_dma *new_dma_pkt, + u32 *new_dma_pkt_size) +{ + struct hl_userptr *userptr; + struct scatterlist *sg, *sg_next_iter; + u32 count, dma_desc_cnt, user_wrcomp_en_mask, ctl; + u64 len, len_next; + dma_addr_t dma_addr, dma_addr_next; + u64 device_memory_addr, addr; + enum dma_data_direction dir; + struct sg_table *sgt; + bool src_in_host = false; + bool skip_host_mem_pin = false; + bool user_memset; + + ctl = le32_to_cpu(user_dma_pkt->ctl); + + if (parser->hw_queue_id <= GAUDI_QUEUE_ID_DMA_0_3) + src_in_host = true; + + user_memset = (ctl & GAUDI_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GAUDI_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + if (src_in_host) { + addr = le64_to_cpu(user_dma_pkt->src_addr); + device_memory_addr = le64_to_cpu(user_dma_pkt->dst_addr); + dir = DMA_TO_DEVICE; + if (user_memset) + skip_host_mem_pin = true; + } else { + addr = le64_to_cpu(user_dma_pkt->dst_addr); + device_memory_addr = le64_to_cpu(user_dma_pkt->src_addr); + dir = DMA_FROM_DEVICE; + } + + if ((!skip_host_mem_pin) && + (!hl_userptr_is_pinned(hdev, addr, + le32_to_cpu(user_dma_pkt->tsize), + parser->job_userptr_list, &userptr))) { + dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n", + addr, user_dma_pkt->tsize); + return -EFAULT; + } + + if ((user_memset) && (dir == DMA_TO_DEVICE)) { + memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt)); + *new_dma_pkt_size = sizeof(*user_dma_pkt); + return 0; + } + + user_wrcomp_en_mask = ctl & GAUDI_PKT_LIN_DMA_CTL_WRCOMP_EN_MASK; + + sgt = userptr->sgt; + dma_desc_cnt = 0; + + for_each_sgtable_dma_sg(sgt, sg, count) { + len = sg_dma_len(sg); + dma_addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + dma_addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((dma_addr + len == dma_addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + ctl = le32_to_cpu(user_dma_pkt->ctl); + if (likely(dma_desc_cnt)) + ctl &= ~GAUDI_PKT_CTL_EB_MASK; + ctl &= ~GAUDI_PKT_LIN_DMA_CTL_WRCOMP_EN_MASK; + new_dma_pkt->ctl = cpu_to_le32(ctl); + new_dma_pkt->tsize = cpu_to_le32(len); + + if (dir == DMA_TO_DEVICE) { + new_dma_pkt->src_addr = cpu_to_le64(dma_addr); + new_dma_pkt->dst_addr = cpu_to_le64(device_memory_addr); + } else { + new_dma_pkt->src_addr = cpu_to_le64(device_memory_addr); + new_dma_pkt->dst_addr = cpu_to_le64(dma_addr); + } + + if (!user_memset) + device_memory_addr += len; + dma_desc_cnt++; + new_dma_pkt++; + } + + if (!dma_desc_cnt) { + dev_err(hdev->dev, + "Error of 0 SG entries when patching DMA packet\n"); + return -EFAULT; + } + + /* Fix the last dma packet - wrcomp must be as user set it */ + new_dma_pkt--; + new_dma_pkt->ctl |= cpu_to_le32(user_wrcomp_en_mask); + + *new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma); + + return 0; +} + +static int gaudi_patch_cb(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u32 cb_parsed_length = 0; + u32 cb_patched_cur_length = 0; + int rc = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + u32 new_pkt_size = 0; + struct gaudi_packet *user_pkt, *kernel_pkt; + + user_pkt = parser->user_cb->kernel_address + cb_parsed_length; + kernel_pkt = parser->patched_cb->kernel_address + + cb_patched_cur_length; + + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + if (!validate_packet_id(pkt_id)) { + dev_err(hdev->dev, "Invalid packet id %u\n", pkt_id); + rc = -EINVAL; + break; + } + + pkt_size = gaudi_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_LIN_DMA: + rc = gaudi_patch_dma_packet(hdev, parser, + (struct packet_lin_dma *) user_pkt, + (struct packet_lin_dma *) kernel_pkt, + &new_pkt_size); + cb_patched_cur_length += new_pkt_size; + break; + + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_WREG_32: + case PACKET_WREG_BULK: + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_REPEAT: + case PACKET_FENCE: + case PACKET_NOP: + case PACKET_ARB_POINT: + case PACKET_LOAD_AND_EXE: + memcpy(kernel_pkt, user_pkt, pkt_size); + cb_patched_cur_length += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + return rc; +} + +static int gaudi_parse_cb_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u64 handle; + u32 patched_cb_size; + struct hl_cb *user_cb; + int rc; + + /* + * The new CB should have space at the end for two MSG_PROT packets: + * 1. Optional NOP padding for cacheline alignment + * 2. A packet that will act as a completion packet + * 3. A packet that will generate MSI interrupt + */ + if (parser->completion) + parser->patched_cb_size = parser->user_cb_size + + gaudi_get_patched_cb_extra_size(parser->user_cb_size); + else + parser->patched_cb_size = parser->user_cb_size; + + rc = hl_cb_create(hdev, &hdev->kernel_mem_mgr, hdev->kernel_ctx, + parser->patched_cb_size, false, false, + &handle); + + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", + rc); + return rc; + } + + parser->patched_cb = hl_cb_get(&hdev->kernel_mem_mgr, handle); + /* hl_cb_get should never fail */ + if (!parser->patched_cb) { + dev_crit(hdev->dev, "DMA CB handle invalid 0x%llx\n", handle); + rc = -EFAULT; + goto out; + } + + /* + * We are protected from overflow because the check + * "parser->user_cb_size <= parser->user_cb->size" was done in get_cb_from_cs_chunk() + * in the common code. That check is done only if is_kernel_allocated_cb is true. + * + * There is no option to reach here without going through that check because: + * 1. validate_queue_index() assigns true to is_kernel_allocated_cb for any submission to + * an external queue. + * 2. For Gaudi, we only parse CBs that were submitted to the external queues. + */ + memcpy(parser->patched_cb->kernel_address, + parser->user_cb->kernel_address, + parser->user_cb_size); + + patched_cb_size = parser->patched_cb_size; + + /* Validate patched CB instead of user CB */ + user_cb = parser->user_cb; + parser->user_cb = parser->patched_cb; + rc = gaudi_validate_cb(hdev, parser, true); + parser->user_cb = user_cb; + + if (rc) { + hl_cb_put(parser->patched_cb); + goto out; + } + + if (patched_cb_size != parser->patched_cb_size) { + dev_err(hdev->dev, "user CB size mismatch\n"); + hl_cb_put(parser->patched_cb); + rc = -EINVAL; + goto out; + } + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(&hdev->kernel_mem_mgr, handle); + + return rc; +} + +static int gaudi_parse_cb_no_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u64 handle; + int rc; + + rc = gaudi_validate_cb(hdev, parser, false); + + if (rc) + goto free_userptr; + + rc = hl_cb_create(hdev, &hdev->kernel_mem_mgr, hdev->kernel_ctx, + parser->patched_cb_size, false, false, + &handle); + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", rc); + goto free_userptr; + } + + parser->patched_cb = hl_cb_get(&hdev->kernel_mem_mgr, handle); + /* hl_cb_get should never fail here */ + if (!parser->patched_cb) { + dev_crit(hdev->dev, "DMA CB handle invalid 0x%llx\n", handle); + rc = -EFAULT; + goto out; + } + + rc = gaudi_patch_cb(hdev, parser); + + if (rc) + hl_cb_put(parser->patched_cb); + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(&hdev->kernel_mem_mgr, handle); + +free_userptr: + if (rc) + hl_userptr_delete_list(hdev, parser->job_userptr_list); + return rc; +} + +static int gaudi_parse_cb_no_ext_queue(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + struct asic_fixed_properties *asic_prop = &hdev->asic_prop; + struct gaudi_device *gaudi = hdev->asic_specific; + u32 nic_queue_offset, nic_mask_q_id; + + if ((parser->hw_queue_id >= GAUDI_QUEUE_ID_NIC_0_0) && + (parser->hw_queue_id <= GAUDI_QUEUE_ID_NIC_9_3)) { + nic_queue_offset = parser->hw_queue_id - GAUDI_QUEUE_ID_NIC_0_0; + nic_mask_q_id = 1 << (HW_CAP_NIC_SHIFT + (nic_queue_offset >> 2)); + + if (!(gaudi->hw_cap_initialized & nic_mask_q_id)) { + dev_err(hdev->dev, "h/w queue %d is disabled\n", parser->hw_queue_id); + return -EINVAL; + } + } + + /* For internal queue jobs just check if CB address is valid */ + if (hl_mem_area_inside_range((u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->sram_user_base_address, + asic_prop->sram_end_address)) + return 0; + + if (hl_mem_area_inside_range((u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->dram_user_base_address, + asic_prop->dram_end_address)) + return 0; + + /* PMMU and HPMMU addresses are equal, check only one of them */ + if (hl_mem_area_inside_range((u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->pmmu.start_addr, + asic_prop->pmmu.end_addr)) + return 0; + + dev_err(hdev->dev, + "CB address 0x%px + 0x%x for internal QMAN is not valid\n", + parser->user_cb, parser->user_cb_size); + + return -EFAULT; +} + +static int gaudi_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (parser->queue_type == QUEUE_TYPE_INT) + return gaudi_parse_cb_no_ext_queue(hdev, parser); + + if (gaudi->hw_cap_initialized & HW_CAP_MMU) + return gaudi_parse_cb_mmu(hdev, parser); + else + return gaudi_parse_cb_no_mmu(hdev, parser); +} + +static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_address, + u32 len, u32 original_len, u64 cq_addr, u32 cq_val, + u32 msi_vec, bool eb) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct packet_msg_prot *cq_pkt; + struct packet_nop *cq_padding; + u64 msi_addr; + u32 tmp; + + cq_padding = kernel_address + original_len; + cq_pkt = kernel_address + len - (sizeof(struct packet_msg_prot) * 2); + + while ((void *)cq_padding < (void *)cq_pkt) { + cq_padding->ctl = cpu_to_le32(FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_NOP)); + cq_padding++; + } + + tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + if (eb) + tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); + + cq_pkt->ctl = cpu_to_le32(tmp); + cq_pkt->value = cpu_to_le32(cq_val); + cq_pkt->addr = cpu_to_le64(cq_addr); + + cq_pkt++; + + tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + cq_pkt->ctl = cpu_to_le32(tmp); + cq_pkt->value = cpu_to_le32(1); + + if (gaudi->multi_msi_mode) + msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4; + else + msi_addr = mmPCIE_CORE_MSI_REQ; + + cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr); +} + +static void gaudi_update_eq_ci(struct hl_device *hdev, u32 val) +{ + WREG32(mmCPU_IF_EQ_RD_OFFS, val); +} + +static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr, + u32 size, u64 val) +{ + struct packet_lin_dma *lin_dma_pkt; + struct hl_cs_job *job; + u32 cb_size, ctl, err_cause; + struct hl_cb *cb; + int rc; + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false); + if (!cb) + return -EFAULT; + + lin_dma_pkt = cb->kernel_address; + memset(lin_dma_pkt, 0, sizeof(*lin_dma_pkt)); + cb_size = sizeof(*lin_dma_pkt); + + ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA); + ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_MEMSET_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + + lin_dma_pkt->ctl = cpu_to_le32(ctl); + lin_dma_pkt->src_addr = cpu_to_le64(val); + lin_dma_pkt->dst_addr |= cpu_to_le64(addr); + lin_dma_pkt->tsize = cpu_to_le32(size); + + job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause && !hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + + job->id = 0; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = cb_size; + job->hw_queue_id = GAUDI_QUEUE_ID_DMA_0_0; + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size + sizeof(struct packet_msg_prot); + + hl_debugfs_add_job(hdev, job); + + rc = gaudi_send_job_on_qman0(hdev, job); + hl_debugfs_remove_job(hdev, job); + kfree(job); + atomic_dec(&cb->cs_cnt); + + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE); + if (err_cause) { + dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause); + rc = -EIO; + if (!hdev->init_done) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause); + } + } + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle); + + return rc; +} + +static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base, + u32 num_regs, u32 val) +{ + struct packet_msg_long *pkt; + struct hl_cs_job *job; + u32 cb_size, ctl; + struct hl_cb *cb; + int i, rc; + + cb_size = (sizeof(*pkt) * num_regs) + sizeof(struct packet_msg_prot); + + if (cb_size > SZ_2M) { + dev_err(hdev->dev, "CB size must be smaller than %uMB", SZ_2M); + return -ENOMEM; + } + + cb = hl_cb_kernel_create(hdev, cb_size, false); + if (!cb) + return -EFAULT; + + pkt = cb->kernel_address; + + ctl = FIELD_PREP(GAUDI_PKT_LONG_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_LONG); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + for (i = 0; i < num_regs ; i++, pkt++) { + pkt->ctl = cpu_to_le32(ctl); + pkt->value = cpu_to_le32(val); + pkt->addr = cpu_to_le64(reg_base + (i * 4)); + } + + job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + job->id = 0; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = cb_size; + job->hw_queue_id = GAUDI_QUEUE_ID_DMA_0_0; + job->patched_cb = job->user_cb; + job->job_cb_size = cb_size; + + hl_debugfs_add_job(hdev, job); + + rc = gaudi_send_job_on_qman0(hdev, job); + hl_debugfs_remove_job(hdev, job); + kfree(job); + atomic_dec(&cb->cs_cnt); + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle); + + return rc; +} + +static int gaudi_restore_sm_registers(struct hl_device *hdev) +{ + u64 base_addr; + u32 num_regs; + int rc; + + base_addr = CFG_BASE + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0; + num_regs = NUM_OF_SOB_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_E_S_SYNC_MNGR_OBJS_SOB_OBJ_0; + num_regs = NUM_OF_SOB_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_W_N_SYNC_MNGR_OBJS_SOB_OBJ_0; + num_regs = NUM_OF_SOB_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0; + num_regs = NUM_OF_MONITORS_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_E_S_SYNC_MNGR_OBJS_MON_STATUS_0; + num_regs = NUM_OF_MONITORS_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_W_N_SYNC_MNGR_OBJS_MON_STATUS_0; + num_regs = NUM_OF_MONITORS_IN_BLOCK; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + + (GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT * 4); + num_regs = NUM_OF_SOB_IN_BLOCK - GAUDI_FIRST_AVAILABLE_W_S_SYNC_OBJECT; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0 + + (GAUDI_FIRST_AVAILABLE_W_S_MONITOR * 4); + num_regs = NUM_OF_MONITORS_IN_BLOCK - GAUDI_FIRST_AVAILABLE_W_S_MONITOR; + rc = gaudi_memset_registers(hdev, base_addr, num_regs, 0); + if (rc) { + dev_err(hdev->dev, "failed resetting SM registers"); + return -ENOMEM; + } + + return 0; +} + +static void gaudi_restore_dma_registers(struct hl_device *hdev) +{ + u32 sob_delta = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_1 - + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0; + int i; + + for (i = 0 ; i < DMA_NUMBER_OF_CHANNELS ; i++) { + u64 sob_addr = CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0 + + (i * sob_delta); + u32 dma_offset = i * DMA_CORE_OFFSET; + + WREG32(mmDMA0_CORE_WR_COMP_ADDR_LO + dma_offset, + lower_32_bits(sob_addr)); + WREG32(mmDMA0_CORE_WR_COMP_ADDR_HI + dma_offset, + upper_32_bits(sob_addr)); + WREG32(mmDMA0_CORE_WR_COMP_WDATA + dma_offset, 0x80000001); + + /* For DMAs 2-7, need to restore WR_AWUSER_31_11 as it can be + * modified by the user for SRAM reduction + */ + if (i > 1) + WREG32(mmDMA0_CORE_WR_AWUSER_31_11 + dma_offset, + 0x00000001); + } +} + +static void gaudi_restore_qm_registers(struct hl_device *hdev) +{ + u32 qman_offset; + int i; + + for (i = 0 ; i < DMA_NUMBER_OF_CHANNELS ; i++) { + qman_offset = i * DMA_QMAN_OFFSET; + WREG32(mmDMA0_QM_ARB_CFG_0 + qman_offset, 0); + } + + for (i = 0 ; i < MME_NUMBER_OF_MASTER_ENGINES ; i++) { + qman_offset = i * (mmMME2_QM_BASE - mmMME0_QM_BASE); + WREG32(mmMME0_QM_ARB_CFG_0 + qman_offset, 0); + } + + for (i = 0 ; i < TPC_NUMBER_OF_ENGINES ; i++) { + qman_offset = i * TPC_QMAN_OFFSET; + WREG32(mmTPC0_QM_ARB_CFG_0 + qman_offset, 0); + } + + for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++) { + qman_offset = (i >> 1) * NIC_MACRO_QMAN_OFFSET + + (i & 0x1) * NIC_ENGINE_QMAN_OFFSET; + WREG32(mmNIC0_QM0_ARB_CFG_0 + qman_offset, 0); + } +} + +static int gaudi_restore_user_registers(struct hl_device *hdev) +{ + int rc; + + rc = gaudi_restore_sm_registers(hdev); + if (rc) + return rc; + + gaudi_restore_dma_registers(hdev); + gaudi_restore_qm_registers(hdev); + + return 0; +} + +static int gaudi_context_switch(struct hl_device *hdev, u32 asid) +{ + return 0; +} + +static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev) +{ + u32 size = hdev->asic_prop.mmu_pgt_size + + hdev->asic_prop.mmu_cache_mng_size; + struct gaudi_device *gaudi = hdev->asic_specific; + u64 addr = hdev->asic_prop.mmu_pgt_addr; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) + return 0; + + return gaudi_memset_device_memory(hdev, addr, size, 0); +} + +static void gaudi_restore_phase_topology(struct hl_device *hdev) +{ + +} + +static int gaudi_dma_core_transfer(struct hl_device *hdev, int dma_id, u64 addr, + u32 size_to_dma, dma_addr_t dma_addr) +{ + u32 err_cause, val; + u64 dma_offset; + int rc; + + dma_offset = dma_id * DMA_CORE_OFFSET; + + WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, lower_32_bits(addr)); + WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, upper_32_bits(addr)); + WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset, lower_32_bits(dma_addr)); + WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset, upper_32_bits(dma_addr)); + WREG32(mmDMA0_CORE_DST_TSIZE_0 + dma_offset, size_to_dma); + WREG32(mmDMA0_CORE_COMMIT + dma_offset, + (1 << DMA0_CORE_COMMIT_LIN_SHIFT)); + + rc = hl_poll_timeout( + hdev, + mmDMA0_CORE_STS0 + dma_offset, + val, + ((val & DMA0_CORE_STS0_BUSY_MASK) == 0), + 0, + 1000000); + + if (rc) { + dev_err(hdev->dev, + "DMA %d timed-out during reading of 0x%llx\n", + dma_id, addr); + return -EIO; + } + + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset); + if (err_cause) { + dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause); + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset, err_cause); + + return -EIO; + } + + return 0; +} + +static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size, + void *blob_addr) +{ + u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma; + u32 qm_glbl_sts0, qm_cgm_sts; + u64 dma_offset, qm_offset; + dma_addr_t dma_addr; + void *kernel_addr; + bool is_eng_idle; + int rc = 0, dma_id; + + kernel_addr = hl_asic_dma_alloc_coherent(hdev, SZ_2M, &dma_addr, GFP_KERNEL | __GFP_ZERO); + + if (!kernel_addr) + return -ENOMEM; + + hdev->asic_funcs->hw_queues_lock(hdev); + + dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_1]; + dma_offset = dma_id * DMA_CORE_OFFSET; + qm_offset = dma_id * DMA_QMAN_OFFSET; + dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); + + if (!is_eng_idle) { + dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2]; + dma_offset = dma_id * DMA_CORE_OFFSET; + qm_offset = dma_id * DMA_QMAN_OFFSET; + dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); + + if (!is_eng_idle) { + dev_err_ratelimited(hdev->dev, + "Can't read via DMA because it is BUSY\n"); + rc = -EAGAIN; + goto out; + } + } + + cfg1 = RREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset); + WREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset, + 0xF << DMA0_QM_GLBL_CFG1_CP_STOP_SHIFT); + + /* TODO: remove this by mapping the DMA temporary buffer to the MMU + * using the compute ctx ASID, if exists. If not, use the kernel ctx + * ASID + */ + WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT)); + + /* Verify DMA is OK */ + err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset); + if (err_cause) { + dev_dbg(hdev->dev, + "Clearing DMA0 engine from errors (cause 0x%x)\n", + err_cause); + WREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset, err_cause); + } + + pos = 0; + size_left = size; + size_to_dma = SZ_2M; + + while (size_left > 0) { + + if (size_left < SZ_2M) + size_to_dma = size_left; + + rc = gaudi_dma_core_transfer(hdev, dma_id, addr, size_to_dma, + dma_addr); + if (rc) + break; + + memcpy(blob_addr + pos, kernel_addr, size_to_dma); + + if (size_left <= SZ_2M) + break; + + pos += SZ_2M; + addr += SZ_2M; + size_left -= SZ_2M; + } + + /* TODO: remove this by mapping the DMA temporary buffer to the MMU + * using the compute ctx ASID, if exists. If not, use the kernel ctx + * ASID + */ + WREG32_AND(mmDMA0_CORE_PROT + dma_offset, + ~BIT(DMA0_CORE_PROT_VAL_SHIFT)); + + WREG32(mmDMA0_QM_GLBL_CFG1 + qm_offset, cfg1); + +out: + hdev->asic_funcs->hw_queues_unlock(hdev); + + hl_asic_dma_free_coherent(hdev, SZ_2M, kernel_addr, dma_addr); + + return rc; +} + +static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (hdev->reset_info.hard_reset_pending) + return U64_MAX; + + return readq(hdev->pcie_bar[HBM_BAR_ID] + + (addr - gaudi->hbm_bar_cur_addr)); +} + +static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (hdev->reset_info.hard_reset_pending) + return; + + writeq(val, hdev->pcie_bar[HBM_BAR_ID] + + (addr - gaudi->hbm_bar_cur_addr)); +} + +void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid) +{ + /* mask to zero the MMBP and ASID bits */ + WREG32_AND(reg, ~0x7FF); + WREG32_OR(reg, asid); +} + +static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) + return; + + if (asid & ~DMA0_QM_GLBL_NON_SECURE_PROPS_0_ASID_MASK) { + dev_crit(hdev->dev, "asid %u is too big\n", asid); + return; + } + + gaudi_mmu_prepare_reg(hdev, mmDMA0_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA0_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA0_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA0_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA0_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA1_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA1_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA1_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA1_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA1_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA2_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA2_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA2_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA2_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA2_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA3_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA3_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA3_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA3_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA3_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA4_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA4_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA4_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA4_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA4_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA5_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA5_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA5_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA5_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA5_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA6_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA6_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA6_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA6_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA6_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA7_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA7_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA7_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA7_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA7_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmDMA0_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA1_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA2_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA3_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA4_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA5_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA6_CORE_NON_SECURE_PROPS, asid); + gaudi_mmu_prepare_reg(hdev, mmDMA7_CORE_NON_SECURE_PROPS, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC0_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC0_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC1_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC1_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC2_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC2_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC3_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC3_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC4_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC4_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC5_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC5_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC6_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC6_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmTPC7_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_CFG_ARUSER_LO, asid); + gaudi_mmu_prepare_reg(hdev, mmTPC7_CFG_AWUSER_LO, asid); + + gaudi_mmu_prepare_reg(hdev, mmMME0_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_QM_GLBL_NON_SECURE_PROPS_4, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_QM_GLBL_NON_SECURE_PROPS_0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_QM_GLBL_NON_SECURE_PROPS_1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_QM_GLBL_NON_SECURE_PROPS_2, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_QM_GLBL_NON_SECURE_PROPS_3, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_QM_GLBL_NON_SECURE_PROPS_4, asid); + + gaudi_mmu_prepare_reg(hdev, mmMME0_SBAB_ARUSER0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_SBAB_ARUSER1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME1_SBAB_ARUSER0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME1_SBAB_ARUSER1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_SBAB_ARUSER0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_SBAB_ARUSER1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME3_SBAB_ARUSER0, asid); + gaudi_mmu_prepare_reg(hdev, mmMME3_SBAB_ARUSER1, asid); + gaudi_mmu_prepare_reg(hdev, mmMME0_ACC_WBC, asid); + gaudi_mmu_prepare_reg(hdev, mmMME1_ACC_WBC, asid); + gaudi_mmu_prepare_reg(hdev, mmMME2_ACC_WBC, asid); + gaudi_mmu_prepare_reg(hdev, mmMME3_ACC_WBC, asid); + + if (gaudi->hw_cap_initialized & HW_CAP_NIC0) { + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM0_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC1) { + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC0_QM1_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC2) { + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM0_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC3) { + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC1_QM1_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC4) { + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM0_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC5) { + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC2_QM1_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC6) { + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM0_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC7) { + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC3_QM1_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC8) { + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM0_GLBL_NON_SECURE_PROPS_4, + asid); + } + + if (gaudi->hw_cap_initialized & HW_CAP_NIC9) { + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_0, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_1, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_2, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_3, + asid); + gaudi_mmu_prepare_reg(hdev, mmNIC4_QM1_GLBL_NON_SECURE_PROPS_4, + asid); + } + + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid); + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid); +} + +static int gaudi_send_job_on_qman0(struct hl_device *hdev, + struct hl_cs_job *job) +{ + struct packet_msg_prot *fence_pkt; + u32 *fence_ptr; + dma_addr_t fence_dma_addr; + struct hl_cb *cb; + u32 tmp, timeout, dma_offset; + int rc; + + if (hdev->pldm) + timeout = GAUDI_PLDM_QMAN0_TIMEOUT_USEC; + else + timeout = HL_DEVICE_TIMEOUT_USEC; + + if (!hdev->asic_funcs->is_device_idle(hdev, NULL, 0, NULL)) { + dev_err_ratelimited(hdev->dev, + "Can't send driver job on QMAN0 because the device is not idle\n"); + return -EBUSY; + } + + fence_ptr = hl_asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL, &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate fence memory for QMAN0\n"); + return -ENOMEM; + } + + cb = job->patched_cb; + + fence_pkt = cb->kernel_address + + job->job_cb_size - sizeof(struct packet_msg_prot); + + tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); + tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + fence_pkt->ctl = cpu_to_le32(tmp); + fence_pkt->value = cpu_to_le32(GAUDI_QMAN0_FENCE_VAL); + fence_pkt->addr = cpu_to_le64(fence_dma_addr); + + dma_offset = gaudi_dma_assignment[GAUDI_PCI_DMA_1] * DMA_CORE_OFFSET; + + WREG32(mmDMA0_CORE_PROT + dma_offset, + BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT) | BIT(DMA0_CORE_PROT_VAL_SHIFT)); + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, GAUDI_QUEUE_ID_DMA_0_0, + job->job_cb_size, cb->bus_address); + if (rc) { + dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc); + goto free_fence_ptr; + } + + rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, + (tmp == GAUDI_QMAN0_FENCE_VAL), 1000, + timeout, true); + + hl_hw_queue_inc_ci_kernel(hdev, GAUDI_QUEUE_ID_DMA_0_0); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, "QMAN0 Job timeout (0x%x)\n", tmp); + goto free_fence_ptr; + } + +free_fence_ptr: + WREG32(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT)); + + hl_asic_dma_pool_free(hdev, (void *) fence_ptr, fence_dma_addr); + return rc; +} + +static void gaudi_get_event_desc(u16 event_type, char *desc, size_t size) +{ + if (event_type >= GAUDI_EVENT_SIZE) + goto event_not_supported; + + if (!gaudi_irq_map_table[event_type].valid) + goto event_not_supported; + + snprintf(desc, size, gaudi_irq_map_table[event_type].name); + + return; + +event_not_supported: + snprintf(desc, size, "N/A"); +} + +static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, + bool is_write, s32 *engine_id_1, + s32 *engine_id_2) +{ + u32 dma_id[2], dma_offset, err_cause[2], mask, i; + + mask = is_write ? DMA0_CORE_ERR_CAUSE_HBW_WR_ERR_MASK : + DMA0_CORE_ERR_CAUSE_HBW_RD_ERR_MASK; + + switch (x_y) { + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1: + dma_id[0] = 0; + dma_id[1] = 2; + break; + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1: + dma_id[0] = 1; + dma_id[1] = 3; + break; + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: + dma_id[0] = 4; + dma_id[1] = 6; + break; + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: + dma_id[0] = 5; + dma_id[1] = 7; + break; + default: + goto unknown_initiator; + } + + for (i = 0 ; i < 2 ; i++) { + dma_offset = dma_id[i] * DMA_CORE_OFFSET; + err_cause[i] = RREG32(mmDMA0_CORE_ERR_CAUSE + dma_offset); + } + + switch (x_y) { + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1: + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; + return "DMA0"; + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_2; + return "DMA2"; + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_2; + return "DMA0 or DMA2"; + } + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1: + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; + return "DMA1"; + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_3; + return "DMA3"; + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_3; + return "DMA1 or DMA3"; + } + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; + return "DMA4"; + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_6; + return "DMA6"; + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_6; + return "DMA4 or DMA6"; + } + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; + return "DMA5"; + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_7; + return "DMA7"; + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_7; + return "DMA5 or DMA7"; + } + } + +unknown_initiator: + return "unknown initiator"; +} + +static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, + u32 *engine_id_1, u32 *engine_id_2) +{ + u32 val, x_y, axi_id; + + val = is_write ? RREG32(mmMMU_UP_RAZWI_WRITE_ID) : + RREG32(mmMMU_UP_RAZWI_READ_ID); + x_y = val & ((RAZWI_INITIATOR_Y_MASK << RAZWI_INITIATOR_Y_SHIFT) | + (RAZWI_INITIATOR_X_MASK << RAZWI_INITIATOR_X_SHIFT)); + axi_id = val & (RAZWI_INITIATOR_AXI_ID_MASK << + RAZWI_INITIATOR_AXI_ID_SHIFT); + + switch (x_y) { + case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0: + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_0; + return "TPC0"; + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_0; + return "NIC0"; + } + break; + case RAZWI_INITIATOR_ID_X_Y_TPC1: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_1; + return "TPC1"; + case RAZWI_INITIATOR_ID_X_Y_MME0_0: + case RAZWI_INITIATOR_ID_X_Y_MME0_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_0; + return "MME0"; + case RAZWI_INITIATOR_ID_X_Y_MME1_0: + case RAZWI_INITIATOR_ID_X_Y_MME1_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_1; + return "MME1"; + case RAZWI_INITIATOR_ID_X_Y_TPC2: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_2; + return "TPC2"; + case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC: + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_3; + return "TPC3"; + } + /* PCI, CPU or PSOC does not have engine id*/ + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI)) + return "PCI"; + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU)) + return "CPU"; + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PSOC)) + return "PSOC"; + break; + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: + case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: + return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write, + engine_id_1, engine_id_2); + case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2: + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_4; + return "TPC4"; + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_1; + return "NIC1"; + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_2; + return "NIC2"; + } + break; + case RAZWI_INITIATOR_ID_X_Y_TPC5: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_5; + return "TPC5"; + case RAZWI_INITIATOR_ID_X_Y_MME2_0: + case RAZWI_INITIATOR_ID_X_Y_MME2_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_2; + return "MME2"; + case RAZWI_INITIATOR_ID_X_Y_MME3_0: + case RAZWI_INITIATOR_ID_X_Y_MME3_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_3; + return "MME3"; + case RAZWI_INITIATOR_ID_X_Y_TPC6: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_6; + return "TPC6"; + case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5: + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_7; + return "TPC7"; + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_4; + return "NIC4"; + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_5; + return "NIC5"; + } + break; + default: + break; + } + + dev_err(hdev->dev, + "Unknown RAZWI initiator ID 0x%x [Y=%d, X=%d, AXI_ID=%d]\n", + val, + (val >> RAZWI_INITIATOR_Y_SHIFT) & RAZWI_INITIATOR_Y_MASK, + (val >> RAZWI_INITIATOR_X_SHIFT) & RAZWI_INITIATOR_X_MASK, + (val >> RAZWI_INITIATOR_AXI_ID_SHIFT) & + RAZWI_INITIATOR_AXI_ID_MASK); + + return "unknown initiator"; +} + +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, + u32 *engine_id_2) +{ + + if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { + dev_err_ratelimited(hdev->dev, + "RAZWI event caused by illegal write of %s\n", + gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); + WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); + } + + if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { + dev_err_ratelimited(hdev->dev, + "RAZWI event caused by illegal read of %s\n", + gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); + WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); + } +} + +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + u32 val; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) + return; + + val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE); + if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) { + *addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); + + dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); + *type = HL_RAZWI_PAGE_FAULT; + + WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); + } + + val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE); + if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) { + *addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); + + dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); + *type = HL_RAZWI_MMU_ACCESS_ERROR; + + WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); + } +} + +/* + * +-------------------+------------------------------------------------------+ + * | Configuration Reg | Description | + * | Address | | + * +-------------------+------------------------------------------------------+ + * | 0xF30 - 0xF3F |ECC single error indication (1 bit per memory wrapper)| + * | |0xF30 memory wrappers 31:0 (MSB to LSB) | + * | |0xF34 memory wrappers 63:32 | + * | |0xF38 memory wrappers 95:64 | + * | |0xF3C memory wrappers 127:96 | + * +-------------------+------------------------------------------------------+ + * | 0xF40 - 0xF4F |ECC double error indication (1 bit per memory wrapper)| + * | |0xF40 memory wrappers 31:0 (MSB to LSB) | + * | |0xF44 memory wrappers 63:32 | + * | |0xF48 memory wrappers 95:64 | + * | |0xF4C memory wrappers 127:96 | + * +-------------------+------------------------------------------------------+ + */ +static int gaudi_extract_ecc_info(struct hl_device *hdev, + struct ecc_info_extract_params *params, u64 *ecc_address, + u64 *ecc_syndrom, u8 *memory_wrapper_idx) +{ + u32 i, num_mem_regs, reg, err_bit; + u64 err_addr, err_word = 0; + + num_mem_regs = params->num_memories / 32 + + ((params->num_memories % 32) ? 1 : 0); + + if (params->block_address >= CFG_BASE) + params->block_address -= CFG_BASE; + + if (params->derr) + err_addr = params->block_address + GAUDI_ECC_DERR0_OFFSET; + else + err_addr = params->block_address + GAUDI_ECC_SERR0_OFFSET; + + /* Set invalid wrapper index */ + *memory_wrapper_idx = 0xFF; + + /* Iterate through memory wrappers, a single bit must be set */ + for (i = 0 ; i < num_mem_regs ; i++) { + err_addr += i * 4; + err_word = RREG32(err_addr); + if (err_word) { + err_bit = __ffs(err_word); + *memory_wrapper_idx = err_bit + (32 * i); + break; + } + } + + if (*memory_wrapper_idx == 0xFF) { + dev_err(hdev->dev, "ECC error information cannot be found\n"); + return -EINVAL; + } + + WREG32(params->block_address + GAUDI_ECC_MEM_SEL_OFFSET, + *memory_wrapper_idx); + + *ecc_address = + RREG32(params->block_address + GAUDI_ECC_ADDRESS_OFFSET); + *ecc_syndrom = + RREG32(params->block_address + GAUDI_ECC_SYNDROME_OFFSET); + + /* Clear error indication */ + reg = RREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET); + if (params->derr) + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_DERR_MASK, 1); + else + reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_SERR_MASK, 1); + + WREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET, reg); + + return 0; +} + +/* + * gaudi_queue_idx_dec - decrement queue index (pi/ci) and handle wrap + * + * @idx: the current pi/ci value + * @q_len: the queue length (power of 2) + * + * @return the cyclically decremented index + */ +static inline u32 gaudi_queue_idx_dec(u32 idx, u32 q_len) +{ + u32 mask = q_len - 1; + + /* + * modular decrement is equivalent to adding (queue_size -1) + * later we take LSBs to make sure the value is in the + * range [0, queue_len - 1] + */ + return (idx + q_len - 1) & mask; +} + +/** + * gaudi_handle_sw_config_stream_data - print SW config stream data + * + * @hdev: pointer to the habanalabs device structure + * @stream: the QMAN's stream + * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred + */ +static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 stream, + u64 qman_base, u64 event_mask) +{ + u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr; + u32 cq_ptr_lo_off, size; + + cq_ptr_lo_off = mmTPC0_QM_CQ_PTR_LO_1 - mmTPC0_QM_CQ_PTR_LO_0; + + cq_ptr_lo = qman_base + (mmTPC0_QM_CQ_PTR_LO_0 - mmTPC0_QM_BASE) + + stream * cq_ptr_lo_off; + cq_ptr_hi = cq_ptr_lo + + (mmTPC0_QM_CQ_PTR_HI_0 - mmTPC0_QM_CQ_PTR_LO_0); + cq_tsize = cq_ptr_lo + + (mmTPC0_QM_CQ_TSIZE_0 - mmTPC0_QM_CQ_PTR_LO_0); + + cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo); + size = RREG32(cq_tsize); + dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n", + stream, cq_ptr, size); + + if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { + hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr; + hdev->captured_err_info.undef_opcode.cq_size = size; + hdev->captured_err_info.undef_opcode.stream_id = stream; + } +} + +/** + * gaudi_handle_last_pqes_on_err - print last PQEs on error + * + * @hdev: pointer to the habanalabs device structure + * @qid_base: first QID of the QMAN (out of 4 streams) + * @stream: the QMAN's stream + * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred + * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE) + */ +static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, + u32 stream, u64 qman_base, + u64 event_mask, + bool pr_sw_conf) +{ + u32 ci, qm_ci_stream_off, queue_len; + struct hl_hw_queue *q; + u64 pq_ci, addr[PQ_FETCHER_CACHE_SIZE]; + int i; + + q = &hdev->kernel_queues[qid_base + stream]; + + qm_ci_stream_off = mmTPC0_QM_PQ_CI_1 - mmTPC0_QM_PQ_CI_0; + pq_ci = qman_base + (mmTPC0_QM_PQ_CI_0 - mmTPC0_QM_BASE) + + stream * qm_ci_stream_off; + + queue_len = (q->queue_type == QUEUE_TYPE_INT) ? + q->int_queue_len : HL_QUEUE_LENGTH; + + hdev->asic_funcs->hw_queues_lock(hdev); + + if (pr_sw_conf) + gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask); + + ci = RREG32(pq_ci); + + /* we should start printing form ci -1 */ + ci = gaudi_queue_idx_dec(ci, queue_len); + memset(addr, 0, sizeof(addr)); + + for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) { + struct hl_bd *bd; + u32 len; + + bd = q->kernel_address; + bd += ci; + + len = le32_to_cpu(bd->len); + /* len 0 means uninitialized entry- break */ + if (!len) + break; + + addr[i] = le64_to_cpu(bd->ptr); + + dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n", + stream, ci, addr[i], len); + + /* get previous ci, wrap if needed */ + ci = gaudi_queue_idx_dec(ci, queue_len); + } + + if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { + struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode; + u32 arr_idx = undef_opcode->cb_addr_streams_len; + + if (arr_idx == 0) { + undef_opcode->timestamp = ktime_get(); + undef_opcode->engine_id = gaudi_queue_id_to_engine_id[qid_base]; + } + + memcpy(undef_opcode->cb_addr_streams[arr_idx], addr, sizeof(addr)); + undef_opcode->cb_addr_streams_len++; + } + + hdev->asic_funcs->hw_queues_unlock(hdev); +} + +/** + * handle_qman_data_on_err - extract QMAN data on error + * + * @hdev: pointer to the habanalabs device structure + * @qid_base: first QID of the QMAN (out of 4 streams) + * @stream: the QMAN's stream + * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred + * + * This function attempt to exatract as much data as possible on QMAN error. + * On upper CP print the SW config stream data and last 8 PQEs. + * On lower CP print SW config data and last PQEs of ALL 4 upper CPs + */ +static void handle_qman_data_on_err(struct hl_device *hdev, u32 qid_base, + u32 stream, u64 qman_base, u64 event_mask) +{ + u32 i; + + if (stream != QMAN_STREAMS) { + gaudi_handle_last_pqes_on_err(hdev, qid_base, stream, + qman_base, event_mask, true); + return; + } + + /* handle Lower-CP */ + gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask); + + for (i = 0; i < QMAN_STREAMS; i++) + gaudi_handle_last_pqes_on_err(hdev, qid_base, i, + qman_base, event_mask, false); +} + +static void gaudi_handle_qman_err_generic(struct hl_device *hdev, + const char *qm_name, + u64 qman_base, + u32 qid_base, + u64 *event_mask) +{ + u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val; + u64 glbl_sts_addr, arb_err_addr; + char reg_desc[32]; + + glbl_sts_addr = qman_base + (mmTPC0_QM_GLBL_STS1_0 - mmTPC0_QM_BASE); + arb_err_addr = qman_base + (mmTPC0_QM_ARB_ERR_CAUSE - mmTPC0_QM_BASE); + + /* Iterate through all stream GLBL_STS1 registers + Lower CP */ + for (i = 0 ; i < QMAN_STREAMS + 1 ; i++) { + glbl_sts_clr_val = 0; + glbl_sts_val = RREG32(glbl_sts_addr + 4 * i); + + if (!glbl_sts_val) + continue; + + if (i == QMAN_STREAMS) + snprintf(reg_desc, ARRAY_SIZE(reg_desc), "LowerCP"); + else + snprintf(reg_desc, ARRAY_SIZE(reg_desc), "stream%u", i); + + for (j = 0 ; j < GAUDI_NUM_OF_QM_ERR_CAUSE ; j++) { + if (glbl_sts_val & BIT(j)) { + dev_err_ratelimited(hdev->dev, + "%s %s. err cause: %s\n", + qm_name, reg_desc, + gaudi_qman_error_cause[j]); + glbl_sts_clr_val |= BIT(j); + } + } + /* check for undefined opcode */ + if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK && + hdev->captured_err_info.undef_opcode.write_enable) { + memset(&hdev->captured_err_info.undef_opcode, 0, + sizeof(hdev->captured_err_info.undef_opcode)); + + hdev->captured_err_info.undef_opcode.write_enable = false; + *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; + } + + /* Write 1 clear errors */ + if (!hdev->stop_on_err) + WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val); + else + handle_qman_data_on_err(hdev, qid_base, i, qman_base, *event_mask); + } + + arb_err_val = RREG32(arb_err_addr); + + if (!arb_err_val) + return; + + for (j = 0 ; j < GAUDI_NUM_OF_QM_ARB_ERR_CAUSE ; j++) { + if (arb_err_val & BIT(j)) { + dev_err_ratelimited(hdev->dev, + "%s ARB_ERR. err cause: %s\n", + qm_name, + gaudi_qman_arb_error_cause[j]); + } + } +} + +static void gaudi_print_sm_sei_info(struct hl_device *hdev, u16 event_type, + struct hl_eq_sm_sei_data *sei_data) +{ + u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0; + + /* Flip the bits as the enum is ordered in the opposite way */ + index = (index ^ 0x3) & 0x3; + + switch (sei_data->sei_cause) { + case SM_SEI_SO_OVERFLOW: + dev_err_ratelimited(hdev->dev, + "%s SEI Error: SOB Group %u overflow/underflow", + gaudi_sync_manager_names[index], + le32_to_cpu(sei_data->sei_log)); + break; + case SM_SEI_LBW_4B_UNALIGNED: + dev_err_ratelimited(hdev->dev, + "%s SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x", + gaudi_sync_manager_names[index], + le32_to_cpu(sei_data->sei_log)); + break; + case SM_SEI_AXI_RESPONSE_ERR: + dev_err_ratelimited(hdev->dev, + "%s SEI Error: AXI ID %u response error", + gaudi_sync_manager_names[index], + le32_to_cpu(sei_data->sei_log)); + break; + default: + dev_err_ratelimited(hdev->dev, "Unknown SM SEI cause %u", + le32_to_cpu(sei_data->sei_log)); + break; + } +} + +static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type, + struct hl_eq_ecc_data *ecc_data) +{ + struct ecc_info_extract_params params; + u64 ecc_address = 0, ecc_syndrom = 0; + u8 index, memory_wrapper_idx = 0; + bool extract_info_from_fw; + int rc; + + if (hdev->asic_prop.fw_security_enabled) { + extract_info_from_fw = true; + goto extract_ecc_info; + } + + switch (event_type) { + case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR: + case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR: + extract_info_from_fw = true; + break; + case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR: + index = event_type - GAUDI_EVENT_TPC0_SERR; + params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; + params.num_memories = 90; + params.derr = false; + extract_info_from_fw = false; + break; + case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR: + index = event_type - GAUDI_EVENT_TPC0_DERR; + params.block_address = + mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET; + params.num_memories = 90; + params.derr = true; + extract_info_from_fw = false; + break; + case GAUDI_EVENT_MME0_ACC_SERR: + case GAUDI_EVENT_MME1_ACC_SERR: + case GAUDI_EVENT_MME2_ACC_SERR: + case GAUDI_EVENT_MME3_ACC_SERR: + index = (event_type - GAUDI_EVENT_MME0_ACC_SERR) / 4; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = false; + extract_info_from_fw = false; + break; + case GAUDI_EVENT_MME0_ACC_DERR: + case GAUDI_EVENT_MME1_ACC_DERR: + case GAUDI_EVENT_MME2_ACC_DERR: + case GAUDI_EVENT_MME3_ACC_DERR: + index = (event_type - GAUDI_EVENT_MME0_ACC_DERR) / 4; + params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET; + params.num_memories = 128; + params.derr = true; + extract_info_from_fw = false; + break; + case GAUDI_EVENT_MME0_SBAB_SERR: + case GAUDI_EVENT_MME1_SBAB_SERR: + case GAUDI_EVENT_MME2_SBAB_SERR: + case GAUDI_EVENT_MME3_SBAB_SERR: + index = (event_type - GAUDI_EVENT_MME0_SBAB_SERR) / 4; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = false; + extract_info_from_fw = false; + break; + case GAUDI_EVENT_MME0_SBAB_DERR: + case GAUDI_EVENT_MME1_SBAB_DERR: + case GAUDI_EVENT_MME2_SBAB_DERR: + case GAUDI_EVENT_MME3_SBAB_DERR: + index = (event_type - GAUDI_EVENT_MME0_SBAB_DERR) / 4; + params.block_address = + mmMME0_SBAB_BASE + index * MME_ACC_OFFSET; + params.num_memories = 33; + params.derr = true; + extract_info_from_fw = false; + break; + default: + return; + } + +extract_ecc_info: + if (extract_info_from_fw) { + ecc_address = le64_to_cpu(ecc_data->ecc_address); + ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); + memory_wrapper_idx = ecc_data->memory_wrapper_idx; + } else { + rc = gaudi_extract_ecc_info(hdev, ¶ms, &ecc_address, + &ecc_syndrom, &memory_wrapper_idx); + if (rc) + return; + } + + dev_err(hdev->dev, + "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n", + ecc_address, ecc_syndrom, memory_wrapper_idx); +} + +static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) +{ + u64 qman_base; + char desc[32]; + u32 qid_base; + u8 index; + + switch (event_type) { + case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: + index = event_type - GAUDI_EVENT_TPC0_QM; + qid_base = GAUDI_QUEUE_ID_TPC_0_0 + index * QMAN_STREAMS; + qman_base = mmTPC0_QM_BASE + index * TPC_QMAN_OFFSET; + snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC_QM", index); + break; + case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM: + if (event_type == GAUDI_EVENT_MME0_QM) { + index = 0; + qid_base = GAUDI_QUEUE_ID_MME_0_0; + } else { /* event_type == GAUDI_EVENT_MME2_QM */ + index = 2; + qid_base = GAUDI_QUEUE_ID_MME_1_0; + } + qman_base = mmMME0_QM_BASE + index * MME_QMAN_OFFSET; + snprintf(desc, ARRAY_SIZE(desc), "%s%d", "MME_QM", index); + break; + case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM: + index = event_type - GAUDI_EVENT_DMA0_QM; + qid_base = GAUDI_QUEUE_ID_DMA_0_0 + index * QMAN_STREAMS; + /* skip GAUDI_QUEUE_ID_CPU_PQ if necessary */ + if (index > 1) + qid_base++; + qman_base = mmDMA0_QM_BASE + index * DMA_QMAN_OFFSET; + snprintf(desc, ARRAY_SIZE(desc), "%s%d", "DMA_QM", index); + break; + case GAUDI_EVENT_NIC0_QM0: + qid_base = GAUDI_QUEUE_ID_NIC_0_0; + qman_base = mmNIC0_QM0_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC0_QM0"); + break; + case GAUDI_EVENT_NIC0_QM1: + qid_base = GAUDI_QUEUE_ID_NIC_1_0; + qman_base = mmNIC0_QM1_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC0_QM1"); + break; + case GAUDI_EVENT_NIC1_QM0: + qid_base = GAUDI_QUEUE_ID_NIC_2_0; + qman_base = mmNIC1_QM0_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC1_QM0"); + break; + case GAUDI_EVENT_NIC1_QM1: + qid_base = GAUDI_QUEUE_ID_NIC_3_0; + qman_base = mmNIC1_QM1_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC1_QM1"); + break; + case GAUDI_EVENT_NIC2_QM0: + qid_base = GAUDI_QUEUE_ID_NIC_4_0; + qman_base = mmNIC2_QM0_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC2_QM0"); + break; + case GAUDI_EVENT_NIC2_QM1: + qid_base = GAUDI_QUEUE_ID_NIC_5_0; + qman_base = mmNIC2_QM1_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC2_QM1"); + break; + case GAUDI_EVENT_NIC3_QM0: + qid_base = GAUDI_QUEUE_ID_NIC_6_0; + qman_base = mmNIC3_QM0_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC3_QM0"); + break; + case GAUDI_EVENT_NIC3_QM1: + qid_base = GAUDI_QUEUE_ID_NIC_7_0; + qman_base = mmNIC3_QM1_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC3_QM1"); + break; + case GAUDI_EVENT_NIC4_QM0: + qid_base = GAUDI_QUEUE_ID_NIC_8_0; + qman_base = mmNIC4_QM0_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC4_QM0"); + break; + case GAUDI_EVENT_NIC4_QM1: + qid_base = GAUDI_QUEUE_ID_NIC_9_0; + qman_base = mmNIC4_QM1_BASE; + snprintf(desc, ARRAY_SIZE(desc), "NIC4_QM1"); + break; + default: + return; + } + + gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base, event_mask); +} + +static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, + bool razwi) +{ + u32 engine_id_1, engine_id_2; + char desc[64] = ""; + u64 razwi_addr = 0; + u8 razwi_type; + int rc; + + /* + * Init engine id by default as not valid and only if razwi initiated from engine with + * engine id it will get valid value. + * Init razwi type to default, will be changed only if razwi caused by page fault of + * MMU access error + */ + engine_id_1 = U16_MAX; + engine_id_2 = U16_MAX; + razwi_type = U8_MAX; + + gaudi_get_event_desc(event_type, desc, sizeof(desc)); + dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", + event_type, desc); + + if (razwi) { + gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); + + /* In case it's the first razwi, save its parameters*/ + rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0); + if (rc) { + hdev->captured_err_info.razwi.timestamp = ktime_get(); + hdev->captured_err_info.razwi.addr = razwi_addr; + hdev->captured_err_info.razwi.engine_id_1 = engine_id_1; + hdev->captured_err_info.razwi.engine_id_2 = engine_id_2; + /* + * If first engine id holds non valid value the razwi initiator + * does not have engine id + */ + hdev->captured_err_info.razwi.non_engine_initiator = + (engine_id_1 == U16_MAX); + hdev->captured_err_info.razwi.type = razwi_type; + + } + } +} + +static void gaudi_print_out_of_sync_info(struct hl_device *hdev, + struct cpucp_pkt_sync_err *sync_err) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ]; + + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", + sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); +} + +static void gaudi_print_fw_alive_info(struct hl_device *hdev, + struct hl_eq_fw_alive *fw_alive) +{ + dev_err(hdev->dev, + "FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n", + (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? + "Minor" : "Critical", fw_alive->process_id, + fw_alive->thread_id, fw_alive->uptime_seconds); +} + +static void gaudi_print_nic_axi_irq_info(struct hl_device *hdev, u16 event_type, + void *data) +{ + char desc[64] = "", *type; + struct eq_nic_sei_event *eq_nic_sei = data; + u16 nic_id = event_type - GAUDI_EVENT_NIC_SEI_0; + + switch (eq_nic_sei->axi_error_cause) { + case RXB: + type = "RXB"; + break; + case RXE: + type = "RXE"; + break; + case TXS: + type = "TXS"; + break; + case TXE: + type = "TXE"; + break; + case QPC_RESP: + type = "QPC_RESP"; + break; + case NON_AXI_ERR: + type = "NON_AXI_ERR"; + break; + case TMR: + type = "TMR"; + break; + default: + dev_err(hdev->dev, "unknown NIC AXI cause %d\n", + eq_nic_sei->axi_error_cause); + type = "N/A"; + break; + } + + snprintf(desc, sizeof(desc), "NIC%d_%s%d", nic_id, type, + eq_nic_sei->id); + dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", + event_type, desc); +} + +static int gaudi_compute_reset_late_init(struct hl_device *hdev) +{ + /* GAUDI doesn't support any reset except hard-reset */ + return -EPERM; +} + +static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device, + struct hl_eq_hbm_ecc_data *hbm_ecc_data) +{ + u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch; + int rc = 0; + + if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_HBM_ECC_EN) { + if (!hbm_ecc_data) { + dev_err(hdev->dev, "No FW ECC data"); + return 0; + } + + wr_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + rd_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + ca_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + derr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_DERR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + serr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_SERR_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + type = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + ch = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK, + le32_to_cpu(hbm_ecc_data->hbm_ecc_info)); + + dev_err(hdev->dev, + "HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n", + device, ch, wr_par, rd_par, ca_par, serr, derr); + dev_err(hdev->dev, + "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%u, SEC_CNT=%d, DEC_CNT=%d\n", + device, ch, hbm_ecc_data->first_addr, type, + hbm_ecc_data->sec_cont_cnt, hbm_ecc_data->sec_cnt, + hbm_ecc_data->dec_cnt); + return 0; + } + + if (hdev->asic_prop.fw_security_enabled) { + dev_info(hdev->dev, "Cannot access MC regs for ECC data while security is enabled\n"); + return 0; + } + + base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET; + for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) { + val = RREG32_MASK(base + ch * 0x1000 + 0x06C, 0x0000FFFF); + val = (val & 0xFF) | ((val >> 8) & 0xFF); + if (val) { + rc = -EIO; + dev_err(hdev->dev, + "HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n", + device, ch * 2, val & 0x1, (val >> 1) & 0x1, + (val >> 2) & 0x1, (val >> 3) & 0x1, + (val >> 4) & 0x1); + + val2 = RREG32(base + ch * 0x1000 + 0x060); + dev_err(hdev->dev, + "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n", + device, ch * 2, + RREG32(base + ch * 0x1000 + 0x064), + (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10, + (val2 & 0xFF0000) >> 16, + (val2 & 0xFF000000) >> 24); + } + + val = RREG32_MASK(base + ch * 0x1000 + 0x07C, 0x0000FFFF); + val = (val & 0xFF) | ((val >> 8) & 0xFF); + if (val) { + rc = -EIO; + dev_err(hdev->dev, + "HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n", + device, ch * 2 + 1, val & 0x1, (val >> 1) & 0x1, + (val >> 2) & 0x1, (val >> 3) & 0x1, + (val >> 4) & 0x1); + + val2 = RREG32(base + ch * 0x1000 + 0x070); + dev_err(hdev->dev, + "HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n", + device, ch * 2 + 1, + RREG32(base + ch * 0x1000 + 0x074), + (val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10, + (val2 & 0xFF0000) >> 16, + (val2 & 0xFF000000) >> 24); + } + + /* Clear interrupts */ + RMWREG32(base + (ch * 0x1000) + 0x060, 0x1C8, 0x1FF); + RMWREG32(base + (ch * 0x1000) + 0x070, 0x1C8, 0x1FF); + WREG32(base + (ch * 0x1000) + 0x06C, 0x1F1F); + WREG32(base + (ch * 0x1000) + 0x07C, 0x1F1F); + RMWREG32(base + (ch * 0x1000) + 0x060, 0x0, 0xF); + RMWREG32(base + (ch * 0x1000) + 0x070, 0x0, 0xF); + } + + val = RREG32(base + 0x8F30); + val2 = RREG32(base + 0x8F34); + if (val | val2) { + rc = -EIO; + dev_err(hdev->dev, + "HBM %d MC SRAM SERR info: Reg 0x8F30=0x%x, Reg 0x8F34=0x%x\n", + device, val, val2); + } + val = RREG32(base + 0x8F40); + val2 = RREG32(base + 0x8F44); + if (val | val2) { + rc = -EIO; + dev_err(hdev->dev, + "HBM %d MC SRAM DERR info: Reg 0x8F40=0x%x, Reg 0x8F44=0x%x\n", + device, val, val2); + } + + return rc; +} + +static int gaudi_hbm_event_to_dev(u16 hbm_event_type) +{ + switch (hbm_event_type) { + case GAUDI_EVENT_HBM0_SPI_0: + case GAUDI_EVENT_HBM0_SPI_1: + return 0; + case GAUDI_EVENT_HBM1_SPI_0: + case GAUDI_EVENT_HBM1_SPI_1: + return 1; + case GAUDI_EVENT_HBM2_SPI_0: + case GAUDI_EVENT_HBM2_SPI_1: + return 2; + case GAUDI_EVENT_HBM3_SPI_0: + case GAUDI_EVENT_HBM3_SPI_1: + return 3; + default: + break; + } + + /* Should never happen */ + return 0; +} + +static bool gaudi_tpc_read_interrupts(struct hl_device *hdev, u8 tpc_id, + char *interrupt_name) +{ + u32 tpc_offset = tpc_id * TPC_CFG_OFFSET, tpc_interrupts_cause, i; + bool soft_reset_required = false; + + tpc_interrupts_cause = RREG32(mmTPC0_CFG_TPC_INTR_CAUSE + tpc_offset) & + TPC0_CFG_TPC_INTR_CAUSE_CAUSE_MASK; + + for (i = 0 ; i < GAUDI_NUM_OF_TPC_INTR_CAUSE ; i++) + if (tpc_interrupts_cause & BIT(i)) { + dev_err_ratelimited(hdev->dev, + "TPC%d_%s interrupt cause: %s\n", + tpc_id, interrupt_name, + gaudi_tpc_interrupts_cause[i]); + /* If this is QM error, we need to soft-reset */ + if (i == 15) + soft_reset_required = true; + } + + /* Clear interrupts */ + WREG32(mmTPC0_CFG_TPC_INTR_CAUSE + tpc_offset, 0); + + return soft_reset_required; +} + +static int tpc_dec_event_to_tpc_id(u16 tpc_dec_event_type) +{ + return (tpc_dec_event_type - GAUDI_EVENT_TPC0_DEC) >> 1; +} + +static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type) +{ + return (tpc_dec_event_type - GAUDI_EVENT_TPC0_KRN_ERR) / 6; +} + +static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type) +{ + ktime_t zero_time = ktime_set(0, 0); + + mutex_lock(&hdev->clk_throttling.lock); + + switch (event_type) { + case GAUDI_EVENT_FIX_POWER_ENV_S: + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; + dev_info_ratelimited(hdev->dev, + "Clock throttling due to power consumption\n"); + break; + + case GAUDI_EVENT_FIX_POWER_ENV_E: + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); + dev_info_ratelimited(hdev->dev, + "Power envelop is safe, back to optimal clock\n"); + break; + + case GAUDI_EVENT_FIX_THERMAL_ENV_S: + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time; + dev_info_ratelimited(hdev->dev, + "Clock throttling due to overheating\n"); + break; + + case GAUDI_EVENT_FIX_THERMAL_ENV_E: + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get(); + dev_info_ratelimited(hdev->dev, + "Thermal envelop is safe, back to optimal clock\n"); + break; + + default: + dev_err(hdev->dev, "Received invalid clock change event %d\n", + event_type); + break; + } + + mutex_unlock(&hdev->clk_throttling.lock); +} + +static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0; + u32 ctl = le32_to_cpu(eq_entry->hdr.ctl); + u32 fw_fatal_err_flag = 0, flags = 0; + u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) + >> EQ_CTL_EVENT_TYPE_SHIFT); + bool reset_required, reset_direct = false; + u8 cause; + int rc; + + if (event_type >= GAUDI_EVENT_SIZE) { + dev_err(hdev->dev, "Event type %u exceeds maximum of %u", + event_type, GAUDI_EVENT_SIZE - 1); + return; + } + + gaudi->events_stat[event_type]++; + gaudi->events_stat_aggregate[event_type]++; + + switch (event_type) { + case GAUDI_EVENT_PCIE_CORE_DERR: + case GAUDI_EVENT_PCIE_IF_DERR: + case GAUDI_EVENT_PCIE_PHY_DERR: + case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR: + case GAUDI_EVENT_MME0_ACC_DERR: + case GAUDI_EVENT_MME0_SBAB_DERR: + case GAUDI_EVENT_MME1_ACC_DERR: + case GAUDI_EVENT_MME1_SBAB_DERR: + case GAUDI_EVENT_MME2_ACC_DERR: + case GAUDI_EVENT_MME2_SBAB_DERR: + case GAUDI_EVENT_MME3_ACC_DERR: + case GAUDI_EVENT_MME3_SBAB_DERR: + case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC: + fallthrough; + case GAUDI_EVENT_CPU_IF_ECC_DERR: + case GAUDI_EVENT_PSOC_MEM_DERR: + case GAUDI_EVENT_PSOC_CORESIGHT_DERR: + case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR: + case GAUDI_EVENT_NIC0_DERR ... GAUDI_EVENT_NIC4_DERR: + case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR: + case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: + case GAUDI_EVENT_MMU_DERR: + case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; + goto reset_device; + + case GAUDI_EVENT_GIC500: + case GAUDI_EVENT_AXI_ECC: + case GAUDI_EVENT_L2_RAM_ECC: + case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: + gaudi_print_irq_info(hdev, event_type, false); + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + goto reset_device; + + case GAUDI_EVENT_HBM0_SPI_0: + case GAUDI_EVENT_HBM1_SPI_0: + case GAUDI_EVENT_HBM2_SPI_0: + case GAUDI_EVENT_HBM3_SPI_0: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_hbm_read_interrupts(hdev, + gaudi_hbm_event_to_dev(event_type), + &eq_entry->hbm_ecc_data); + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + goto reset_device; + + case GAUDI_EVENT_HBM0_SPI_1: + case GAUDI_EVENT_HBM1_SPI_1: + case GAUDI_EVENT_HBM2_SPI_1: + case GAUDI_EVENT_HBM3_SPI_1: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_hbm_read_interrupts(hdev, + gaudi_hbm_event_to_dev(event_type), + &eq_entry->hbm_ecc_data); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + break; + + case GAUDI_EVENT_TPC0_DEC: + case GAUDI_EVENT_TPC1_DEC: + case GAUDI_EVENT_TPC2_DEC: + case GAUDI_EVENT_TPC3_DEC: + case GAUDI_EVENT_TPC4_DEC: + case GAUDI_EVENT_TPC5_DEC: + case GAUDI_EVENT_TPC6_DEC: + case GAUDI_EVENT_TPC7_DEC: + /* In TPC DEC event, notify on TPC assertion. While there isn't + * a specific event for assertion yet, the FW generates TPC DEC event. + * The SW upper layer will inspect an internal mapped area to indicate + * if the event is a TPC Assertion or a "real" TPC DEC. + */ + event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT; + gaudi_print_irq_info(hdev, event_type, true); + reset_required = gaudi_tpc_read_interrupts(hdev, + tpc_dec_event_to_tpc_id(event_type), + "AXI_SLV_DEC_Error"); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + if (reset_required) { + dev_err(hdev->dev, "reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + reset_direct = true; + goto reset_device; + } else { + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + } + break; + + case GAUDI_EVENT_TPC0_KRN_ERR: + case GAUDI_EVENT_TPC1_KRN_ERR: + case GAUDI_EVENT_TPC2_KRN_ERR: + case GAUDI_EVENT_TPC3_KRN_ERR: + case GAUDI_EVENT_TPC4_KRN_ERR: + case GAUDI_EVENT_TPC5_KRN_ERR: + case GAUDI_EVENT_TPC6_KRN_ERR: + case GAUDI_EVENT_TPC7_KRN_ERR: + gaudi_print_irq_info(hdev, event_type, true); + reset_required = gaudi_tpc_read_interrupts(hdev, + tpc_krn_event_to_tpc_id(event_type), + "KRN_ERR"); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + if (reset_required) { + dev_err(hdev->dev, "reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + reset_direct = true; + goto reset_device; + } else { + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + } + break; + + case GAUDI_EVENT_PCIE_CORE_SERR: + case GAUDI_EVENT_PCIE_IF_SERR: + case GAUDI_EVENT_PCIE_PHY_SERR: + case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR: + case GAUDI_EVENT_MME0_ACC_SERR: + case GAUDI_EVENT_MME0_SBAB_SERR: + case GAUDI_EVENT_MME1_ACC_SERR: + case GAUDI_EVENT_MME1_SBAB_SERR: + case GAUDI_EVENT_MME2_ACC_SERR: + case GAUDI_EVENT_MME2_SBAB_SERR: + case GAUDI_EVENT_MME3_ACC_SERR: + case GAUDI_EVENT_MME3_SBAB_SERR: + case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC: + case GAUDI_EVENT_CPU_IF_ECC_SERR: + case GAUDI_EVENT_PSOC_MEM_SERR: + case GAUDI_EVENT_PSOC_CORESIGHT_SERR: + case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR: + case GAUDI_EVENT_NIC0_SERR ... GAUDI_EVENT_NIC4_SERR: + case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR: + case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: + fallthrough; + case GAUDI_EVENT_MMU_SERR: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + break; + + case GAUDI_EVENT_PCIE_DEC: + case GAUDI_EVENT_CPU_AXI_SPLITTER: + case GAUDI_EVENT_PSOC_AXI_DEC: + case GAUDI_EVENT_PSOC_PRSTN_FALL: + gaudi_print_irq_info(hdev, event_type, true); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + break; + + case GAUDI_EVENT_MMU_PAGE_FAULT: + case GAUDI_EVENT_MMU_WR_PERM: + gaudi_print_irq_info(hdev, event_type, true); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + break; + + case GAUDI_EVENT_MME0_WBC_RSP: + case GAUDI_EVENT_MME0_SBAB0_RSP: + case GAUDI_EVENT_MME1_WBC_RSP: + case GAUDI_EVENT_MME1_SBAB0_RSP: + case GAUDI_EVENT_MME2_WBC_RSP: + case GAUDI_EVENT_MME2_SBAB0_RSP: + case GAUDI_EVENT_MME3_WBC_RSP: + case GAUDI_EVENT_MME3_SBAB0_RSP: + case GAUDI_EVENT_RAZWI_OR_ADC: + case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM: + case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM: + fallthrough; + case GAUDI_EVENT_NIC0_QM0: + case GAUDI_EVENT_NIC0_QM1: + case GAUDI_EVENT_NIC1_QM0: + case GAUDI_EVENT_NIC1_QM1: + case GAUDI_EVENT_NIC2_QM0: + case GAUDI_EVENT_NIC2_QM1: + case GAUDI_EVENT_NIC3_QM0: + case GAUDI_EVENT_NIC3_QM1: + case GAUDI_EVENT_NIC4_QM0: + case GAUDI_EVENT_NIC4_QM1: + case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE: + case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_qman_err(hdev, event_type, &event_mask); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET); + break; + + case GAUDI_EVENT_RAZWI_OR_ADC_SW: + gaudi_print_irq_info(hdev, event_type, true); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + goto reset_device; + + case GAUDI_EVENT_TPC0_BMON_SPMU: + case GAUDI_EVENT_TPC1_BMON_SPMU: + case GAUDI_EVENT_TPC2_BMON_SPMU: + case GAUDI_EVENT_TPC3_BMON_SPMU: + case GAUDI_EVENT_TPC4_BMON_SPMU: + case GAUDI_EVENT_TPC5_BMON_SPMU: + case GAUDI_EVENT_TPC6_BMON_SPMU: + case GAUDI_EVENT_TPC7_BMON_SPMU: + case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7: + gaudi_print_irq_info(hdev, event_type, false); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + break; + + case GAUDI_EVENT_NIC_SEI_0 ... GAUDI_EVENT_NIC_SEI_4: + gaudi_print_nic_axi_irq_info(hdev, event_type, &data); + hl_fw_unmask_irq(hdev, event_type); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + break; + + case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_sm_sei_info(hdev, event_type, + &eq_entry->sm_sei_data); + rc = hl_state_dump(hdev); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + if (rc) + dev_err(hdev->dev, + "Error during system state dump %d\n", rc); + hl_fw_unmask_irq(hdev, event_type); + break; + + case GAUDI_EVENT_STATUS_NIC0_ENG0 ... GAUDI_EVENT_STATUS_NIC4_ENG1: + break; + + case GAUDI_EVENT_FIX_POWER_ENV_S ... GAUDI_EVENT_FIX_THERMAL_ENV_E: + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + gaudi_print_clk_change_info(hdev, event_type); + hl_fw_unmask_irq(hdev, event_type); + break; + + case GAUDI_EVENT_PSOC_GPIO_U16_0: + cause = le64_to_cpu(eq_entry->data[0]) & 0xFF; + dev_err(hdev->dev, + "Received high temp H/W interrupt %d (cause %d)\n", + event_type, cause); + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + break; + + case GAUDI_EVENT_DEV_RESET_REQ: + gaudi_print_irq_info(hdev, event_type, false); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + goto reset_device; + + case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + goto reset_device; + + case GAUDI_EVENT_FW_ALIVE_S: + gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + goto reset_device; + + default: + dev_err(hdev->dev, "Received invalid H/W interrupt %d\n", + event_type); + break; + } + + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + + return; + +reset_device: + reset_required = true; + + if (hdev->asic_prop.fw_security_enabled && !reset_direct) { + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW | fw_fatal_err_flag; + + /* notify on device unavailable while the reset triggered by fw */ + event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET | + HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE); + } else if (hdev->hard_reset_on_fw_events) { + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag; + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + } else { + reset_required = false; + } + + /* despite reset doesn't execute. a notification on + * occurred event needs to be sent here + */ + hl_notifier_event_send_all(hdev, event_mask); + if (reset_required) + hl_device_reset(hdev, flags); + else + hl_fw_unmask_irq(hdev, event_type); +} + +static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (aggregate) { + *size = (u32) sizeof(gaudi->events_stat_aggregate); + return gaudi->events_stat_aggregate; + } + + *size = (u32) sizeof(gaudi->events_stat); + return gaudi->events_stat; +} + +static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + u32 status, timeout_usec; + int rc; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) || + hdev->reset_info.hard_reset_pending) + return 0; + + if (hdev->pldm) + timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC; + else + timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + + /* L0 & L1 invalidation */ + WREG32(mmSTLB_INV_PS, 3); + WREG32(mmSTLB_CACHE_INV, gaudi->mmu_cache_inv_pi++); + WREG32(mmSTLB_INV_PS, 2); + + rc = hl_poll_timeout( + hdev, + mmSTLB_INV_PS, + status, + !status, + 1000, + timeout_usec); + + WREG32(mmSTLB_INV_SET, 0); + + return rc; +} + +static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, + bool is_hard, u32 flags, + u32 asid, u64 va, u64 size) +{ + /* Treat as invalidate all because there is no range invalidation + * in Gaudi + */ + return hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); +} + +static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, u64 phys_addr) +{ + u32 status, timeout_usec; + int rc; + + if (hdev->pldm) + timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC; + else + timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + + WREG32(MMU_ASID, asid); + WREG32(MMU_HOP0_PA43_12, phys_addr >> MMU_HOP0_PA43_12_SHIFT); + WREG32(MMU_HOP0_PA49_44, phys_addr >> MMU_HOP0_PA49_44_SHIFT); + WREG32(MMU_BUSY, 0x80000000); + + rc = hl_poll_timeout( + hdev, + MMU_BUSY, + status, + !(status & 0x80000000), + 1000, + timeout_usec); + + if (rc) { + dev_err(hdev->dev, + "Timeout during MMU hop0 config of asid %d\n", asid); + return rc; + } + + return 0; +} + +static int gaudi_send_heartbeat(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + return hl_fw_send_heartbeat(hdev); +} + +static int gaudi_cpucp_info_get(struct hl_device *hdev) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + rc = hl_fw_cpucp_handshake(hdev, mmCPU_BOOT_DEV_STS0, + mmCPU_BOOT_DEV_STS1, mmCPU_BOOT_ERR0, + mmCPU_BOOT_ERR1); + if (rc) + return rc; + + if (!strlen(prop->cpucp_info.card_name)) + strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME, + CARD_NAME_MAX_LEN); + + hdev->card_type = le32_to_cpu(hdev->asic_prop.cpucp_info.card_type); + + set_default_power_values(hdev); + + return 0; +} + +static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, + struct engines_data *e) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + const char *fmt = "%-5d%-9s%#-14x%#-12x%#x\n"; + const char *mme_slave_fmt = "%-5d%-9s%-14s%-12s%#x\n"; + const char *nic_fmt = "%-5d%-9s%#-14x%#x\n"; + unsigned long *mask = (unsigned long *)mask_arr; + u32 qm_glbl_sts0, qm_cgm_sts, dma_core_sts0, tpc_cfg_sts, mme_arch_sts; + bool is_idle = true, is_eng_idle, is_slave; + u64 offset; + int i, dma_id, port; + + if (e) + hl_engine_data_sprintf(e, + "\nDMA is_idle QM_GLBL_STS0 QM_CGM_STS DMA_CORE_STS0\n" + "--- ------- ------------ ---------- -------------\n"); + + for (i = 0 ; i < DMA_NUMBER_OF_CHNLS ; i++) { + dma_id = gaudi_dma_assignment[i]; + offset = dma_id * DMA_QMAN_OFFSET; + + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + offset); + dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); + is_idle &= is_eng_idle; + + if (mask && !is_eng_idle) + set_bit(GAUDI_ENGINE_ID_DMA_0 + dma_id, mask); + if (e) + hl_engine_data_sprintf(e, fmt, dma_id, + is_eng_idle ? "Y" : "N", qm_glbl_sts0, + qm_cgm_sts, dma_core_sts0); + } + + if (e) + hl_engine_data_sprintf(e, + "\nTPC is_idle QM_GLBL_STS0 QM_CGM_STS CFG_STATUS\n" + "--- ------- ------------ ---------- ----------\n"); + + for (i = 0 ; i < TPC_NUMBER_OF_ENGINES ; i++) { + offset = i * TPC_QMAN_OFFSET; + qm_glbl_sts0 = RREG32(mmTPC0_QM_GLBL_STS0 + offset); + qm_cgm_sts = RREG32(mmTPC0_QM_CGM_STS + offset); + tpc_cfg_sts = RREG32(mmTPC0_CFG_STATUS + offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_TPC_IDLE(tpc_cfg_sts); + is_idle &= is_eng_idle; + + if (mask && !is_eng_idle) + set_bit(GAUDI_ENGINE_ID_TPC_0 + i, mask); + if (e) + hl_engine_data_sprintf(e, fmt, i, + is_eng_idle ? "Y" : "N", + qm_glbl_sts0, qm_cgm_sts, tpc_cfg_sts); + } + + if (e) + hl_engine_data_sprintf(e, + "\nMME is_idle QM_GLBL_STS0 QM_CGM_STS ARCH_STATUS\n" + "--- ------- ------------ ---------- -----------\n"); + + for (i = 0 ; i < MME_NUMBER_OF_ENGINES ; i++) { + offset = i * MME_QMAN_OFFSET; + mme_arch_sts = RREG32(mmMME0_CTRL_ARCH_STATUS + offset); + is_eng_idle = IS_MME_IDLE(mme_arch_sts); + + /* MME 1 & 3 are slaves, no need to check their QMANs */ + is_slave = i % 2; + if (!is_slave) { + qm_glbl_sts0 = RREG32(mmMME0_QM_GLBL_STS0 + offset); + qm_cgm_sts = RREG32(mmMME0_QM_CGM_STS + offset); + is_eng_idle &= IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts); + } + + is_idle &= is_eng_idle; + + if (mask && !is_eng_idle) + set_bit(GAUDI_ENGINE_ID_MME_0 + i, mask); + if (e) { + if (!is_slave) + hl_engine_data_sprintf(e, fmt, i, + is_eng_idle ? "Y" : "N", + qm_glbl_sts0, qm_cgm_sts, mme_arch_sts); + else + hl_engine_data_sprintf(e, mme_slave_fmt, i, + is_eng_idle ? "Y" : "N", "-", + "-", mme_arch_sts); + } + } + + if (e) + hl_engine_data_sprintf(e, + "\nNIC is_idle QM_GLBL_STS0 QM_CGM_STS\n" + "--- ------- ------------ ----------\n"); + + for (i = 0 ; i < (NIC_NUMBER_OF_ENGINES / 2) ; i++) { + offset = i * NIC_MACRO_QMAN_OFFSET; + port = 2 * i; + if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) { + qm_glbl_sts0 = RREG32(mmNIC0_QM0_GLBL_STS0 + offset); + qm_cgm_sts = RREG32(mmNIC0_QM0_CGM_STS + offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts); + is_idle &= is_eng_idle; + + if (mask && !is_eng_idle) + set_bit(GAUDI_ENGINE_ID_NIC_0 + port, mask); + if (e) + hl_engine_data_sprintf(e, nic_fmt, port, + is_eng_idle ? "Y" : "N", + qm_glbl_sts0, qm_cgm_sts); + } + + port = 2 * i + 1; + if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + port)) { + qm_glbl_sts0 = RREG32(mmNIC0_QM1_GLBL_STS0 + offset); + qm_cgm_sts = RREG32(mmNIC0_QM1_CGM_STS + offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts); + is_idle &= is_eng_idle; + + if (mask && !is_eng_idle) + set_bit(GAUDI_ENGINE_ID_NIC_0 + port, mask); + if (e) + hl_engine_data_sprintf(e, nic_fmt, port, + is_eng_idle ? "Y" : "N", + qm_glbl_sts0, qm_cgm_sts); + } + } + + if (e) + hl_engine_data_sprintf(e, "\n"); + + return is_idle; +} + +static void gaudi_hw_queues_lock(struct hl_device *hdev) + __acquires(&gaudi->hw_queues_lock) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + spin_lock(&gaudi->hw_queues_lock); +} + +static void gaudi_hw_queues_unlock(struct hl_device *hdev) + __releases(&gaudi->hw_queues_lock) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + spin_unlock(&gaudi->hw_queues_lock); +} + +static u32 gaudi_get_pci_id(struct hl_device *hdev) +{ + return hdev->pdev->device; +} + +static int gaudi_get_eeprom_data(struct hl_device *hdev, void *data, + size_t max_size) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + return hl_fw_get_eeprom_data(hdev, data, max_size); +} + +static int gaudi_get_monitor_dump(struct hl_device *hdev, void *data) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + return hl_fw_get_monitor_dump(hdev, data); +} + +/* + * this function should be used only during initialization and/or after reset, + * when there are no active users. + */ +static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel, u32 tpc_id) +{ + u64 kernel_timeout; + u32 status, offset; + int rc; + + offset = tpc_id * (mmTPC1_CFG_STATUS - mmTPC0_CFG_STATUS); + + if (hdev->pldm) + kernel_timeout = GAUDI_PLDM_TPC_KERNEL_WAIT_USEC; + else + kernel_timeout = HL_DEVICE_TIMEOUT_USEC; + + WREG32(mmTPC0_CFG_QM_KERNEL_BASE_ADDRESS_LOW + offset, + lower_32_bits(tpc_kernel)); + WREG32(mmTPC0_CFG_QM_KERNEL_BASE_ADDRESS_HIGH + offset, + upper_32_bits(tpc_kernel)); + + WREG32(mmTPC0_CFG_ICACHE_BASE_ADDERESS_LOW + offset, + lower_32_bits(tpc_kernel)); + WREG32(mmTPC0_CFG_ICACHE_BASE_ADDERESS_HIGH + offset, + upper_32_bits(tpc_kernel)); + /* set a valid LUT pointer, content is of no significance */ + WREG32(mmTPC0_CFG_LUT_FUNC256_BASE_ADDR_LO + offset, + lower_32_bits(tpc_kernel)); + WREG32(mmTPC0_CFG_LUT_FUNC256_BASE_ADDR_HI + offset, + upper_32_bits(tpc_kernel)); + + WREG32(mmTPC0_CFG_QM_SYNC_OBJECT_ADDR + offset, + lower_32_bits(CFG_BASE + + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0)); + + WREG32(mmTPC0_CFG_TPC_CMD + offset, + (1 << TPC0_CFG_TPC_CMD_ICACHE_INVALIDATE_SHIFT | + 1 << TPC0_CFG_TPC_CMD_ICACHE_PREFETCH_64KB_SHIFT)); + /* wait a bit for the engine to start executing */ + usleep_range(1000, 1500); + + /* wait until engine has finished executing */ + rc = hl_poll_timeout( + hdev, + mmTPC0_CFG_STATUS + offset, + status, + (status & TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK) == + TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK, + 1000, + kernel_timeout); + + if (rc) { + dev_err(hdev->dev, + "Timeout while waiting for TPC%d icache prefetch\n", + tpc_id); + return -EIO; + } + + WREG32(mmTPC0_CFG_TPC_EXECUTE + offset, + 1 << TPC0_CFG_TPC_EXECUTE_V_SHIFT); + + /* wait a bit for the engine to start executing */ + usleep_range(1000, 1500); + + /* wait until engine has finished executing */ + rc = hl_poll_timeout( + hdev, + mmTPC0_CFG_STATUS + offset, + status, + (status & TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK) == + TPC0_CFG_STATUS_VECTOR_PIPE_EMPTY_MASK, + 1000, + kernel_timeout); + + if (rc) { + dev_err(hdev->dev, + "Timeout while waiting for TPC%d vector pipe\n", + tpc_id); + return -EIO; + } + + rc = hl_poll_timeout( + hdev, + mmTPC0_CFG_WQ_INFLIGHT_CNTR + offset, + status, + (status == 0), + 1000, + kernel_timeout); + + if (rc) { + dev_err(hdev->dev, + "Timeout while waiting for TPC%d kernel to execute\n", + tpc_id); + return -EIO; + } + + return 0; +} + +static int gaudi_internal_cb_pool_init(struct hl_device *hdev, + struct hl_ctx *ctx) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + int min_alloc_order, rc, collective_cb_size; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) + return 0; + + hdev->internal_cb_pool_virt_addr = hl_asic_dma_alloc_coherent(hdev, + HOST_SPACE_INTERNAL_CB_SZ, + &hdev->internal_cb_pool_dma_addr, + GFP_KERNEL | __GFP_ZERO); + + if (!hdev->internal_cb_pool_virt_addr) + return -ENOMEM; + + collective_cb_size = sizeof(struct packet_msg_short) * 5 + + sizeof(struct packet_fence); + min_alloc_order = ilog2(collective_cb_size); + + hdev->internal_cb_pool = gen_pool_create(min_alloc_order, -1); + if (!hdev->internal_cb_pool) { + dev_err(hdev->dev, + "Failed to create internal CB pool\n"); + rc = -ENOMEM; + goto free_internal_cb_pool; + } + + rc = gen_pool_add(hdev->internal_cb_pool, + (uintptr_t) hdev->internal_cb_pool_virt_addr, + HOST_SPACE_INTERNAL_CB_SZ, -1); + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to internal CB pool\n"); + rc = -EFAULT; + goto destroy_internal_cb_pool; + } + + hdev->internal_cb_va_base = hl_reserve_va_block(hdev, ctx, + HL_VA_RANGE_TYPE_HOST, HOST_SPACE_INTERNAL_CB_SZ, + HL_MMU_VA_ALIGNMENT_NOT_NEEDED); + + if (!hdev->internal_cb_va_base) { + rc = -ENOMEM; + goto destroy_internal_cb_pool; + } + + mutex_lock(&hdev->mmu_lock); + rc = hl_mmu_map_contiguous(ctx, hdev->internal_cb_va_base, + hdev->internal_cb_pool_dma_addr, + HOST_SPACE_INTERNAL_CB_SZ); + + hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR); + mutex_unlock(&hdev->mmu_lock); + + if (rc) + goto unreserve_internal_cb_pool; + + return 0; + +unreserve_internal_cb_pool: + hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base, + HOST_SPACE_INTERNAL_CB_SZ); +destroy_internal_cb_pool: + gen_pool_destroy(hdev->internal_cb_pool); +free_internal_cb_pool: + hl_asic_dma_free_coherent(hdev, HOST_SPACE_INTERNAL_CB_SZ, hdev->internal_cb_pool_virt_addr, + hdev->internal_cb_pool_dma_addr); + + return rc; +} + +static void gaudi_internal_cb_pool_fini(struct hl_device *hdev, + struct hl_ctx *ctx) +{ + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) + return; + + mutex_lock(&hdev->mmu_lock); + hl_mmu_unmap_contiguous(ctx, hdev->internal_cb_va_base, + HOST_SPACE_INTERNAL_CB_SZ); + hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base, + HOST_SPACE_INTERNAL_CB_SZ); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + mutex_unlock(&hdev->mmu_lock); + + gen_pool_destroy(hdev->internal_cb_pool); + + hl_asic_dma_free_coherent(hdev, HOST_SPACE_INTERNAL_CB_SZ, hdev->internal_cb_pool_virt_addr, + hdev->internal_cb_pool_dma_addr); +} + +static int gaudi_ctx_init(struct hl_ctx *ctx) +{ + int rc; + + if (ctx->asid == HL_KERNEL_ASID_ID) + return 0; + + rc = gaudi_internal_cb_pool_init(ctx->hdev, ctx); + if (rc) + return rc; + + rc = gaudi_restore_user_registers(ctx->hdev); + if (rc) + gaudi_internal_cb_pool_fini(ctx->hdev, ctx); + + return rc; +} + +static void gaudi_ctx_fini(struct hl_ctx *ctx) +{ + if (ctx->asid == HL_KERNEL_ASID_ID) + return; + + gaudi_internal_cb_pool_fini(ctx->hdev, ctx); +} + +static int gaudi_pre_schedule_cs(struct hl_cs *cs) +{ + return 0; +} + +static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx) +{ + return gaudi_cq_assignment[cq_idx]; +} + +static u32 gaudi_get_signal_cb_size(struct hl_device *hdev) +{ + return sizeof(struct packet_msg_short) + + sizeof(struct packet_msg_prot) * 2; +} + +static u32 gaudi_get_wait_cb_size(struct hl_device *hdev) +{ + return sizeof(struct packet_msg_short) * 4 + + sizeof(struct packet_fence) + + sizeof(struct packet_msg_prot) * 2; +} + +static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id) +{ + return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4); +} + +static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, + u32 size, bool eb) +{ + struct hl_cb *cb = (struct hl_cb *) data; + struct packet_msg_short *pkt; + u32 value, ctl, pkt_size = sizeof(*pkt); + + pkt = cb->kernel_address + size; + memset(pkt, 0, pkt_size); + + /* Inc by 1, Mode ADD */ + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK, 1); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK, 1); + + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, sob_id * 4); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */ + ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, eb); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + pkt->value = cpu_to_le32(value); + pkt->ctl = cpu_to_le32(ctl); + + return size + pkt_size; +} + +static u32 gaudi_add_mon_msg_short(struct packet_msg_short *pkt, u32 value, + u16 addr) +{ + u32 ctl, pkt_size = sizeof(*pkt); + + memset(pkt, 0, pkt_size); + + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 0); /* last pkt MB */ + + pkt->value = cpu_to_le32(value); + pkt->ctl = cpu_to_le32(ctl); + + return pkt_size; +} + +static u32 gaudi_add_arm_monitor_pkt(struct hl_device *hdev, + struct packet_msg_short *pkt, u16 sob_base, u8 sob_mask, + u16 sob_val, u16 mon_id) +{ + u64 monitor_base; + u32 ctl, value, pkt_size = sizeof(*pkt); + u16 msg_addr_offset; + u8 mask; + + if (hl_gen_sob_mask(sob_base, sob_mask, &mask)) { + dev_err(hdev->dev, + "sob_base %u (mask %#x) is not valid\n", + sob_base, sob_mask); + return 0; + } + + /* + * monitor_base should be the content of the base0 address registers, + * so it will be added to the msg short offsets + */ + monitor_base = mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0; + + msg_addr_offset = + (mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0 + mon_id * 4) - + monitor_base; + + memset(pkt, 0, pkt_size); + + /* Monitor config packet: bind the monitor to a sync object */ + value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_MASK, sob_base / 8); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_MASK, sob_val); + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MODE_MASK, + 0); /* GREATER OR EQUAL*/ + value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MASK_MASK, mask); + + ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, msg_addr_offset); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */ + ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + pkt->value = cpu_to_le32(value); + pkt->ctl = cpu_to_le32(ctl); + + return pkt_size; +} + +static u32 gaudi_add_fence_pkt(struct packet_fence *pkt) +{ + u32 ctl, cfg, pkt_size = sizeof(*pkt); + + memset(pkt, 0, pkt_size); + + cfg = FIELD_PREP(GAUDI_PKT_FENCE_CFG_DEC_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK, 1); + cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_ID_MASK, 2); + + ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_FENCE); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 0); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); + + pkt->cfg = cpu_to_le32(cfg); + pkt->ctl = cpu_to_le32(ctl); + + return pkt_size; +} + +static int gaudi_get_fence_addr(struct hl_device *hdev, u32 queue_id, u64 *addr) +{ + u32 offset, nic_index; + + switch (queue_id) { + case GAUDI_QUEUE_ID_DMA_0_0: + offset = mmDMA0_QM_CP_FENCE2_RDATA_0; + break; + case GAUDI_QUEUE_ID_DMA_0_1: + offset = mmDMA0_QM_CP_FENCE2_RDATA_1; + break; + case GAUDI_QUEUE_ID_DMA_0_2: + offset = mmDMA0_QM_CP_FENCE2_RDATA_2; + break; + case GAUDI_QUEUE_ID_DMA_0_3: + offset = mmDMA0_QM_CP_FENCE2_RDATA_3; + break; + case GAUDI_QUEUE_ID_DMA_1_0: + offset = mmDMA1_QM_CP_FENCE2_RDATA_0; + break; + case GAUDI_QUEUE_ID_DMA_1_1: + offset = mmDMA1_QM_CP_FENCE2_RDATA_1; + break; + case GAUDI_QUEUE_ID_DMA_1_2: + offset = mmDMA1_QM_CP_FENCE2_RDATA_2; + break; + case GAUDI_QUEUE_ID_DMA_1_3: + offset = mmDMA1_QM_CP_FENCE2_RDATA_3; + break; + case GAUDI_QUEUE_ID_DMA_5_0: + offset = mmDMA5_QM_CP_FENCE2_RDATA_0; + break; + case GAUDI_QUEUE_ID_DMA_5_1: + offset = mmDMA5_QM_CP_FENCE2_RDATA_1; + break; + case GAUDI_QUEUE_ID_DMA_5_2: + offset = mmDMA5_QM_CP_FENCE2_RDATA_2; + break; + case GAUDI_QUEUE_ID_DMA_5_3: + offset = mmDMA5_QM_CP_FENCE2_RDATA_3; + break; + case GAUDI_QUEUE_ID_TPC_7_0: + offset = mmTPC7_QM_CP_FENCE2_RDATA_0; + break; + case GAUDI_QUEUE_ID_TPC_7_1: + offset = mmTPC7_QM_CP_FENCE2_RDATA_1; + break; + case GAUDI_QUEUE_ID_TPC_7_2: + offset = mmTPC7_QM_CP_FENCE2_RDATA_2; + break; + case GAUDI_QUEUE_ID_TPC_7_3: + offset = mmTPC7_QM_CP_FENCE2_RDATA_3; + break; + case GAUDI_QUEUE_ID_NIC_0_0: + case GAUDI_QUEUE_ID_NIC_1_0: + case GAUDI_QUEUE_ID_NIC_2_0: + case GAUDI_QUEUE_ID_NIC_3_0: + case GAUDI_QUEUE_ID_NIC_4_0: + case GAUDI_QUEUE_ID_NIC_5_0: + case GAUDI_QUEUE_ID_NIC_6_0: + case GAUDI_QUEUE_ID_NIC_7_0: + case GAUDI_QUEUE_ID_NIC_8_0: + case GAUDI_QUEUE_ID_NIC_9_0: + nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_0) >> 2; + offset = mmNIC0_QM0_CP_FENCE2_RDATA_0 + + (nic_index >> 1) * NIC_MACRO_QMAN_OFFSET + + (nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET; + break; + case GAUDI_QUEUE_ID_NIC_0_1: + case GAUDI_QUEUE_ID_NIC_1_1: + case GAUDI_QUEUE_ID_NIC_2_1: + case GAUDI_QUEUE_ID_NIC_3_1: + case GAUDI_QUEUE_ID_NIC_4_1: + case GAUDI_QUEUE_ID_NIC_5_1: + case GAUDI_QUEUE_ID_NIC_6_1: + case GAUDI_QUEUE_ID_NIC_7_1: + case GAUDI_QUEUE_ID_NIC_8_1: + case GAUDI_QUEUE_ID_NIC_9_1: + nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_1) >> 2; + offset = mmNIC0_QM0_CP_FENCE2_RDATA_1 + + (nic_index >> 1) * NIC_MACRO_QMAN_OFFSET + + (nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET; + break; + case GAUDI_QUEUE_ID_NIC_0_2: + case GAUDI_QUEUE_ID_NIC_1_2: + case GAUDI_QUEUE_ID_NIC_2_2: + case GAUDI_QUEUE_ID_NIC_3_2: + case GAUDI_QUEUE_ID_NIC_4_2: + case GAUDI_QUEUE_ID_NIC_5_2: + case GAUDI_QUEUE_ID_NIC_6_2: + case GAUDI_QUEUE_ID_NIC_7_2: + case GAUDI_QUEUE_ID_NIC_8_2: + case GAUDI_QUEUE_ID_NIC_9_2: + nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_2) >> 2; + offset = mmNIC0_QM0_CP_FENCE2_RDATA_2 + + (nic_index >> 1) * NIC_MACRO_QMAN_OFFSET + + (nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET; + break; + case GAUDI_QUEUE_ID_NIC_0_3: + case GAUDI_QUEUE_ID_NIC_1_3: + case GAUDI_QUEUE_ID_NIC_2_3: + case GAUDI_QUEUE_ID_NIC_3_3: + case GAUDI_QUEUE_ID_NIC_4_3: + case GAUDI_QUEUE_ID_NIC_5_3: + case GAUDI_QUEUE_ID_NIC_6_3: + case GAUDI_QUEUE_ID_NIC_7_3: + case GAUDI_QUEUE_ID_NIC_8_3: + case GAUDI_QUEUE_ID_NIC_9_3: + nic_index = (queue_id - GAUDI_QUEUE_ID_NIC_0_3) >> 2; + offset = mmNIC0_QM0_CP_FENCE2_RDATA_3 + + (nic_index >> 1) * NIC_MACRO_QMAN_OFFSET + + (nic_index & 0x1) * NIC_ENGINE_QMAN_OFFSET; + break; + default: + return -EINVAL; + } + + *addr = CFG_BASE + offset; + + return 0; +} + +static u32 gaudi_add_mon_pkts(void *buf, u16 mon_id, u64 fence_addr) +{ + u64 monitor_base; + u32 size = 0; + u16 msg_addr_offset; + + /* + * monitor_base should be the content of the base0 address registers, + * so it will be added to the msg short offsets + */ + monitor_base = mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0; + + /* First monitor config packet: low address of the sync */ + msg_addr_offset = + (mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0 + mon_id * 4) - + monitor_base; + + size += gaudi_add_mon_msg_short(buf + size, (u32) fence_addr, + msg_addr_offset); + + /* Second monitor config packet: high address of the sync */ + msg_addr_offset = + (mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0 + mon_id * 4) - + monitor_base; + + size += gaudi_add_mon_msg_short(buf + size, (u32) (fence_addr >> 32), + msg_addr_offset); + + /* + * Third monitor config packet: the payload, i.e. what to write when the + * sync triggers + */ + msg_addr_offset = + (mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_DATA_0 + mon_id * 4) - + monitor_base; + + size += gaudi_add_mon_msg_short(buf + size, 1, msg_addr_offset); + + return size; +} + +static u32 gaudi_gen_wait_cb(struct hl_device *hdev, + struct hl_gen_wait_properties *prop) +{ + struct hl_cb *cb = (struct hl_cb *) prop->data; + void *buf = cb->kernel_address; + u64 fence_addr = 0; + u32 size = prop->size; + + if (gaudi_get_fence_addr(hdev, prop->q_idx, &fence_addr)) { + dev_crit(hdev->dev, "wrong queue id %d for wait packet\n", + prop->q_idx); + return 0; + } + + size += gaudi_add_mon_pkts(buf + size, prop->mon_id, fence_addr); + size += gaudi_add_arm_monitor_pkt(hdev, buf + size, prop->sob_base, + prop->sob_mask, prop->sob_val, prop->mon_id); + size += gaudi_add_fence_pkt(buf + size); + + return size; +} + +static void gaudi_reset_sob(struct hl_device *hdev, void *data) +{ + struct hl_hw_sob *hw_sob = (struct hl_hw_sob *) data; + + dev_dbg(hdev->dev, "reset SOB, q_idx: %d, sob_id: %d\n", hw_sob->q_idx, + hw_sob->sob_id); + + WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + + hw_sob->sob_id * 4, 0); + + kref_init(&hw_sob->kref); +} + +static u64 gaudi_get_device_time(struct hl_device *hdev) +{ + u64 device_time = ((u64) RREG32(mmPSOC_TIMESTAMP_CNTCVU)) << 32; + + return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL); +} + +static int gaudi_get_hw_block_id(struct hl_device *hdev, u64 block_addr, + u32 *block_size, u32 *block_id) +{ + return -EPERM; +} + +static int gaudi_block_mmap(struct hl_device *hdev, + struct vm_area_struct *vma, + u32 block_id, u32 block_size) +{ + return -EPERM; +} + +static void gaudi_enable_events_from_fw(struct hl_device *hdev) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + u32 irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : + le32_to_cpu(dyn_regs->gic_host_ints_irq); + + WREG32(irq_handler_offset, + gaudi_irq_map_table[GAUDI_EVENT_INTS_REGISTER].cpu_id); +} + +static int gaudi_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64 mmu_cap_mask) +{ + return -EINVAL; +} + +static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx) +{ + switch (pll_idx) { + case HL_GAUDI_CPU_PLL: return CPU_PLL; + case HL_GAUDI_PCI_PLL: return PCI_PLL; + case HL_GAUDI_NIC_PLL: return NIC_PLL; + case HL_GAUDI_DMA_PLL: return DMA_PLL; + case HL_GAUDI_MESH_PLL: return MESH_PLL; + case HL_GAUDI_MME_PLL: return MME_PLL; + case HL_GAUDI_TPC_PLL: return TPC_PLL; + case HL_GAUDI_IF_PLL: return IF_PLL; + case HL_GAUDI_SRAM_PLL: return SRAM_PLL; + case HL_GAUDI_HBM_PLL: return HBM_PLL; + default: return -EINVAL; + } +} + +static int gaudi_add_sync_to_engine_map_entry( + struct hl_sync_to_engine_map *map, u32 reg_value, + enum hl_sync_engine_type engine_type, u32 engine_id) +{ + struct hl_sync_to_engine_map_entry *entry; + + /* Reg value represents a partial address of sync object, + * it is used as unique identifier. For this we need to + * clear the cutoff cfg base bits from the value. + */ + if (reg_value == 0 || reg_value == 0xffffffff) + return 0; + reg_value -= lower_32_bits(CFG_BASE); + + /* create a new hash entry */ + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->engine_type = engine_type; + entry->engine_id = engine_id; + entry->sync_id = reg_value; + hash_add(map->tb, &entry->node, reg_value); + + return 0; +} + +static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev, + struct hl_sync_to_engine_map *map) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + int i, j, rc; + u32 reg_value; + + /* Iterate over TPC engines */ + for (i = 0; i < sds->props[SP_NUM_OF_TPC_ENGINES]; ++i) { + + reg_value = RREG32(sds->props[SP_TPC0_CFG_SO] + + sds->props[SP_NEXT_TPC] * i); + + rc = gaudi_add_sync_to_engine_map_entry(map, reg_value, + ENGINE_TPC, i); + if (rc) + goto free_sync_to_engine_map; + } + + /* Iterate over MME engines */ + for (i = 0; i < sds->props[SP_NUM_OF_MME_ENGINES]; ++i) { + for (j = 0; j < sds->props[SP_SUB_MME_ENG_NUM]; ++j) { + + reg_value = RREG32(sds->props[SP_MME_CFG_SO] + + sds->props[SP_NEXT_MME] * i + + j * sizeof(u32)); + + rc = gaudi_add_sync_to_engine_map_entry( + map, reg_value, ENGINE_MME, + i * sds->props[SP_SUB_MME_ENG_NUM] + j); + if (rc) + goto free_sync_to_engine_map; + } + } + + /* Iterate over DMA engines */ + for (i = 0; i < sds->props[SP_NUM_OF_DMA_ENGINES]; ++i) { + reg_value = RREG32(sds->props[SP_DMA_CFG_SO] + + sds->props[SP_DMA_QUEUES_OFFSET] * i); + rc = gaudi_add_sync_to_engine_map_entry(map, reg_value, + ENGINE_DMA, i); + if (rc) + goto free_sync_to_engine_map; + } + + return 0; + +free_sync_to_engine_map: + hl_state_dump_free_sync_to_engine_map(map); + + return rc; +} + +static int gaudi_monitor_valid(struct hl_mon_state_dump *mon) +{ + return FIELD_GET( + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK, + mon->status); +} + +static void gaudi_fill_sobs_from_mon(char *sobs, struct hl_mon_state_dump *mon) +{ + const size_t max_write = 10; + u32 gid, mask, sob; + int i, offset; + + /* Sync object ID is calculated as follows: + * (8 * group_id + cleared bits in mask) + */ + gid = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK, + mon->arm_data); + mask = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK, + mon->arm_data); + + for (i = 0, offset = 0; mask && offset < MONITOR_SOB_STRING_SIZE - + max_write; mask >>= 1, i++) { + if (!(mask & 1)) { + sob = gid * MONITOR_MAX_SOBS + i; + + if (offset > 0) + offset += snprintf(sobs + offset, max_write, + ", "); + + offset += snprintf(sobs + offset, max_write, "%u", sob); + } + } +} + +static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset, + struct hl_device *hdev, + struct hl_mon_state_dump *mon) +{ + const char *name; + char scratch_buf1[BIN_REG_STRING_SIZE], + scratch_buf2[BIN_REG_STRING_SIZE]; + char monitored_sobs[MONITOR_SOB_STRING_SIZE] = {0}; + + name = hl_state_dump_get_monitor_name(hdev, mon); + if (!name) + name = ""; + + gaudi_fill_sobs_from_mon(monitored_sobs, mon); + + return hl_snprintf_resize( + buf, size, offset, + "Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s. Means sync objects [%s] are being monitored.", + mon->id, name, + FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK, + mon->arm_data), + hl_format_as_binary( + scratch_buf1, sizeof(scratch_buf1), + FIELD_GET( + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK, + mon->arm_data)), + FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK, + mon->arm_data), + mon->wr_data, + (((u64)mon->wr_addr_high) << 32) | mon->wr_addr_low, + hl_format_as_binary( + scratch_buf2, sizeof(scratch_buf2), + FIELD_GET( + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK, + mon->status)), + monitored_sobs); +} + + +static int gaudi_print_fences_single_engine( + struct hl_device *hdev, u64 base_offset, u64 status_base_offset, + enum hl_sync_engine_type engine_type, u32 engine_id, char **buf, + size_t *size, size_t *offset) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + int rc = -ENOMEM, i; + u32 *statuses, *fences; + + statuses = kcalloc(sds->props[SP_ENGINE_NUM_OF_QUEUES], + sizeof(*statuses), GFP_KERNEL); + if (!statuses) + goto out; + + fences = kcalloc(sds->props[SP_ENGINE_NUM_OF_FENCES] * + sds->props[SP_ENGINE_NUM_OF_QUEUES], + sizeof(*fences), GFP_KERNEL); + if (!fences) + goto free_status; + + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES]; ++i) + statuses[i] = RREG32(status_base_offset + i * sizeof(u32)); + + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES] * + sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) + fences[i] = RREG32(base_offset + i * sizeof(u32)); + + /* The actual print */ + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) { + u32 fence_id; + u64 fence_cnt, fence_rdata; + const char *engine_name; + + if (!FIELD_GET(TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK, + statuses[i])) + continue; + + fence_id = + FIELD_GET(TPC0_QM_CP_STS_0_FENCE_ID_MASK, statuses[i]); + fence_cnt = base_offset + CFG_BASE + + sizeof(u32) * + (i + fence_id * sds->props[SP_ENGINE_NUM_OF_QUEUES]); + fence_rdata = fence_cnt - sds->props[SP_FENCE0_CNT_OFFSET] + + sds->props[SP_FENCE0_RDATA_OFFSET]; + engine_name = hl_sync_engine_to_string(engine_type); + + rc = hl_snprintf_resize( + buf, size, offset, + "%s%u, stream %u: fence id %u cnt = 0x%llx (%s%u_QM.CP_FENCE%u_CNT_%u) rdata = 0x%llx (%s%u_QM.CP_FENCE%u_RDATA_%u) value = %u, cp_status = %u\n", + engine_name, engine_id, + i, fence_id, + fence_cnt, engine_name, engine_id, fence_id, i, + fence_rdata, engine_name, engine_id, fence_id, i, + fences[fence_id], + statuses[i]); + if (rc) + goto free_fences; + } + + rc = 0; + +free_fences: + kfree(fences); +free_status: + kfree(statuses); +out: + return rc; +} + + +static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = { + .monitor_valid = gaudi_monitor_valid, + .print_single_monitor = gaudi_print_single_monitor, + .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map, + .print_fences_single_engine = gaudi_print_fences_single_engine, +}; + +static void gaudi_state_dump_init(struct hl_device *hdev) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + int i; + + for (i = 0; i < ARRAY_SIZE(gaudi_so_id_to_str); ++i) + hash_add(sds->so_id_to_str_tb, + &gaudi_so_id_to_str[i].node, + gaudi_so_id_to_str[i].id); + + for (i = 0; i < ARRAY_SIZE(gaudi_monitor_id_to_str); ++i) + hash_add(sds->monitor_id_to_str_tb, + &gaudi_monitor_id_to_str[i].node, + gaudi_monitor_id_to_str[i].id); + + sds->props = gaudi_state_dump_specs_props; + + sds->sync_namager_names = gaudi_sync_manager_names; + + sds->funcs = gaudi_state_dump_funcs; +} + +static u32 *gaudi_get_stream_master_qid_arr(void) +{ + return gaudi_stream_master; +} + +static void gaudi_check_if_razwi_happened(struct hl_device *hdev) +{ +} + +static ssize_t infineon_ver_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + struct cpucp_info *cpucp_info; + + cpucp_info = &hdev->asic_prop.cpucp_info; + + return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version)); +} + +static DEVICE_ATTR_RO(infineon_ver); + +static struct attribute *gaudi_vrm_dev_attrs[] = { + &dev_attr_infineon_ver.attr, + NULL, +}; + +static void gaudi_add_device_attr(struct hl_device *hdev, struct attribute_group *dev_clk_attr_grp, + struct attribute_group *dev_vrm_attr_grp) +{ + hl_sysfs_add_dev_clk_attr(hdev, dev_clk_attr_grp); + dev_vrm_attr_grp->attrs = gaudi_vrm_dev_attrs; +} + +static int gaudi_send_device_activity(struct hl_device *hdev, bool open) +{ + return 0; +} + +static const struct hl_asic_funcs gaudi_funcs = { + .early_init = gaudi_early_init, + .early_fini = gaudi_early_fini, + .late_init = gaudi_late_init, + .late_fini = gaudi_late_fini, + .sw_init = gaudi_sw_init, + .sw_fini = gaudi_sw_fini, + .hw_init = gaudi_hw_init, + .hw_fini = gaudi_hw_fini, + .halt_engines = gaudi_halt_engines, + .suspend = gaudi_suspend, + .resume = gaudi_resume, + .mmap = gaudi_mmap, + .ring_doorbell = gaudi_ring_doorbell, + .pqe_write = gaudi_pqe_write, + .asic_dma_alloc_coherent = gaudi_dma_alloc_coherent, + .asic_dma_free_coherent = gaudi_dma_free_coherent, + .scrub_device_mem = gaudi_scrub_device_mem, + .scrub_device_dram = gaudi_scrub_device_dram, + .get_int_queue_base = gaudi_get_int_queue_base, + .test_queues = gaudi_test_queues, + .asic_dma_pool_zalloc = gaudi_dma_pool_zalloc, + .asic_dma_pool_free = gaudi_dma_pool_free, + .cpu_accessible_dma_pool_alloc = gaudi_cpu_accessible_dma_pool_alloc, + .cpu_accessible_dma_pool_free = gaudi_cpu_accessible_dma_pool_free, + .hl_dma_unmap_sgtable = hl_dma_unmap_sgtable, + .cs_parser = gaudi_cs_parser, + .asic_dma_map_sgtable = hl_dma_map_sgtable, + .add_end_of_cb_packets = gaudi_add_end_of_cb_packets, + .update_eq_ci = gaudi_update_eq_ci, + .context_switch = gaudi_context_switch, + .restore_phase_topology = gaudi_restore_phase_topology, + .debugfs_read_dma = gaudi_debugfs_read_dma, + .add_device_attr = gaudi_add_device_attr, + .handle_eqe = gaudi_handle_eqe, + .get_events_stat = gaudi_get_events_stat, + .read_pte = gaudi_read_pte, + .write_pte = gaudi_write_pte, + .mmu_invalidate_cache = gaudi_mmu_invalidate_cache, + .mmu_invalidate_cache_range = gaudi_mmu_invalidate_cache_range, + .mmu_prefetch_cache_range = NULL, + .send_heartbeat = gaudi_send_heartbeat, + .debug_coresight = gaudi_debug_coresight, + .is_device_idle = gaudi_is_device_idle, + .compute_reset_late_init = gaudi_compute_reset_late_init, + .hw_queues_lock = gaudi_hw_queues_lock, + .hw_queues_unlock = gaudi_hw_queues_unlock, + .get_pci_id = gaudi_get_pci_id, + .get_eeprom_data = gaudi_get_eeprom_data, + .get_monitor_dump = gaudi_get_monitor_dump, + .send_cpu_message = gaudi_send_cpu_message, + .pci_bars_map = gaudi_pci_bars_map, + .init_iatu = gaudi_init_iatu, + .rreg = hl_rreg, + .wreg = hl_wreg, + .halt_coresight = gaudi_halt_coresight, + .ctx_init = gaudi_ctx_init, + .ctx_fini = gaudi_ctx_fini, + .pre_schedule_cs = gaudi_pre_schedule_cs, + .get_queue_id_for_cq = gaudi_get_queue_id_for_cq, + .load_firmware_to_device = gaudi_load_firmware_to_device, + .load_boot_fit_to_device = gaudi_load_boot_fit_to_device, + .get_signal_cb_size = gaudi_get_signal_cb_size, + .get_wait_cb_size = gaudi_get_wait_cb_size, + .gen_signal_cb = gaudi_gen_signal_cb, + .gen_wait_cb = gaudi_gen_wait_cb, + .reset_sob = gaudi_reset_sob, + .reset_sob_group = gaudi_reset_sob_group, + .get_device_time = gaudi_get_device_time, + .pb_print_security_errors = NULL, + .collective_wait_init_cs = gaudi_collective_wait_init_cs, + .collective_wait_create_jobs = gaudi_collective_wait_create_jobs, + .get_dec_base_addr = NULL, + .scramble_addr = hl_mmu_scramble_addr, + .descramble_addr = hl_mmu_descramble_addr, + .ack_protection_bits_errors = gaudi_ack_protection_bits_errors, + .get_hw_block_id = gaudi_get_hw_block_id, + .hw_block_mmap = gaudi_block_mmap, + .enable_events_from_fw = gaudi_enable_events_from_fw, + .ack_mmu_errors = gaudi_ack_mmu_page_fault_or_access_error, + .map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx, + .init_firmware_preload_params = gaudi_init_firmware_preload_params, + .init_firmware_loader = gaudi_init_firmware_loader, + .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm, + .state_dump_init = gaudi_state_dump_init, + .get_sob_addr = gaudi_get_sob_addr, + .set_pci_memory_regions = gaudi_set_pci_memory_regions, + .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr, + .check_if_razwi_happened = gaudi_check_if_razwi_happened, + .mmu_get_real_page_size = hl_mmu_get_real_page_size, + .access_dev_mem = hl_access_dev_mem, + .set_dram_bar_base = gaudi_set_hbm_bar_base, + .send_device_activity = gaudi_send_device_activity, +}; + +/** + * gaudi_set_asic_funcs - set GAUDI function pointers + * + * @hdev: pointer to hl_device structure + * + */ +void gaudi_set_asic_funcs(struct hl_device *hdev) +{ + hdev->asic_funcs = &gaudi_funcs; +} |