summaryrefslogtreecommitdiffstats
path: root/drivers/misc/sgi-gru
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/sgi-gru')
-rw-r--r--drivers/misc/sgi-gru/Makefile5
-rw-r--r--drivers/misc/sgi-gru/gru.h78
-rw-r--r--drivers/misc/sgi-gru/gru_instructions.h736
-rw-r--r--drivers/misc/sgi-gru/grufault.c907
-rw-r--r--drivers/misc/sgi-gru/grufile.c623
-rw-r--r--drivers/misc/sgi-gru/gruhandles.c210
-rw-r--r--drivers/misc/sgi-gru/gruhandles.h530
-rw-r--r--drivers/misc/sgi-gru/grukdump.c236
-rw-r--r--drivers/misc/sgi-gru/grukservices.c1171
-rw-r--r--drivers/misc/sgi-gru/grukservices.h214
-rw-r--r--drivers/misc/sgi-gru/grulib.h153
-rw-r--r--drivers/misc/sgi-gru/grumain.c976
-rw-r--r--drivers/misc/sgi-gru/gruprocfs.c324
-rw-r--r--drivers/misc/sgi-gru/grutables.h679
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c370
15 files changed, 7212 insertions, 0 deletions
diff --git a/drivers/misc/sgi-gru/Makefile b/drivers/misc/sgi-gru/Makefile
new file mode 100644
index 000000000..0003a1d56
--- /dev/null
+++ b/drivers/misc/sgi-gru/Makefile
@@ -0,0 +1,5 @@
+ccflags-$(CONFIG_SGI_GRU_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_SGI_GRU) := gru.o
+gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o gruhandles.o grukdump.o
+
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
new file mode 100644
index 000000000..3ad76cd18
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __GRU_H__
+#define __GRU_H__
+
+/*
+ * GRU architectural definitions
+ */
+#define GRU_CACHE_LINE_BYTES 64
+#define GRU_HANDLE_STRIDE 256
+#define GRU_CB_BASE 0
+#define GRU_DS_BASE 0x20000
+
+/*
+ * Size used to map GRU GSeg
+ */
+#if defined(CONFIG_IA64)
+#define GRU_GSEG_PAGESIZE (256 * 1024UL)
+#elif defined(CONFIG_X86_64)
+#define GRU_GSEG_PAGESIZE (256 * 1024UL) /* ZZZ 2MB ??? */
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Structure for obtaining GRU resource information
+ */
+struct gru_chiplet_info {
+ int node;
+ int chiplet;
+ int blade;
+ int total_dsr_bytes;
+ int total_cbr;
+ int total_user_dsr_bytes;
+ int total_user_cbr;
+ int free_user_dsr_bytes;
+ int free_user_cbr;
+};
+
+/*
+ * Statictics kept for each context.
+ */
+struct gru_gseg_statistics {
+ unsigned long fmm_tlbmiss;
+ unsigned long upm_tlbmiss;
+ unsigned long tlbdropin;
+ unsigned long context_stolen;
+ unsigned long reserved[10];
+};
+
+/* Flags for GRU options on the gru_create_context() call */
+/* Select one of the follow 4 options to specify how TLB misses are handled */
+#define GRU_OPT_MISS_DEFAULT 0x0000 /* Use default mode */
+#define GRU_OPT_MISS_USER_POLL 0x0001 /* User will poll CB for faults */
+#define GRU_OPT_MISS_FMM_INTR 0x0002 /* Send interrupt to cpu to
+ handle fault */
+#define GRU_OPT_MISS_FMM_POLL 0x0003 /* Use system polling thread */
+#define GRU_OPT_MISS_MASK 0x0003 /* Mask for TLB MISS option */
+
+
+
+#endif /* __GRU_H__ */
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
new file mode 100644
index 000000000..04d5170ac
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __GRU_INSTRUCTIONS_H__
+#define __GRU_INSTRUCTIONS_H__
+
+extern int gru_check_status_proc(void *cb);
+extern int gru_wait_proc(void *cb);
+extern void gru_wait_abort_proc(void *cb);
+
+
+
+/*
+ * Architecture dependent functions
+ */
+
+#if defined(CONFIG_IA64)
+#include <linux/compiler.h>
+#include <asm/intrinsics.h>
+#define __flush_cache(p) ia64_fc((unsigned long)p)
+/* Use volatile on IA64 to ensure ordering via st4.rel */
+#define gru_ordered_store_ulong(p, v) \
+ do { \
+ barrier(); \
+ *((volatile unsigned long *)(p)) = v; /* force st.rel */ \
+ } while (0)
+#elif defined(CONFIG_X86_64)
+#include <asm/cacheflush.h>
+#define __flush_cache(p) clflush(p)
+#define gru_ordered_store_ulong(p, v) \
+ do { \
+ barrier(); \
+ *(unsigned long *)p = v; \
+ } while (0)
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Control block status and exception codes
+ */
+#define CBS_IDLE 0
+#define CBS_EXCEPTION 1
+#define CBS_ACTIVE 2
+#define CBS_CALL_OS 3
+
+/* CB substatus bitmasks */
+#define CBSS_MSG_QUEUE_MASK 7
+#define CBSS_IMPLICIT_ABORT_ACTIVE_MASK 8
+
+/* CB substatus message queue values (low 3 bits of substatus) */
+#define CBSS_NO_ERROR 0
+#define CBSS_LB_OVERFLOWED 1
+#define CBSS_QLIMIT_REACHED 2
+#define CBSS_PAGE_OVERFLOW 3
+#define CBSS_AMO_NACKED 4
+#define CBSS_PUT_NACKED 5
+
+/*
+ * Structure used to fetch exception detail for CBs that terminate with
+ * CBS_EXCEPTION
+ */
+struct control_block_extended_exc_detail {
+ unsigned long cb;
+ int opc;
+ int ecause;
+ int exopc;
+ long exceptdet0;
+ int exceptdet1;
+ int cbrstate;
+ int cbrexecstatus;
+};
+
+/*
+ * Instruction formats
+ */
+
+/*
+ * Generic instruction format.
+ * This definition has precise bit field definitions.
+ */
+struct gru_instruction_bits {
+ /* DW 0 - low */
+ unsigned int icmd: 1;
+ unsigned char ima: 3; /* CB_DelRep, unmapped mode */
+ unsigned char reserved0: 4;
+ unsigned int xtype: 3;
+ unsigned int iaa0: 2;
+ unsigned int iaa1: 2;
+ unsigned char reserved1: 1;
+ unsigned char opc: 8; /* opcode */
+ unsigned char exopc: 8; /* extended opcode */
+ /* DW 0 - high */
+ unsigned int idef2: 22; /* TRi0 */
+ unsigned char reserved2: 2;
+ unsigned char istatus: 2;
+ unsigned char isubstatus:4;
+ unsigned char reserved3: 1;
+ unsigned char tlb_fault_color: 1;
+ /* DW 1 */
+ unsigned long idef4; /* 42 bits: TRi1, BufSize */
+ /* DW 2-6 */
+ unsigned long idef1; /* BAddr0 */
+ unsigned long idef5; /* Nelem */
+ unsigned long idef6; /* Stride, Operand1 */
+ unsigned long idef3; /* BAddr1, Value, Operand2 */
+ unsigned long reserved4;
+ /* DW 7 */
+ unsigned long avalue; /* AValue */
+};
+
+/*
+ * Generic instruction with friendlier names. This format is used
+ * for inline instructions.
+ */
+struct gru_instruction {
+ /* DW 0 */
+ union {
+ unsigned long op64; /* icmd,xtype,iaa0,ima,opc,tri0 */
+ struct {
+ unsigned int op32;
+ unsigned int tri0;
+ };
+ };
+ unsigned long tri1_bufsize; /* DW 1 */
+ unsigned long baddr0; /* DW 2 */
+ unsigned long nelem; /* DW 3 */
+ unsigned long op1_stride; /* DW 4 */
+ unsigned long op2_value_baddr1; /* DW 5 */
+ unsigned long reserved0; /* DW 6 */
+ unsigned long avalue; /* DW 7 */
+};
+
+/* Some shifts and masks for the low 64 bits of a GRU command */
+#define GRU_CB_ICMD_SHFT 0
+#define GRU_CB_ICMD_MASK 0x1
+#define GRU_CB_XTYPE_SHFT 8
+#define GRU_CB_XTYPE_MASK 0x7
+#define GRU_CB_IAA0_SHFT 11
+#define GRU_CB_IAA0_MASK 0x3
+#define GRU_CB_IAA1_SHFT 13
+#define GRU_CB_IAA1_MASK 0x3
+#define GRU_CB_IMA_SHFT 1
+#define GRU_CB_IMA_MASK 0x3
+#define GRU_CB_OPC_SHFT 16
+#define GRU_CB_OPC_MASK 0xff
+#define GRU_CB_EXOPC_SHFT 24
+#define GRU_CB_EXOPC_MASK 0xff
+#define GRU_IDEF2_SHFT 32
+#define GRU_IDEF2_MASK 0x3ffff
+#define GRU_ISTATUS_SHFT 56
+#define GRU_ISTATUS_MASK 0x3
+
+/* GRU instruction opcodes (opc field) */
+#define OP_NOP 0x00
+#define OP_BCOPY 0x01
+#define OP_VLOAD 0x02
+#define OP_IVLOAD 0x03
+#define OP_VSTORE 0x04
+#define OP_IVSTORE 0x05
+#define OP_VSET 0x06
+#define OP_IVSET 0x07
+#define OP_MESQ 0x08
+#define OP_GAMXR 0x09
+#define OP_GAMIR 0x0a
+#define OP_GAMIRR 0x0b
+#define OP_GAMER 0x0c
+#define OP_GAMERR 0x0d
+#define OP_BSTORE 0x0e
+#define OP_VFLUSH 0x0f
+
+
+/* Extended opcodes values (exopc field) */
+
+/* GAMIR - AMOs with implicit operands */
+#define EOP_IR_FETCH 0x01 /* Plain fetch of memory */
+#define EOP_IR_CLR 0x02 /* Fetch and clear */
+#define EOP_IR_INC 0x05 /* Fetch and increment */
+#define EOP_IR_DEC 0x07 /* Fetch and decrement */
+#define EOP_IR_QCHK1 0x0d /* Queue check, 64 byte msg */
+#define EOP_IR_QCHK2 0x0e /* Queue check, 128 byte msg */
+
+/* GAMIRR - Registered AMOs with implicit operands */
+#define EOP_IRR_FETCH 0x01 /* Registered fetch of memory */
+#define EOP_IRR_CLR 0x02 /* Registered fetch and clear */
+#define EOP_IRR_INC 0x05 /* Registered fetch and increment */
+#define EOP_IRR_DEC 0x07 /* Registered fetch and decrement */
+#define EOP_IRR_DECZ 0x0f /* Registered fetch and decrement, update on zero*/
+
+/* GAMER - AMOs with explicit operands */
+#define EOP_ER_SWAP 0x00 /* Exchange argument and memory */
+#define EOP_ER_OR 0x01 /* Logical OR with memory */
+#define EOP_ER_AND 0x02 /* Logical AND with memory */
+#define EOP_ER_XOR 0x03 /* Logical XOR with memory */
+#define EOP_ER_ADD 0x04 /* Add value to memory */
+#define EOP_ER_CSWAP 0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ER_CADD 0x0c /* Queue check, operand1*64 byte msg */
+
+/* GAMERR - Registered AMOs with explicit operands */
+#define EOP_ERR_SWAP 0x00 /* Exchange argument and memory */
+#define EOP_ERR_OR 0x01 /* Logical OR with memory */
+#define EOP_ERR_AND 0x02 /* Logical AND with memory */
+#define EOP_ERR_XOR 0x03 /* Logical XOR with memory */
+#define EOP_ERR_ADD 0x04 /* Add value to memory */
+#define EOP_ERR_CSWAP 0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ERR_EPOLL 0x09 /* Poll for equality */
+#define EOP_ERR_NPOLL 0x0a /* Poll for inequality */
+
+/* GAMXR - SGI Arithmetic unit */
+#define EOP_XR_CSWAP 0x0b /* Masked compare exchange */
+
+
+/* Transfer types (xtype field) */
+#define XTYPE_B 0x0 /* byte */
+#define XTYPE_S 0x1 /* short (2-byte) */
+#define XTYPE_W 0x2 /* word (4-byte) */
+#define XTYPE_DW 0x3 /* doubleword (8-byte) */
+#define XTYPE_CL 0x6 /* cacheline (64-byte) */
+
+
+/* Instruction access attributes (iaa0, iaa1 fields) */
+#define IAA_RAM 0x0 /* normal cached RAM access */
+#define IAA_NCRAM 0x2 /* noncoherent RAM access */
+#define IAA_MMIO 0x1 /* noncoherent memory-mapped I/O space */
+#define IAA_REGISTER 0x3 /* memory-mapped registers, etc. */
+
+
+/* Instruction mode attributes (ima field) */
+#define IMA_MAPPED 0x0 /* Virtual mode */
+#define IMA_CB_DELAY 0x1 /* hold read responses until status changes */
+#define IMA_UNMAPPED 0x2 /* bypass the TLBs (OS only) */
+#define IMA_INTERRUPT 0x4 /* Interrupt when instruction completes */
+
+/* CBE ecause bits */
+#define CBE_CAUSE_RI (1 << 0)
+#define CBE_CAUSE_INVALID_INSTRUCTION (1 << 1)
+#define CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN (1 << 2)
+#define CBE_CAUSE_PE_CHECK_DATA_ERROR (1 << 3)
+#define CBE_CAUSE_IAA_GAA_MISMATCH (1 << 4)
+#define CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION (1 << 5)
+#define CBE_CAUSE_OS_FATAL_TLB_FAULT (1 << 6)
+#define CBE_CAUSE_EXECUTION_HW_ERROR (1 << 7)
+#define CBE_CAUSE_TLBHW_ERROR (1 << 8)
+#define CBE_CAUSE_RA_REQUEST_TIMEOUT (1 << 9)
+#define CBE_CAUSE_HA_REQUEST_TIMEOUT (1 << 10)
+#define CBE_CAUSE_RA_RESPONSE_FATAL (1 << 11)
+#define CBE_CAUSE_RA_RESPONSE_NON_FATAL (1 << 12)
+#define CBE_CAUSE_HA_RESPONSE_FATAL (1 << 13)
+#define CBE_CAUSE_HA_RESPONSE_NON_FATAL (1 << 14)
+#define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR (1 << 15)
+#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR (1 << 16)
+#define CBE_CAUSE_RA_RESPONSE_DATA_ERROR (1 << 17)
+#define CBE_CAUSE_HA_RESPONSE_DATA_ERROR (1 << 18)
+#define CBE_CAUSE_FORCED_ERROR (1 << 19)
+
+/* CBE cbrexecstatus bits */
+#define CBR_EXS_ABORT_OCC_BIT 0
+#define CBR_EXS_INT_OCC_BIT 1
+#define CBR_EXS_PENDING_BIT 2
+#define CBR_EXS_QUEUED_BIT 3
+#define CBR_EXS_TLB_INVAL_BIT 4
+#define CBR_EXS_EXCEPTION_BIT 5
+#define CBR_EXS_CB_INT_PENDING_BIT 6
+
+#define CBR_EXS_ABORT_OCC (1 << CBR_EXS_ABORT_OCC_BIT)
+#define CBR_EXS_INT_OCC (1 << CBR_EXS_INT_OCC_BIT)
+#define CBR_EXS_PENDING (1 << CBR_EXS_PENDING_BIT)
+#define CBR_EXS_QUEUED (1 << CBR_EXS_QUEUED_BIT)
+#define CBR_EXS_TLB_INVAL (1 << CBR_EXS_TLB_INVAL_BIT)
+#define CBR_EXS_EXCEPTION (1 << CBR_EXS_EXCEPTION_BIT)
+#define CBR_EXS_CB_INT_PENDING (1 << CBR_EXS_CB_INT_PENDING_BIT)
+
+/*
+ * Exceptions are retried for the following cases. If any OTHER bits are set
+ * in ecause, the exception is not retryable.
+ */
+#define EXCEPTION_RETRY_BITS (CBE_CAUSE_EXECUTION_HW_ERROR | \
+ CBE_CAUSE_TLBHW_ERROR | \
+ CBE_CAUSE_RA_REQUEST_TIMEOUT | \
+ CBE_CAUSE_RA_RESPONSE_NON_FATAL | \
+ CBE_CAUSE_HA_RESPONSE_NON_FATAL | \
+ CBE_CAUSE_RA_RESPONSE_DATA_ERROR | \
+ CBE_CAUSE_HA_RESPONSE_DATA_ERROR \
+ )
+
+/* Message queue head structure */
+union gru_mesqhead {
+ unsigned long val;
+ struct {
+ unsigned int head;
+ unsigned int limit;
+ };
+};
+
+
+/* Generate the low word of a GRU instruction */
+static inline unsigned long
+__opdword(unsigned char opcode, unsigned char exopc, unsigned char xtype,
+ unsigned char iaa0, unsigned char iaa1,
+ unsigned long idef2, unsigned char ima)
+{
+ return (1 << GRU_CB_ICMD_SHFT) |
+ ((unsigned long)CBS_ACTIVE << GRU_ISTATUS_SHFT) |
+ (idef2<< GRU_IDEF2_SHFT) |
+ (iaa0 << GRU_CB_IAA0_SHFT) |
+ (iaa1 << GRU_CB_IAA1_SHFT) |
+ (ima << GRU_CB_IMA_SHFT) |
+ (xtype << GRU_CB_XTYPE_SHFT) |
+ (opcode << GRU_CB_OPC_SHFT) |
+ (exopc << GRU_CB_EXOPC_SHFT);
+}
+
+/*
+ * Architecture specific intrinsics
+ */
+static inline void gru_flush_cache(void *p)
+{
+ __flush_cache(p);
+}
+
+/*
+ * Store the lower 64 bits of the command including the "start" bit. Then
+ * start the instruction executing.
+ */
+static inline void gru_start_instruction(struct gru_instruction *ins, unsigned long op64)
+{
+ gru_ordered_store_ulong(ins, op64);
+ mb();
+ gru_flush_cache(ins);
+}
+
+
+/* Convert "hints" to IMA */
+#define CB_IMA(h) ((h) | IMA_UNMAPPED)
+
+/* Convert data segment cache line index into TRI0 / TRI1 value */
+#define GRU_DINDEX(i) ((i) * GRU_CACHE_LINE_BYTES)
+
+/* Inline functions for GRU instructions.
+ * Note:
+ * - nelem and stride are in elements
+ * - tri0/tri1 is in bytes for the beginning of the data segment.
+ */
+static inline void gru_vload_phys(void *cb, unsigned long gpa,
+ unsigned int tri0, int iaa, unsigned long hints)
+{
+ struct gru_instruction *ins = (struct gru_instruction *)cb;
+
+ ins->baddr0 = (long)gpa | ((unsigned long)iaa << 62);
+ ins->nelem = 1;
+ ins->op1_stride = 1;
+ gru_start_instruction(ins, __opdword(OP_VLOAD, 0, XTYPE_DW, iaa, 0,
+ (unsigned long)tri0, CB_IMA(hints)));
+}
+
+static inline void gru_vstore_phys(void *cb, unsigned long gpa,
+ unsigned int tri0, int iaa, unsigned long hints)
+{
+ struct gru_instruction *ins = (struct gru_instruction *)cb;
+
+ ins->baddr0 = (long)gpa | ((unsigned long)iaa << 62);
+ ins->nelem = 1;
+ ins->op1_stride = 1;
+ gru_start_instruction(ins, __opdword(OP_VSTORE, 0, XTYPE_DW, iaa, 0,
+ (unsigned long)tri0, CB_IMA(hints)));
+}
+
+static inline void gru_vload(void *cb, unsigned long mem_addr,
+ unsigned int tri0, unsigned char xtype, unsigned long nelem,
+ unsigned long stride, unsigned long hints)
+{
+ struct gru_instruction *ins = (struct gru_instruction *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->nelem = nelem;
+ ins->op1_stride = stride;
+ gru_start_instruction(ins, __opdword(OP_VLOAD, 0, xtype, IAA_RAM, 0,
+ (unsigned long)tri0, CB_IMA(hints)));
+}
+
+static inline void gru_vstore(void *cb, unsigned long mem_addr,
+ unsigned int tri0, unsigned char xtype, unsigned long nelem,
+ unsigned long stride, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->nelem = nelem;
+ ins->op1_stride = stride;
+ gru_start_instruction(ins, __opdword(OP_VSTORE, 0, xtype, IAA_RAM, 0,
+ tri0, CB_IMA(hints)));
+}
+
+static inline void gru_ivload(void *cb, unsigned long mem_addr,
+ unsigned int tri0, unsigned int tri1, unsigned char xtype,
+ unsigned long nelem, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->nelem = nelem;
+ ins->tri1_bufsize = tri1;
+ gru_start_instruction(ins, __opdword(OP_IVLOAD, 0, xtype, IAA_RAM, 0,
+ tri0, CB_IMA(hints)));
+}
+
+static inline void gru_ivstore(void *cb, unsigned long mem_addr,
+ unsigned int tri0, unsigned int tri1,
+ unsigned char xtype, unsigned long nelem, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->nelem = nelem;
+ ins->tri1_bufsize = tri1;
+ gru_start_instruction(ins, __opdword(OP_IVSTORE, 0, xtype, IAA_RAM, 0,
+ tri0, CB_IMA(hints)));
+}
+
+static inline void gru_vset(void *cb, unsigned long mem_addr,
+ unsigned long value, unsigned char xtype, unsigned long nelem,
+ unsigned long stride, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->op2_value_baddr1 = value;
+ ins->nelem = nelem;
+ ins->op1_stride = stride;
+ gru_start_instruction(ins, __opdword(OP_VSET, 0, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_ivset(void *cb, unsigned long mem_addr,
+ unsigned int tri1, unsigned long value, unsigned char xtype,
+ unsigned long nelem, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->op2_value_baddr1 = value;
+ ins->nelem = nelem;
+ ins->tri1_bufsize = tri1;
+ gru_start_instruction(ins, __opdword(OP_IVSET, 0, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_vflush(void *cb, unsigned long mem_addr,
+ unsigned long nelem, unsigned char xtype, unsigned long stride,
+ unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)mem_addr;
+ ins->op1_stride = stride;
+ ins->nelem = nelem;
+ gru_start_instruction(ins, __opdword(OP_VFLUSH, 0, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_nop(void *cb, int hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ gru_start_instruction(ins, __opdword(OP_NOP, 0, 0, 0, 0, 0, CB_IMA(hints)));
+}
+
+
+static inline void gru_bcopy(void *cb, const unsigned long src,
+ unsigned long dest,
+ unsigned int tri0, unsigned int xtype, unsigned long nelem,
+ unsigned int bufsize, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ ins->op2_value_baddr1 = (long)dest;
+ ins->nelem = nelem;
+ ins->tri1_bufsize = bufsize;
+ gru_start_instruction(ins, __opdword(OP_BCOPY, 0, xtype, IAA_RAM,
+ IAA_RAM, tri0, CB_IMA(hints)));
+}
+
+static inline void gru_bstore(void *cb, const unsigned long src,
+ unsigned long dest, unsigned int tri0, unsigned int xtype,
+ unsigned long nelem, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ ins->op2_value_baddr1 = (long)dest;
+ ins->nelem = nelem;
+ gru_start_instruction(ins, __opdword(OP_BSTORE, 0, xtype, 0, IAA_RAM,
+ tri0, CB_IMA(hints)));
+}
+
+static inline void gru_gamir(void *cb, int exopc, unsigned long src,
+ unsigned int xtype, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ gru_start_instruction(ins, __opdword(OP_GAMIR, exopc, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_gamirr(void *cb, int exopc, unsigned long src,
+ unsigned int xtype, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ gru_start_instruction(ins, __opdword(OP_GAMIRR, exopc, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_gamer(void *cb, int exopc, unsigned long src,
+ unsigned int xtype,
+ unsigned long operand1, unsigned long operand2,
+ unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ ins->op1_stride = operand1;
+ ins->op2_value_baddr1 = operand2;
+ gru_start_instruction(ins, __opdword(OP_GAMER, exopc, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_gamerr(void *cb, int exopc, unsigned long src,
+ unsigned int xtype, unsigned long operand1,
+ unsigned long operand2, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ ins->op1_stride = operand1;
+ ins->op2_value_baddr1 = operand2;
+ gru_start_instruction(ins, __opdword(OP_GAMERR, exopc, xtype, IAA_RAM, 0,
+ 0, CB_IMA(hints)));
+}
+
+static inline void gru_gamxr(void *cb, unsigned long src,
+ unsigned int tri0, unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)src;
+ ins->nelem = 4;
+ gru_start_instruction(ins, __opdword(OP_GAMXR, EOP_XR_CSWAP, XTYPE_DW,
+ IAA_RAM, 0, 0, CB_IMA(hints)));
+}
+
+static inline void gru_mesq(void *cb, unsigned long queue,
+ unsigned long tri0, unsigned long nelem,
+ unsigned long hints)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ ins->baddr0 = (long)queue;
+ ins->nelem = nelem;
+ gru_start_instruction(ins, __opdword(OP_MESQ, 0, XTYPE_CL, IAA_RAM, 0,
+ tri0, CB_IMA(hints)));
+}
+
+static inline unsigned long gru_get_amo_value(void *cb)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ return ins->avalue;
+}
+
+static inline int gru_get_amo_value_head(void *cb)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ return ins->avalue & 0xffffffff;
+}
+
+static inline int gru_get_amo_value_limit(void *cb)
+{
+ struct gru_instruction *ins = (void *)cb;
+
+ return ins->avalue >> 32;
+}
+
+static inline union gru_mesqhead gru_mesq_head(int head, int limit)
+{
+ union gru_mesqhead mqh;
+
+ mqh.head = head;
+ mqh.limit = limit;
+ return mqh;
+}
+
+/*
+ * Get struct control_block_extended_exc_detail for CB.
+ */
+extern int gru_get_cb_exception_detail(void *cb,
+ struct control_block_extended_exc_detail *excdet);
+
+#define GRU_EXC_STR_SIZE 256
+
+
+/*
+ * Control block definition for checking status
+ */
+struct gru_control_block_status {
+ unsigned int icmd :1;
+ unsigned int ima :3;
+ unsigned int reserved0 :4;
+ unsigned int unused1 :24;
+ unsigned int unused2 :24;
+ unsigned int istatus :2;
+ unsigned int isubstatus :4;
+ unsigned int unused3 :2;
+};
+
+/* Get CB status */
+static inline int gru_get_cb_status(void *cb)
+{
+ struct gru_control_block_status *cbs = (void *)cb;
+
+ return cbs->istatus;
+}
+
+/* Get CB message queue substatus */
+static inline int gru_get_cb_message_queue_substatus(void *cb)
+{
+ struct gru_control_block_status *cbs = (void *)cb;
+
+ return cbs->isubstatus & CBSS_MSG_QUEUE_MASK;
+}
+
+/* Get CB substatus */
+static inline int gru_get_cb_substatus(void *cb)
+{
+ struct gru_control_block_status *cbs = (void *)cb;
+
+ return cbs->isubstatus;
+}
+
+/*
+ * User interface to check an instruction status. UPM and exceptions
+ * are handled automatically. However, this function does NOT wait
+ * for an active instruction to complete.
+ *
+ */
+static inline int gru_check_status(void *cb)
+{
+ struct gru_control_block_status *cbs = (void *)cb;
+ int ret;
+
+ ret = cbs->istatus;
+ if (ret != CBS_ACTIVE)
+ ret = gru_check_status_proc(cb);
+ return ret;
+}
+
+/*
+ * User interface (via inline function) to wait for an instruction
+ * to complete. Completion status (IDLE or EXCEPTION is returned
+ * to the user. Exception due to hardware errors are automatically
+ * retried before returning an exception.
+ *
+ */
+static inline int gru_wait(void *cb)
+{
+ return gru_wait_proc(cb);
+}
+
+/*
+ * Wait for CB to complete. Aborts program if error. (Note: error does NOT
+ * mean TLB mis - only fatal errors such as memory parity error or user
+ * bugs will cause termination.
+ */
+static inline void gru_wait_abort(void *cb)
+{
+ gru_wait_abort_proc(cb);
+}
+
+/*
+ * Get a pointer to the start of a gseg
+ * p - Any valid pointer within the gseg
+ */
+static inline void *gru_get_gseg_pointer (void *p)
+{
+ return (void *)((unsigned long)p & ~(GRU_GSEG_PAGESIZE - 1));
+}
+
+/*
+ * Get a pointer to a control block
+ * gseg - GSeg address returned from gru_get_thread_gru_segment()
+ * index - index of desired CB
+ */
+static inline void *gru_get_cb_pointer(void *gseg,
+ int index)
+{
+ return gseg + GRU_CB_BASE + index * GRU_HANDLE_STRIDE;
+}
+
+/*
+ * Get a pointer to a cacheline in the data segment portion of a GSeg
+ * gseg - GSeg address returned from gru_get_thread_gru_segment()
+ * index - index of desired cache line
+ */
+static inline void *gru_get_data_pointer(void *gseg, int index)
+{
+ return gseg + GRU_DS_BASE + index * GRU_CACHE_LINE_BYTES;
+}
+
+/*
+ * Convert a vaddr into the tri index within the GSEG
+ * vaddr - virtual address of within gseg
+ */
+static inline int gru_get_tri(void *vaddr)
+{
+ return ((unsigned long)vaddr & (GRU_GSEG_PAGESIZE - 1)) - GRU_DS_BASE;
+}
+#endif /* __GRU_INSTRUCTIONS_H__ */
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
new file mode 100644
index 000000000..93be82fc3
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -0,0 +1,907 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * FAULT HANDLER FOR GRU DETECTED TLB MISSES
+ *
+ * This file contains code that handles TLB misses within the GRU.
+ * These misses are reported either via interrupts or user polling of
+ * the user CB.
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/security.h>
+#include <linux/prefetch.h>
+#include <asm/pgtable.h>
+#include "gru.h"
+#include "grutables.h"
+#include "grulib.h"
+#include "gru_instructions.h"
+#include <asm/uv/uv_hub.h>
+
+/* Return codes for vtop functions */
+#define VTOP_SUCCESS 0
+#define VTOP_INVALID -1
+#define VTOP_RETRY -2
+
+
+/*
+ * Test if a physical address is a valid GRU GSEG address
+ */
+static inline int is_gru_paddr(unsigned long paddr)
+{
+ return paddr >= gru_start_paddr && paddr < gru_end_paddr;
+}
+
+/*
+ * Find the vma of a GRU segment. Caller must hold mmap_sem.
+ */
+struct vm_area_struct *gru_find_vma(unsigned long vaddr)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(current->mm, vaddr);
+ if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
+ return vma;
+ return NULL;
+}
+
+/*
+ * Find and lock the gts that contains the specified user vaddr.
+ *
+ * Returns:
+ * - *gts with the mmap_sem locked for read and the GTS locked.
+ * - NULL if vaddr invalid OR is not a valid GSEG vaddr.
+ */
+
+static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct gru_thread_state *gts = NULL;
+
+ down_read(&mm->mmap_sem);
+ vma = gru_find_vma(vaddr);
+ if (vma)
+ gts = gru_find_thread_state(vma, TSID(vaddr, vma));
+ if (gts)
+ mutex_lock(&gts->ts_ctxlock);
+ else
+ up_read(&mm->mmap_sem);
+ return gts;
+}
+
+static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct gru_thread_state *gts = ERR_PTR(-EINVAL);
+
+ down_write(&mm->mmap_sem);
+ vma = gru_find_vma(vaddr);
+ if (!vma)
+ goto err;
+
+ gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
+ if (IS_ERR(gts))
+ goto err;
+ mutex_lock(&gts->ts_ctxlock);
+ downgrade_write(&mm->mmap_sem);
+ return gts;
+
+err:
+ up_write(&mm->mmap_sem);
+ return gts;
+}
+
+/*
+ * Unlock a GTS that was previously locked with gru_find_lock_gts().
+ */
+static void gru_unlock_gts(struct gru_thread_state *gts)
+{
+ mutex_unlock(&gts->ts_ctxlock);
+ up_read(&current->mm->mmap_sem);
+}
+
+/*
+ * Set a CB.istatus to active using a user virtual address. This must be done
+ * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
+ * If the line is evicted, the status may be lost. The in-cache update
+ * is necessary to prevent the user from seeing a stale cb.istatus that will
+ * change as soon as the TFH restart is complete. Races may cause an
+ * occasional failure to clear the cb.istatus, but that is ok.
+ */
+static void gru_cb_set_istatus_active(struct gru_instruction_bits *cbk)
+{
+ if (cbk) {
+ cbk->istatus = CBS_ACTIVE;
+ }
+}
+
+/*
+ * Read & clear a TFM
+ *
+ * The GRU has an array of fault maps. A map is private to a cpu
+ * Only one cpu will be accessing a cpu's fault map.
+ *
+ * This function scans the cpu-private fault map & clears all bits that
+ * are set. The function returns a bitmap that indicates the bits that
+ * were cleared. Note that sense the maps may be updated asynchronously by
+ * the GRU, atomic operations must be used to clear bits.
+ */
+static void get_clear_fault_map(struct gru_state *gru,
+ struct gru_tlb_fault_map *imap,
+ struct gru_tlb_fault_map *dmap)
+{
+ unsigned long i, k;
+ struct gru_tlb_fault_map *tfm;
+
+ tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
+ prefetchw(tfm); /* Helps on hardware, required for emulator */
+ for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
+ k = tfm->fault_bits[i];
+ if (k)
+ k = xchg(&tfm->fault_bits[i], 0UL);
+ imap->fault_bits[i] = k;
+ k = tfm->done_bits[i];
+ if (k)
+ k = xchg(&tfm->done_bits[i], 0UL);
+ dmap->fault_bits[i] = k;
+ }
+
+ /*
+ * Not functionally required but helps performance. (Required
+ * on emulator)
+ */
+ gru_flush_cache(tfm);
+}
+
+/*
+ * Atomic (interrupt context) & non-atomic (user context) functions to
+ * convert a vaddr into a physical address. The size of the page
+ * is returned in pageshift.
+ * returns:
+ * 0 - successful
+ * < 0 - error code
+ * 1 - (atomic only) try again in non-atomic context
+ */
+static int non_atomic_pte_lookup(struct vm_area_struct *vma,
+ unsigned long vaddr, int write,
+ unsigned long *paddr, int *pageshift)
+{
+ struct page *page;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ *pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
+#else
+ *pageshift = PAGE_SHIFT;
+#endif
+ if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
+ return -EFAULT;
+ *paddr = page_to_phys(page);
+ put_page(page);
+ return 0;
+}
+
+/*
+ * atomic_pte_lookup
+ *
+ * Convert a user virtual address to a physical address
+ * Only supports Intel large pages (2MB only) on x86_64.
+ * ZZZ - hugepage support is incomplete
+ *
+ * NOTE: mmap_sem is already held on entry to this function. This
+ * guarantees existence of the page tables.
+ */
+static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
+ int write, unsigned long *paddr, int *pageshift)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t pte;
+
+ pgdp = pgd_offset(vma->vm_mm, vaddr);
+ if (unlikely(pgd_none(*pgdp)))
+ goto err;
+
+ p4dp = p4d_offset(pgdp, vaddr);
+ if (unlikely(p4d_none(*p4dp)))
+ goto err;
+
+ pudp = pud_offset(p4dp, vaddr);
+ if (unlikely(pud_none(*pudp)))
+ goto err;
+
+ pmdp = pmd_offset(pudp, vaddr);
+ if (unlikely(pmd_none(*pmdp)))
+ goto err;
+#ifdef CONFIG_X86_64
+ if (unlikely(pmd_large(*pmdp)))
+ pte = *(pte_t *) pmdp;
+ else
+#endif
+ pte = *pte_offset_kernel(pmdp, vaddr);
+
+ if (unlikely(!pte_present(pte) ||
+ (write && (!pte_write(pte) || !pte_dirty(pte)))))
+ return 1;
+
+ *paddr = pte_pfn(pte) << PAGE_SHIFT;
+#ifdef CONFIG_HUGETLB_PAGE
+ *pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
+#else
+ *pageshift = PAGE_SHIFT;
+#endif
+ return 0;
+
+err:
+ return 1;
+}
+
+static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
+ int write, int atomic, unsigned long *gpa, int *pageshift)
+{
+ struct mm_struct *mm = gts->ts_mm;
+ struct vm_area_struct *vma;
+ unsigned long paddr;
+ int ret, ps;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ goto inval;
+
+ /*
+ * Atomic lookup is faster & usually works even if called in non-atomic
+ * context.
+ */
+ rmb(); /* Must/check ms_range_active before loading PTEs */
+ ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &ps);
+ if (ret) {
+ if (atomic)
+ goto upm;
+ if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &ps))
+ goto inval;
+ }
+ if (is_gru_paddr(paddr))
+ goto inval;
+ paddr = paddr & ~((1UL << ps) - 1);
+ *gpa = uv_soc_phys_ram_to_gpa(paddr);
+ *pageshift = ps;
+ return VTOP_SUCCESS;
+
+inval:
+ return VTOP_INVALID;
+upm:
+ return VTOP_RETRY;
+}
+
+
+/*
+ * Flush a CBE from cache. The CBE is clean in the cache. Dirty the
+ * CBE cacheline so that the line will be written back to home agent.
+ * Otherwise the line may be silently dropped. This has no impact
+ * except on performance.
+ */
+static void gru_flush_cache_cbe(struct gru_control_block_extended *cbe)
+{
+ if (unlikely(cbe)) {
+ cbe->cbrexecstatus = 0; /* make CL dirty */
+ gru_flush_cache(cbe);
+ }
+}
+
+/*
+ * Preload the TLB with entries that may be required. Currently, preloading
+ * is implemented only for BCOPY. Preload <tlb_preload_count> pages OR to
+ * the end of the bcopy tranfer, whichever is smaller.
+ */
+static void gru_preload_tlb(struct gru_state *gru,
+ struct gru_thread_state *gts, int atomic,
+ unsigned long fault_vaddr, int asid, int write,
+ unsigned char tlb_preload_count,
+ struct gru_tlb_fault_handle *tfh,
+ struct gru_control_block_extended *cbe)
+{
+ unsigned long vaddr = 0, gpa;
+ int ret, pageshift;
+
+ if (cbe->opccpy != OP_BCOPY)
+ return;
+
+ if (fault_vaddr == cbe->cbe_baddr0)
+ vaddr = fault_vaddr + GRU_CACHE_LINE_BYTES * cbe->cbe_src_cl - 1;
+ else if (fault_vaddr == cbe->cbe_baddr1)
+ vaddr = fault_vaddr + (1 << cbe->xtypecpy) * cbe->cbe_nelemcur - 1;
+
+ fault_vaddr &= PAGE_MASK;
+ vaddr &= PAGE_MASK;
+ vaddr = min(vaddr, fault_vaddr + tlb_preload_count * PAGE_SIZE);
+
+ while (vaddr > fault_vaddr) {
+ ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
+ if (ret || tfh_write_only(tfh, gpa, GAA_RAM, vaddr, asid, write,
+ GRU_PAGESIZE(pageshift)))
+ return;
+ gru_dbg(grudev,
+ "%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, rw %d, ps %d, gpa 0x%lx\n",
+ atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh,
+ vaddr, asid, write, pageshift, gpa);
+ vaddr -= PAGE_SIZE;
+ STAT(tlb_preload_page);
+ }
+}
+
+/*
+ * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
+ * Input:
+ * cb Address of user CBR. Null if not running in user context
+ * Return:
+ * 0 = dropin, exception, or switch to UPM successful
+ * 1 = range invalidate active
+ * < 0 = error code
+ *
+ */
+static int gru_try_dropin(struct gru_state *gru,
+ struct gru_thread_state *gts,
+ struct gru_tlb_fault_handle *tfh,
+ struct gru_instruction_bits *cbk)
+{
+ struct gru_control_block_extended *cbe = NULL;
+ unsigned char tlb_preload_count = gts->ts_tlb_preload_count;
+ int pageshift = 0, asid, write, ret, atomic = !cbk, indexway;
+ unsigned long gpa = 0, vaddr = 0;
+
+ /*
+ * NOTE: The GRU contains magic hardware that eliminates races between
+ * TLB invalidates and TLB dropins. If an invalidate occurs
+ * in the window between reading the TFH and the subsequent TLB dropin,
+ * the dropin is ignored. This eliminates the need for additional locks.
+ */
+
+ /*
+ * Prefetch the CBE if doing TLB preloading
+ */
+ if (unlikely(tlb_preload_count)) {
+ cbe = gru_tfh_to_cbe(tfh);
+ prefetchw(cbe);
+ }
+
+ /*
+ * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
+ * Might be a hardware race OR a stupid user. Ignore FMM because FMM
+ * is a transient state.
+ */
+ if (tfh->status != TFHSTATUS_EXCEPTION) {
+ gru_flush_cache(tfh);
+ sync_core();
+ if (tfh->status != TFHSTATUS_EXCEPTION)
+ goto failnoexception;
+ STAT(tfh_stale_on_fault);
+ }
+ if (tfh->state == TFHSTATE_IDLE)
+ goto failidle;
+ if (tfh->state == TFHSTATE_MISS_FMM && cbk)
+ goto failfmm;
+
+ write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
+ vaddr = tfh->missvaddr;
+ asid = tfh->missasid;
+ indexway = tfh->indexway;
+ if (asid == 0)
+ goto failnoasid;
+
+ rmb(); /* TFH must be cache resident before reading ms_range_active */
+
+ /*
+ * TFH is cache resident - at least briefly. Fail the dropin
+ * if a range invalidate is active.
+ */
+ if (atomic_read(&gts->ts_gms->ms_range_active))
+ goto failactive;
+
+ ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
+ if (ret == VTOP_INVALID)
+ goto failinval;
+ if (ret == VTOP_RETRY)
+ goto failupm;
+
+ if (!(gts->ts_sizeavail & GRU_SIZEAVAIL(pageshift))) {
+ gts->ts_sizeavail |= GRU_SIZEAVAIL(pageshift);
+ if (atomic || !gru_update_cch(gts)) {
+ gts->ts_force_cch_reload = 1;
+ goto failupm;
+ }
+ }
+
+ if (unlikely(cbe) && pageshift == PAGE_SHIFT) {
+ gru_preload_tlb(gru, gts, atomic, vaddr, asid, write, tlb_preload_count, tfh, cbe);
+ gru_flush_cache_cbe(cbe);
+ }
+
+ gru_cb_set_istatus_active(cbk);
+ gts->ustats.tlbdropin++;
+ tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
+ GRU_PAGESIZE(pageshift));
+ gru_dbg(grudev,
+ "%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, indexway 0x%x,"
+ " rw %d, ps %d, gpa 0x%lx\n",
+ atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh, vaddr, asid,
+ indexway, write, pageshift, gpa);
+ STAT(tlb_dropin);
+ return 0;
+
+failnoasid:
+ /* No asid (delayed unload). */
+ STAT(tlb_dropin_fail_no_asid);
+ gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+ if (!cbk)
+ tfh_user_polling_mode(tfh);
+ else
+ gru_flush_cache(tfh);
+ gru_flush_cache_cbe(cbe);
+ return -EAGAIN;
+
+failupm:
+ /* Atomic failure switch CBR to UPM */
+ tfh_user_polling_mode(tfh);
+ gru_flush_cache_cbe(cbe);
+ STAT(tlb_dropin_fail_upm);
+ gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+ return 1;
+
+failfmm:
+ /* FMM state on UPM call */
+ gru_flush_cache(tfh);
+ gru_flush_cache_cbe(cbe);
+ STAT(tlb_dropin_fail_fmm);
+ gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
+ return 0;
+
+failnoexception:
+ /* TFH status did not show exception pending */
+ gru_flush_cache(tfh);
+ gru_flush_cache_cbe(cbe);
+ if (cbk)
+ gru_flush_cache(cbk);
+ STAT(tlb_dropin_fail_no_exception);
+ gru_dbg(grudev, "FAILED non-exception tfh: 0x%p, status %d, state %d\n",
+ tfh, tfh->status, tfh->state);
+ return 0;
+
+failidle:
+ /* TFH state was idle - no miss pending */
+ gru_flush_cache(tfh);
+ gru_flush_cache_cbe(cbe);
+ if (cbk)
+ gru_flush_cache(cbk);
+ STAT(tlb_dropin_fail_idle);
+ gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
+ return 0;
+
+failinval:
+ /* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
+ tfh_exception(tfh);
+ gru_flush_cache_cbe(cbe);
+ STAT(tlb_dropin_fail_invalid);
+ gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+ return -EFAULT;
+
+failactive:
+ /* Range invalidate active. Switch to UPM iff atomic */
+ if (!cbk)
+ tfh_user_polling_mode(tfh);
+ else
+ gru_flush_cache(tfh);
+ gru_flush_cache_cbe(cbe);
+ STAT(tlb_dropin_fail_range_active);
+ gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
+ tfh, vaddr);
+ return 1;
+}
+
+/*
+ * Process an external interrupt from the GRU. This interrupt is
+ * caused by a TLB miss.
+ * Note that this is the interrupt handler that is registered with linux
+ * interrupt handlers.
+ */
+static irqreturn_t gru_intr(int chiplet, int blade)
+{
+ struct gru_state *gru;
+ struct gru_tlb_fault_map imap, dmap;
+ struct gru_thread_state *gts;
+ struct gru_tlb_fault_handle *tfh = NULL;
+ struct completion *cmp;
+ int cbrnum, ctxnum;
+
+ STAT(intr);
+
+ gru = &gru_base[blade]->bs_grus[chiplet];
+ if (!gru) {
+ dev_err(grudev, "GRU: invalid interrupt: cpu %d, chiplet %d\n",
+ raw_smp_processor_id(), chiplet);
+ return IRQ_NONE;
+ }
+ get_clear_fault_map(gru, &imap, &dmap);
+ gru_dbg(grudev,
+ "cpu %d, chiplet %d, gid %d, imap %016lx %016lx, dmap %016lx %016lx\n",
+ smp_processor_id(), chiplet, gru->gs_gid,
+ imap.fault_bits[0], imap.fault_bits[1],
+ dmap.fault_bits[0], dmap.fault_bits[1]);
+
+ for_each_cbr_in_tfm(cbrnum, dmap.fault_bits) {
+ STAT(intr_cbr);
+ cmp = gru->gs_blade->bs_async_wq;
+ if (cmp)
+ complete(cmp);
+ gru_dbg(grudev, "gid %d, cbr_done %d, done %d\n",
+ gru->gs_gid, cbrnum, cmp ? cmp->done : -1);
+ }
+
+ for_each_cbr_in_tfm(cbrnum, imap.fault_bits) {
+ STAT(intr_tfh);
+ tfh = get_tfh_by_index(gru, cbrnum);
+ prefetchw(tfh); /* Helps on hdw, required for emulator */
+
+ /*
+ * When hardware sets a bit in the faultmap, it implicitly
+ * locks the GRU context so that it cannot be unloaded.
+ * The gts cannot change until a TFH start/writestart command
+ * is issued.
+ */
+ ctxnum = tfh->ctxnum;
+ gts = gru->gs_gts[ctxnum];
+
+ /* Spurious interrupts can cause this. Ignore. */
+ if (!gts) {
+ STAT(intr_spurious);
+ continue;
+ }
+
+ /*
+ * This is running in interrupt context. Trylock the mmap_sem.
+ * If it fails, retry the fault in user context.
+ */
+ gts->ustats.fmm_tlbmiss++;
+ if (!gts->ts_force_cch_reload &&
+ down_read_trylock(&gts->ts_mm->mmap_sem)) {
+ gru_try_dropin(gru, gts, tfh, NULL);
+ up_read(&gts->ts_mm->mmap_sem);
+ } else {
+ tfh_user_polling_mode(tfh);
+ STAT(intr_mm_lock_failed);
+ }
+ }
+ return IRQ_HANDLED;
+}
+
+irqreturn_t gru0_intr(int irq, void *dev_id)
+{
+ return gru_intr(0, uv_numa_blade_id());
+}
+
+irqreturn_t gru1_intr(int irq, void *dev_id)
+{
+ return gru_intr(1, uv_numa_blade_id());
+}
+
+irqreturn_t gru_intr_mblade(int irq, void *dev_id)
+{
+ int blade;
+
+ for_each_possible_blade(blade) {
+ if (uv_blade_nr_possible_cpus(blade))
+ continue;
+ gru_intr(0, blade);
+ gru_intr(1, blade);
+ }
+ return IRQ_HANDLED;
+}
+
+
+static int gru_user_dropin(struct gru_thread_state *gts,
+ struct gru_tlb_fault_handle *tfh,
+ void *cb)
+{
+ struct gru_mm_struct *gms = gts->ts_gms;
+ int ret;
+
+ gts->ustats.upm_tlbmiss++;
+ while (1) {
+ wait_event(gms->ms_wait_queue,
+ atomic_read(&gms->ms_range_active) == 0);
+ prefetchw(tfh); /* Helps on hdw, required for emulator */
+ ret = gru_try_dropin(gts->ts_gru, gts, tfh, cb);
+ if (ret <= 0)
+ return ret;
+ STAT(call_os_wait_queue);
+ }
+}
+
+/*
+ * This interface is called as a result of a user detecting a "call OS" bit
+ * in a user CB. Normally means that a TLB fault has occurred.
+ * cb - user virtual address of the CB
+ */
+int gru_handle_user_call_os(unsigned long cb)
+{
+ struct gru_tlb_fault_handle *tfh;
+ struct gru_thread_state *gts;
+ void *cbk;
+ int ucbnum, cbrnum, ret = -EINVAL;
+
+ STAT(call_os);
+
+ /* sanity check the cb pointer */
+ ucbnum = get_cb_number((void *)cb);
+ if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
+ return -EINVAL;
+
+ gts = gru_find_lock_gts(cb);
+ if (!gts)
+ return -EINVAL;
+ gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
+
+ if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE)
+ goto exit;
+
+ gru_check_context_placement(gts);
+
+ /*
+ * CCH may contain stale data if ts_force_cch_reload is set.
+ */
+ if (gts->ts_gru && gts->ts_force_cch_reload) {
+ gts->ts_force_cch_reload = 0;
+ gru_update_cch(gts);
+ }
+
+ ret = -EAGAIN;
+ cbrnum = thread_cbr_number(gts, ucbnum);
+ if (gts->ts_gru) {
+ tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
+ cbk = get_gseg_base_address_cb(gts->ts_gru->gs_gru_base_vaddr,
+ gts->ts_ctxnum, ucbnum);
+ ret = gru_user_dropin(gts, tfh, cbk);
+ }
+exit:
+ gru_unlock_gts(gts);
+ return ret;
+}
+
+/*
+ * Fetch the exception detail information for a CB that terminated with
+ * an exception.
+ */
+int gru_get_exception_detail(unsigned long arg)
+{
+ struct control_block_extended_exc_detail excdet;
+ struct gru_control_block_extended *cbe;
+ struct gru_thread_state *gts;
+ int ucbnum, cbrnum, ret;
+
+ STAT(user_exception);
+ if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
+ return -EFAULT;
+
+ gts = gru_find_lock_gts(excdet.cb);
+ if (!gts)
+ return -EINVAL;
+
+ gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", excdet.cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
+ ucbnum = get_cb_number((void *)excdet.cb);
+ if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
+ ret = -EINVAL;
+ } else if (gts->ts_gru) {
+ cbrnum = thread_cbr_number(gts, ucbnum);
+ cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
+ gru_flush_cache(cbe); /* CBE not coherent */
+ sync_core(); /* make sure we are have current data */
+ excdet.opc = cbe->opccpy;
+ excdet.exopc = cbe->exopccpy;
+ excdet.ecause = cbe->ecause;
+ excdet.exceptdet0 = cbe->idef1upd;
+ excdet.exceptdet1 = cbe->idef3upd;
+ excdet.cbrstate = cbe->cbrstate;
+ excdet.cbrexecstatus = cbe->cbrexecstatus;
+ gru_flush_cache_cbe(cbe);
+ ret = 0;
+ } else {
+ ret = -EAGAIN;
+ }
+ gru_unlock_gts(gts);
+
+ gru_dbg(grudev,
+ "cb 0x%lx, op %d, exopc %d, cbrstate %d, cbrexecstatus 0x%x, ecause 0x%x, "
+ "exdet0 0x%lx, exdet1 0x%x\n",
+ excdet.cb, excdet.opc, excdet.exopc, excdet.cbrstate, excdet.cbrexecstatus,
+ excdet.ecause, excdet.exceptdet0, excdet.exceptdet1);
+ if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
+ ret = -EFAULT;
+ return ret;
+}
+
+/*
+ * User request to unload a context. Content is saved for possible reload.
+ */
+static int gru_unload_all_contexts(void)
+{
+ struct gru_thread_state *gts;
+ struct gru_state *gru;
+ int gid, ctxnum;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ foreach_gid(gid) {
+ gru = GID_TO_GRU(gid);
+ spin_lock(&gru->gs_lock);
+ for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
+ gts = gru->gs_gts[ctxnum];
+ if (gts && mutex_trylock(&gts->ts_ctxlock)) {
+ spin_unlock(&gru->gs_lock);
+ gru_unload_context(gts, 1);
+ mutex_unlock(&gts->ts_ctxlock);
+ spin_lock(&gru->gs_lock);
+ }
+ }
+ spin_unlock(&gru->gs_lock);
+ }
+ return 0;
+}
+
+int gru_user_unload_context(unsigned long arg)
+{
+ struct gru_thread_state *gts;
+ struct gru_unload_context_req req;
+
+ STAT(user_unload_context);
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+
+ gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
+
+ if (!req.gseg)
+ return gru_unload_all_contexts();
+
+ gts = gru_find_lock_gts(req.gseg);
+ if (!gts)
+ return -EINVAL;
+
+ if (gts->ts_gru)
+ gru_unload_context(gts, 1);
+ gru_unlock_gts(gts);
+
+ return 0;
+}
+
+/*
+ * User request to flush a range of virtual addresses from the GRU TLB
+ * (Mainly for testing).
+ */
+int gru_user_flush_tlb(unsigned long arg)
+{
+ struct gru_thread_state *gts;
+ struct gru_flush_tlb_req req;
+ struct gru_mm_struct *gms;
+
+ STAT(user_flush_tlb);
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+
+ gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
+ req.vaddr, req.len);
+
+ gts = gru_find_lock_gts(req.gseg);
+ if (!gts)
+ return -EINVAL;
+
+ gms = gts->ts_gms;
+ gru_unlock_gts(gts);
+ gru_flush_tlb_range(gms, req.vaddr, req.len);
+
+ return 0;
+}
+
+/*
+ * Fetch GSEG statisticss
+ */
+long gru_get_gseg_statistics(unsigned long arg)
+{
+ struct gru_thread_state *gts;
+ struct gru_get_gseg_statistics_req req;
+
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+
+ /*
+ * The library creates arrays of contexts for threaded programs.
+ * If no gts exists in the array, the context has never been used & all
+ * statistics are implicitly 0.
+ */
+ gts = gru_find_lock_gts(req.gseg);
+ if (gts) {
+ memcpy(&req.stats, &gts->ustats, sizeof(gts->ustats));
+ gru_unlock_gts(gts);
+ } else {
+ memset(&req.stats, 0, sizeof(gts->ustats));
+ }
+
+ if (copy_to_user((void __user *)arg, &req, sizeof(req)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Register the current task as the user of the GSEG slice.
+ * Needed for TLB fault interrupt targeting.
+ */
+int gru_set_context_option(unsigned long arg)
+{
+ struct gru_thread_state *gts;
+ struct gru_set_context_option_req req;
+ int ret = 0;
+
+ STAT(set_context_option);
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+ gru_dbg(grudev, "op %d, gseg 0x%lx, value1 0x%lx\n", req.op, req.gseg, req.val1);
+
+ gts = gru_find_lock_gts(req.gseg);
+ if (!gts) {
+ gts = gru_alloc_locked_gts(req.gseg);
+ if (IS_ERR(gts))
+ return PTR_ERR(gts);
+ }
+
+ switch (req.op) {
+ case sco_blade_chiplet:
+ /* Select blade/chiplet for GRU context */
+ if (req.val0 < -1 || req.val0 >= GRU_CHIPLETS_PER_HUB ||
+ req.val1 < -1 || req.val1 >= GRU_MAX_BLADES ||
+ (req.val1 >= 0 && !gru_base[req.val1])) {
+ ret = -EINVAL;
+ } else {
+ gts->ts_user_blade_id = req.val1;
+ gts->ts_user_chiplet_id = req.val0;
+ gru_check_context_placement(gts);
+ }
+ break;
+ case sco_gseg_owner:
+ /* Register the current task as the GSEG owner */
+ gts->ts_tgid_owner = current->tgid;
+ break;
+ case sco_cch_req_slice:
+ /* Set the CCH slice option */
+ gts->ts_cch_req_slice = req.val1 & 3;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ gru_unlock_gts(gts);
+
+ return ret;
+}
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
new file mode 100644
index 000000000..104a05f6b
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -0,0 +1,623 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * FILE OPERATIONS & DRIVER INITIALIZATION
+ *
+ * This file supports the user system call for file open, close, mmap, etc.
+ * This also incudes the driver initialization code.
+ *
+ * Copyright (c) 2008-2014 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_X86_64
+#include <asm/uv/uv_irq.h>
+#endif
+#include <asm/uv/uv.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+
+struct gru_blade_state *gru_base[GRU_MAX_BLADES] __read_mostly;
+unsigned long gru_start_paddr __read_mostly;
+void *gru_start_vaddr __read_mostly;
+unsigned long gru_end_paddr __read_mostly;
+unsigned int gru_max_gids __read_mostly;
+struct gru_stats_s gru_stats;
+
+/* Guaranteed user available resources on each node */
+static int max_user_cbrs, max_user_dsr_bytes;
+
+static struct miscdevice gru_miscdev;
+
+static int gru_supported(void)
+{
+ return is_uv_system() &&
+ (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE);
+}
+
+/*
+ * gru_vma_close
+ *
+ * Called when unmapping a device mapping. Frees all gru resources
+ * and tables belonging to the vma.
+ */
+static void gru_vma_close(struct vm_area_struct *vma)
+{
+ struct gru_vma_data *vdata;
+ struct gru_thread_state *gts;
+ struct list_head *entry, *next;
+
+ if (!vma->vm_private_data)
+ return;
+
+ vdata = vma->vm_private_data;
+ vma->vm_private_data = NULL;
+ gru_dbg(grudev, "vma %p, file %p, vdata %p\n", vma, vma->vm_file,
+ vdata);
+ list_for_each_safe(entry, next, &vdata->vd_head) {
+ gts =
+ list_entry(entry, struct gru_thread_state, ts_next);
+ list_del(&gts->ts_next);
+ mutex_lock(&gts->ts_ctxlock);
+ if (gts->ts_gru)
+ gru_unload_context(gts, 0);
+ mutex_unlock(&gts->ts_ctxlock);
+ gts_drop(gts);
+ }
+ kfree(vdata);
+ STAT(vdata_free);
+}
+
+/*
+ * gru_file_mmap
+ *
+ * Called when mmapping the device. Initializes the vma with a fault handler
+ * and private data structure necessary to allocate, track, and free the
+ * underlying pages.
+ */
+static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) != (VM_SHARED | VM_WRITE))
+ return -EPERM;
+
+ if (vma->vm_start & (GRU_GSEG_PAGESIZE - 1) ||
+ vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED |
+ VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_page_prot = PAGE_SHARED;
+ vma->vm_ops = &gru_vm_ops;
+
+ vma->vm_private_data = gru_alloc_vma_data(vma, 0);
+ if (!vma->vm_private_data)
+ return -ENOMEM;
+
+ gru_dbg(grudev, "file %p, vaddr 0x%lx, vma %p, vdata %p\n",
+ file, vma->vm_start, vma, vma->vm_private_data);
+ return 0;
+}
+
+/*
+ * Create a new GRU context
+ */
+static int gru_create_new_context(unsigned long arg)
+{
+ struct gru_create_context_req req;
+ struct vm_area_struct *vma;
+ struct gru_vma_data *vdata;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+
+ if (req.data_segment_bytes > max_user_dsr_bytes)
+ return -EINVAL;
+ if (req.control_blocks > max_user_cbrs || !req.maximum_thread_count)
+ return -EINVAL;
+
+ if (!(req.options & GRU_OPT_MISS_MASK))
+ req.options |= GRU_OPT_MISS_FMM_INTR;
+
+ down_write(&current->mm->mmap_sem);
+ vma = gru_find_vma(req.gseg);
+ if (vma) {
+ vdata = vma->vm_private_data;
+ vdata->vd_user_options = req.options;
+ vdata->vd_dsr_au_count =
+ GRU_DS_BYTES_TO_AU(req.data_segment_bytes);
+ vdata->vd_cbr_au_count = GRU_CB_COUNT_TO_AU(req.control_blocks);
+ vdata->vd_tlb_preload_count = req.tlb_preload_count;
+ ret = 0;
+ }
+ up_write(&current->mm->mmap_sem);
+
+ return ret;
+}
+
+/*
+ * Get GRU configuration info (temp - for emulator testing)
+ */
+static long gru_get_config_info(unsigned long arg)
+{
+ struct gru_config_info info;
+ int nodesperblade;
+
+ if (num_online_nodes() > 1 &&
+ (uv_node_to_blade_id(1) == uv_node_to_blade_id(0)))
+ nodesperblade = 2;
+ else
+ nodesperblade = 1;
+ memset(&info, 0, sizeof(info));
+ info.cpus = num_online_cpus();
+ info.nodes = num_online_nodes();
+ info.blades = info.nodes / nodesperblade;
+ info.chiplets = GRU_CHIPLETS_PER_BLADE * info.blades;
+
+ if (copy_to_user((void __user *)arg, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * gru_file_unlocked_ioctl
+ *
+ * Called to update file attributes via IOCTL calls.
+ */
+static long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
+ unsigned long arg)
+{
+ int err = -EBADRQC;
+
+ gru_dbg(grudev, "file %p, req 0x%x, 0x%lx\n", file, req, arg);
+
+ switch (req) {
+ case GRU_CREATE_CONTEXT:
+ err = gru_create_new_context(arg);
+ break;
+ case GRU_SET_CONTEXT_OPTION:
+ err = gru_set_context_option(arg);
+ break;
+ case GRU_USER_GET_EXCEPTION_DETAIL:
+ err = gru_get_exception_detail(arg);
+ break;
+ case GRU_USER_UNLOAD_CONTEXT:
+ err = gru_user_unload_context(arg);
+ break;
+ case GRU_USER_FLUSH_TLB:
+ err = gru_user_flush_tlb(arg);
+ break;
+ case GRU_USER_CALL_OS:
+ err = gru_handle_user_call_os(arg);
+ break;
+ case GRU_GET_GSEG_STATISTICS:
+ err = gru_get_gseg_statistics(arg);
+ break;
+ case GRU_KTEST:
+ err = gru_ktest(arg);
+ break;
+ case GRU_GET_CONFIG_INFO:
+ err = gru_get_config_info(arg);
+ break;
+ case GRU_DUMP_CHIPLET_STATE:
+ err = gru_dump_chiplet_request(arg);
+ break;
+ }
+ return err;
+}
+
+/*
+ * Called at init time to build tables for all GRUs that are present in the
+ * system.
+ */
+static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
+ void *vaddr, int blade_id, int chiplet_id)
+{
+ spin_lock_init(&gru->gs_lock);
+ spin_lock_init(&gru->gs_asid_lock);
+ gru->gs_gru_base_paddr = paddr;
+ gru->gs_gru_base_vaddr = vaddr;
+ gru->gs_gid = blade_id * GRU_CHIPLETS_PER_BLADE + chiplet_id;
+ gru->gs_blade = gru_base[blade_id];
+ gru->gs_blade_id = blade_id;
+ gru->gs_chiplet_id = chiplet_id;
+ gru->gs_cbr_map = (GRU_CBR_AU == 64) ? ~0 : (1UL << GRU_CBR_AU) - 1;
+ gru->gs_dsr_map = (1UL << GRU_DSR_AU) - 1;
+ gru->gs_asid_limit = MAX_ASID;
+ gru_tgh_flush_init(gru);
+ if (gru->gs_gid >= gru_max_gids)
+ gru_max_gids = gru->gs_gid + 1;
+ gru_dbg(grudev, "bid %d, gid %d, vaddr %p (0x%lx)\n",
+ blade_id, gru->gs_gid, gru->gs_gru_base_vaddr,
+ gru->gs_gru_base_paddr);
+}
+
+static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
+{
+ int pnode, nid, bid, chip;
+ int cbrs, dsrbytes, n;
+ int order = get_order(sizeof(struct gru_blade_state));
+ struct page *page;
+ struct gru_state *gru;
+ unsigned long paddr;
+ void *vaddr;
+
+ max_user_cbrs = GRU_NUM_CB;
+ max_user_dsr_bytes = GRU_NUM_DSR_BYTES;
+ for_each_possible_blade(bid) {
+ pnode = uv_blade_to_pnode(bid);
+ nid = uv_blade_to_memory_nid(bid);/* -1 if no memory on blade */
+ page = alloc_pages_node(nid, GFP_KERNEL, order);
+ if (!page)
+ goto fail;
+ gru_base[bid] = page_address(page);
+ memset(gru_base[bid], 0, sizeof(struct gru_blade_state));
+ gru_base[bid]->bs_lru_gru = &gru_base[bid]->bs_grus[0];
+ spin_lock_init(&gru_base[bid]->bs_lock);
+ init_rwsem(&gru_base[bid]->bs_kgts_sema);
+
+ dsrbytes = 0;
+ cbrs = 0;
+ for (gru = gru_base[bid]->bs_grus, chip = 0;
+ chip < GRU_CHIPLETS_PER_BLADE;
+ chip++, gru++) {
+ paddr = gru_chiplet_paddr(gru_base_paddr, pnode, chip);
+ vaddr = gru_chiplet_vaddr(gru_base_vaddr, pnode, chip);
+ gru_init_chiplet(gru, paddr, vaddr, bid, chip);
+ n = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+ cbrs = max(cbrs, n);
+ n = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
+ dsrbytes = max(dsrbytes, n);
+ }
+ max_user_cbrs = min(max_user_cbrs, cbrs);
+ max_user_dsr_bytes = min(max_user_dsr_bytes, dsrbytes);
+ }
+
+ return 0;
+
+fail:
+ for (bid--; bid >= 0; bid--)
+ free_pages((unsigned long)gru_base[bid], order);
+ return -ENOMEM;
+}
+
+static void gru_free_tables(void)
+{
+ int bid;
+ int order = get_order(sizeof(struct gru_state) *
+ GRU_CHIPLETS_PER_BLADE);
+
+ for (bid = 0; bid < GRU_MAX_BLADES; bid++)
+ free_pages((unsigned long)gru_base[bid], order);
+}
+
+static unsigned long gru_chiplet_cpu_to_mmr(int chiplet, int cpu, int *corep)
+{
+ unsigned long mmr = 0;
+ int core;
+
+ /*
+ * We target the cores of a blade and not the hyperthreads themselves.
+ * There is a max of 8 cores per socket and 2 sockets per blade,
+ * making for a max total of 16 cores (i.e., 16 CPUs without
+ * hyperthreading and 32 CPUs with hyperthreading).
+ */
+ core = uv_cpu_core_number(cpu) + UV_MAX_INT_CORES * uv_cpu_socket_number(cpu);
+ if (core >= GRU_NUM_TFM || uv_cpu_ht_number(cpu))
+ return 0;
+
+ if (chiplet == 0) {
+ mmr = UVH_GR0_TLB_INT0_CONFIG +
+ core * (UVH_GR0_TLB_INT1_CONFIG - UVH_GR0_TLB_INT0_CONFIG);
+ } else if (chiplet == 1) {
+ mmr = UVH_GR1_TLB_INT0_CONFIG +
+ core * (UVH_GR1_TLB_INT1_CONFIG - UVH_GR1_TLB_INT0_CONFIG);
+ } else {
+ BUG();
+ }
+
+ *corep = core;
+ return mmr;
+}
+
+#ifdef CONFIG_IA64
+
+static int gru_irq_count[GRU_CHIPLETS_PER_BLADE];
+
+static void gru_noop(struct irq_data *d)
+{
+}
+
+static struct irq_chip gru_chip[GRU_CHIPLETS_PER_BLADE] = {
+ [0 ... GRU_CHIPLETS_PER_BLADE - 1] {
+ .irq_mask = gru_noop,
+ .irq_unmask = gru_noop,
+ .irq_ack = gru_noop
+ }
+};
+
+static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
+ irq_handler_t irq_handler, int cpu, int blade)
+{
+ unsigned long mmr;
+ int irq = IRQ_GRU + chiplet;
+ int ret, core;
+
+ mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
+ if (mmr == 0)
+ return 0;
+
+ if (gru_irq_count[chiplet] == 0) {
+ gru_chip[chiplet].name = irq_name;
+ ret = irq_set_chip(irq, &gru_chip[chiplet]);
+ if (ret) {
+ printk(KERN_ERR "%s: set_irq_chip failed, errno=%d\n",
+ GRU_DRIVER_ID_STR, -ret);
+ return ret;
+ }
+
+ ret = request_irq(irq, irq_handler, 0, irq_name, NULL);
+ if (ret) {
+ printk(KERN_ERR "%s: request_irq failed, errno=%d\n",
+ GRU_DRIVER_ID_STR, -ret);
+ return ret;
+ }
+ }
+ gru_irq_count[chiplet]++;
+
+ return 0;
+}
+
+static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
+{
+ unsigned long mmr;
+ int core, irq = IRQ_GRU + chiplet;
+
+ if (gru_irq_count[chiplet] == 0)
+ return;
+
+ mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
+ if (mmr == 0)
+ return;
+
+ if (--gru_irq_count[chiplet] == 0)
+ free_irq(irq, NULL);
+}
+
+#elif defined CONFIG_X86_64
+
+static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
+ irq_handler_t irq_handler, int cpu, int blade)
+{
+ unsigned long mmr;
+ int irq, core;
+ int ret;
+
+ mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
+ if (mmr == 0)
+ return 0;
+
+ irq = uv_setup_irq(irq_name, cpu, blade, mmr, UV_AFFINITY_CPU);
+ if (irq < 0) {
+ printk(KERN_ERR "%s: uv_setup_irq failed, errno=%d\n",
+ GRU_DRIVER_ID_STR, -irq);
+ return irq;
+ }
+
+ ret = request_irq(irq, irq_handler, 0, irq_name, NULL);
+ if (ret) {
+ uv_teardown_irq(irq);
+ printk(KERN_ERR "%s: request_irq failed, errno=%d\n",
+ GRU_DRIVER_ID_STR, -ret);
+ return ret;
+ }
+ gru_base[blade]->bs_grus[chiplet].gs_irq[core] = irq;
+ return 0;
+}
+
+static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
+{
+ int irq, core;
+ unsigned long mmr;
+
+ mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
+ if (mmr) {
+ irq = gru_base[blade]->bs_grus[chiplet].gs_irq[core];
+ if (irq) {
+ free_irq(irq, NULL);
+ uv_teardown_irq(irq);
+ }
+ }
+}
+
+#endif
+
+static void gru_teardown_tlb_irqs(void)
+{
+ int blade;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ blade = uv_cpu_to_blade_id(cpu);
+ gru_chiplet_teardown_tlb_irq(0, cpu, blade);
+ gru_chiplet_teardown_tlb_irq(1, cpu, blade);
+ }
+ for_each_possible_blade(blade) {
+ if (uv_blade_nr_possible_cpus(blade))
+ continue;
+ gru_chiplet_teardown_tlb_irq(0, 0, blade);
+ gru_chiplet_teardown_tlb_irq(1, 0, blade);
+ }
+}
+
+static int gru_setup_tlb_irqs(void)
+{
+ int blade;
+ int cpu;
+ int ret;
+
+ for_each_online_cpu(cpu) {
+ blade = uv_cpu_to_blade_id(cpu);
+ ret = gru_chiplet_setup_tlb_irq(0, "GRU0_TLB", gru0_intr, cpu, blade);
+ if (ret != 0)
+ goto exit1;
+
+ ret = gru_chiplet_setup_tlb_irq(1, "GRU1_TLB", gru1_intr, cpu, blade);
+ if (ret != 0)
+ goto exit1;
+ }
+ for_each_possible_blade(blade) {
+ if (uv_blade_nr_possible_cpus(blade))
+ continue;
+ ret = gru_chiplet_setup_tlb_irq(0, "GRU0_TLB", gru_intr_mblade, 0, blade);
+ if (ret != 0)
+ goto exit1;
+
+ ret = gru_chiplet_setup_tlb_irq(1, "GRU1_TLB", gru_intr_mblade, 0, blade);
+ if (ret != 0)
+ goto exit1;
+ }
+
+ return 0;
+
+exit1:
+ gru_teardown_tlb_irqs();
+ return ret;
+}
+
+/*
+ * gru_init
+ *
+ * Called at boot or module load time to initialize the GRUs.
+ */
+static int __init gru_init(void)
+{
+ int ret;
+
+ if (!gru_supported())
+ return 0;
+
+#if defined CONFIG_IA64
+ gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
+#else
+ gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR) &
+ 0x7fffffffffffUL;
+#endif
+ gru_start_vaddr = __va(gru_start_paddr);
+ gru_end_paddr = gru_start_paddr + GRU_MAX_BLADES * GRU_SIZE;
+ printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
+ gru_start_paddr, gru_end_paddr);
+ ret = misc_register(&gru_miscdev);
+ if (ret) {
+ printk(KERN_ERR "%s: misc_register failed\n",
+ GRU_DRIVER_ID_STR);
+ goto exit0;
+ }
+
+ ret = gru_proc_init();
+ if (ret) {
+ printk(KERN_ERR "%s: proc init failed\n", GRU_DRIVER_ID_STR);
+ goto exit1;
+ }
+
+ ret = gru_init_tables(gru_start_paddr, gru_start_vaddr);
+ if (ret) {
+ printk(KERN_ERR "%s: init tables failed\n", GRU_DRIVER_ID_STR);
+ goto exit2;
+ }
+
+ ret = gru_setup_tlb_irqs();
+ if (ret != 0)
+ goto exit3;
+
+ gru_kservices_init();
+
+ printk(KERN_INFO "%s: v%s\n", GRU_DRIVER_ID_STR,
+ GRU_DRIVER_VERSION_STR);
+ return 0;
+
+exit3:
+ gru_free_tables();
+exit2:
+ gru_proc_exit();
+exit1:
+ misc_deregister(&gru_miscdev);
+exit0:
+ return ret;
+
+}
+
+static void __exit gru_exit(void)
+{
+ if (!gru_supported())
+ return;
+
+ gru_teardown_tlb_irqs();
+ gru_kservices_exit();
+ gru_free_tables();
+ misc_deregister(&gru_miscdev);
+ gru_proc_exit();
+}
+
+static const struct file_operations gru_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = gru_file_unlocked_ioctl,
+ .mmap = gru_file_mmap,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice gru_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "gru",
+ .fops = &gru_fops,
+};
+
+const struct vm_operations_struct gru_vm_ops = {
+ .close = gru_vma_close,
+ .fault = gru_fault,
+};
+
+#ifndef MODULE
+fs_initcall(gru_init);
+#else
+module_init(gru_init);
+#endif
+module_exit(gru_exit);
+
+module_param(gru_options, ulong, 0644);
+MODULE_PARM_DESC(gru_options, "Various debug options");
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(GRU_DRIVER_ID_STR GRU_DRIVER_VERSION_STR);
+MODULE_VERSION(GRU_DRIVER_VERSION_STR);
+
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
new file mode 100644
index 000000000..1ee8e82ba
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -0,0 +1,210 @@
+/*
+ * GRU KERNEL MCS INSTRUCTIONS
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+/* 10 sec */
+#ifdef CONFIG_IA64
+#include <asm/processor.h>
+#define GRU_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
+#define CLKS2NSEC(c) ((c) *1000000000 / local_cpu_data->itc_freq)
+#else
+#include <asm/tsc.h>
+#define GRU_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+#define CLKS2NSEC(c) ((c) * 1000000 / tsc_khz)
+#endif
+
+/* Extract the status field from a kernel handle */
+#define GET_MSEG_HANDLE_STATUS(h) (((*(unsigned long *)(h)) >> 16) & 3)
+
+struct mcs_op_statistic mcs_op_statistics[mcsop_last];
+
+static void update_mcs_stats(enum mcs_op op, unsigned long clks)
+{
+ unsigned long nsec;
+
+ nsec = CLKS2NSEC(clks);
+ atomic_long_inc(&mcs_op_statistics[op].count);
+ atomic_long_add(nsec, &mcs_op_statistics[op].total);
+ if (mcs_op_statistics[op].max < nsec)
+ mcs_op_statistics[op].max = nsec;
+}
+
+static void start_instruction(void *h)
+{
+ unsigned long *w0 = h;
+
+ wmb(); /* setting CMD/STATUS bits must be last */
+ *w0 = *w0 | 0x20001;
+ gru_flush_cache(h);
+}
+
+static void report_instruction_timeout(void *h)
+{
+ unsigned long goff = GSEGPOFF((unsigned long)h);
+ char *id = "???";
+
+ if (TYPE_IS(CCH, goff))
+ id = "CCH";
+ else if (TYPE_IS(TGH, goff))
+ id = "TGH";
+ else if (TYPE_IS(TFH, goff))
+ id = "TFH";
+
+ panic(KERN_ALERT "GRU %p (%s) is malfunctioning\n", h, id);
+}
+
+static int wait_instruction_complete(void *h, enum mcs_op opc)
+{
+ int status;
+ unsigned long start_time = get_cycles();
+
+ while (1) {
+ cpu_relax();
+ status = GET_MSEG_HANDLE_STATUS(h);
+ if (status != CCHSTATUS_ACTIVE)
+ break;
+ if (GRU_OPERATION_TIMEOUT < (get_cycles() - start_time)) {
+ report_instruction_timeout(h);
+ start_time = get_cycles();
+ }
+ }
+ if (gru_options & OPT_STATS)
+ update_mcs_stats(opc, get_cycles() - start_time);
+ return status;
+}
+
+int cch_allocate(struct gru_context_configuration_handle *cch)
+{
+ int ret;
+
+ cch->opc = CCHOP_ALLOCATE;
+ start_instruction(cch);
+ ret = wait_instruction_complete(cch, cchop_allocate);
+
+ /*
+ * Stop speculation into the GSEG being mapped by the previous ALLOCATE.
+ * The GSEG memory does not exist until the ALLOCATE completes.
+ */
+ sync_core();
+ return ret;
+}
+
+int cch_start(struct gru_context_configuration_handle *cch)
+{
+ cch->opc = CCHOP_START;
+ start_instruction(cch);
+ return wait_instruction_complete(cch, cchop_start);
+}
+
+int cch_interrupt(struct gru_context_configuration_handle *cch)
+{
+ cch->opc = CCHOP_INTERRUPT;
+ start_instruction(cch);
+ return wait_instruction_complete(cch, cchop_interrupt);
+}
+
+int cch_deallocate(struct gru_context_configuration_handle *cch)
+{
+ int ret;
+
+ cch->opc = CCHOP_DEALLOCATE;
+ start_instruction(cch);
+ ret = wait_instruction_complete(cch, cchop_deallocate);
+
+ /*
+ * Stop speculation into the GSEG being unmapped by the previous
+ * DEALLOCATE.
+ */
+ sync_core();
+ return ret;
+}
+
+int cch_interrupt_sync(struct gru_context_configuration_handle
+ *cch)
+{
+ cch->opc = CCHOP_INTERRUPT_SYNC;
+ start_instruction(cch);
+ return wait_instruction_complete(cch, cchop_interrupt_sync);
+}
+
+int tgh_invalidate(struct gru_tlb_global_handle *tgh,
+ unsigned long vaddr, unsigned long vaddrmask,
+ int asid, int pagesize, int global, int n,
+ unsigned short ctxbitmap)
+{
+ tgh->vaddr = vaddr;
+ tgh->asid = asid;
+ tgh->pagesize = pagesize;
+ tgh->n = n;
+ tgh->global = global;
+ tgh->vaddrmask = vaddrmask;
+ tgh->ctxbitmap = ctxbitmap;
+ tgh->opc = TGHOP_TLBINV;
+ start_instruction(tgh);
+ return wait_instruction_complete(tgh, tghop_invalidate);
+}
+
+int tfh_write_only(struct gru_tlb_fault_handle *tfh,
+ unsigned long paddr, int gaa,
+ unsigned long vaddr, int asid, int dirty,
+ int pagesize)
+{
+ tfh->fillasid = asid;
+ tfh->fillvaddr = vaddr;
+ tfh->pfn = paddr >> GRU_PADDR_SHIFT;
+ tfh->gaa = gaa;
+ tfh->dirty = dirty;
+ tfh->pagesize = pagesize;
+ tfh->opc = TFHOP_WRITE_ONLY;
+ start_instruction(tfh);
+ return wait_instruction_complete(tfh, tfhop_write_only);
+}
+
+void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
+ unsigned long paddr, int gaa,
+ unsigned long vaddr, int asid, int dirty,
+ int pagesize)
+{
+ tfh->fillasid = asid;
+ tfh->fillvaddr = vaddr;
+ tfh->pfn = paddr >> GRU_PADDR_SHIFT;
+ tfh->gaa = gaa;
+ tfh->dirty = dirty;
+ tfh->pagesize = pagesize;
+ tfh->opc = TFHOP_WRITE_RESTART;
+ start_instruction(tfh);
+}
+
+void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
+{
+ tfh->opc = TFHOP_USER_POLLING_MODE;
+ start_instruction(tfh);
+}
+
+void tfh_exception(struct gru_tlb_fault_handle *tfh)
+{
+ tfh->opc = TFHOP_EXCEPTION;
+ start_instruction(tfh);
+}
+
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
new file mode 100644
index 000000000..3d7bd36a1
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -0,0 +1,530 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * GRU HANDLE DEFINITION
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __GRUHANDLES_H__
+#define __GRUHANDLES_H__
+#include "gru_instructions.h"
+
+/*
+ * Manifest constants for GRU Memory Map
+ */
+#define GRU_GSEG0_BASE 0
+#define GRU_MCS_BASE (64 * 1024 * 1024)
+#define GRU_SIZE (128UL * 1024 * 1024)
+
+/* Handle & resource counts */
+#define GRU_NUM_CB 128
+#define GRU_NUM_DSR_BYTES (32 * 1024)
+#define GRU_NUM_TFM 16
+#define GRU_NUM_TGH 24
+#define GRU_NUM_CBE 128
+#define GRU_NUM_TFH 128
+#define GRU_NUM_CCH 16
+
+/* Maximum resource counts that can be reserved by user programs */
+#define GRU_NUM_USER_CBR GRU_NUM_CBE
+#define GRU_NUM_USER_DSR_BYTES GRU_NUM_DSR_BYTES
+
+/* Bytes per handle & handle stride. Code assumes all cb, tfh, cbe handles
+ * are the same */
+#define GRU_HANDLE_BYTES 64
+#define GRU_HANDLE_STRIDE 256
+
+/* Base addresses of handles */
+#define GRU_TFM_BASE (GRU_MCS_BASE + 0x00000)
+#define GRU_TGH_BASE (GRU_MCS_BASE + 0x08000)
+#define GRU_CBE_BASE (GRU_MCS_BASE + 0x10000)
+#define GRU_TFH_BASE (GRU_MCS_BASE + 0x18000)
+#define GRU_CCH_BASE (GRU_MCS_BASE + 0x20000)
+
+/* User gseg constants */
+#define GRU_GSEG_STRIDE (4 * 1024 * 1024)
+#define GSEG_BASE(a) ((a) & ~(GRU_GSEG_PAGESIZE - 1))
+
+/* Data segment constants */
+#define GRU_DSR_AU_BYTES 1024
+#define GRU_DSR_CL (GRU_NUM_DSR_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU_CL (GRU_DSR_AU_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU (GRU_NUM_DSR_BYTES / GRU_DSR_AU_BYTES)
+
+/* Control block constants */
+#define GRU_CBR_AU_SIZE 2
+#define GRU_CBR_AU (GRU_NUM_CBE / GRU_CBR_AU_SIZE)
+
+/* Convert resource counts to the number of AU */
+#define GRU_DS_BYTES_TO_AU(n) DIV_ROUND_UP(n, GRU_DSR_AU_BYTES)
+#define GRU_CB_COUNT_TO_AU(n) DIV_ROUND_UP(n, GRU_CBR_AU_SIZE)
+
+/* UV limits */
+#define GRU_CHIPLETS_PER_HUB 2
+#define GRU_HUBS_PER_BLADE 1
+#define GRU_CHIPLETS_PER_BLADE (GRU_HUBS_PER_BLADE * GRU_CHIPLETS_PER_HUB)
+
+/* User GRU Gseg offsets */
+#define GRU_CB_BASE 0
+#define GRU_CB_LIMIT (GRU_CB_BASE + GRU_HANDLE_STRIDE * GRU_NUM_CBE)
+#define GRU_DS_BASE 0x20000
+#define GRU_DS_LIMIT (GRU_DS_BASE + GRU_NUM_DSR_BYTES)
+
+/* Convert a GRU physical address to the chiplet offset */
+#define GSEGPOFF(h) ((h) & (GRU_SIZE - 1))
+
+/* Convert an arbitrary handle address to the beginning of the GRU segment */
+#define GRUBASE(h) ((void *)((unsigned long)(h) & ~(GRU_SIZE - 1)))
+
+/* Test a valid handle address to determine the type */
+#define TYPE_IS(hn, h) ((h) >= GRU_##hn##_BASE && (h) < \
+ GRU_##hn##_BASE + GRU_NUM_##hn * GRU_HANDLE_STRIDE && \
+ (((h) & (GRU_HANDLE_STRIDE - 1)) == 0))
+
+
+/* General addressing macros. */
+static inline void *get_gseg_base_address(void *base, int ctxnum)
+{
+ return (void *)(base + GRU_GSEG0_BASE + GRU_GSEG_STRIDE * ctxnum);
+}
+
+static inline void *get_gseg_base_address_cb(void *base, int ctxnum, int line)
+{
+ return (void *)(get_gseg_base_address(base, ctxnum) +
+ GRU_CB_BASE + GRU_HANDLE_STRIDE * line);
+}
+
+static inline void *get_gseg_base_address_ds(void *base, int ctxnum, int line)
+{
+ return (void *)(get_gseg_base_address(base, ctxnum) + GRU_DS_BASE +
+ GRU_CACHE_LINE_BYTES * line);
+}
+
+static inline struct gru_tlb_fault_map *get_tfm(void *base, int ctxnum)
+{
+ return (struct gru_tlb_fault_map *)(base + GRU_TFM_BASE +
+ ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_tlb_global_handle *get_tgh(void *base, int ctxnum)
+{
+ return (struct gru_tlb_global_handle *)(base + GRU_TGH_BASE +
+ ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_control_block_extended *get_cbe(void *base, int ctxnum)
+{
+ return (struct gru_control_block_extended *)(base + GRU_CBE_BASE +
+ ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_tlb_fault_handle *get_tfh(void *base, int ctxnum)
+{
+ return (struct gru_tlb_fault_handle *)(base + GRU_TFH_BASE +
+ ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_context_configuration_handle *get_cch(void *base,
+ int ctxnum)
+{
+ return (struct gru_context_configuration_handle *)(base +
+ GRU_CCH_BASE + ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline unsigned long get_cb_number(void *cb)
+{
+ return (((unsigned long)cb - GRU_CB_BASE) % GRU_GSEG_PAGESIZE) /
+ GRU_HANDLE_STRIDE;
+}
+
+/* byte offset to a specific GRU chiplet. (p=pnode, c=chiplet (0 or 1)*/
+static inline unsigned long gru_chiplet_paddr(unsigned long paddr, int pnode,
+ int chiplet)
+{
+ return paddr + GRU_SIZE * (2 * pnode + chiplet);
+}
+
+static inline void *gru_chiplet_vaddr(void *vaddr, int pnode, int chiplet)
+{
+ return vaddr + GRU_SIZE * (2 * pnode + chiplet);
+}
+
+static inline struct gru_control_block_extended *gru_tfh_to_cbe(
+ struct gru_tlb_fault_handle *tfh)
+{
+ unsigned long cbe;
+
+ cbe = (unsigned long)tfh - GRU_TFH_BASE + GRU_CBE_BASE;
+ return (struct gru_control_block_extended*)cbe;
+}
+
+
+
+
+/*
+ * Global TLB Fault Map
+ * Bitmap of outstanding TLB misses needing interrupt/polling service.
+ *
+ */
+struct gru_tlb_fault_map {
+ unsigned long fault_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+ unsigned long fill0[2];
+ unsigned long done_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+ unsigned long fill1[2];
+};
+
+/*
+ * TGH - TLB Global Handle
+ * Used for TLB flushing.
+ *
+ */
+struct gru_tlb_global_handle {
+ unsigned int cmd:1; /* DW 0 */
+ unsigned int delresp:1;
+ unsigned int opc:1;
+ unsigned int fill1:5;
+
+ unsigned int fill2:8;
+
+ unsigned int status:2;
+ unsigned long fill3:2;
+ unsigned int state:3;
+ unsigned long fill4:1;
+
+ unsigned int cause:3;
+ unsigned long fill5:37;
+
+ unsigned long vaddr:64; /* DW 1 */
+
+ unsigned int asid:24; /* DW 2 */
+ unsigned int fill6:8;
+
+ unsigned int pagesize:5;
+ unsigned int fill7:11;
+
+ unsigned int global:1;
+ unsigned int fill8:15;
+
+ unsigned long vaddrmask:39; /* DW 3 */
+ unsigned int fill9:9;
+ unsigned int n:10;
+ unsigned int fill10:6;
+
+ unsigned int ctxbitmap:16; /* DW4 */
+ unsigned long fill11[3];
+};
+
+enum gru_tgh_cmd {
+ TGHCMD_START
+};
+
+enum gru_tgh_opc {
+ TGHOP_TLBNOP,
+ TGHOP_TLBINV
+};
+
+enum gru_tgh_status {
+ TGHSTATUS_IDLE,
+ TGHSTATUS_EXCEPTION,
+ TGHSTATUS_ACTIVE
+};
+
+enum gru_tgh_state {
+ TGHSTATE_IDLE,
+ TGHSTATE_PE_INVAL,
+ TGHSTATE_INTERRUPT_INVAL,
+ TGHSTATE_WAITDONE,
+ TGHSTATE_RESTART_CTX,
+};
+
+enum gru_tgh_cause {
+ TGHCAUSE_RR_ECC,
+ TGHCAUSE_TLB_ECC,
+ TGHCAUSE_LRU_ECC,
+ TGHCAUSE_PS_ECC,
+ TGHCAUSE_MUL_ERR,
+ TGHCAUSE_DATA_ERR,
+ TGHCAUSE_SW_FORCE
+};
+
+
+/*
+ * TFH - TLB Global Handle
+ * Used for TLB dropins into the GRU TLB.
+ *
+ */
+struct gru_tlb_fault_handle {
+ unsigned int cmd:1; /* DW 0 - low 32*/
+ unsigned int delresp:1;
+ unsigned int fill0:2;
+ unsigned int opc:3;
+ unsigned int fill1:9;
+
+ unsigned int status:2;
+ unsigned int fill2:2;
+ unsigned int state:3;
+ unsigned int fill3:1;
+
+ unsigned int cause:6;
+ unsigned int cb_int:1;
+ unsigned int fill4:1;
+
+ unsigned int indexway:12; /* DW 0 - high 32 */
+ unsigned int fill5:4;
+
+ unsigned int ctxnum:4;
+ unsigned int fill6:12;
+
+ unsigned long missvaddr:64; /* DW 1 */
+
+ unsigned int missasid:24; /* DW 2 */
+ unsigned int fill7:8;
+ unsigned int fillasid:24;
+ unsigned int dirty:1;
+ unsigned int gaa:2;
+ unsigned long fill8:5;
+
+ unsigned long pfn:41; /* DW 3 */
+ unsigned int fill9:7;
+ unsigned int pagesize:5;
+ unsigned int fill10:11;
+
+ unsigned long fillvaddr:64; /* DW 4 */
+
+ unsigned long fill11[3];
+};
+
+enum gru_tfh_opc {
+ TFHOP_NOOP,
+ TFHOP_RESTART,
+ TFHOP_WRITE_ONLY,
+ TFHOP_WRITE_RESTART,
+ TFHOP_EXCEPTION,
+ TFHOP_USER_POLLING_MODE = 7,
+};
+
+enum tfh_status {
+ TFHSTATUS_IDLE,
+ TFHSTATUS_EXCEPTION,
+ TFHSTATUS_ACTIVE,
+};
+
+enum tfh_state {
+ TFHSTATE_INACTIVE,
+ TFHSTATE_IDLE,
+ TFHSTATE_MISS_UPM,
+ TFHSTATE_MISS_FMM,
+ TFHSTATE_HW_ERR,
+ TFHSTATE_WRITE_TLB,
+ TFHSTATE_RESTART_CBR,
+};
+
+/* TFH cause bits */
+enum tfh_cause {
+ TFHCAUSE_NONE,
+ TFHCAUSE_TLB_MISS,
+ TFHCAUSE_TLB_MOD,
+ TFHCAUSE_HW_ERROR_RR,
+ TFHCAUSE_HW_ERROR_MAIN_ARRAY,
+ TFHCAUSE_HW_ERROR_VALID,
+ TFHCAUSE_HW_ERROR_PAGESIZE,
+ TFHCAUSE_INSTRUCTION_EXCEPTION,
+ TFHCAUSE_UNCORRECTIBLE_ERROR,
+};
+
+/* GAA values */
+#define GAA_RAM 0x0
+#define GAA_NCRAM 0x2
+#define GAA_MMIO 0x1
+#define GAA_REGISTER 0x3
+
+/* GRU paddr shift for pfn. (NOTE: shift is NOT by actual pagesize) */
+#define GRU_PADDR_SHIFT 12
+
+/*
+ * Context Configuration handle
+ * Used to allocate resources to a GSEG context.
+ *
+ */
+struct gru_context_configuration_handle {
+ unsigned int cmd:1; /* DW0 */
+ unsigned int delresp:1;
+ unsigned int opc:3;
+ unsigned int unmap_enable:1;
+ unsigned int req_slice_set_enable:1;
+ unsigned int req_slice:2;
+ unsigned int cb_int_enable:1;
+ unsigned int tlb_int_enable:1;
+ unsigned int tfm_fault_bit_enable:1;
+ unsigned int tlb_int_select:4;
+
+ unsigned int status:2;
+ unsigned int state:2;
+ unsigned int reserved2:4;
+
+ unsigned int cause:4;
+ unsigned int tfm_done_bit_enable:1;
+ unsigned int unused:3;
+
+ unsigned int dsr_allocation_map;
+
+ unsigned long cbr_allocation_map; /* DW1 */
+
+ unsigned int asid[8]; /* DW 2 - 5 */
+ unsigned short sizeavail[8]; /* DW 6 - 7 */
+} __attribute__ ((packed));
+
+enum gru_cch_opc {
+ CCHOP_START = 1,
+ CCHOP_ALLOCATE,
+ CCHOP_INTERRUPT,
+ CCHOP_DEALLOCATE,
+ CCHOP_INTERRUPT_SYNC,
+};
+
+enum gru_cch_status {
+ CCHSTATUS_IDLE,
+ CCHSTATUS_EXCEPTION,
+ CCHSTATUS_ACTIVE,
+};
+
+enum gru_cch_state {
+ CCHSTATE_INACTIVE,
+ CCHSTATE_MAPPED,
+ CCHSTATE_ACTIVE,
+ CCHSTATE_INTERRUPTED,
+};
+
+/* CCH Exception cause */
+enum gru_cch_cause {
+ CCHCAUSE_REGION_REGISTER_WRITE_ERROR = 1,
+ CCHCAUSE_ILLEGAL_OPCODE = 2,
+ CCHCAUSE_INVALID_START_REQUEST = 3,
+ CCHCAUSE_INVALID_ALLOCATION_REQUEST = 4,
+ CCHCAUSE_INVALID_DEALLOCATION_REQUEST = 5,
+ CCHCAUSE_INVALID_INTERRUPT_REQUEST = 6,
+ CCHCAUSE_CCH_BUSY = 7,
+ CCHCAUSE_NO_CBRS_TO_ALLOCATE = 8,
+ CCHCAUSE_BAD_TFM_CONFIG = 9,
+ CCHCAUSE_CBR_RESOURCES_OVERSUBSCRIPED = 10,
+ CCHCAUSE_DSR_RESOURCES_OVERSUBSCRIPED = 11,
+ CCHCAUSE_CBR_DEALLOCATION_ERROR = 12,
+};
+/*
+ * CBE - Control Block Extended
+ * Maintains internal GRU state for active CBs.
+ *
+ */
+struct gru_control_block_extended {
+ unsigned int reserved0:1; /* DW 0 - low */
+ unsigned int imacpy:3;
+ unsigned int reserved1:4;
+ unsigned int xtypecpy:3;
+ unsigned int iaa0cpy:2;
+ unsigned int iaa1cpy:2;
+ unsigned int reserved2:1;
+ unsigned int opccpy:8;
+ unsigned int exopccpy:8;
+
+ unsigned int idef2cpy:22; /* DW 0 - high */
+ unsigned int reserved3:10;
+
+ unsigned int idef4cpy:22; /* DW 1 */
+ unsigned int reserved4:10;
+ unsigned int idef4upd:22;
+ unsigned int reserved5:10;
+
+ unsigned long idef1upd:64; /* DW 2 */
+
+ unsigned long idef5cpy:64; /* DW 3 */
+
+ unsigned long idef6cpy:64; /* DW 4 */
+
+ unsigned long idef3upd:64; /* DW 5 */
+
+ unsigned long idef5upd:64; /* DW 6 */
+
+ unsigned int idef2upd:22; /* DW 7 */
+ unsigned int reserved6:10;
+
+ unsigned int ecause:20;
+ unsigned int cbrstate:4;
+ unsigned int cbrexecstatus:8;
+};
+
+/* CBE fields for active BCOPY instructions */
+#define cbe_baddr0 idef1upd
+#define cbe_baddr1 idef3upd
+#define cbe_src_cl idef6cpy
+#define cbe_nelemcur idef5upd
+
+enum gru_cbr_state {
+ CBRSTATE_INACTIVE,
+ CBRSTATE_IDLE,
+ CBRSTATE_PE_CHECK,
+ CBRSTATE_QUEUED,
+ CBRSTATE_WAIT_RESPONSE,
+ CBRSTATE_INTERRUPTED,
+ CBRSTATE_INTERRUPTED_MISS_FMM,
+ CBRSTATE_BUSY_INTERRUPT_MISS_FMM,
+ CBRSTATE_INTERRUPTED_MISS_UPM,
+ CBRSTATE_BUSY_INTERRUPTED_MISS_UPM,
+ CBRSTATE_REQUEST_ISSUE,
+ CBRSTATE_BUSY_INTERRUPT,
+};
+
+/* CBE cbrexecstatus bits - defined in gru_instructions.h*/
+/* CBE ecause bits - defined in gru_instructions.h */
+
+/*
+ * Convert a processor pagesize into the strange encoded pagesize used by the
+ * GRU. Processor pagesize is encoded as log of bytes per page. (or PAGE_SHIFT)
+ * pagesize log pagesize grupagesize
+ * 4k 12 0
+ * 16k 14 1
+ * 64k 16 2
+ * 256k 18 3
+ * 1m 20 4
+ * 2m 21 5
+ * 4m 22 6
+ * 16m 24 7
+ * 64m 26 8
+ * ...
+ */
+#define GRU_PAGESIZE(sh) ((((sh) > 20 ? (sh) + 2 : (sh)) >> 1) - 6)
+#define GRU_SIZEAVAIL(sh) (1UL << GRU_PAGESIZE(sh))
+
+/* minimum TLB purge count to ensure a full purge */
+#define GRUMAXINVAL 1024UL
+
+int cch_allocate(struct gru_context_configuration_handle *cch);
+int cch_start(struct gru_context_configuration_handle *cch);
+int cch_interrupt(struct gru_context_configuration_handle *cch);
+int cch_deallocate(struct gru_context_configuration_handle *cch);
+int cch_interrupt_sync(struct gru_context_configuration_handle *cch);
+int tgh_invalidate(struct gru_tlb_global_handle *tgh, unsigned long vaddr,
+ unsigned long vaddrmask, int asid, int pagesize, int global, int n,
+ unsigned short ctxbitmap);
+int tfh_write_only(struct gru_tlb_fault_handle *tfh, unsigned long paddr,
+ int gaa, unsigned long vaddr, int asid, int dirty, int pagesize);
+void tfh_write_restart(struct gru_tlb_fault_handle *tfh, unsigned long paddr,
+ int gaa, unsigned long vaddr, int asid, int dirty, int pagesize);
+void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh);
+void tfh_exception(struct gru_tlb_fault_handle *tfh);
+
+#endif /* __GRUHANDLES_H__ */
diff --git a/drivers/misc/sgi-gru/grukdump.c b/drivers/misc/sgi-gru/grukdump.c
new file mode 100644
index 000000000..1540a7785
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukdump.c
@@ -0,0 +1,236 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * Dump GRU State
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <asm/uv/uv_hub.h>
+
+#include <linux/nospec.h>
+
+#include "gru.h"
+#include "grutables.h"
+#include "gruhandles.h"
+#include "grulib.h"
+
+#define CCH_LOCK_ATTEMPTS 10
+
+static int gru_user_copy_handle(void __user **dp, void *s)
+{
+ if (copy_to_user(*dp, s, GRU_HANDLE_BYTES))
+ return -1;
+ *dp += GRU_HANDLE_BYTES;
+ return 0;
+}
+
+static int gru_dump_context_data(void *grubase,
+ struct gru_context_configuration_handle *cch,
+ void __user *ubuf, int ctxnum, int dsrcnt,
+ int flush_cbrs)
+{
+ void *cb, *cbe, *tfh, *gseg;
+ int i, scr;
+
+ gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+ cb = gseg + GRU_CB_BASE;
+ cbe = grubase + GRU_CBE_BASE;
+ tfh = grubase + GRU_TFH_BASE;
+
+ for_each_cbr_in_allocation_map(i, &cch->cbr_allocation_map, scr) {
+ if (flush_cbrs)
+ gru_flush_cache(cb);
+ if (gru_user_copy_handle(&ubuf, cb))
+ goto fail;
+ if (gru_user_copy_handle(&ubuf, tfh + i * GRU_HANDLE_STRIDE))
+ goto fail;
+ if (gru_user_copy_handle(&ubuf, cbe + i * GRU_HANDLE_STRIDE))
+ goto fail;
+ cb += GRU_HANDLE_STRIDE;
+ }
+ if (dsrcnt)
+ memcpy(ubuf, gseg + GRU_DS_BASE, dsrcnt * GRU_HANDLE_STRIDE);
+ return 0;
+
+fail:
+ return -EFAULT;
+}
+
+static int gru_dump_tfm(struct gru_state *gru,
+ void __user *ubuf, void __user *ubufend)
+{
+ struct gru_tlb_fault_map *tfm;
+ int i;
+
+ if (GRU_NUM_TFM * GRU_CACHE_LINE_BYTES > ubufend - ubuf)
+ return -EFBIG;
+
+ for (i = 0; i < GRU_NUM_TFM; i++) {
+ tfm = get_tfm(gru->gs_gru_base_vaddr, i);
+ if (gru_user_copy_handle(&ubuf, tfm))
+ goto fail;
+ }
+ return GRU_NUM_TFM * GRU_CACHE_LINE_BYTES;
+
+fail:
+ return -EFAULT;
+}
+
+static int gru_dump_tgh(struct gru_state *gru,
+ void __user *ubuf, void __user *ubufend)
+{
+ struct gru_tlb_global_handle *tgh;
+ int i;
+
+ if (GRU_NUM_TGH * GRU_CACHE_LINE_BYTES > ubufend - ubuf)
+ return -EFBIG;
+
+ for (i = 0; i < GRU_NUM_TGH; i++) {
+ tgh = get_tgh(gru->gs_gru_base_vaddr, i);
+ if (gru_user_copy_handle(&ubuf, tgh))
+ goto fail;
+ }
+ return GRU_NUM_TGH * GRU_CACHE_LINE_BYTES;
+
+fail:
+ return -EFAULT;
+}
+
+static int gru_dump_context(struct gru_state *gru, int ctxnum,
+ void __user *ubuf, void __user *ubufend, char data_opt,
+ char lock_cch, char flush_cbrs)
+{
+ struct gru_dump_context_header hdr;
+ struct gru_dump_context_header __user *uhdr = ubuf;
+ struct gru_context_configuration_handle *cch, *ubufcch;
+ struct gru_thread_state *gts;
+ int try, cch_locked, cbrcnt = 0, dsrcnt = 0, bytes = 0, ret = 0;
+ void *grubase;
+
+ memset(&hdr, 0, sizeof(hdr));
+ grubase = gru->gs_gru_base_vaddr;
+ cch = get_cch(grubase, ctxnum);
+ for (try = 0; try < CCH_LOCK_ATTEMPTS; try++) {
+ cch_locked = trylock_cch_handle(cch);
+ if (cch_locked)
+ break;
+ msleep(1);
+ }
+
+ ubuf += sizeof(hdr);
+ ubufcch = ubuf;
+ if (gru_user_copy_handle(&ubuf, cch)) {
+ if (cch_locked)
+ unlock_cch_handle(cch);
+ return -EFAULT;
+ }
+ if (cch_locked)
+ ubufcch->delresp = 0;
+ bytes = sizeof(hdr) + GRU_CACHE_LINE_BYTES;
+
+ if (cch_locked || !lock_cch) {
+ gts = gru->gs_gts[ctxnum];
+ if (gts && gts->ts_vma) {
+ hdr.pid = gts->ts_tgid_owner;
+ hdr.vaddr = gts->ts_vma->vm_start;
+ }
+ if (cch->state != CCHSTATE_INACTIVE) {
+ cbrcnt = hweight64(cch->cbr_allocation_map) *
+ GRU_CBR_AU_SIZE;
+ dsrcnt = data_opt ? hweight32(cch->dsr_allocation_map) *
+ GRU_DSR_AU_CL : 0;
+ }
+ bytes += (3 * cbrcnt + dsrcnt) * GRU_CACHE_LINE_BYTES;
+ if (bytes > ubufend - ubuf)
+ ret = -EFBIG;
+ else
+ ret = gru_dump_context_data(grubase, cch, ubuf, ctxnum,
+ dsrcnt, flush_cbrs);
+ }
+ if (cch_locked)
+ unlock_cch_handle(cch);
+ if (ret)
+ return ret;
+
+ hdr.magic = GRU_DUMP_MAGIC;
+ hdr.gid = gru->gs_gid;
+ hdr.ctxnum = ctxnum;
+ hdr.cbrcnt = cbrcnt;
+ hdr.dsrcnt = dsrcnt;
+ hdr.cch_locked = cch_locked;
+ if (copy_to_user(uhdr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+
+ return bytes;
+}
+
+int gru_dump_chiplet_request(unsigned long arg)
+{
+ struct gru_state *gru;
+ struct gru_dump_chiplet_state_req req;
+ void __user *ubuf;
+ void __user *ubufend;
+ int ctxnum, ret, cnt = 0;
+
+ if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+ return -EFAULT;
+
+ /* Currently, only dump by gid is implemented */
+ if (req.gid >= gru_max_gids)
+ return -EINVAL;
+ req.gid = array_index_nospec(req.gid, gru_max_gids);
+
+ gru = GID_TO_GRU(req.gid);
+ ubuf = req.buf;
+ ubufend = req.buf + req.buflen;
+
+ ret = gru_dump_tfm(gru, ubuf, ubufend);
+ if (ret < 0)
+ goto fail;
+ ubuf += ret;
+
+ ret = gru_dump_tgh(gru, ubuf, ubufend);
+ if (ret < 0)
+ goto fail;
+ ubuf += ret;
+
+ for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
+ if (req.ctxnum == ctxnum || req.ctxnum < 0) {
+ ret = gru_dump_context(gru, ctxnum, ubuf, ubufend,
+ req.data_opt, req.lock_cch,
+ req.flush_cbrs);
+ if (ret < 0)
+ goto fail;
+ ubuf += ret;
+ cnt++;
+ }
+ }
+
+ if (copy_to_user((void __user *)arg, &req, sizeof(req)))
+ return -EFAULT;
+ return cnt;
+
+fail:
+ return ret;
+}
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
new file mode 100644
index 000000000..030769018
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -0,0 +1,1171 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * KERNEL SERVICES THAT USE THE GRU
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <asm/io_apic.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+#include "grukservices.h"
+#include "gru_instructions.h"
+#include <asm/uv/uv_hub.h>
+
+/*
+ * Kernel GRU Usage
+ *
+ * The following is an interim algorithm for management of kernel GRU
+ * resources. This will likely be replaced when we better understand the
+ * kernel/user requirements.
+ *
+ * Blade percpu resources reserved for kernel use. These resources are
+ * reserved whenever the the kernel context for the blade is loaded. Note
+ * that the kernel context is not guaranteed to be always available. It is
+ * loaded on demand & can be stolen by a user if the user demand exceeds the
+ * kernel demand. The kernel can always reload the kernel context but
+ * a SLEEP may be required!!!.
+ *
+ * Async Overview:
+ *
+ * Each blade has one "kernel context" that owns GRU kernel resources
+ * located on the blade. Kernel drivers use GRU resources in this context
+ * for sending messages, zeroing memory, etc.
+ *
+ * The kernel context is dynamically loaded on demand. If it is not in
+ * use by the kernel, the kernel context can be unloaded & given to a user.
+ * The kernel context will be reloaded when needed. This may require that
+ * a context be stolen from a user.
+ * NOTE: frequent unloading/reloading of the kernel context is
+ * expensive. We are depending on batch schedulers, cpusets, sane
+ * drivers or some other mechanism to prevent the need for frequent
+ * stealing/reloading.
+ *
+ * The kernel context consists of two parts:
+ * - 1 CB & a few DSRs that are reserved for each cpu on the blade.
+ * Each cpu has it's own private resources & does not share them
+ * with other cpus. These resources are used serially, ie,
+ * locked, used & unlocked on each call to a function in
+ * grukservices.
+ * (Now that we have dynamic loading of kernel contexts, I
+ * may rethink this & allow sharing between cpus....)
+ *
+ * - Additional resources can be reserved long term & used directly
+ * by UV drivers located in the kernel. Drivers using these GRU
+ * resources can use asynchronous GRU instructions that send
+ * interrupts on completion.
+ * - these resources must be explicitly locked/unlocked
+ * - locked resources prevent (obviously) the kernel
+ * context from being unloaded.
+ * - drivers using these resource directly issue their own
+ * GRU instruction and must wait/check completion.
+ *
+ * When these resources are reserved, the caller can optionally
+ * associate a wait_queue with the resources and use asynchronous
+ * GRU instructions. When an async GRU instruction completes, the
+ * driver will do a wakeup on the event.
+ *
+ */
+
+
+#define ASYNC_HAN_TO_BID(h) ((h) - 1)
+#define ASYNC_BID_TO_HAN(b) ((b) + 1)
+#define ASYNC_HAN_TO_BS(h) gru_base[ASYNC_HAN_TO_BID(h)]
+
+#define GRU_NUM_KERNEL_CBR 1
+#define GRU_NUM_KERNEL_DSR_BYTES 256
+#define GRU_NUM_KERNEL_DSR_CL (GRU_NUM_KERNEL_DSR_BYTES / \
+ GRU_CACHE_LINE_BYTES)
+
+/* GRU instruction attributes for all instructions */
+#define IMA IMA_CB_DELAY
+
+/* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
+#define __gru_cacheline_aligned__ \
+ __attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
+
+#define MAGIC 0x1234567887654321UL
+
+/* Default retry count for GRU errors on kernel instructions */
+#define EXCEPTION_RETRY_LIMIT 3
+
+/* Status of message queue sections */
+#define MQS_EMPTY 0
+#define MQS_FULL 1
+#define MQS_NOOP 2
+
+/*----------------- RESOURCE MANAGEMENT -------------------------------------*/
+/* optimized for x86_64 */
+struct message_queue {
+ union gru_mesqhead head __gru_cacheline_aligned__; /* CL 0 */
+ int qlines; /* DW 1 */
+ long hstatus[2];
+ void *next __gru_cacheline_aligned__;/* CL 1 */
+ void *limit;
+ void *start;
+ void *start2;
+ char data ____cacheline_aligned; /* CL 2 */
+};
+
+/* First word in every message - used by mesq interface */
+struct message_header {
+ char present;
+ char present2;
+ char lines;
+ char fill;
+};
+
+#define HSTATUS(mq, h) ((mq) + offsetof(struct message_queue, hstatus[h]))
+
+/*
+ * Reload the blade's kernel context into a GRU chiplet. Called holding
+ * the bs_kgts_sema for READ. Will steal user contexts if necessary.
+ */
+static void gru_load_kernel_context(struct gru_blade_state *bs, int blade_id)
+{
+ struct gru_state *gru;
+ struct gru_thread_state *kgts;
+ void *vaddr;
+ int ctxnum, ncpus;
+
+ up_read(&bs->bs_kgts_sema);
+ down_write(&bs->bs_kgts_sema);
+
+ if (!bs->bs_kgts) {
+ do {
+ bs->bs_kgts = gru_alloc_gts(NULL, 0, 0, 0, 0, 0);
+ if (!IS_ERR(bs->bs_kgts))
+ break;
+ msleep(1);
+ } while (true);
+ bs->bs_kgts->ts_user_blade_id = blade_id;
+ }
+ kgts = bs->bs_kgts;
+
+ if (!kgts->ts_gru) {
+ STAT(load_kernel_context);
+ ncpus = uv_blade_nr_possible_cpus(blade_id);
+ kgts->ts_cbr_au_count = GRU_CB_COUNT_TO_AU(
+ GRU_NUM_KERNEL_CBR * ncpus + bs->bs_async_cbrs);
+ kgts->ts_dsr_au_count = GRU_DS_BYTES_TO_AU(
+ GRU_NUM_KERNEL_DSR_BYTES * ncpus +
+ bs->bs_async_dsr_bytes);
+ while (!gru_assign_gru_context(kgts)) {
+ msleep(1);
+ gru_steal_context(kgts);
+ }
+ gru_load_context(kgts);
+ gru = bs->bs_kgts->ts_gru;
+ vaddr = gru->gs_gru_base_vaddr;
+ ctxnum = kgts->ts_ctxnum;
+ bs->kernel_cb = get_gseg_base_address_cb(vaddr, ctxnum, 0);
+ bs->kernel_dsr = get_gseg_base_address_ds(vaddr, ctxnum, 0);
+ }
+ downgrade_write(&bs->bs_kgts_sema);
+}
+
+/*
+ * Free all kernel contexts that are not currently in use.
+ * Returns 0 if all freed, else number of inuse context.
+ */
+static int gru_free_kernel_contexts(void)
+{
+ struct gru_blade_state *bs;
+ struct gru_thread_state *kgts;
+ int bid, ret = 0;
+
+ for (bid = 0; bid < GRU_MAX_BLADES; bid++) {
+ bs = gru_base[bid];
+ if (!bs)
+ continue;
+
+ /* Ignore busy contexts. Don't want to block here. */
+ if (down_write_trylock(&bs->bs_kgts_sema)) {
+ kgts = bs->bs_kgts;
+ if (kgts && kgts->ts_gru)
+ gru_unload_context(kgts, 0);
+ bs->bs_kgts = NULL;
+ up_write(&bs->bs_kgts_sema);
+ kfree(kgts);
+ } else {
+ ret++;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Lock & load the kernel context for the specified blade.
+ */
+static struct gru_blade_state *gru_lock_kernel_context(int blade_id)
+{
+ struct gru_blade_state *bs;
+ int bid;
+
+ STAT(lock_kernel_context);
+again:
+ bid = blade_id < 0 ? uv_numa_blade_id() : blade_id;
+ bs = gru_base[bid];
+
+ /* Handle the case where migration occurred while waiting for the sema */
+ down_read(&bs->bs_kgts_sema);
+ if (blade_id < 0 && bid != uv_numa_blade_id()) {
+ up_read(&bs->bs_kgts_sema);
+ goto again;
+ }
+ if (!bs->bs_kgts || !bs->bs_kgts->ts_gru)
+ gru_load_kernel_context(bs, bid);
+ return bs;
+
+}
+
+/*
+ * Unlock the kernel context for the specified blade. Context is not
+ * unloaded but may be stolen before next use.
+ */
+static void gru_unlock_kernel_context(int blade_id)
+{
+ struct gru_blade_state *bs;
+
+ bs = gru_base[blade_id];
+ up_read(&bs->bs_kgts_sema);
+ STAT(unlock_kernel_context);
+}
+
+/*
+ * Reserve & get pointers to the DSR/CBRs reserved for the current cpu.
+ * - returns with preemption disabled
+ */
+static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
+{
+ struct gru_blade_state *bs;
+ int lcpu;
+
+ BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
+ preempt_disable();
+ bs = gru_lock_kernel_context(-1);
+ lcpu = uv_blade_processor_id();
+ *cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
+ *dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
+ return 0;
+}
+
+/*
+ * Free the current cpus reserved DSR/CBR resources.
+ */
+static void gru_free_cpu_resources(void *cb, void *dsr)
+{
+ gru_unlock_kernel_context(uv_numa_blade_id());
+ preempt_enable();
+}
+
+/*
+ * Reserve GRU resources to be used asynchronously.
+ * Note: currently supports only 1 reservation per blade.
+ *
+ * input:
+ * blade_id - blade on which resources should be reserved
+ * cbrs - number of CBRs
+ * dsr_bytes - number of DSR bytes needed
+ * output:
+ * handle to identify resource
+ * (0 = async resources already reserved)
+ */
+unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
+ struct completion *cmp)
+{
+ struct gru_blade_state *bs;
+ struct gru_thread_state *kgts;
+ int ret = 0;
+
+ bs = gru_base[blade_id];
+
+ down_write(&bs->bs_kgts_sema);
+
+ /* Verify no resources already reserved */
+ if (bs->bs_async_dsr_bytes + bs->bs_async_cbrs)
+ goto done;
+ bs->bs_async_dsr_bytes = dsr_bytes;
+ bs->bs_async_cbrs = cbrs;
+ bs->bs_async_wq = cmp;
+ kgts = bs->bs_kgts;
+
+ /* Resources changed. Unload context if already loaded */
+ if (kgts && kgts->ts_gru)
+ gru_unload_context(kgts, 0);
+ ret = ASYNC_BID_TO_HAN(blade_id);
+
+done:
+ up_write(&bs->bs_kgts_sema);
+ return ret;
+}
+
+/*
+ * Release async resources previously reserved.
+ *
+ * input:
+ * han - handle to identify resources
+ */
+void gru_release_async_resources(unsigned long han)
+{
+ struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+
+ down_write(&bs->bs_kgts_sema);
+ bs->bs_async_dsr_bytes = 0;
+ bs->bs_async_cbrs = 0;
+ bs->bs_async_wq = NULL;
+ up_write(&bs->bs_kgts_sema);
+}
+
+/*
+ * Wait for async GRU instructions to complete.
+ *
+ * input:
+ * han - handle to identify resources
+ */
+void gru_wait_async_cbr(unsigned long han)
+{
+ struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+
+ wait_for_completion(bs->bs_async_wq);
+ mb();
+}
+
+/*
+ * Lock previous reserved async GRU resources
+ *
+ * input:
+ * han - handle to identify resources
+ * output:
+ * cb - pointer to first CBR
+ * dsr - pointer to first DSR
+ */
+void gru_lock_async_resource(unsigned long han, void **cb, void **dsr)
+{
+ struct gru_blade_state *bs = ASYNC_HAN_TO_BS(han);
+ int blade_id = ASYNC_HAN_TO_BID(han);
+ int ncpus;
+
+ gru_lock_kernel_context(blade_id);
+ ncpus = uv_blade_nr_possible_cpus(blade_id);
+ if (cb)
+ *cb = bs->kernel_cb + ncpus * GRU_HANDLE_STRIDE;
+ if (dsr)
+ *dsr = bs->kernel_dsr + ncpus * GRU_NUM_KERNEL_DSR_BYTES;
+}
+
+/*
+ * Unlock previous reserved async GRU resources
+ *
+ * input:
+ * han - handle to identify resources
+ */
+void gru_unlock_async_resource(unsigned long han)
+{
+ int blade_id = ASYNC_HAN_TO_BID(han);
+
+ gru_unlock_kernel_context(blade_id);
+}
+
+/*----------------------------------------------------------------------*/
+int gru_get_cb_exception_detail(void *cb,
+ struct control_block_extended_exc_detail *excdet)
+{
+ struct gru_control_block_extended *cbe;
+ struct gru_thread_state *kgts = NULL;
+ unsigned long off;
+ int cbrnum, bid;
+
+ /*
+ * Locate kgts for cb. This algorithm is SLOW but
+ * this function is rarely called (ie., almost never).
+ * Performance does not matter.
+ */
+ for_each_possible_blade(bid) {
+ if (!gru_base[bid])
+ break;
+ kgts = gru_base[bid]->bs_kgts;
+ if (!kgts || !kgts->ts_gru)
+ continue;
+ off = cb - kgts->ts_gru->gs_gru_base_vaddr;
+ if (off < GRU_SIZE)
+ break;
+ kgts = NULL;
+ }
+ BUG_ON(!kgts);
+ cbrnum = thread_cbr_number(kgts, get_cb_number(cb));
+ cbe = get_cbe(GRUBASE(cb), cbrnum);
+ gru_flush_cache(cbe); /* CBE not coherent */
+ sync_core();
+ excdet->opc = cbe->opccpy;
+ excdet->exopc = cbe->exopccpy;
+ excdet->ecause = cbe->ecause;
+ excdet->exceptdet0 = cbe->idef1upd;
+ excdet->exceptdet1 = cbe->idef3upd;
+ gru_flush_cache(cbe);
+ return 0;
+}
+
+static char *gru_get_cb_exception_detail_str(int ret, void *cb,
+ char *buf, int size)
+{
+ struct gru_control_block_status *gen = (void *)cb;
+ struct control_block_extended_exc_detail excdet;
+
+ if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
+ gru_get_cb_exception_detail(cb, &excdet);
+ snprintf(buf, size,
+ "GRU:%d exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
+ "excdet0 0x%lx, excdet1 0x%x", smp_processor_id(),
+ gen, excdet.opc, excdet.exopc, excdet.ecause,
+ excdet.exceptdet0, excdet.exceptdet1);
+ } else {
+ snprintf(buf, size, "No exception");
+ }
+ return buf;
+}
+
+static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
+{
+ while (gen->istatus >= CBS_ACTIVE) {
+ cpu_relax();
+ barrier();
+ }
+ return gen->istatus;
+}
+
+static int gru_retry_exception(void *cb)
+{
+ struct gru_control_block_status *gen = (void *)cb;
+ struct control_block_extended_exc_detail excdet;
+ int retry = EXCEPTION_RETRY_LIMIT;
+
+ while (1) {
+ if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
+ return CBS_IDLE;
+ if (gru_get_cb_message_queue_substatus(cb))
+ return CBS_EXCEPTION;
+ gru_get_cb_exception_detail(cb, &excdet);
+ if ((excdet.ecause & ~EXCEPTION_RETRY_BITS) ||
+ (excdet.cbrexecstatus & CBR_EXS_ABORT_OCC))
+ break;
+ if (retry-- == 0)
+ break;
+ gen->icmd = 1;
+ gru_flush_cache(gen);
+ }
+ return CBS_EXCEPTION;
+}
+
+int gru_check_status_proc(void *cb)
+{
+ struct gru_control_block_status *gen = (void *)cb;
+ int ret;
+
+ ret = gen->istatus;
+ if (ret == CBS_EXCEPTION)
+ ret = gru_retry_exception(cb);
+ rmb();
+ return ret;
+
+}
+
+int gru_wait_proc(void *cb)
+{
+ struct gru_control_block_status *gen = (void *)cb;
+ int ret;
+
+ ret = gru_wait_idle_or_exception(gen);
+ if (ret == CBS_EXCEPTION)
+ ret = gru_retry_exception(cb);
+ rmb();
+ return ret;
+}
+
+static void gru_abort(int ret, void *cb, char *str)
+{
+ char buf[GRU_EXC_STR_SIZE];
+
+ panic("GRU FATAL ERROR: %s - %s\n", str,
+ gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
+}
+
+void gru_wait_abort_proc(void *cb)
+{
+ int ret;
+
+ ret = gru_wait_proc(cb);
+ if (ret)
+ gru_abort(ret, cb, "gru_wait_abort");
+}
+
+
+/*------------------------------ MESSAGE QUEUES -----------------------------*/
+
+/* Internal status . These are NOT returned to the user. */
+#define MQIE_AGAIN -1 /* try again */
+
+
+/*
+ * Save/restore the "present" flag that is in the second line of 2-line
+ * messages
+ */
+static inline int get_present2(void *p)
+{
+ struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
+ return mhdr->present;
+}
+
+static inline void restore_present2(void *p, int val)
+{
+ struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
+ mhdr->present = val;
+}
+
+/*
+ * Create a message queue.
+ * qlines - message queue size in cache lines. Includes 2-line header.
+ */
+int gru_create_message_queue(struct gru_message_queue_desc *mqd,
+ void *p, unsigned int bytes, int nasid, int vector, int apicid)
+{
+ struct message_queue *mq = p;
+ unsigned int qlines;
+
+ qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
+ memset(mq, 0, bytes);
+ mq->start = &mq->data;
+ mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
+ mq->next = &mq->data;
+ mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
+ mq->qlines = qlines;
+ mq->hstatus[0] = 0;
+ mq->hstatus[1] = 1;
+ mq->head = gru_mesq_head(2, qlines / 2 + 1);
+ mqd->mq = mq;
+ mqd->mq_gpa = uv_gpa(mq);
+ mqd->qlines = qlines;
+ mqd->interrupt_pnode = nasid >> 1;
+ mqd->interrupt_vector = vector;
+ mqd->interrupt_apicid = apicid;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gru_create_message_queue);
+
+/*
+ * Send a NOOP message to a message queue
+ * Returns:
+ * 0 - if queue is full after the send. This is the normal case
+ * but various races can change this.
+ * -1 - if mesq sent successfully but queue not full
+ * >0 - unexpected error. MQE_xxx returned
+ */
+static int send_noop_message(void *cb, struct gru_message_queue_desc *mqd,
+ void *mesg)
+{
+ const struct message_header noop_header = {
+ .present = MQS_NOOP, .lines = 1};
+ unsigned long m;
+ int substatus, ret;
+ struct message_header save_mhdr, *mhdr = mesg;
+
+ STAT(mesq_noop);
+ save_mhdr = *mhdr;
+ *mhdr = noop_header;
+ gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), 1, IMA);
+ ret = gru_wait(cb);
+
+ if (ret) {
+ substatus = gru_get_cb_message_queue_substatus(cb);
+ switch (substatus) {
+ case CBSS_NO_ERROR:
+ STAT(mesq_noop_unexpected_error);
+ ret = MQE_UNEXPECTED_CB_ERR;
+ break;
+ case CBSS_LB_OVERFLOWED:
+ STAT(mesq_noop_lb_overflow);
+ ret = MQE_CONGESTION;
+ break;
+ case CBSS_QLIMIT_REACHED:
+ STAT(mesq_noop_qlimit_reached);
+ ret = 0;
+ break;
+ case CBSS_AMO_NACKED:
+ STAT(mesq_noop_amo_nacked);
+ ret = MQE_CONGESTION;
+ break;
+ case CBSS_PUT_NACKED:
+ STAT(mesq_noop_put_nacked);
+ m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
+ gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
+ IMA);
+ if (gru_wait(cb) == CBS_IDLE)
+ ret = MQIE_AGAIN;
+ else
+ ret = MQE_UNEXPECTED_CB_ERR;
+ break;
+ case CBSS_PAGE_OVERFLOW:
+ STAT(mesq_noop_page_overflow);
+ /* fallthru */
+ default:
+ BUG();
+ }
+ }
+ *mhdr = save_mhdr;
+ return ret;
+}
+
+/*
+ * Handle a gru_mesq full.
+ */
+static int send_message_queue_full(void *cb, struct gru_message_queue_desc *mqd,
+ void *mesg, int lines)
+{
+ union gru_mesqhead mqh;
+ unsigned int limit, head;
+ unsigned long avalue;
+ int half, qlines;
+
+ /* Determine if switching to first/second half of q */
+ avalue = gru_get_amo_value(cb);
+ head = gru_get_amo_value_head(cb);
+ limit = gru_get_amo_value_limit(cb);
+
+ qlines = mqd->qlines;
+ half = (limit != qlines);
+
+ if (half)
+ mqh = gru_mesq_head(qlines / 2 + 1, qlines);
+ else
+ mqh = gru_mesq_head(2, qlines / 2 + 1);
+
+ /* Try to get lock for switching head pointer */
+ gru_gamir(cb, EOP_IR_CLR, HSTATUS(mqd->mq_gpa, half), XTYPE_DW, IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ goto cberr;
+ if (!gru_get_amo_value(cb)) {
+ STAT(mesq_qf_locked);
+ return MQE_QUEUE_FULL;
+ }
+
+ /* Got the lock. Send optional NOP if queue not full, */
+ if (head != limit) {
+ if (send_noop_message(cb, mqd, mesg)) {
+ gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half),
+ XTYPE_DW, IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ goto cberr;
+ STAT(mesq_qf_noop_not_full);
+ return MQIE_AGAIN;
+ }
+ avalue++;
+ }
+
+ /* Then flip queuehead to other half of queue. */
+ gru_gamer(cb, EOP_ERR_CSWAP, mqd->mq_gpa, XTYPE_DW, mqh.val, avalue,
+ IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ goto cberr;
+
+ /* If not successfully in swapping queue head, clear the hstatus lock */
+ if (gru_get_amo_value(cb) != avalue) {
+ STAT(mesq_qf_switch_head_failed);
+ gru_gamir(cb, EOP_IR_INC, HSTATUS(mqd->mq_gpa, half), XTYPE_DW,
+ IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ goto cberr;
+ }
+ return MQIE_AGAIN;
+cberr:
+ STAT(mesq_qf_unexpected_error);
+ return MQE_UNEXPECTED_CB_ERR;
+}
+
+/*
+ * Handle a PUT failure. Note: if message was a 2-line message, one of the
+ * lines might have successfully have been written. Before sending the
+ * message, "present" must be cleared in BOTH lines to prevent the receiver
+ * from prematurely seeing the full message.
+ */
+static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
+ void *mesg, int lines)
+{
+ unsigned long m;
+ int ret, loops = 200; /* experimentally determined */
+
+ m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
+ if (lines == 2) {
+ gru_vset(cb, m, 0, XTYPE_CL, lines, 1, IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ return MQE_UNEXPECTED_CB_ERR;
+ }
+ gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
+ if (gru_wait(cb) != CBS_IDLE)
+ return MQE_UNEXPECTED_CB_ERR;
+
+ if (!mqd->interrupt_vector)
+ return MQE_OK;
+
+ /*
+ * Send a noop message in order to deliver a cross-partition interrupt
+ * to the SSI that contains the target message queue. Normally, the
+ * interrupt is automatically delivered by hardware following mesq
+ * operations, but some error conditions require explicit delivery.
+ * The noop message will trigger delivery. Otherwise partition failures
+ * could cause unrecovered errors.
+ */
+ do {
+ ret = send_noop_message(cb, mqd, mesg);
+ } while ((ret == MQIE_AGAIN || ret == MQE_CONGESTION) && (loops-- > 0));
+
+ if (ret == MQIE_AGAIN || ret == MQE_CONGESTION) {
+ /*
+ * Don't indicate to the app to resend the message, as it's
+ * already been successfully sent. We simply send an OK
+ * (rather than fail the send with MQE_UNEXPECTED_CB_ERR),
+ * assuming that the other side is receiving enough
+ * interrupts to get this message processed anyway.
+ */
+ ret = MQE_OK;
+ }
+ return ret;
+}
+
+/*
+ * Handle a gru_mesq failure. Some of these failures are software recoverable
+ * or retryable.
+ */
+static int send_message_failure(void *cb, struct gru_message_queue_desc *mqd,
+ void *mesg, int lines)
+{
+ int substatus, ret = 0;
+
+ substatus = gru_get_cb_message_queue_substatus(cb);
+ switch (substatus) {
+ case CBSS_NO_ERROR:
+ STAT(mesq_send_unexpected_error);
+ ret = MQE_UNEXPECTED_CB_ERR;
+ break;
+ case CBSS_LB_OVERFLOWED:
+ STAT(mesq_send_lb_overflow);
+ ret = MQE_CONGESTION;
+ break;
+ case CBSS_QLIMIT_REACHED:
+ STAT(mesq_send_qlimit_reached);
+ ret = send_message_queue_full(cb, mqd, mesg, lines);
+ break;
+ case CBSS_AMO_NACKED:
+ STAT(mesq_send_amo_nacked);
+ ret = MQE_CONGESTION;
+ break;
+ case CBSS_PUT_NACKED:
+ STAT(mesq_send_put_nacked);
+ ret = send_message_put_nacked(cb, mqd, mesg, lines);
+ break;
+ case CBSS_PAGE_OVERFLOW:
+ STAT(mesq_page_overflow);
+ /* fallthru */
+ default:
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * Send a message to a message queue
+ * mqd message queue descriptor
+ * mesg message. ust be vaddr within a GSEG
+ * bytes message size (<= 2 CL)
+ */
+int gru_send_message_gpa(struct gru_message_queue_desc *mqd, void *mesg,
+ unsigned int bytes)
+{
+ struct message_header *mhdr;
+ void *cb;
+ void *dsr;
+ int istatus, clines, ret;
+
+ STAT(mesq_send);
+ BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
+
+ clines = DIV_ROUND_UP(bytes, GRU_CACHE_LINE_BYTES);
+ if (gru_get_cpu_resources(bytes, &cb, &dsr))
+ return MQE_BUG_NO_RESOURCES;
+ memcpy(dsr, mesg, bytes);
+ mhdr = dsr;
+ mhdr->present = MQS_FULL;
+ mhdr->lines = clines;
+ if (clines == 2) {
+ mhdr->present2 = get_present2(mhdr);
+ restore_present2(mhdr, MQS_FULL);
+ }
+
+ do {
+ ret = MQE_OK;
+ gru_mesq(cb, mqd->mq_gpa, gru_get_tri(mhdr), clines, IMA);
+ istatus = gru_wait(cb);
+ if (istatus != CBS_IDLE)
+ ret = send_message_failure(cb, mqd, dsr, clines);
+ } while (ret == MQIE_AGAIN);
+ gru_free_cpu_resources(cb, dsr);
+
+ if (ret)
+ STAT(mesq_send_failed);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gru_send_message_gpa);
+
+/*
+ * Advance the receive pointer for the queue to the next message.
+ */
+void gru_free_message(struct gru_message_queue_desc *mqd, void *mesg)
+{
+ struct message_queue *mq = mqd->mq;
+ struct message_header *mhdr = mq->next;
+ void *next, *pnext;
+ int half = -1;
+ int lines = mhdr->lines;
+
+ if (lines == 2)
+ restore_present2(mhdr, MQS_EMPTY);
+ mhdr->present = MQS_EMPTY;
+
+ pnext = mq->next;
+ next = pnext + GRU_CACHE_LINE_BYTES * lines;
+ if (next == mq->limit) {
+ next = mq->start;
+ half = 1;
+ } else if (pnext < mq->start2 && next >= mq->start2) {
+ half = 0;
+ }
+
+ if (half >= 0)
+ mq->hstatus[half] = 1;
+ mq->next = next;
+}
+EXPORT_SYMBOL_GPL(gru_free_message);
+
+/*
+ * Get next message from message queue. Return NULL if no message
+ * present. User must call next_message() to move to next message.
+ * rmq message queue
+ */
+void *gru_get_next_message(struct gru_message_queue_desc *mqd)
+{
+ struct message_queue *mq = mqd->mq;
+ struct message_header *mhdr = mq->next;
+ int present = mhdr->present;
+
+ /* skip NOOP messages */
+ while (present == MQS_NOOP) {
+ gru_free_message(mqd, mhdr);
+ mhdr = mq->next;
+ present = mhdr->present;
+ }
+
+ /* Wait for both halves of 2 line messages */
+ if (present == MQS_FULL && mhdr->lines == 2 &&
+ get_present2(mhdr) == MQS_EMPTY)
+ present = MQS_EMPTY;
+
+ if (!present) {
+ STAT(mesq_receive_none);
+ return NULL;
+ }
+
+ if (mhdr->lines == 2)
+ restore_present2(mhdr, mhdr->present2);
+
+ STAT(mesq_receive);
+ return mhdr;
+}
+EXPORT_SYMBOL_GPL(gru_get_next_message);
+
+/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
+
+/*
+ * Load a DW from a global GPA. The GPA can be a memory or MMR address.
+ */
+int gru_read_gpa(unsigned long *value, unsigned long gpa)
+{
+ void *cb;
+ void *dsr;
+ int ret, iaa;
+
+ STAT(read_gpa);
+ if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
+ return MQE_BUG_NO_RESOURCES;
+ iaa = gpa >> 62;
+ gru_vload_phys(cb, gpa, gru_get_tri(dsr), iaa, IMA);
+ ret = gru_wait(cb);
+ if (ret == CBS_IDLE)
+ *value = *(unsigned long *)dsr;
+ gru_free_cpu_resources(cb, dsr);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gru_read_gpa);
+
+
+/*
+ * Copy a block of data using the GRU resources
+ */
+int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
+ unsigned int bytes)
+{
+ void *cb;
+ void *dsr;
+ int ret;
+
+ STAT(copy_gpa);
+ if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
+ return MQE_BUG_NO_RESOURCES;
+ gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
+ XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_CL, IMA);
+ ret = gru_wait(cb);
+ gru_free_cpu_resources(cb, dsr);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gru_copy_gpa);
+
+/* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
+/* Temp - will delete after we gain confidence in the GRU */
+
+static int quicktest0(unsigned long arg)
+{
+ unsigned long word0;
+ unsigned long word1;
+ void *cb;
+ void *dsr;
+ unsigned long *p;
+ int ret = -EIO;
+
+ if (gru_get_cpu_resources(GRU_CACHE_LINE_BYTES, &cb, &dsr))
+ return MQE_BUG_NO_RESOURCES;
+ p = dsr;
+ word0 = MAGIC;
+ word1 = 0;
+
+ gru_vload(cb, uv_gpa(&word0), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
+ if (gru_wait(cb) != CBS_IDLE) {
+ printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 1\n", smp_processor_id());
+ goto done;
+ }
+
+ if (*p != MAGIC) {
+ printk(KERN_DEBUG "GRU:%d quicktest0 bad magic 0x%lx\n", smp_processor_id(), *p);
+ goto done;
+ }
+ gru_vstore(cb, uv_gpa(&word1), gru_get_tri(dsr), XTYPE_DW, 1, 1, IMA);
+ if (gru_wait(cb) != CBS_IDLE) {
+ printk(KERN_DEBUG "GRU:%d quicktest0: CBR failure 2\n", smp_processor_id());
+ goto done;
+ }
+
+ if (word0 != word1 || word1 != MAGIC) {
+ printk(KERN_DEBUG
+ "GRU:%d quicktest0 err: found 0x%lx, expected 0x%lx\n",
+ smp_processor_id(), word1, MAGIC);
+ goto done;
+ }
+ ret = 0;
+
+done:
+ gru_free_cpu_resources(cb, dsr);
+ return ret;
+}
+
+#define ALIGNUP(p, q) ((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1)))
+
+static int quicktest1(unsigned long arg)
+{
+ struct gru_message_queue_desc mqd;
+ void *p, *mq;
+ int i, ret = -EIO;
+ char mes[GRU_CACHE_LINE_BYTES], *m;
+
+ /* Need 1K cacheline aligned that does not cross page boundary */
+ p = kmalloc(4096, 0);
+ if (p == NULL)
+ return -ENOMEM;
+ mq = ALIGNUP(p, 1024);
+ memset(mes, 0xee, sizeof(mes));
+
+ gru_create_message_queue(&mqd, mq, 8 * GRU_CACHE_LINE_BYTES, 0, 0, 0);
+ for (i = 0; i < 6; i++) {
+ mes[8] = i;
+ do {
+ ret = gru_send_message_gpa(&mqd, mes, sizeof(mes));
+ } while (ret == MQE_CONGESTION);
+ if (ret)
+ break;
+ }
+ if (ret != MQE_QUEUE_FULL || i != 4) {
+ printk(KERN_DEBUG "GRU:%d quicktest1: unexpect status %d, i %d\n",
+ smp_processor_id(), ret, i);
+ goto done;
+ }
+
+ for (i = 0; i < 6; i++) {
+ m = gru_get_next_message(&mqd);
+ if (!m || m[8] != i)
+ break;
+ gru_free_message(&mqd, m);
+ }
+ if (i != 4) {
+ printk(KERN_DEBUG "GRU:%d quicktest2: bad message, i %d, m %p, m8 %d\n",
+ smp_processor_id(), i, m, m ? m[8] : -1);
+ goto done;
+ }
+ ret = 0;
+
+done:
+ kfree(p);
+ return ret;
+}
+
+static int quicktest2(unsigned long arg)
+{
+ static DECLARE_COMPLETION(cmp);
+ unsigned long han;
+ int blade_id = 0;
+ int numcb = 4;
+ int ret = 0;
+ unsigned long *buf;
+ void *cb0, *cb;
+ struct gru_control_block_status *gen;
+ int i, k, istatus, bytes;
+
+ bytes = numcb * 4 * 8;
+ buf = kmalloc(bytes, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ret = -EBUSY;
+ han = gru_reserve_async_resources(blade_id, numcb, 0, &cmp);
+ if (!han)
+ goto done;
+
+ gru_lock_async_resource(han, &cb0, NULL);
+ memset(buf, 0xee, bytes);
+ for (i = 0; i < numcb; i++)
+ gru_vset(cb0 + i * GRU_HANDLE_STRIDE, uv_gpa(&buf[i * 4]), 0,
+ XTYPE_DW, 4, 1, IMA_INTERRUPT);
+
+ ret = 0;
+ k = numcb;
+ do {
+ gru_wait_async_cbr(han);
+ for (i = 0; i < numcb; i++) {
+ cb = cb0 + i * GRU_HANDLE_STRIDE;
+ istatus = gru_check_status(cb);
+ if (istatus != CBS_ACTIVE && istatus != CBS_CALL_OS)
+ break;
+ }
+ if (i == numcb)
+ continue;
+ if (istatus != CBS_IDLE) {
+ printk(KERN_DEBUG "GRU:%d quicktest2: cb %d, exception\n", smp_processor_id(), i);
+ ret = -EFAULT;
+ } else if (buf[4 * i] || buf[4 * i + 1] || buf[4 * i + 2] ||
+ buf[4 * i + 3]) {
+ printk(KERN_DEBUG "GRU:%d quicktest2:cb %d, buf 0x%lx, 0x%lx, 0x%lx, 0x%lx\n",
+ smp_processor_id(), i, buf[4 * i], buf[4 * i + 1], buf[4 * i + 2], buf[4 * i + 3]);
+ ret = -EIO;
+ }
+ k--;
+ gen = cb;
+ gen->istatus = CBS_CALL_OS; /* don't handle this CBR again */
+ } while (k);
+ BUG_ON(cmp.done);
+
+ gru_unlock_async_resource(han);
+ gru_release_async_resources(han);
+done:
+ kfree(buf);
+ return ret;
+}
+
+#define BUFSIZE 200
+static int quicktest3(unsigned long arg)
+{
+ char buf1[BUFSIZE], buf2[BUFSIZE];
+ int ret = 0;
+
+ memset(buf2, 0, sizeof(buf2));
+ memset(buf1, get_cycles() & 255, sizeof(buf1));
+ gru_copy_gpa(uv_gpa(buf2), uv_gpa(buf1), BUFSIZE);
+ if (memcmp(buf1, buf2, BUFSIZE)) {
+ printk(KERN_DEBUG "GRU:%d quicktest3 error\n", smp_processor_id());
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/*
+ * Debugging only. User hook for various kernel tests
+ * of driver & gru.
+ */
+int gru_ktest(unsigned long arg)
+{
+ int ret = -EINVAL;
+
+ switch (arg & 0xff) {
+ case 0:
+ ret = quicktest0(arg);
+ break;
+ case 1:
+ ret = quicktest1(arg);
+ break;
+ case 2:
+ ret = quicktest2(arg);
+ break;
+ case 3:
+ ret = quicktest3(arg);
+ break;
+ case 99:
+ ret = gru_free_kernel_contexts();
+ break;
+ }
+ return ret;
+
+}
+
+int gru_kservices_init(void)
+{
+ return 0;
+}
+
+void gru_kservices_exit(void)
+{
+ if (gru_free_kernel_contexts())
+ BUG();
+}
+
diff --git a/drivers/misc/sgi-gru/grukservices.h b/drivers/misc/sgi-gru/grukservices.h
new file mode 100644
index 000000000..02aa94d84
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.h
@@ -0,0 +1,214 @@
+
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef __GRU_KSERVICES_H_
+#define __GRU_KSERVICES_H_
+
+
+/*
+ * Message queues using the GRU to send/receive messages.
+ *
+ * These function allow the user to create a message queue for
+ * sending/receiving 1 or 2 cacheline messages using the GRU.
+ *
+ * Processes SENDING messages will use a kernel CBR/DSR to send
+ * the message. This is transparent to the caller.
+ *
+ * The receiver does not use any GRU resources.
+ *
+ * The functions support:
+ * - single receiver
+ * - multiple senders
+ * - cross partition message
+ *
+ * Missing features ZZZ:
+ * - user options for dealing with timeouts, queue full, etc.
+ * - gru_create_message_queue() needs interrupt vector info
+ */
+
+struct gru_message_queue_desc {
+ void *mq; /* message queue vaddress */
+ unsigned long mq_gpa; /* global address of mq */
+ int qlines; /* queue size in CL */
+ int interrupt_vector; /* interrupt vector */
+ int interrupt_pnode; /* pnode for interrupt */
+ int interrupt_apicid; /* lapicid for interrupt */
+};
+
+/*
+ * Initialize a user allocated chunk of memory to be used as
+ * a message queue. The caller must ensure that the queue is
+ * in contiguous physical memory and is cacheline aligned.
+ *
+ * Message queue size is the total number of bytes allocated
+ * to the queue including a 2 cacheline header that is used
+ * to manage the queue.
+ *
+ * Input:
+ * mqd pointer to message queue descriptor
+ * p pointer to user allocated mesq memory.
+ * bytes size of message queue in bytes
+ * vector interrupt vector (zero if no interrupts)
+ * nasid nasid of blade where interrupt is delivered
+ * apicid apicid of cpu for interrupt
+ *
+ * Errors:
+ * 0 OK
+ * >0 error
+ */
+extern int gru_create_message_queue(struct gru_message_queue_desc *mqd,
+ void *p, unsigned int bytes, int nasid, int vector, int apicid);
+
+/*
+ * Send a message to a message queue.
+ *
+ * Note: The message queue transport mechanism uses the first 32
+ * bits of the message. Users should avoid using these bits.
+ *
+ *
+ * Input:
+ * mqd pointer to message queue descriptor
+ * mesg pointer to message. Must be 64-bit aligned
+ * bytes size of message in bytes
+ *
+ * Output:
+ * 0 message sent
+ * >0 Send failure - see error codes below
+ *
+ */
+extern int gru_send_message_gpa(struct gru_message_queue_desc *mqd,
+ void *mesg, unsigned int bytes);
+
+/* Status values for gru_send_message() */
+#define MQE_OK 0 /* message sent successfully */
+#define MQE_CONGESTION 1 /* temporary congestion, try again */
+#define MQE_QUEUE_FULL 2 /* queue is full */
+#define MQE_UNEXPECTED_CB_ERR 3 /* unexpected CB error */
+#define MQE_PAGE_OVERFLOW 10 /* BUG - queue overflowed a page */
+#define MQE_BUG_NO_RESOURCES 11 /* BUG - could not alloc GRU cb/dsr */
+
+/*
+ * Advance the receive pointer for the message queue to the next message.
+ * Note: current API requires messages to be gotten & freed in order. Future
+ * API extensions may allow for out-of-order freeing.
+ *
+ * Input
+ * mqd pointer to message queue descriptor
+ * mesq message being freed
+ */
+extern void gru_free_message(struct gru_message_queue_desc *mqd,
+ void *mesq);
+
+/*
+ * Get next message from message queue. Returns pointer to
+ * message OR NULL if no message present.
+ * User must call gru_free_message() after message is processed
+ * in order to move the queue pointers to next message.
+ *
+ * Input
+ * mqd pointer to message queue descriptor
+ *
+ * Output:
+ * p pointer to message
+ * NULL no message available
+ */
+extern void *gru_get_next_message(struct gru_message_queue_desc *mqd);
+
+
+/*
+ * Read a GRU global GPA. Source can be located in a remote partition.
+ *
+ * Input:
+ * value memory address where MMR value is returned
+ * gpa source numalink physical address of GPA
+ *
+ * Output:
+ * 0 OK
+ * >0 error
+ */
+int gru_read_gpa(unsigned long *value, unsigned long gpa);
+
+
+/*
+ * Copy data using the GRU. Source or destination can be located in a remote
+ * partition.
+ *
+ * Input:
+ * dest_gpa destination global physical address
+ * src_gpa source global physical address
+ * bytes number of bytes to copy
+ *
+ * Output:
+ * 0 OK
+ * >0 error
+ */
+extern int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
+ unsigned int bytes);
+
+/*
+ * Reserve GRU resources to be used asynchronously.
+ *
+ * input:
+ * blade_id - blade on which resources should be reserved
+ * cbrs - number of CBRs
+ * dsr_bytes - number of DSR bytes needed
+ * cmp - completion structure for waiting for
+ * async completions
+ * output:
+ * handle to identify resource
+ * (0 = no resources)
+ */
+extern unsigned long gru_reserve_async_resources(int blade_id, int cbrs, int dsr_bytes,
+ struct completion *cmp);
+
+/*
+ * Release async resources previously reserved.
+ *
+ * input:
+ * han - handle to identify resources
+ */
+extern void gru_release_async_resources(unsigned long han);
+
+/*
+ * Wait for async GRU instructions to complete.
+ *
+ * input:
+ * han - handle to identify resources
+ */
+extern void gru_wait_async_cbr(unsigned long han);
+
+/*
+ * Lock previous reserved async GRU resources
+ *
+ * input:
+ * han - handle to identify resources
+ * output:
+ * cb - pointer to first CBR
+ * dsr - pointer to first DSR
+ */
+extern void gru_lock_async_resource(unsigned long han, void **cb, void **dsr);
+
+/*
+ * Unlock previous reserved async GRU resources
+ *
+ * input:
+ * han - handle to identify resources
+ */
+extern void gru_unlock_async_resource(unsigned long han);
+
+#endif /* __GRU_KSERVICES_H_ */
diff --git a/drivers/misc/sgi-gru/grulib.h b/drivers/misc/sgi-gru/grulib.h
new file mode 100644
index 000000000..e77d1b1f9
--- /dev/null
+++ b/drivers/misc/sgi-gru/grulib.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __GRULIB_H__
+#define __GRULIB_H__
+
+#define GRU_BASENAME "gru"
+#define GRU_FULLNAME "/dev/gru"
+#define GRU_IOCTL_NUM 'G'
+
+/*
+ * Maximum number of GRU segments that a user can have open
+ * ZZZ temp - set high for testing. Revisit.
+ */
+#define GRU_MAX_OPEN_CONTEXTS 32
+
+/* Set Number of Request Blocks */
+#define GRU_CREATE_CONTEXT _IOWR(GRU_IOCTL_NUM, 1, void *)
+
+/* Set Context Options */
+#define GRU_SET_CONTEXT_OPTION _IOWR(GRU_IOCTL_NUM, 4, void *)
+
+/* Fetch exception detail */
+#define GRU_USER_GET_EXCEPTION_DETAIL _IOWR(GRU_IOCTL_NUM, 6, void *)
+
+/* For user call_os handling - normally a TLB fault */
+#define GRU_USER_CALL_OS _IOWR(GRU_IOCTL_NUM, 8, void *)
+
+/* For user unload context */
+#define GRU_USER_UNLOAD_CONTEXT _IOWR(GRU_IOCTL_NUM, 9, void *)
+
+/* For dumpping GRU chiplet state */
+#define GRU_DUMP_CHIPLET_STATE _IOWR(GRU_IOCTL_NUM, 11, void *)
+
+/* For getting gseg statistics */
+#define GRU_GET_GSEG_STATISTICS _IOWR(GRU_IOCTL_NUM, 12, void *)
+
+/* For user TLB flushing (primarily for tests) */
+#define GRU_USER_FLUSH_TLB _IOWR(GRU_IOCTL_NUM, 50, void *)
+
+/* Get some config options (primarily for tests & emulator) */
+#define GRU_GET_CONFIG_INFO _IOWR(GRU_IOCTL_NUM, 51, void *)
+
+/* Various kernel self-tests */
+#define GRU_KTEST _IOWR(GRU_IOCTL_NUM, 52, void *)
+
+#define CONTEXT_WINDOW_BYTES(th) (GRU_GSEG_PAGESIZE * (th))
+#define THREAD_POINTER(p, th) (p + GRU_GSEG_PAGESIZE * (th))
+#define GSEG_START(cb) ((void *)((unsigned long)(cb) & ~(GRU_GSEG_PAGESIZE - 1)))
+
+struct gru_get_gseg_statistics_req {
+ unsigned long gseg;
+ struct gru_gseg_statistics stats;
+};
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_create_context_req {
+ unsigned long gseg;
+ unsigned int data_segment_bytes;
+ unsigned int control_blocks;
+ unsigned int maximum_thread_count;
+ unsigned int options;
+ unsigned char tlb_preload_count;
+};
+
+/*
+ * Structure used to pass unload context parameters to the driver
+ */
+struct gru_unload_context_req {
+ unsigned long gseg;
+};
+
+/*
+ * Structure used to set context options
+ */
+enum {sco_gseg_owner, sco_cch_req_slice, sco_blade_chiplet};
+struct gru_set_context_option_req {
+ unsigned long gseg;
+ int op;
+ int val0;
+ long val1;
+};
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_flush_tlb_req {
+ unsigned long gseg;
+ unsigned long vaddr;
+ size_t len;
+};
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+enum {dcs_pid, dcs_gid};
+struct gru_dump_chiplet_state_req {
+ unsigned int op;
+ unsigned int gid;
+ int ctxnum;
+ char data_opt;
+ char lock_cch;
+ char flush_cbrs;
+ char fill[10];
+ pid_t pid;
+ void *buf;
+ size_t buflen;
+ /* ---- output --- */
+ unsigned int num_contexts;
+};
+
+#define GRU_DUMP_MAGIC 0x3474ab6c
+struct gru_dump_context_header {
+ unsigned int magic;
+ unsigned int gid;
+ unsigned char ctxnum;
+ unsigned char cbrcnt;
+ unsigned char dsrcnt;
+ pid_t pid;
+ unsigned long vaddr;
+ int cch_locked;
+ unsigned long data[0];
+};
+
+/*
+ * GRU configuration info (temp - for testing)
+ */
+struct gru_config_info {
+ int cpus;
+ int blades;
+ int nodes;
+ int chiplets;
+ int fill[16];
+};
+
+#endif /* __GRULIB_H__ */
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
new file mode 100644
index 000000000..ab174f28e
--- /dev/null
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -0,0 +1,976 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * DRIVER TABLE MANAGER + GRU CONTEXT LOAD/UNLOAD
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/prefetch.h>
+#include <asm/uv/uv_hub.h>
+#include "gru.h"
+#include "grutables.h"
+#include "gruhandles.h"
+
+unsigned long gru_options __read_mostly;
+
+static struct device_driver gru_driver = {
+ .name = "gru"
+};
+
+static struct device gru_device = {
+ .init_name = "",
+ .driver = &gru_driver,
+};
+
+struct device *grudev = &gru_device;
+
+/*
+ * Select a gru fault map to be used by the current cpu. Note that
+ * multiple cpus may be using the same map.
+ * ZZZ should be inline but did not work on emulator
+ */
+int gru_cpu_fault_map_id(void)
+{
+#ifdef CONFIG_IA64
+ return uv_blade_processor_id() % GRU_NUM_TFM;
+#else
+ int cpu = smp_processor_id();
+ int id, core;
+
+ core = uv_cpu_core_number(cpu);
+ id = core + UV_MAX_INT_CORES * uv_cpu_socket_number(cpu);
+ return id;
+#endif
+}
+
+/*--------- ASID Management -------------------------------------------
+ *
+ * Initially, assign asids sequentially from MIN_ASID .. MAX_ASID.
+ * Once MAX is reached, flush the TLB & start over. However,
+ * some asids may still be in use. There won't be many (percentage wise) still
+ * in use. Search active contexts & determine the value of the first
+ * asid in use ("x"s below). Set "limit" to this value.
+ * This defines a block of assignable asids.
+ *
+ * When "limit" is reached, search forward from limit+1 and determine the
+ * next block of assignable asids.
+ *
+ * Repeat until MAX_ASID is reached, then start over again.
+ *
+ * Each time MAX_ASID is reached, increment the asid generation. Since
+ * the search for in-use asids only checks contexts with GRUs currently
+ * assigned, asids in some contexts will be missed. Prior to loading
+ * a context, the asid generation of the GTS asid is rechecked. If it
+ * doesn't match the current generation, a new asid will be assigned.
+ *
+ * 0---------------x------------x---------------------x----|
+ * ^-next ^-limit ^-MAX_ASID
+ *
+ * All asid manipulation & context loading/unloading is protected by the
+ * gs_lock.
+ */
+
+/* Hit the asid limit. Start over */
+static int gru_wrap_asid(struct gru_state *gru)
+{
+ gru_dbg(grudev, "gid %d\n", gru->gs_gid);
+ STAT(asid_wrap);
+ gru->gs_asid_gen++;
+ return MIN_ASID;
+}
+
+/* Find the next chunk of unused asids */
+static int gru_reset_asid_limit(struct gru_state *gru, int asid)
+{
+ int i, gid, inuse_asid, limit;
+
+ gru_dbg(grudev, "gid %d, asid 0x%x\n", gru->gs_gid, asid);
+ STAT(asid_next);
+ limit = MAX_ASID;
+ if (asid >= limit)
+ asid = gru_wrap_asid(gru);
+ gru_flush_all_tlb(gru);
+ gid = gru->gs_gid;
+again:
+ for (i = 0; i < GRU_NUM_CCH; i++) {
+ if (!gru->gs_gts[i] || is_kernel_context(gru->gs_gts[i]))
+ continue;
+ inuse_asid = gru->gs_gts[i]->ts_gms->ms_asids[gid].mt_asid;
+ gru_dbg(grudev, "gid %d, gts %p, gms %p, inuse 0x%x, cxt %d\n",
+ gru->gs_gid, gru->gs_gts[i], gru->gs_gts[i]->ts_gms,
+ inuse_asid, i);
+ if (inuse_asid == asid) {
+ asid += ASID_INC;
+ if (asid >= limit) {
+ /*
+ * empty range: reset the range limit and
+ * start over
+ */
+ limit = MAX_ASID;
+ if (asid >= MAX_ASID)
+ asid = gru_wrap_asid(gru);
+ goto again;
+ }
+ }
+
+ if ((inuse_asid > asid) && (inuse_asid < limit))
+ limit = inuse_asid;
+ }
+ gru->gs_asid_limit = limit;
+ gru->gs_asid = asid;
+ gru_dbg(grudev, "gid %d, new asid 0x%x, new_limit 0x%x\n", gru->gs_gid,
+ asid, limit);
+ return asid;
+}
+
+/* Assign a new ASID to a thread context. */
+static int gru_assign_asid(struct gru_state *gru)
+{
+ int asid;
+
+ gru->gs_asid += ASID_INC;
+ asid = gru->gs_asid;
+ if (asid >= gru->gs_asid_limit)
+ asid = gru_reset_asid_limit(gru, asid);
+
+ gru_dbg(grudev, "gid %d, asid 0x%x\n", gru->gs_gid, asid);
+ return asid;
+}
+
+/*
+ * Clear n bits in a word. Return a word indicating the bits that were cleared.
+ * Optionally, build an array of chars that contain the bit numbers allocated.
+ */
+static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
+ char *idx)
+{
+ unsigned long bits = 0;
+ int i;
+
+ while (n--) {
+ i = find_first_bit(p, mmax);
+ if (i == mmax)
+ BUG();
+ __clear_bit(i, p);
+ __set_bit(i, &bits);
+ if (idx)
+ *idx++ = i;
+ }
+ return bits;
+}
+
+unsigned long gru_reserve_cb_resources(struct gru_state *gru, int cbr_au_count,
+ char *cbmap)
+{
+ return reserve_resources(&gru->gs_cbr_map, cbr_au_count, GRU_CBR_AU,
+ cbmap);
+}
+
+unsigned long gru_reserve_ds_resources(struct gru_state *gru, int dsr_au_count,
+ char *dsmap)
+{
+ return reserve_resources(&gru->gs_dsr_map, dsr_au_count, GRU_DSR_AU,
+ dsmap);
+}
+
+static void reserve_gru_resources(struct gru_state *gru,
+ struct gru_thread_state *gts)
+{
+ gru->gs_active_contexts++;
+ gts->ts_cbr_map =
+ gru_reserve_cb_resources(gru, gts->ts_cbr_au_count,
+ gts->ts_cbr_idx);
+ gts->ts_dsr_map =
+ gru_reserve_ds_resources(gru, gts->ts_dsr_au_count, NULL);
+}
+
+static void free_gru_resources(struct gru_state *gru,
+ struct gru_thread_state *gts)
+{
+ gru->gs_active_contexts--;
+ gru->gs_cbr_map |= gts->ts_cbr_map;
+ gru->gs_dsr_map |= gts->ts_dsr_map;
+}
+
+/*
+ * Check if a GRU has sufficient free resources to satisfy an allocation
+ * request. Note: GRU locks may or may not be held when this is called. If
+ * not held, recheck after acquiring the appropriate locks.
+ *
+ * Returns 1 if sufficient resources, 0 if not
+ */
+static int check_gru_resources(struct gru_state *gru, int cbr_au_count,
+ int dsr_au_count, int max_active_contexts)
+{
+ return hweight64(gru->gs_cbr_map) >= cbr_au_count
+ && hweight64(gru->gs_dsr_map) >= dsr_au_count
+ && gru->gs_active_contexts < max_active_contexts;
+}
+
+/*
+ * TLB manangment requires tracking all GRU chiplets that have loaded a GSEG
+ * context.
+ */
+static int gru_load_mm_tracker(struct gru_state *gru,
+ struct gru_thread_state *gts)
+{
+ struct gru_mm_struct *gms = gts->ts_gms;
+ struct gru_mm_tracker *asids = &gms->ms_asids[gru->gs_gid];
+ unsigned short ctxbitmap = (1 << gts->ts_ctxnum);
+ int asid;
+
+ spin_lock(&gms->ms_asid_lock);
+ asid = asids->mt_asid;
+
+ spin_lock(&gru->gs_asid_lock);
+ if (asid == 0 || (asids->mt_ctxbitmap == 0 && asids->mt_asid_gen !=
+ gru->gs_asid_gen)) {
+ asid = gru_assign_asid(gru);
+ asids->mt_asid = asid;
+ asids->mt_asid_gen = gru->gs_asid_gen;
+ STAT(asid_new);
+ } else {
+ STAT(asid_reuse);
+ }
+ spin_unlock(&gru->gs_asid_lock);
+
+ BUG_ON(asids->mt_ctxbitmap & ctxbitmap);
+ asids->mt_ctxbitmap |= ctxbitmap;
+ if (!test_bit(gru->gs_gid, gms->ms_asidmap))
+ __set_bit(gru->gs_gid, gms->ms_asidmap);
+ spin_unlock(&gms->ms_asid_lock);
+
+ gru_dbg(grudev,
+ "gid %d, gts %p, gms %p, ctxnum %d, asid 0x%x, asidmap 0x%lx\n",
+ gru->gs_gid, gts, gms, gts->ts_ctxnum, asid,
+ gms->ms_asidmap[0]);
+ return asid;
+}
+
+static void gru_unload_mm_tracker(struct gru_state *gru,
+ struct gru_thread_state *gts)
+{
+ struct gru_mm_struct *gms = gts->ts_gms;
+ struct gru_mm_tracker *asids;
+ unsigned short ctxbitmap;
+
+ asids = &gms->ms_asids[gru->gs_gid];
+ ctxbitmap = (1 << gts->ts_ctxnum);
+ spin_lock(&gms->ms_asid_lock);
+ spin_lock(&gru->gs_asid_lock);
+ BUG_ON((asids->mt_ctxbitmap & ctxbitmap) != ctxbitmap);
+ asids->mt_ctxbitmap ^= ctxbitmap;
+ gru_dbg(grudev, "gid %d, gts %p, gms %p, ctxnum %d, asidmap 0x%lx\n",
+ gru->gs_gid, gts, gms, gts->ts_ctxnum, gms->ms_asidmap[0]);
+ spin_unlock(&gru->gs_asid_lock);
+ spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Decrement the reference count on a GTS structure. Free the structure
+ * if the reference count goes to zero.
+ */
+void gts_drop(struct gru_thread_state *gts)
+{
+ if (gts && atomic_dec_return(&gts->ts_refcnt) == 0) {
+ if (gts->ts_gms)
+ gru_drop_mmu_notifier(gts->ts_gms);
+ kfree(gts);
+ STAT(gts_free);
+ }
+}
+
+/*
+ * Locate the GTS structure for the current thread.
+ */
+static struct gru_thread_state *gru_find_current_gts_nolock(struct gru_vma_data
+ *vdata, int tsid)
+{
+ struct gru_thread_state *gts;
+
+ list_for_each_entry(gts, &vdata->vd_head, ts_next)
+ if (gts->ts_tsid == tsid)
+ return gts;
+ return NULL;
+}
+
+/*
+ * Allocate a thread state structure.
+ */
+struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+ int cbr_au_count, int dsr_au_count,
+ unsigned char tlb_preload_count, int options, int tsid)
+{
+ struct gru_thread_state *gts;
+ struct gru_mm_struct *gms;
+ int bytes;
+
+ bytes = DSR_BYTES(dsr_au_count) + CBR_BYTES(cbr_au_count);
+ bytes += sizeof(struct gru_thread_state);
+ gts = kmalloc(bytes, GFP_KERNEL);
+ if (!gts)
+ return ERR_PTR(-ENOMEM);
+
+ STAT(gts_alloc);
+ memset(gts, 0, sizeof(struct gru_thread_state)); /* zero out header */
+ atomic_set(&gts->ts_refcnt, 1);
+ mutex_init(&gts->ts_ctxlock);
+ gts->ts_cbr_au_count = cbr_au_count;
+ gts->ts_dsr_au_count = dsr_au_count;
+ gts->ts_tlb_preload_count = tlb_preload_count;
+ gts->ts_user_options = options;
+ gts->ts_user_blade_id = -1;
+ gts->ts_user_chiplet_id = -1;
+ gts->ts_tsid = tsid;
+ gts->ts_ctxnum = NULLCTX;
+ gts->ts_tlb_int_select = -1;
+ gts->ts_cch_req_slice = -1;
+ gts->ts_sizeavail = GRU_SIZEAVAIL(PAGE_SHIFT);
+ if (vma) {
+ gts->ts_mm = current->mm;
+ gts->ts_vma = vma;
+ gms = gru_register_mmu_notifier();
+ if (IS_ERR(gms))
+ goto err;
+ gts->ts_gms = gms;
+ }
+
+ gru_dbg(grudev, "alloc gts %p\n", gts);
+ return gts;
+
+err:
+ gts_drop(gts);
+ return ERR_CAST(gms);
+}
+
+/*
+ * Allocate a vma private data structure.
+ */
+struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, int tsid)
+{
+ struct gru_vma_data *vdata = NULL;
+
+ vdata = kmalloc(sizeof(*vdata), GFP_KERNEL);
+ if (!vdata)
+ return NULL;
+
+ STAT(vdata_alloc);
+ INIT_LIST_HEAD(&vdata->vd_head);
+ spin_lock_init(&vdata->vd_lock);
+ gru_dbg(grudev, "alloc vdata %p\n", vdata);
+ return vdata;
+}
+
+/*
+ * Find the thread state structure for the current thread.
+ */
+struct gru_thread_state *gru_find_thread_state(struct vm_area_struct *vma,
+ int tsid)
+{
+ struct gru_vma_data *vdata = vma->vm_private_data;
+ struct gru_thread_state *gts;
+
+ spin_lock(&vdata->vd_lock);
+ gts = gru_find_current_gts_nolock(vdata, tsid);
+ spin_unlock(&vdata->vd_lock);
+ gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
+ return gts;
+}
+
+/*
+ * Allocate a new thread state for a GSEG. Note that races may allow
+ * another thread to race to create a gts.
+ */
+struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct *vma,
+ int tsid)
+{
+ struct gru_vma_data *vdata = vma->vm_private_data;
+ struct gru_thread_state *gts, *ngts;
+
+ gts = gru_alloc_gts(vma, vdata->vd_cbr_au_count,
+ vdata->vd_dsr_au_count,
+ vdata->vd_tlb_preload_count,
+ vdata->vd_user_options, tsid);
+ if (IS_ERR(gts))
+ return gts;
+
+ spin_lock(&vdata->vd_lock);
+ ngts = gru_find_current_gts_nolock(vdata, tsid);
+ if (ngts) {
+ gts_drop(gts);
+ gts = ngts;
+ STAT(gts_double_allocate);
+ } else {
+ list_add(&gts->ts_next, &vdata->vd_head);
+ }
+ spin_unlock(&vdata->vd_lock);
+ gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
+ return gts;
+}
+
+/*
+ * Free the GRU context assigned to the thread state.
+ */
+static void gru_free_gru_context(struct gru_thread_state *gts)
+{
+ struct gru_state *gru;
+
+ gru = gts->ts_gru;
+ gru_dbg(grudev, "gts %p, gid %d\n", gts, gru->gs_gid);
+
+ spin_lock(&gru->gs_lock);
+ gru->gs_gts[gts->ts_ctxnum] = NULL;
+ free_gru_resources(gru, gts);
+ BUG_ON(test_bit(gts->ts_ctxnum, &gru->gs_context_map) == 0);
+ __clear_bit(gts->ts_ctxnum, &gru->gs_context_map);
+ gts->ts_ctxnum = NULLCTX;
+ gts->ts_gru = NULL;
+ gts->ts_blade = -1;
+ spin_unlock(&gru->gs_lock);
+
+ gts_drop(gts);
+ STAT(free_context);
+}
+
+/*
+ * Prefetching cachelines help hardware performance.
+ * (Strictly a performance enhancement. Not functionally required).
+ */
+static void prefetch_data(void *p, int num, int stride)
+{
+ while (num-- > 0) {
+ prefetchw(p);
+ p += stride;
+ }
+}
+
+static inline long gru_copy_handle(void *d, void *s)
+{
+ memcpy(d, s, GRU_HANDLE_BYTES);
+ return GRU_HANDLE_BYTES;
+}
+
+static void gru_prefetch_context(void *gseg, void *cb, void *cbe,
+ unsigned long cbrmap, unsigned long length)
+{
+ int i, scr;
+
+ prefetch_data(gseg + GRU_DS_BASE, length / GRU_CACHE_LINE_BYTES,
+ GRU_CACHE_LINE_BYTES);
+
+ for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+ prefetch_data(cb, 1, GRU_CACHE_LINE_BYTES);
+ prefetch_data(cbe + i * GRU_HANDLE_STRIDE, 1,
+ GRU_CACHE_LINE_BYTES);
+ cb += GRU_HANDLE_STRIDE;
+ }
+}
+
+static void gru_load_context_data(void *save, void *grubase, int ctxnum,
+ unsigned long cbrmap, unsigned long dsrmap,
+ int data_valid)
+{
+ void *gseg, *cb, *cbe;
+ unsigned long length;
+ int i, scr;
+
+ gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+ cb = gseg + GRU_CB_BASE;
+ cbe = grubase + GRU_CBE_BASE;
+ length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+ gru_prefetch_context(gseg, cb, cbe, cbrmap, length);
+
+ for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+ if (data_valid) {
+ save += gru_copy_handle(cb, save);
+ save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE,
+ save);
+ } else {
+ memset(cb, 0, GRU_CACHE_LINE_BYTES);
+ memset(cbe + i * GRU_HANDLE_STRIDE, 0,
+ GRU_CACHE_LINE_BYTES);
+ }
+ /* Flush CBE to hide race in context restart */
+ mb();
+ gru_flush_cache(cbe + i * GRU_HANDLE_STRIDE);
+ cb += GRU_HANDLE_STRIDE;
+ }
+
+ if (data_valid)
+ memcpy(gseg + GRU_DS_BASE, save, length);
+ else
+ memset(gseg + GRU_DS_BASE, 0, length);
+}
+
+static void gru_unload_context_data(void *save, void *grubase, int ctxnum,
+ unsigned long cbrmap, unsigned long dsrmap)
+{
+ void *gseg, *cb, *cbe;
+ unsigned long length;
+ int i, scr;
+
+ gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+ cb = gseg + GRU_CB_BASE;
+ cbe = grubase + GRU_CBE_BASE;
+ length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+
+ /* CBEs may not be coherent. Flush them from cache */
+ for_each_cbr_in_allocation_map(i, &cbrmap, scr)
+ gru_flush_cache(cbe + i * GRU_HANDLE_STRIDE);
+ mb(); /* Let the CL flush complete */
+
+ gru_prefetch_context(gseg, cb, cbe, cbrmap, length);
+
+ for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+ save += gru_copy_handle(save, cb);
+ save += gru_copy_handle(save, cbe + i * GRU_HANDLE_STRIDE);
+ cb += GRU_HANDLE_STRIDE;
+ }
+ memcpy(save, gseg + GRU_DS_BASE, length);
+}
+
+void gru_unload_context(struct gru_thread_state *gts, int savestate)
+{
+ struct gru_state *gru = gts->ts_gru;
+ struct gru_context_configuration_handle *cch;
+ int ctxnum = gts->ts_ctxnum;
+
+ if (!is_kernel_context(gts))
+ zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE);
+ cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+
+ gru_dbg(grudev, "gts %p, cbrmap 0x%lx, dsrmap 0x%lx\n",
+ gts, gts->ts_cbr_map, gts->ts_dsr_map);
+ lock_cch_handle(cch);
+ if (cch_interrupt_sync(cch))
+ BUG();
+
+ if (!is_kernel_context(gts))
+ gru_unload_mm_tracker(gru, gts);
+ if (savestate) {
+ gru_unload_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr,
+ ctxnum, gts->ts_cbr_map,
+ gts->ts_dsr_map);
+ gts->ts_data_valid = 1;
+ }
+
+ if (cch_deallocate(cch))
+ BUG();
+ unlock_cch_handle(cch);
+
+ gru_free_gru_context(gts);
+}
+
+/*
+ * Load a GRU context by copying it from the thread data structure in memory
+ * to the GRU.
+ */
+void gru_load_context(struct gru_thread_state *gts)
+{
+ struct gru_state *gru = gts->ts_gru;
+ struct gru_context_configuration_handle *cch;
+ int i, err, asid, ctxnum = gts->ts_ctxnum;
+
+ cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+ lock_cch_handle(cch);
+ cch->tfm_fault_bit_enable =
+ (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
+ || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+ cch->tlb_int_enable = (gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+ if (cch->tlb_int_enable) {
+ gts->ts_tlb_int_select = gru_cpu_fault_map_id();
+ cch->tlb_int_select = gts->ts_tlb_int_select;
+ }
+ if (gts->ts_cch_req_slice >= 0) {
+ cch->req_slice_set_enable = 1;
+ cch->req_slice = gts->ts_cch_req_slice;
+ } else {
+ cch->req_slice_set_enable =0;
+ }
+ cch->tfm_done_bit_enable = 0;
+ cch->dsr_allocation_map = gts->ts_dsr_map;
+ cch->cbr_allocation_map = gts->ts_cbr_map;
+
+ if (is_kernel_context(gts)) {
+ cch->unmap_enable = 1;
+ cch->tfm_done_bit_enable = 1;
+ cch->cb_int_enable = 1;
+ cch->tlb_int_select = 0; /* For now, ints go to cpu 0 */
+ } else {
+ cch->unmap_enable = 0;
+ cch->tfm_done_bit_enable = 0;
+ cch->cb_int_enable = 0;
+ asid = gru_load_mm_tracker(gru, gts);
+ for (i = 0; i < 8; i++) {
+ cch->asid[i] = asid + i;
+ cch->sizeavail[i] = gts->ts_sizeavail;
+ }
+ }
+
+ err = cch_allocate(cch);
+ if (err) {
+ gru_dbg(grudev,
+ "err %d: cch %p, gts %p, cbr 0x%lx, dsr 0x%lx\n",
+ err, cch, gts, gts->ts_cbr_map, gts->ts_dsr_map);
+ BUG();
+ }
+
+ gru_load_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr, ctxnum,
+ gts->ts_cbr_map, gts->ts_dsr_map, gts->ts_data_valid);
+
+ if (cch_start(cch))
+ BUG();
+ unlock_cch_handle(cch);
+
+ gru_dbg(grudev, "gid %d, gts %p, cbrmap 0x%lx, dsrmap 0x%lx, tie %d, tis %d\n",
+ gts->ts_gru->gs_gid, gts, gts->ts_cbr_map, gts->ts_dsr_map,
+ (gts->ts_user_options == GRU_OPT_MISS_FMM_INTR), gts->ts_tlb_int_select);
+}
+
+/*
+ * Update fields in an active CCH:
+ * - retarget interrupts on local blade
+ * - update sizeavail mask
+ */
+int gru_update_cch(struct gru_thread_state *gts)
+{
+ struct gru_context_configuration_handle *cch;
+ struct gru_state *gru = gts->ts_gru;
+ int i, ctxnum = gts->ts_ctxnum, ret = 0;
+
+ cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+
+ lock_cch_handle(cch);
+ if (cch->state == CCHSTATE_ACTIVE) {
+ if (gru->gs_gts[gts->ts_ctxnum] != gts)
+ goto exit;
+ if (cch_interrupt(cch))
+ BUG();
+ for (i = 0; i < 8; i++)
+ cch->sizeavail[i] = gts->ts_sizeavail;
+ gts->ts_tlb_int_select = gru_cpu_fault_map_id();
+ cch->tlb_int_select = gru_cpu_fault_map_id();
+ cch->tfm_fault_bit_enable =
+ (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
+ || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+ if (cch_start(cch))
+ BUG();
+ ret = 1;
+ }
+exit:
+ unlock_cch_handle(cch);
+ return ret;
+}
+
+/*
+ * Update CCH tlb interrupt select. Required when all the following is true:
+ * - task's GRU context is loaded into a GRU
+ * - task is using interrupt notification for TLB faults
+ * - task has migrated to a different cpu on the same blade where
+ * it was previously running.
+ */
+static int gru_retarget_intr(struct gru_thread_state *gts)
+{
+ if (gts->ts_tlb_int_select < 0
+ || gts->ts_tlb_int_select == gru_cpu_fault_map_id())
+ return 0;
+
+ gru_dbg(grudev, "retarget from %d to %d\n", gts->ts_tlb_int_select,
+ gru_cpu_fault_map_id());
+ return gru_update_cch(gts);
+}
+
+/*
+ * Check if a GRU context is allowed to use a specific chiplet. By default
+ * a context is assigned to any blade-local chiplet. However, users can
+ * override this.
+ * Returns 1 if assignment allowed, 0 otherwise
+ */
+static int gru_check_chiplet_assignment(struct gru_state *gru,
+ struct gru_thread_state *gts)
+{
+ int blade_id;
+ int chiplet_id;
+
+ blade_id = gts->ts_user_blade_id;
+ if (blade_id < 0)
+ blade_id = uv_numa_blade_id();
+
+ chiplet_id = gts->ts_user_chiplet_id;
+ return gru->gs_blade_id == blade_id &&
+ (chiplet_id < 0 || chiplet_id == gru->gs_chiplet_id);
+}
+
+/*
+ * Unload the gru context if it is not assigned to the correct blade or
+ * chiplet. Misassignment can occur if the process migrates to a different
+ * blade or if the user changes the selected blade/chiplet.
+ */
+void gru_check_context_placement(struct gru_thread_state *gts)
+{
+ struct gru_state *gru;
+
+ /*
+ * If the current task is the context owner, verify that the
+ * context is correctly placed. This test is skipped for non-owner
+ * references. Pthread apps use non-owner references to the CBRs.
+ */
+ gru = gts->ts_gru;
+ if (!gru || gts->ts_tgid_owner != current->tgid)
+ return;
+
+ if (!gru_check_chiplet_assignment(gru, gts)) {
+ STAT(check_context_unload);
+ gru_unload_context(gts, 1);
+ } else if (gru_retarget_intr(gts)) {
+ STAT(check_context_retarget_intr);
+ }
+}
+
+
+/*
+ * Insufficient GRU resources available on the local blade. Steal a context from
+ * a process. This is a hack until a _real_ resource scheduler is written....
+ */
+#define next_ctxnum(n) ((n) < GRU_NUM_CCH - 2 ? (n) + 1 : 0)
+#define next_gru(b, g) (((g) < &(b)->bs_grus[GRU_CHIPLETS_PER_BLADE - 1]) ? \
+ ((g)+1) : &(b)->bs_grus[0])
+
+static int is_gts_stealable(struct gru_thread_state *gts,
+ struct gru_blade_state *bs)
+{
+ if (is_kernel_context(gts))
+ return down_write_trylock(&bs->bs_kgts_sema);
+ else
+ return mutex_trylock(&gts->ts_ctxlock);
+}
+
+static void gts_stolen(struct gru_thread_state *gts,
+ struct gru_blade_state *bs)
+{
+ if (is_kernel_context(gts)) {
+ up_write(&bs->bs_kgts_sema);
+ STAT(steal_kernel_context);
+ } else {
+ mutex_unlock(&gts->ts_ctxlock);
+ STAT(steal_user_context);
+ }
+}
+
+void gru_steal_context(struct gru_thread_state *gts)
+{
+ struct gru_blade_state *blade;
+ struct gru_state *gru, *gru0;
+ struct gru_thread_state *ngts = NULL;
+ int ctxnum, ctxnum0, flag = 0, cbr, dsr;
+ int blade_id;
+
+ blade_id = gts->ts_user_blade_id;
+ if (blade_id < 0)
+ blade_id = uv_numa_blade_id();
+ cbr = gts->ts_cbr_au_count;
+ dsr = gts->ts_dsr_au_count;
+
+ blade = gru_base[blade_id];
+ spin_lock(&blade->bs_lock);
+
+ ctxnum = next_ctxnum(blade->bs_lru_ctxnum);
+ gru = blade->bs_lru_gru;
+ if (ctxnum == 0)
+ gru = next_gru(blade, gru);
+ blade->bs_lru_gru = gru;
+ blade->bs_lru_ctxnum = ctxnum;
+ ctxnum0 = ctxnum;
+ gru0 = gru;
+ while (1) {
+ if (gru_check_chiplet_assignment(gru, gts)) {
+ if (check_gru_resources(gru, cbr, dsr, GRU_NUM_CCH))
+ break;
+ spin_lock(&gru->gs_lock);
+ for (; ctxnum < GRU_NUM_CCH; ctxnum++) {
+ if (flag && gru == gru0 && ctxnum == ctxnum0)
+ break;
+ ngts = gru->gs_gts[ctxnum];
+ /*
+ * We are grabbing locks out of order, so trylock is
+ * needed. GTSs are usually not locked, so the odds of
+ * success are high. If trylock fails, try to steal a
+ * different GSEG.
+ */
+ if (ngts && is_gts_stealable(ngts, blade))
+ break;
+ ngts = NULL;
+ }
+ spin_unlock(&gru->gs_lock);
+ if (ngts || (flag && gru == gru0 && ctxnum == ctxnum0))
+ break;
+ }
+ if (flag && gru == gru0)
+ break;
+ flag = 1;
+ ctxnum = 0;
+ gru = next_gru(blade, gru);
+ }
+ spin_unlock(&blade->bs_lock);
+
+ if (ngts) {
+ gts->ustats.context_stolen++;
+ ngts->ts_steal_jiffies = jiffies;
+ gru_unload_context(ngts, is_kernel_context(ngts) ? 0 : 1);
+ gts_stolen(ngts, blade);
+ } else {
+ STAT(steal_context_failed);
+ }
+ gru_dbg(grudev,
+ "stole gid %d, ctxnum %d from gts %p. Need cb %d, ds %d;"
+ " avail cb %ld, ds %ld\n",
+ gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
+ hweight64(gru->gs_dsr_map));
+}
+
+/*
+ * Assign a gru context.
+ */
+static int gru_assign_context_number(struct gru_state *gru)
+{
+ int ctxnum;
+
+ ctxnum = find_first_zero_bit(&gru->gs_context_map, GRU_NUM_CCH);
+ __set_bit(ctxnum, &gru->gs_context_map);
+ return ctxnum;
+}
+
+/*
+ * Scan the GRUs on the local blade & assign a GRU context.
+ */
+struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts)
+{
+ struct gru_state *gru, *grux;
+ int i, max_active_contexts;
+ int blade_id = gts->ts_user_blade_id;
+
+ if (blade_id < 0)
+ blade_id = uv_numa_blade_id();
+again:
+ gru = NULL;
+ max_active_contexts = GRU_NUM_CCH;
+ for_each_gru_on_blade(grux, blade_id, i) {
+ if (!gru_check_chiplet_assignment(grux, gts))
+ continue;
+ if (check_gru_resources(grux, gts->ts_cbr_au_count,
+ gts->ts_dsr_au_count,
+ max_active_contexts)) {
+ gru = grux;
+ max_active_contexts = grux->gs_active_contexts;
+ if (max_active_contexts == 0)
+ break;
+ }
+ }
+
+ if (gru) {
+ spin_lock(&gru->gs_lock);
+ if (!check_gru_resources(gru, gts->ts_cbr_au_count,
+ gts->ts_dsr_au_count, GRU_NUM_CCH)) {
+ spin_unlock(&gru->gs_lock);
+ goto again;
+ }
+ reserve_gru_resources(gru, gts);
+ gts->ts_gru = gru;
+ gts->ts_blade = gru->gs_blade_id;
+ gts->ts_ctxnum = gru_assign_context_number(gru);
+ atomic_inc(&gts->ts_refcnt);
+ gru->gs_gts[gts->ts_ctxnum] = gts;
+ spin_unlock(&gru->gs_lock);
+
+ STAT(assign_context);
+ gru_dbg(grudev,
+ "gseg %p, gts %p, gid %d, ctx %d, cbr %d, dsr %d\n",
+ gseg_virtual_address(gts->ts_gru, gts->ts_ctxnum), gts,
+ gts->ts_gru->gs_gid, gts->ts_ctxnum,
+ gts->ts_cbr_au_count, gts->ts_dsr_au_count);
+ } else {
+ gru_dbg(grudev, "failed to allocate a GTS %s\n", "");
+ STAT(assign_context_failed);
+ }
+
+ return gru;
+}
+
+/*
+ * gru_nopage
+ *
+ * Map the user's GRU segment
+ *
+ * Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
+ */
+vm_fault_t gru_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct gru_thread_state *gts;
+ unsigned long paddr, vaddr;
+ unsigned long expires;
+
+ vaddr = vmf->address;
+ gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
+ vma, vaddr, GSEG_BASE(vaddr));
+ STAT(nopfn);
+
+ /* The following check ensures vaddr is a valid address in the VMA */
+ gts = gru_find_thread_state(vma, TSID(vaddr, vma));
+ if (!gts)
+ return VM_FAULT_SIGBUS;
+
+again:
+ mutex_lock(&gts->ts_ctxlock);
+ preempt_disable();
+
+ gru_check_context_placement(gts);
+
+ if (!gts->ts_gru) {
+ STAT(load_user_context);
+ if (!gru_assign_gru_context(gts)) {
+ preempt_enable();
+ mutex_unlock(&gts->ts_ctxlock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(GRU_ASSIGN_DELAY); /* true hack ZZZ */
+ expires = gts->ts_steal_jiffies + GRU_STEAL_DELAY;
+ if (time_before(expires, jiffies))
+ gru_steal_context(gts);
+ goto again;
+ }
+ gru_load_context(gts);
+ paddr = gseg_physical_address(gts->ts_gru, gts->ts_ctxnum);
+ remap_pfn_range(vma, vaddr & ~(GRU_GSEG_PAGESIZE - 1),
+ paddr >> PAGE_SHIFT, GRU_GSEG_PAGESIZE,
+ vma->vm_page_prot);
+ }
+
+ preempt_enable();
+ mutex_unlock(&gts->ts_ctxlock);
+
+ return VM_FAULT_NOPAGE;
+}
+
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
new file mode 100644
index 000000000..42ea2ecca
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -0,0 +1,324 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * PROC INTERFACES
+ *
+ * This file supports the /proc interfaces for the GRU driver
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/device.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+#define printstat(s, f) printstat_val(s, &gru_stats.f, #f)
+
+static void printstat_val(struct seq_file *s, atomic_long_t *v, char *id)
+{
+ unsigned long val = atomic_long_read(v);
+
+ seq_printf(s, "%16lu %s\n", val, id);
+}
+
+static int statistics_show(struct seq_file *s, void *p)
+{
+ printstat(s, vdata_alloc);
+ printstat(s, vdata_free);
+ printstat(s, gts_alloc);
+ printstat(s, gts_free);
+ printstat(s, gms_alloc);
+ printstat(s, gms_free);
+ printstat(s, gts_double_allocate);
+ printstat(s, assign_context);
+ printstat(s, assign_context_failed);
+ printstat(s, free_context);
+ printstat(s, load_user_context);
+ printstat(s, load_kernel_context);
+ printstat(s, lock_kernel_context);
+ printstat(s, unlock_kernel_context);
+ printstat(s, steal_user_context);
+ printstat(s, steal_kernel_context);
+ printstat(s, steal_context_failed);
+ printstat(s, nopfn);
+ printstat(s, asid_new);
+ printstat(s, asid_next);
+ printstat(s, asid_wrap);
+ printstat(s, asid_reuse);
+ printstat(s, intr);
+ printstat(s, intr_cbr);
+ printstat(s, intr_tfh);
+ printstat(s, intr_spurious);
+ printstat(s, intr_mm_lock_failed);
+ printstat(s, call_os);
+ printstat(s, call_os_wait_queue);
+ printstat(s, user_flush_tlb);
+ printstat(s, user_unload_context);
+ printstat(s, user_exception);
+ printstat(s, set_context_option);
+ printstat(s, check_context_retarget_intr);
+ printstat(s, check_context_unload);
+ printstat(s, tlb_dropin);
+ printstat(s, tlb_preload_page);
+ printstat(s, tlb_dropin_fail_no_asid);
+ printstat(s, tlb_dropin_fail_upm);
+ printstat(s, tlb_dropin_fail_invalid);
+ printstat(s, tlb_dropin_fail_range_active);
+ printstat(s, tlb_dropin_fail_idle);
+ printstat(s, tlb_dropin_fail_fmm);
+ printstat(s, tlb_dropin_fail_no_exception);
+ printstat(s, tfh_stale_on_fault);
+ printstat(s, mmu_invalidate_range);
+ printstat(s, mmu_invalidate_page);
+ printstat(s, flush_tlb);
+ printstat(s, flush_tlb_gru);
+ printstat(s, flush_tlb_gru_tgh);
+ printstat(s, flush_tlb_gru_zero_asid);
+ printstat(s, copy_gpa);
+ printstat(s, read_gpa);
+ printstat(s, mesq_receive);
+ printstat(s, mesq_receive_none);
+ printstat(s, mesq_send);
+ printstat(s, mesq_send_failed);
+ printstat(s, mesq_noop);
+ printstat(s, mesq_send_unexpected_error);
+ printstat(s, mesq_send_lb_overflow);
+ printstat(s, mesq_send_qlimit_reached);
+ printstat(s, mesq_send_amo_nacked);
+ printstat(s, mesq_send_put_nacked);
+ printstat(s, mesq_qf_locked);
+ printstat(s, mesq_qf_noop_not_full);
+ printstat(s, mesq_qf_switch_head_failed);
+ printstat(s, mesq_qf_unexpected_error);
+ printstat(s, mesq_noop_unexpected_error);
+ printstat(s, mesq_noop_lb_overflow);
+ printstat(s, mesq_noop_qlimit_reached);
+ printstat(s, mesq_noop_amo_nacked);
+ printstat(s, mesq_noop_put_nacked);
+ printstat(s, mesq_noop_page_overflow);
+ return 0;
+}
+
+static ssize_t statistics_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *data)
+{
+ memset(&gru_stats, 0, sizeof(gru_stats));
+ return count;
+}
+
+static int mcs_statistics_show(struct seq_file *s, void *p)
+{
+ int op;
+ unsigned long total, count, max;
+ static char *id[] = {"cch_allocate", "cch_start", "cch_interrupt",
+ "cch_interrupt_sync", "cch_deallocate", "tfh_write_only",
+ "tfh_write_restart", "tgh_invalidate"};
+
+ seq_printf(s, "%-20s%12s%12s%12s\n", "#id", "count", "aver-clks", "max-clks");
+ for (op = 0; op < mcsop_last; op++) {
+ count = atomic_long_read(&mcs_op_statistics[op].count);
+ total = atomic_long_read(&mcs_op_statistics[op].total);
+ max = mcs_op_statistics[op].max;
+ seq_printf(s, "%-20s%12ld%12ld%12ld\n", id[op], count,
+ count ? total / count : 0, max);
+ }
+ return 0;
+}
+
+static ssize_t mcs_statistics_write(struct file *file,
+ const char __user *userbuf, size_t count, loff_t *data)
+{
+ memset(mcs_op_statistics, 0, sizeof(mcs_op_statistics));
+ return count;
+}
+
+static int options_show(struct seq_file *s, void *p)
+{
+ seq_printf(s, "#bitmask: 1=trace, 2=statistics\n");
+ seq_printf(s, "0x%lx\n", gru_options);
+ return 0;
+}
+
+static ssize_t options_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *data)
+{
+ int ret;
+
+ ret = kstrtoul_from_user(userbuf, count, 0, &gru_options);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int cch_seq_show(struct seq_file *file, void *data)
+{
+ long gid = *(long *)data;
+ int i;
+ struct gru_state *gru = GID_TO_GRU(gid);
+ struct gru_thread_state *ts;
+ const char *mode[] = { "??", "UPM", "INTR", "OS_POLL" };
+
+ if (gid == 0)
+ seq_printf(file, "#%5s%5s%6s%7s%9s%6s%8s%8s\n", "gid", "bid",
+ "ctx#", "asid", "pid", "cbrs", "dsbytes", "mode");
+ if (gru)
+ for (i = 0; i < GRU_NUM_CCH; i++) {
+ ts = gru->gs_gts[i];
+ if (!ts)
+ continue;
+ seq_printf(file, " %5d%5d%6d%7d%9d%6d%8d%8s\n",
+ gru->gs_gid, gru->gs_blade_id, i,
+ is_kernel_context(ts) ? 0 : ts->ts_gms->ms_asids[gid].mt_asid,
+ is_kernel_context(ts) ? 0 : ts->ts_tgid_owner,
+ ts->ts_cbr_au_count * GRU_CBR_AU_SIZE,
+ ts->ts_cbr_au_count * GRU_DSR_AU_BYTES,
+ mode[ts->ts_user_options &
+ GRU_OPT_MISS_MASK]);
+ }
+
+ return 0;
+}
+
+static int gru_seq_show(struct seq_file *file, void *data)
+{
+ long gid = *(long *)data, ctxfree, cbrfree, dsrfree;
+ struct gru_state *gru = GID_TO_GRU(gid);
+
+ if (gid == 0) {
+ seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "gid", "nid",
+ "ctx", "cbr", "dsr", "ctx", "cbr", "dsr");
+ seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "", "", "busy",
+ "busy", "busy", "free", "free", "free");
+ }
+ if (gru) {
+ ctxfree = GRU_NUM_CCH - gru->gs_active_contexts;
+ cbrfree = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+ dsrfree = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
+ seq_printf(file, " %5d%5d%7ld%6ld%6ld%8ld%6ld%6ld\n",
+ gru->gs_gid, gru->gs_blade_id, GRU_NUM_CCH - ctxfree,
+ GRU_NUM_CBE - cbrfree, GRU_NUM_DSR_BYTES - dsrfree,
+ ctxfree, cbrfree, dsrfree);
+ }
+
+ return 0;
+}
+
+static void seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static void *seq_start(struct seq_file *file, loff_t *gid)
+{
+ if (*gid < gru_max_gids)
+ return gid;
+ return NULL;
+}
+
+static void *seq_next(struct seq_file *file, void *data, loff_t *gid)
+{
+ (*gid)++;
+ if (*gid < gru_max_gids)
+ return gid;
+ return NULL;
+}
+
+static const struct seq_operations cch_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = cch_seq_show
+};
+
+static const struct seq_operations gru_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = gru_seq_show
+};
+
+static int statistics_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, statistics_show, NULL);
+}
+
+static int mcs_statistics_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mcs_statistics_show, NULL);
+}
+
+static int options_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, options_show, NULL);
+}
+
+/* *INDENT-OFF* */
+static const struct file_operations statistics_fops = {
+ .open = statistics_open,
+ .read = seq_read,
+ .write = statistics_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations mcs_statistics_fops = {
+ .open = mcs_statistics_open,
+ .read = seq_read,
+ .write = mcs_statistics_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations options_fops = {
+ .open = options_open,
+ .read = seq_read,
+ .write = options_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct proc_dir_entry *proc_gru __read_mostly;
+
+int gru_proc_init(void)
+{
+ proc_gru = proc_mkdir("sgi_uv/gru", NULL);
+ if (!proc_gru)
+ return -1;
+ if (!proc_create("statistics", 0644, proc_gru, &statistics_fops))
+ goto err;
+ if (!proc_create("mcs_statistics", 0644, proc_gru, &mcs_statistics_fops))
+ goto err;
+ if (!proc_create("debug_options", 0644, proc_gru, &options_fops))
+ goto err;
+ if (!proc_create_seq("cch_status", 0444, proc_gru, &cch_seq_ops))
+ goto err;
+ if (!proc_create_seq("gru_status", 0444, proc_gru, &gru_seq_ops))
+ goto err;
+ return 0;
+err:
+ remove_proc_subtree("sgi_uv/gru", NULL);
+ return -1;
+}
+
+void gru_proc_exit(void)
+{
+ remove_proc_subtree("sgi_uv/gru", NULL);
+}
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
new file mode 100644
index 000000000..3e041b6f7
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -0,0 +1,679 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * GRU DRIVER TABLES, MACROS, externs, etc
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __GRUTABLES_H__
+#define __GRUTABLES_H__
+
+/*
+ * GRU Chiplet:
+ * The GRU is a user addressible memory accelerator. It provides
+ * several forms of load, store, memset, bcopy instructions. In addition, it
+ * contains special instructions for AMOs, sending messages to message
+ * queues, etc.
+ *
+ * The GRU is an integral part of the node controller. It connects
+ * directly to the cpu socket. In its current implementation, there are 2
+ * GRU chiplets in the node controller on each blade (~node).
+ *
+ * The entire GRU memory space is fully coherent and cacheable by the cpus.
+ *
+ * Each GRU chiplet has a physical memory map that looks like the following:
+ *
+ * +-----------------+
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * |/////////////////|
+ * +-----------------+
+ * | system control |
+ * +-----------------+ _______ +-------------+
+ * |/////////////////| / | |
+ * |/////////////////| / | |
+ * |/////////////////| / | instructions|
+ * |/////////////////| / | |
+ * |/////////////////| / | |
+ * |/////////////////| / |-------------|
+ * |/////////////////| / | |
+ * +-----------------+ | |
+ * | context 15 | | data |
+ * +-----------------+ | |
+ * | ...... | \ | |
+ * +-----------------+ \____________ +-------------+
+ * | context 1 |
+ * +-----------------+
+ * | context 0 |
+ * +-----------------+
+ *
+ * Each of the "contexts" is a chunk of memory that can be mmaped into user
+ * space. The context consists of 2 parts:
+ *
+ * - an instruction space that can be directly accessed by the user
+ * to issue GRU instructions and to check instruction status.
+ *
+ * - a data area that acts as normal RAM.
+ *
+ * User instructions contain virtual addresses of data to be accessed by the
+ * GRU. The GRU contains a TLB that is used to convert these user virtual
+ * addresses to physical addresses.
+ *
+ * The "system control" area of the GRU chiplet is used by the kernel driver
+ * to manage user contexts and to perform functions such as TLB dropin and
+ * purging.
+ *
+ * One context may be reserved for the kernel and used for cross-partition
+ * communication. The GRU will also be used to asynchronously zero out
+ * large blocks of memory (not currently implemented).
+ *
+ *
+ * Tables:
+ *
+ * VDATA-VMA Data - Holds a few parameters. Head of linked list of
+ * GTS tables for threads using the GSEG
+ * GTS - Gru Thread State - contains info for managing a GSEG context. A
+ * GTS is allocated for each thread accessing a
+ * GSEG.
+ * GTD - GRU Thread Data - contains shadow copy of GRU data when GSEG is
+ * not loaded into a GRU
+ * GMS - GRU Memory Struct - Used to manage TLB shootdowns. Tracks GRUs
+ * where a GSEG has been loaded. Similar to
+ * an mm_struct but for GRU.
+ *
+ * GS - GRU State - Used to manage the state of a GRU chiplet
+ * BS - Blade State - Used to manage state of all GRU chiplets
+ * on a blade
+ *
+ *
+ * Normal task tables for task using GRU.
+ * - 2 threads in process
+ * - 2 GSEGs open in process
+ * - GSEG1 is being used by both threads
+ * - GSEG2 is used only by thread 2
+ *
+ * task -->|
+ * task ---+---> mm ->------ (notifier) -------+-> gms
+ * | |
+ * |--> vma -> vdata ---> gts--->| GSEG1 (thread1)
+ * | | |
+ * | +-> gts--->| GSEG1 (thread2)
+ * | |
+ * |--> vma -> vdata ---> gts--->| GSEG2 (thread2)
+ * .
+ * .
+ *
+ * GSEGs are marked DONTCOPY on fork
+ *
+ * At open
+ * file.private_data -> NULL
+ *
+ * At mmap,
+ * vma -> vdata
+ *
+ * After gseg reference
+ * vma -> vdata ->gts
+ *
+ * After fork
+ * parent
+ * vma -> vdata -> gts
+ * child
+ * (vma is not copied)
+ *
+ */
+
+#include <linux/rmap.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mm_types.h>
+#include "gru.h"
+#include "grulib.h"
+#include "gruhandles.h"
+
+extern struct gru_stats_s gru_stats;
+extern struct gru_blade_state *gru_base[];
+extern unsigned long gru_start_paddr, gru_end_paddr;
+extern void *gru_start_vaddr;
+extern unsigned int gru_max_gids;
+
+#define GRU_MAX_BLADES MAX_NUMNODES
+#define GRU_MAX_GRUS (GRU_MAX_BLADES * GRU_CHIPLETS_PER_BLADE)
+
+#define GRU_DRIVER_ID_STR "SGI GRU Device Driver"
+#define GRU_DRIVER_VERSION_STR "0.85"
+
+/*
+ * GRU statistics.
+ */
+struct gru_stats_s {
+ atomic_long_t vdata_alloc;
+ atomic_long_t vdata_free;
+ atomic_long_t gts_alloc;
+ atomic_long_t gts_free;
+ atomic_long_t gms_alloc;
+ atomic_long_t gms_free;
+ atomic_long_t gts_double_allocate;
+ atomic_long_t assign_context;
+ atomic_long_t assign_context_failed;
+ atomic_long_t free_context;
+ atomic_long_t load_user_context;
+ atomic_long_t load_kernel_context;
+ atomic_long_t lock_kernel_context;
+ atomic_long_t unlock_kernel_context;
+ atomic_long_t steal_user_context;
+ atomic_long_t steal_kernel_context;
+ atomic_long_t steal_context_failed;
+ atomic_long_t nopfn;
+ atomic_long_t asid_new;
+ atomic_long_t asid_next;
+ atomic_long_t asid_wrap;
+ atomic_long_t asid_reuse;
+ atomic_long_t intr;
+ atomic_long_t intr_cbr;
+ atomic_long_t intr_tfh;
+ atomic_long_t intr_spurious;
+ atomic_long_t intr_mm_lock_failed;
+ atomic_long_t call_os;
+ atomic_long_t call_os_wait_queue;
+ atomic_long_t user_flush_tlb;
+ atomic_long_t user_unload_context;
+ atomic_long_t user_exception;
+ atomic_long_t set_context_option;
+ atomic_long_t check_context_retarget_intr;
+ atomic_long_t check_context_unload;
+ atomic_long_t tlb_dropin;
+ atomic_long_t tlb_preload_page;
+ atomic_long_t tlb_dropin_fail_no_asid;
+ atomic_long_t tlb_dropin_fail_upm;
+ atomic_long_t tlb_dropin_fail_invalid;
+ atomic_long_t tlb_dropin_fail_range_active;
+ atomic_long_t tlb_dropin_fail_idle;
+ atomic_long_t tlb_dropin_fail_fmm;
+ atomic_long_t tlb_dropin_fail_no_exception;
+ atomic_long_t tfh_stale_on_fault;
+ atomic_long_t mmu_invalidate_range;
+ atomic_long_t mmu_invalidate_page;
+ atomic_long_t flush_tlb;
+ atomic_long_t flush_tlb_gru;
+ atomic_long_t flush_tlb_gru_tgh;
+ atomic_long_t flush_tlb_gru_zero_asid;
+
+ atomic_long_t copy_gpa;
+ atomic_long_t read_gpa;
+
+ atomic_long_t mesq_receive;
+ atomic_long_t mesq_receive_none;
+ atomic_long_t mesq_send;
+ atomic_long_t mesq_send_failed;
+ atomic_long_t mesq_noop;
+ atomic_long_t mesq_send_unexpected_error;
+ atomic_long_t mesq_send_lb_overflow;
+ atomic_long_t mesq_send_qlimit_reached;
+ atomic_long_t mesq_send_amo_nacked;
+ atomic_long_t mesq_send_put_nacked;
+ atomic_long_t mesq_page_overflow;
+ atomic_long_t mesq_qf_locked;
+ atomic_long_t mesq_qf_noop_not_full;
+ atomic_long_t mesq_qf_switch_head_failed;
+ atomic_long_t mesq_qf_unexpected_error;
+ atomic_long_t mesq_noop_unexpected_error;
+ atomic_long_t mesq_noop_lb_overflow;
+ atomic_long_t mesq_noop_qlimit_reached;
+ atomic_long_t mesq_noop_amo_nacked;
+ atomic_long_t mesq_noop_put_nacked;
+ atomic_long_t mesq_noop_page_overflow;
+
+};
+
+enum mcs_op {cchop_allocate, cchop_start, cchop_interrupt, cchop_interrupt_sync,
+ cchop_deallocate, tfhop_write_only, tfhop_write_restart,
+ tghop_invalidate, mcsop_last};
+
+struct mcs_op_statistic {
+ atomic_long_t count;
+ atomic_long_t total;
+ unsigned long max;
+};
+
+extern struct mcs_op_statistic mcs_op_statistics[mcsop_last];
+
+#define OPT_DPRINT 1
+#define OPT_STATS 2
+
+
+#define IRQ_GRU 110 /* Starting IRQ number for interrupts */
+
+/* Delay in jiffies between attempts to assign a GRU context */
+#define GRU_ASSIGN_DELAY ((HZ * 20) / 1000)
+
+/*
+ * If a process has it's context stolen, min delay in jiffies before trying to
+ * steal a context from another process.
+ */
+#define GRU_STEAL_DELAY ((HZ * 200) / 1000)
+
+#define STAT(id) do { \
+ if (gru_options & OPT_STATS) \
+ atomic_long_inc(&gru_stats.id); \
+ } while (0)
+
+#ifdef CONFIG_SGI_GRU_DEBUG
+#define gru_dbg(dev, fmt, x...) \
+ do { \
+ if (gru_options & OPT_DPRINT) \
+ printk(KERN_DEBUG "GRU:%d %s: " fmt, smp_processor_id(), __func__, x);\
+ } while (0)
+#else
+#define gru_dbg(x...)
+#endif
+
+/*-----------------------------------------------------------------------------
+ * ASID management
+ */
+#define MAX_ASID 0xfffff0
+#define MIN_ASID 8
+#define ASID_INC 8 /* number of regions */
+
+/* Generate a GRU asid value from a GRU base asid & a virtual address. */
+#define VADDR_HI_BIT 64
+#define GRUREGION(addr) ((addr) >> (VADDR_HI_BIT - 3) & 3)
+#define GRUASID(asid, addr) ((asid) + GRUREGION(addr))
+
+/*------------------------------------------------------------------------------
+ * File & VMS Tables
+ */
+
+struct gru_state;
+
+/*
+ * This structure is pointed to from the mmstruct via the notifier pointer.
+ * There is one of these per address space.
+ */
+struct gru_mm_tracker { /* pack to reduce size */
+ unsigned int mt_asid_gen:24; /* ASID wrap count */
+ unsigned int mt_asid:24; /* current base ASID for gru */
+ unsigned short mt_ctxbitmap:16;/* bitmap of contexts using
+ asid */
+} __attribute__ ((packed));
+
+struct gru_mm_struct {
+ struct mmu_notifier ms_notifier;
+ atomic_t ms_refcnt;
+ spinlock_t ms_asid_lock; /* protects ASID assignment */
+ atomic_t ms_range_active;/* num range_invals active */
+ char ms_released;
+ wait_queue_head_t ms_wait_queue;
+ DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS);
+ struct gru_mm_tracker ms_asids[GRU_MAX_GRUS];
+};
+
+/*
+ * One of these structures is allocated when a GSEG is mmaped. The
+ * structure is pointed to by the vma->vm_private_data field in the vma struct.
+ */
+struct gru_vma_data {
+ spinlock_t vd_lock; /* Serialize access to vma */
+ struct list_head vd_head; /* head of linked list of gts */
+ long vd_user_options;/* misc user option flags */
+ int vd_cbr_au_count;
+ int vd_dsr_au_count;
+ unsigned char vd_tlb_preload_count;
+};
+
+/*
+ * One of these is allocated for each thread accessing a mmaped GRU. A linked
+ * list of these structure is hung off the struct gru_vma_data in the mm_struct.
+ */
+struct gru_thread_state {
+ struct list_head ts_next; /* list - head at vma-private */
+ struct mutex ts_ctxlock; /* load/unload CTX lock */
+ struct mm_struct *ts_mm; /* mm currently mapped to
+ context */
+ struct vm_area_struct *ts_vma; /* vma of GRU context */
+ struct gru_state *ts_gru; /* GRU where the context is
+ loaded */
+ struct gru_mm_struct *ts_gms; /* asid & ioproc struct */
+ unsigned char ts_tlb_preload_count; /* TLB preload pages */
+ unsigned long ts_cbr_map; /* map of allocated CBRs */
+ unsigned long ts_dsr_map; /* map of allocated DATA
+ resources */
+ unsigned long ts_steal_jiffies;/* jiffies when context last
+ stolen */
+ long ts_user_options;/* misc user option flags */
+ pid_t ts_tgid_owner; /* task that is using the
+ context - for migration */
+ short ts_user_blade_id;/* user selected blade */
+ char ts_user_chiplet_id;/* user selected chiplet */
+ unsigned short ts_sizeavail; /* Pagesizes in use */
+ int ts_tsid; /* thread that owns the
+ structure */
+ int ts_tlb_int_select;/* target cpu if interrupts
+ enabled */
+ int ts_ctxnum; /* context number where the
+ context is loaded */
+ atomic_t ts_refcnt; /* reference count GTS */
+ unsigned char ts_dsr_au_count;/* Number of DSR resources
+ required for contest */
+ unsigned char ts_cbr_au_count;/* Number of CBR resources
+ required for contest */
+ char ts_cch_req_slice;/* CCH packet slice */
+ char ts_blade; /* If >= 0, migrate context if
+ ref from different blade */
+ char ts_force_cch_reload;
+ char ts_cbr_idx[GRU_CBR_AU];/* CBR numbers of each
+ allocated CB */
+ int ts_data_valid; /* Indicates if ts_gdata has
+ valid data */
+ struct gru_gseg_statistics ustats; /* User statistics */
+ unsigned long ts_gdata[0]; /* save area for GRU data (CB,
+ DS, CBE) */
+};
+
+/*
+ * Threaded programs actually allocate an array of GSEGs when a context is
+ * created. Each thread uses a separate GSEG. TSID is the index into the GSEG
+ * array.
+ */
+#define TSID(a, v) (((a) - (v)->vm_start) / GRU_GSEG_PAGESIZE)
+#define UGRUADDR(gts) ((gts)->ts_vma->vm_start + \
+ (gts)->ts_tsid * GRU_GSEG_PAGESIZE)
+
+#define NULLCTX (-1) /* if context not loaded into GRU */
+
+/*-----------------------------------------------------------------------------
+ * GRU State Tables
+ */
+
+/*
+ * One of these exists for each GRU chiplet.
+ */
+struct gru_state {
+ struct gru_blade_state *gs_blade; /* GRU state for entire
+ blade */
+ unsigned long gs_gru_base_paddr; /* Physical address of
+ gru segments (64) */
+ void *gs_gru_base_vaddr; /* Virtual address of
+ gru segments (64) */
+ unsigned short gs_gid; /* unique GRU number */
+ unsigned short gs_blade_id; /* blade of GRU */
+ unsigned char gs_chiplet_id; /* blade chiplet of GRU */
+ unsigned char gs_tgh_local_shift; /* used to pick TGH for
+ local flush */
+ unsigned char gs_tgh_first_remote; /* starting TGH# for
+ remote flush */
+ spinlock_t gs_asid_lock; /* lock used for
+ assigning asids */
+ spinlock_t gs_lock; /* lock used for
+ assigning contexts */
+
+ /* -- the following are protected by the gs_asid_lock spinlock ---- */
+ unsigned int gs_asid; /* Next availe ASID */
+ unsigned int gs_asid_limit; /* Limit of available
+ ASIDs */
+ unsigned int gs_asid_gen; /* asid generation.
+ Inc on wrap */
+
+ /* --- the following fields are protected by the gs_lock spinlock --- */
+ unsigned long gs_context_map; /* bitmap to manage
+ contexts in use */
+ unsigned long gs_cbr_map; /* bitmap to manage CB
+ resources */
+ unsigned long gs_dsr_map; /* bitmap used to manage
+ DATA resources */
+ unsigned int gs_reserved_cbrs; /* Number of kernel-
+ reserved cbrs */
+ unsigned int gs_reserved_dsr_bytes; /* Bytes of kernel-
+ reserved dsrs */
+ unsigned short gs_active_contexts; /* number of contexts
+ in use */
+ struct gru_thread_state *gs_gts[GRU_NUM_CCH]; /* GTS currently using
+ the context */
+ int gs_irq[GRU_NUM_TFM]; /* Interrupt irqs */
+};
+
+/*
+ * This structure contains the GRU state for all the GRUs on a blade.
+ */
+struct gru_blade_state {
+ void *kernel_cb; /* First kernel
+ reserved cb */
+ void *kernel_dsr; /* First kernel
+ reserved DSR */
+ struct rw_semaphore bs_kgts_sema; /* lock for kgts */
+ struct gru_thread_state *bs_kgts; /* GTS for kernel use */
+
+ /* ---- the following are used for managing kernel async GRU CBRs --- */
+ int bs_async_dsr_bytes; /* DSRs for async */
+ int bs_async_cbrs; /* CBRs AU for async */
+ struct completion *bs_async_wq;
+
+ /* ---- the following are protected by the bs_lock spinlock ---- */
+ spinlock_t bs_lock; /* lock used for
+ stealing contexts */
+ int bs_lru_ctxnum; /* STEAL - last context
+ stolen */
+ struct gru_state *bs_lru_gru; /* STEAL - last gru
+ stolen */
+
+ struct gru_state bs_grus[GRU_CHIPLETS_PER_BLADE];
+};
+
+/*-----------------------------------------------------------------------------
+ * Address Primitives
+ */
+#define get_tfm_for_cpu(g, c) \
+ ((struct gru_tlb_fault_map *)get_tfm((g)->gs_gru_base_vaddr, (c)))
+#define get_tfh_by_index(g, i) \
+ ((struct gru_tlb_fault_handle *)get_tfh((g)->gs_gru_base_vaddr, (i)))
+#define get_tgh_by_index(g, i) \
+ ((struct gru_tlb_global_handle *)get_tgh((g)->gs_gru_base_vaddr, (i)))
+#define get_cbe_by_index(g, i) \
+ ((struct gru_control_block_extended *)get_cbe((g)->gs_gru_base_vaddr,\
+ (i)))
+
+/*-----------------------------------------------------------------------------
+ * Useful Macros
+ */
+
+/* Given a blade# & chiplet#, get a pointer to the GRU */
+#define get_gru(b, c) (&gru_base[b]->bs_grus[c])
+
+/* Number of bytes to save/restore when unloading/loading GRU contexts */
+#define DSR_BYTES(dsr) ((dsr) * GRU_DSR_AU_BYTES)
+#define CBR_BYTES(cbr) ((cbr) * GRU_HANDLE_BYTES * GRU_CBR_AU_SIZE * 2)
+
+/* Convert a user CB number to the actual CBRNUM */
+#define thread_cbr_number(gts, n) ((gts)->ts_cbr_idx[(n) / GRU_CBR_AU_SIZE] \
+ * GRU_CBR_AU_SIZE + (n) % GRU_CBR_AU_SIZE)
+
+/* Convert a gid to a pointer to the GRU */
+#define GID_TO_GRU(gid) \
+ (gru_base[(gid) / GRU_CHIPLETS_PER_BLADE] ? \
+ (&gru_base[(gid) / GRU_CHIPLETS_PER_BLADE]-> \
+ bs_grus[(gid) % GRU_CHIPLETS_PER_BLADE]) : \
+ NULL)
+
+/* Scan all active GRUs in a GRU bitmap */
+#define for_each_gru_in_bitmap(gid, map) \
+ for_each_set_bit((gid), (map), GRU_MAX_GRUS)
+
+/* Scan all active GRUs on a specific blade */
+#define for_each_gru_on_blade(gru, nid, i) \
+ for ((gru) = gru_base[nid]->bs_grus, (i) = 0; \
+ (i) < GRU_CHIPLETS_PER_BLADE; \
+ (i)++, (gru)++)
+
+/* Scan all GRUs */
+#define foreach_gid(gid) \
+ for ((gid) = 0; (gid) < gru_max_gids; (gid)++)
+
+/* Scan all active GTSs on a gru. Note: must hold ss_lock to use this macro. */
+#define for_each_gts_on_gru(gts, gru, ctxnum) \
+ for ((ctxnum) = 0; (ctxnum) < GRU_NUM_CCH; (ctxnum)++) \
+ if (((gts) = (gru)->gs_gts[ctxnum]))
+
+/* Scan each CBR whose bit is set in a TFM (or copy of) */
+#define for_each_cbr_in_tfm(i, map) \
+ for_each_set_bit((i), (map), GRU_NUM_CBE)
+
+/* Scan each CBR in a CBR bitmap. Note: multiple CBRs in an allocation unit */
+#define for_each_cbr_in_allocation_map(i, map, k) \
+ for_each_set_bit((k), (map), GRU_CBR_AU) \
+ for ((i) = (k)*GRU_CBR_AU_SIZE; \
+ (i) < ((k) + 1) * GRU_CBR_AU_SIZE; (i)++)
+
+/* Scan each DSR in a DSR bitmap. Note: multiple DSRs in an allocation unit */
+#define for_each_dsr_in_allocation_map(i, map, k) \
+ for_each_set_bit((k), (const unsigned long *)(map), GRU_DSR_AU) \
+ for ((i) = (k) * GRU_DSR_AU_CL; \
+ (i) < ((k) + 1) * GRU_DSR_AU_CL; (i)++)
+
+#define gseg_physical_address(gru, ctxnum) \
+ ((gru)->gs_gru_base_paddr + ctxnum * GRU_GSEG_STRIDE)
+#define gseg_virtual_address(gru, ctxnum) \
+ ((gru)->gs_gru_base_vaddr + ctxnum * GRU_GSEG_STRIDE)
+
+/*-----------------------------------------------------------------------------
+ * Lock / Unlock GRU handles
+ * Use the "delresp" bit in the handle as a "lock" bit.
+ */
+
+/* Lock hierarchy checking enabled only in emulator */
+
+/* 0 = lock failed, 1 = locked */
+static inline int __trylock_handle(void *h)
+{
+ return !test_and_set_bit(1, h);
+}
+
+static inline void __lock_handle(void *h)
+{
+ while (test_and_set_bit(1, h))
+ cpu_relax();
+}
+
+static inline void __unlock_handle(void *h)
+{
+ clear_bit(1, h);
+}
+
+static inline int trylock_cch_handle(struct gru_context_configuration_handle *cch)
+{
+ return __trylock_handle(cch);
+}
+
+static inline void lock_cch_handle(struct gru_context_configuration_handle *cch)
+{
+ __lock_handle(cch);
+}
+
+static inline void unlock_cch_handle(struct gru_context_configuration_handle
+ *cch)
+{
+ __unlock_handle(cch);
+}
+
+static inline void lock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+ __lock_handle(tgh);
+}
+
+static inline void unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+ __unlock_handle(tgh);
+}
+
+static inline int is_kernel_context(struct gru_thread_state *gts)
+{
+ return !gts->ts_mm;
+}
+
+/*
+ * The following are for Nehelem-EX. A more general scheme is needed for
+ * future processors.
+ */
+#define UV_MAX_INT_CORES 8
+#define uv_cpu_socket_number(p) ((cpu_physical_id(p) >> 5) & 1)
+#define uv_cpu_ht_number(p) (cpu_physical_id(p) & 1)
+#define uv_cpu_core_number(p) (((cpu_physical_id(p) >> 2) & 4) | \
+ ((cpu_physical_id(p) >> 1) & 3))
+/*-----------------------------------------------------------------------------
+ * Function prototypes & externs
+ */
+struct gru_unload_context_req;
+
+extern const struct vm_operations_struct gru_vm_ops;
+extern struct device *grudev;
+
+extern struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma,
+ int tsid);
+extern struct gru_thread_state *gru_find_thread_state(struct vm_area_struct
+ *vma, int tsid);
+extern struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct
+ *vma, int tsid);
+extern struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts);
+extern void gru_load_context(struct gru_thread_state *gts);
+extern void gru_steal_context(struct gru_thread_state *gts);
+extern void gru_unload_context(struct gru_thread_state *gts, int savestate);
+extern int gru_update_cch(struct gru_thread_state *gts);
+extern void gts_drop(struct gru_thread_state *gts);
+extern void gru_tgh_flush_init(struct gru_state *gru);
+extern int gru_kservices_init(void);
+extern void gru_kservices_exit(void);
+extern irqreturn_t gru0_intr(int irq, void *dev_id);
+extern irqreturn_t gru1_intr(int irq, void *dev_id);
+extern irqreturn_t gru_intr_mblade(int irq, void *dev_id);
+extern int gru_dump_chiplet_request(unsigned long arg);
+extern long gru_get_gseg_statistics(unsigned long arg);
+extern int gru_handle_user_call_os(unsigned long address);
+extern int gru_user_flush_tlb(unsigned long arg);
+extern int gru_user_unload_context(unsigned long arg);
+extern int gru_get_exception_detail(unsigned long arg);
+extern int gru_set_context_option(unsigned long address);
+extern void gru_check_context_placement(struct gru_thread_state *gts);
+extern int gru_cpu_fault_map_id(void);
+extern struct vm_area_struct *gru_find_vma(unsigned long vaddr);
+extern void gru_flush_all_tlb(struct gru_state *gru);
+extern int gru_proc_init(void);
+extern void gru_proc_exit(void);
+
+extern struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+ int cbr_au_count, int dsr_au_count,
+ unsigned char tlb_preload_count, int options, int tsid);
+extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
+ int cbr_au_count, char *cbmap);
+extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
+ int dsr_au_count, char *dsmap);
+extern vm_fault_t gru_fault(struct vm_fault *vmf);
+extern struct gru_mm_struct *gru_register_mmu_notifier(void);
+extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
+
+extern int gru_ktest(unsigned long arg);
+extern void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+ unsigned long len);
+
+extern unsigned long gru_options;
+
+#endif /* __GRUTABLES_H__ */
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
new file mode 100644
index 000000000..be28f05bf
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -0,0 +1,370 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * MMUOPS callbacks + TLB flushing
+ *
+ * This file handles emu notifier callbacks from the core kernel. The callbacks
+ * are used to update the TLB in the GRU as a result of changes in the
+ * state of a process address space. This file also handles TLB invalidates
+ * from the GRU driver.
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/hugetlb.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/srcu.h>
+#include <asm/processor.h>
+#include "gru.h"
+#include "grutables.h"
+#include <asm/uv/uv_hub.h>
+
+#define gru_random() get_cycles()
+
+/* ---------------------------------- TLB Invalidation functions --------
+ * get_tgh_handle
+ *
+ * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
+ * local blade, use a fixed TGH that is a function of the blade-local cpu
+ * number. Normally, this TGH is private to the cpu & no contention occurs for
+ * the TGH. For offblade GRUs, select a random TGH in the range above the
+ * private TGHs. A spinlock is required to access this TGH & the lock must be
+ * released when the invalidate is completes. This sucks, but it is the best we
+ * can do.
+ *
+ * Note that the spinlock is IN the TGH handle so locking does not involve
+ * additional cache lines.
+ *
+ */
+static inline int get_off_blade_tgh(struct gru_state *gru)
+{
+ int n;
+
+ n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
+ n = gru_random() % n;
+ n += gru->gs_tgh_first_remote;
+ return n;
+}
+
+static inline int get_on_blade_tgh(struct gru_state *gru)
+{
+ return uv_blade_processor_id() >> gru->gs_tgh_local_shift;
+}
+
+static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
+ *gru)
+{
+ struct gru_tlb_global_handle *tgh;
+ int n;
+
+ preempt_disable();
+ if (uv_numa_blade_id() == gru->gs_blade_id)
+ n = get_on_blade_tgh(gru);
+ else
+ n = get_off_blade_tgh(gru);
+ tgh = get_tgh_by_index(gru, n);
+ lock_tgh_handle(tgh);
+
+ return tgh;
+}
+
+static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+ unlock_tgh_handle(tgh);
+ preempt_enable();
+}
+
+/*
+ * gru_flush_tlb_range
+ *
+ * General purpose TLB invalidation function. This function scans every GRU in
+ * the ENTIRE system (partition) looking for GRUs where the specified MM has
+ * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
+ * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
+ * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
+ * cost of (possibly) a large number of future TLBmisses.
+ *
+ * The current algorithm is optimized based on the following (somewhat true)
+ * assumptions:
+ * - GRU contexts are not loaded into a GRU unless a reference is made to
+ * the data segment or control block (this is true, not an assumption).
+ * If a DS/CB is referenced, the user will also issue instructions that
+ * cause TLBmisses. It is not necessary to optimize for the case where
+ * contexts are loaded but no instructions cause TLB misses. (I know
+ * this will happen but I'm not optimizing for it).
+ * - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
+ * a few usec but in unusual cases, it could be longer. Avoid if
+ * possible.
+ * - intrablade process migration between cpus is not frequent but is
+ * common.
+ * - a GRU context is not typically migrated to a different GRU on the
+ * blade because of intrablade migration
+ * - interblade migration is rare. Processes migrate their GRU context to
+ * the new blade.
+ * - if interblade migration occurs, migration back to the original blade
+ * is very very rare (ie., no optimization for this case)
+ * - most GRU instruction operate on a subset of the user REGIONS. Code
+ * & shared library regions are not likely targets of GRU instructions.
+ *
+ * To help improve the efficiency of TLB invalidation, the GMS data
+ * structure is maintained for EACH address space (MM struct). The GMS is
+ * also the structure that contains the pointer to the mmu callout
+ * functions. This structure is linked to the mm_struct for the address space
+ * using the mmu "register" function. The mmu interfaces are used to
+ * provide the callbacks for TLB invalidation. The GMS contains:
+ *
+ * - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
+ * loaded into the GRU.
+ * - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
+ * the above array
+ * - ctxbitmap[maxgrus]. Indicates the contexts that are currently active
+ * in the GRU for the address space. This bitmap must be passed to the
+ * GRU to do an invalidate.
+ *
+ * The current algorithm for invalidating TLBs is:
+ * - scan the asidmap for GRUs where the context has been loaded, ie,
+ * asid is non-zero.
+ * - for each gru found:
+ * - if the ctxtmap is non-zero, there are active contexts in the
+ * GRU. TLB invalidate instructions must be issued to the GRU.
+ * - if the ctxtmap is zero, no context is active. Set the ASID to
+ * zero to force a full TLB invalidation. This is fast but will
+ * cause a lot of TLB misses if the context is reloaded onto the
+ * GRU
+ *
+ */
+
+void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+ unsigned long len)
+{
+ struct gru_state *gru;
+ struct gru_mm_tracker *asids;
+ struct gru_tlb_global_handle *tgh;
+ unsigned long num;
+ int grupagesize, pagesize, pageshift, gid, asid;
+
+ /* ZZZ TODO - handle huge pages */
+ pageshift = PAGE_SHIFT;
+ pagesize = (1UL << pageshift);
+ grupagesize = GRU_PAGESIZE(pageshift);
+ num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
+
+ STAT(flush_tlb);
+ gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
+ start, len, gms->ms_asidmap[0]);
+
+ spin_lock(&gms->ms_asid_lock);
+ for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
+ STAT(flush_tlb_gru);
+ gru = GID_TO_GRU(gid);
+ asids = gms->ms_asids + gid;
+ asid = asids->mt_asid;
+ if (asids->mt_ctxbitmap && asid) {
+ STAT(flush_tlb_gru_tgh);
+ asid = GRUASID(asid, start);
+ gru_dbg(grudev,
+ " FLUSH gruid %d, asid 0x%x, vaddr 0x%lx, vamask 0x%x, num %ld, cbmap 0x%x\n",
+ gid, asid, start, grupagesize, num, asids->mt_ctxbitmap);
+ tgh = get_lock_tgh_handle(gru);
+ tgh_invalidate(tgh, start, ~0, asid, grupagesize, 0,
+ num - 1, asids->mt_ctxbitmap);
+ get_unlock_tgh_handle(tgh);
+ } else {
+ STAT(flush_tlb_gru_zero_asid);
+ asids->mt_asid = 0;
+ __clear_bit(gru->gs_gid, gms->ms_asidmap);
+ gru_dbg(grudev,
+ " CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
+ gid, asid, asids->mt_ctxbitmap,
+ gms->ms_asidmap[0]);
+ }
+ }
+ spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Flush the entire TLB on a chiplet.
+ */
+void gru_flush_all_tlb(struct gru_state *gru)
+{
+ struct gru_tlb_global_handle *tgh;
+
+ gru_dbg(grudev, "gid %d\n", gru->gs_gid);
+ tgh = get_lock_tgh_handle(gru);
+ tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0xffff);
+ get_unlock_tgh_handle(tgh);
+}
+
+/*
+ * MMUOPS notifier callout functions
+ */
+static int gru_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool blockable)
+{
+ struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+ ms_notifier);
+
+ STAT(mmu_invalidate_range);
+ atomic_inc(&gms->ms_range_active);
+ gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
+ start, end, atomic_read(&gms->ms_range_active));
+ gru_flush_tlb_range(gms, start, end - start);
+
+ return 0;
+}
+
+static void gru_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+ ms_notifier);
+
+ /* ..._and_test() provides needed barrier */
+ (void)atomic_dec_and_test(&gms->ms_range_active);
+
+ wake_up_all(&gms->ms_wait_queue);
+ gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
+}
+
+static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+ ms_notifier);
+
+ gms->ms_released = 1;
+ gru_dbg(grudev, "gms %p\n", gms);
+}
+
+
+static const struct mmu_notifier_ops gru_mmuops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
+ .invalidate_range_start = gru_invalidate_range_start,
+ .invalidate_range_end = gru_invalidate_range_end,
+ .release = gru_release,
+};
+
+/* Move this to the basic mmu_notifier file. But for now... */
+static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm,
+ const struct mmu_notifier_ops *ops)
+{
+ struct mmu_notifier *mn, *gru_mn = NULL;
+
+ if (mm->mmu_notifier_mm) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list,
+ hlist)
+ if (mn->ops == ops) {
+ gru_mn = mn;
+ break;
+ }
+ rcu_read_unlock();
+ }
+ return gru_mn;
+}
+
+struct gru_mm_struct *gru_register_mmu_notifier(void)
+{
+ struct gru_mm_struct *gms;
+ struct mmu_notifier *mn;
+ int err;
+
+ mn = mmu_find_ops(current->mm, &gru_mmuops);
+ if (mn) {
+ gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+ atomic_inc(&gms->ms_refcnt);
+ } else {
+ gms = kzalloc(sizeof(*gms), GFP_KERNEL);
+ if (!gms)
+ return ERR_PTR(-ENOMEM);
+ STAT(gms_alloc);
+ spin_lock_init(&gms->ms_asid_lock);
+ gms->ms_notifier.ops = &gru_mmuops;
+ atomic_set(&gms->ms_refcnt, 1);
+ init_waitqueue_head(&gms->ms_wait_queue);
+ err = __mmu_notifier_register(&gms->ms_notifier, current->mm);
+ if (err)
+ goto error;
+ }
+ if (gms)
+ gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
+ atomic_read(&gms->ms_refcnt));
+ return gms;
+error:
+ kfree(gms);
+ return ERR_PTR(err);
+}
+
+void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
+{
+ gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms,
+ atomic_read(&gms->ms_refcnt), gms->ms_released);
+ if (atomic_dec_return(&gms->ms_refcnt) == 0) {
+ if (!gms->ms_released)
+ mmu_notifier_unregister(&gms->ms_notifier, current->mm);
+ kfree(gms);
+ STAT(gms_free);
+ }
+}
+
+/*
+ * Setup TGH parameters. There are:
+ * - 24 TGH handles per GRU chiplet
+ * - a portion (MAX_LOCAL_TGH) of the handles are reserved for
+ * use by blade-local cpus
+ * - the rest are used by off-blade cpus. This usage is
+ * less frequent than blade-local usage.
+ *
+ * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
+ * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
+ * use.
+ */
+#define MAX_LOCAL_TGH 16
+
+void gru_tgh_flush_init(struct gru_state *gru)
+{
+ int cpus, shift = 0, n;
+
+ cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id);
+
+ /* n = cpus rounded up to next power of 2 */
+ if (cpus) {
+ n = 1 << fls(cpus - 1);
+
+ /*
+ * shift count for converting local cpu# to TGH index
+ * 0 if cpus <= MAX_LOCAL_TGH,
+ * 1 if cpus <= 2*MAX_LOCAL_TGH,
+ * etc
+ */
+ shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
+ }
+ gru->gs_tgh_local_shift = shift;
+
+ /* first starting TGH index to use for remote purges */
+ gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
+
+}