diff options
Diffstat (limited to 'drivers/misc/sgi-xp')
-rw-r--r-- | drivers/misc/sgi-xp/Makefile | 20 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xp.h | 368 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xp_main.c | 264 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xp_nofault.S | 35 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xp_sn2.c | 190 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xp_uv.c | 171 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc.h | 1004 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc_channel.c | 1011 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc_main.c | 1373 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc_partition.c | 540 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc_sn2.c | 2459 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpc_uv.c | 1813 | ||||
-rw-r--r-- | drivers/misc/sgi-xp/xpnet.c | 596 |
13 files changed, 9844 insertions, 0 deletions
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile new file mode 100644 index 000000000..bbb622c19 --- /dev/null +++ b/drivers/misc/sgi-xp/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for SGI's XP devices. +# + +obj-$(CONFIG_SGI_XP) += xp.o +xp-y := xp_main.o +xp-$(CONFIG_IA64_SGI_SN2) += xp_sn2.o xp_nofault.o +xp-$(CONFIG_IA64_GENERIC) += xp_sn2.o xp_nofault.o +xp-$(CONFIG_IA64_SGI_UV) += xp_uv.o +xp-$(CONFIG_X86_64) += xp_uv.o + +obj-$(CONFIG_SGI_XP) += xpc.o +xpc-y := xpc_main.o xpc_channel.o xpc_partition.o +xpc-$(CONFIG_IA64_SGI_SN2) += xpc_sn2.o +xpc-$(CONFIG_IA64_GENERIC) += xpc_sn2.o +xpc-$(CONFIG_IA64_SGI_UV) += xpc_uv.o +xpc-$(CONFIG_X86_64) += xpc_uv.o + +obj-$(CONFIG_SGI_XP) += xpnet.o diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h new file mode 100644 index 000000000..b8069eec1 --- /dev/null +++ b/drivers/misc/sgi-xp/xp.h @@ -0,0 +1,368 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved. + */ + +/* + * External Cross Partition (XP) structures and defines. + */ + +#ifndef _DRIVERS_MISC_SGIXP_XP_H +#define _DRIVERS_MISC_SGIXP_XP_H + +#include <linux/mutex.h> + +#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV +#include <asm/uv/uv.h> +#define is_uv() is_uv_system() +#endif + +#ifndef is_uv +#define is_uv() 0 +#endif + +#if defined CONFIG_IA64 +#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */ +#define is_shub() ia64_platform_is("sn2") +#endif + +#ifndef is_shub1 +#define is_shub1() 0 +#endif + +#ifndef is_shub2 +#define is_shub2() 0 +#endif + +#ifndef is_shub +#define is_shub() 0 +#endif + +#ifdef USE_DBUG_ON +#define DBUG_ON(condition) BUG_ON(condition) +#else +#define DBUG_ON(condition) +#endif + +/* + * Define the maximum number of partitions the system can possibly support. + * It is based on the maximum number of hardware partitionable regions. The + * term 'region' in this context refers to the minimum number of nodes that + * can comprise an access protection grouping. The access protection is in + * regards to memory, IPI and IOI. + * + * The maximum number of hardware partitionable regions is equal to the + * maximum number of nodes in the entire system divided by the minimum number + * of nodes that comprise an access protection grouping. + */ +#define XP_MAX_NPARTITIONS_SN2 64 +#define XP_MAX_NPARTITIONS_UV 256 + +/* + * XPC establishes channel connections between the local partition and any + * other partition that is currently up. Over these channels, kernel-level + * `users' can communicate with their counterparts on the other partitions. + * + * If the need for additional channels arises, one can simply increase + * XPC_MAX_NCHANNELS accordingly. If the day should come where that number + * exceeds the absolute MAXIMUM number of channels possible (eight), then one + * will need to make changes to the XPC code to accommodate for this. + * + * The absolute maximum number of channels possible is limited to eight for + * performance reasons on sn2 hardware. The internal cross partition structures + * require sixteen bytes per channel, and eight allows all of this + * interface-shared info to fit in one 128-byte cacheline. + */ +#define XPC_MEM_CHANNEL 0 /* memory channel number */ +#define XPC_NET_CHANNEL 1 /* network channel number */ + +#define XPC_MAX_NCHANNELS 2 /* max #of channels allowed */ + +#if XPC_MAX_NCHANNELS > 8 +#error XPC_MAX_NCHANNELS exceeds absolute MAXIMUM possible. +#endif + +/* + * Define macro, XPC_MSG_SIZE(), is provided for the user + * that wants to fit as many msg entries as possible in a given memory size + * (e.g. a memory page). + */ +#define XPC_MSG_MAX_SIZE 128 +#define XPC_MSG_HDR_MAX_SIZE 16 +#define XPC_MSG_PAYLOAD_MAX_SIZE (XPC_MSG_MAX_SIZE - XPC_MSG_HDR_MAX_SIZE) + +#define XPC_MSG_SIZE(_payload_size) \ + ALIGN(XPC_MSG_HDR_MAX_SIZE + (_payload_size), \ + is_uv() ? 64 : 128) + + +/* + * Define the return values and values passed to user's callout functions. + * (It is important to add new value codes at the end just preceding + * xpUnknownReason, which must have the highest numerical value.) + */ +enum xp_retval { + xpSuccess = 0, + + xpNotConnected, /* 1: channel is not connected */ + xpConnected, /* 2: channel connected (opened) */ + xpRETIRED1, /* 3: (formerly xpDisconnected) */ + + xpMsgReceived, /* 4: message received */ + xpMsgDelivered, /* 5: message delivered and acknowledged */ + + xpRETIRED2, /* 6: (formerly xpTransferFailed) */ + + xpNoWait, /* 7: operation would require wait */ + xpRetry, /* 8: retry operation */ + xpTimeout, /* 9: timeout in xpc_allocate_msg_wait() */ + xpInterrupted, /* 10: interrupted wait */ + + xpUnequalMsgSizes, /* 11: message size disparity between sides */ + xpInvalidAddress, /* 12: invalid address */ + + xpNoMemory, /* 13: no memory available for XPC structures */ + xpLackOfResources, /* 14: insufficient resources for operation */ + xpUnregistered, /* 15: channel is not registered */ + xpAlreadyRegistered, /* 16: channel is already registered */ + + xpPartitionDown, /* 17: remote partition is down */ + xpNotLoaded, /* 18: XPC module is not loaded */ + xpUnloading, /* 19: this side is unloading XPC module */ + + xpBadMagic, /* 20: XPC MAGIC string not found */ + + xpReactivating, /* 21: remote partition was reactivated */ + + xpUnregistering, /* 22: this side is unregistering channel */ + xpOtherUnregistering, /* 23: other side is unregistering channel */ + + xpCloneKThread, /* 24: cloning kernel thread */ + xpCloneKThreadFailed, /* 25: cloning kernel thread failed */ + + xpNoHeartbeat, /* 26: remote partition has no heartbeat */ + + xpPioReadError, /* 27: PIO read error */ + xpPhysAddrRegFailed, /* 28: registration of phys addr range failed */ + + xpRETIRED3, /* 29: (formerly xpBteDirectoryError) */ + xpRETIRED4, /* 30: (formerly xpBtePoisonError) */ + xpRETIRED5, /* 31: (formerly xpBteWriteError) */ + xpRETIRED6, /* 32: (formerly xpBteAccessError) */ + xpRETIRED7, /* 33: (formerly xpBtePWriteError) */ + xpRETIRED8, /* 34: (formerly xpBtePReadError) */ + xpRETIRED9, /* 35: (formerly xpBteTimeOutError) */ + xpRETIRED10, /* 36: (formerly xpBteXtalkError) */ + xpRETIRED11, /* 37: (formerly xpBteNotAvailable) */ + xpRETIRED12, /* 38: (formerly xpBteUnmappedError) */ + + xpBadVersion, /* 39: bad version number */ + xpVarsNotSet, /* 40: the XPC variables are not set up */ + xpNoRsvdPageAddr, /* 41: unable to get rsvd page's phys addr */ + xpInvalidPartid, /* 42: invalid partition ID */ + xpLocalPartid, /* 43: local partition ID */ + + xpOtherGoingDown, /* 44: other side going down, reason unknown */ + xpSystemGoingDown, /* 45: system is going down, reason unknown */ + xpSystemHalt, /* 46: system is being halted */ + xpSystemReboot, /* 47: system is being rebooted */ + xpSystemPoweroff, /* 48: system is being powered off */ + + xpDisconnecting, /* 49: channel disconnecting (closing) */ + + xpOpenCloseError, /* 50: channel open/close protocol error */ + + xpDisconnected, /* 51: channel disconnected (closed) */ + + xpBteCopyError, /* 52: bte_copy() returned error */ + xpSalError, /* 53: sn SAL error */ + xpRsvdPageNotSet, /* 54: the reserved page is not set up */ + xpPayloadTooBig, /* 55: payload too large for message slot */ + + xpUnsupported, /* 56: unsupported functionality or resource */ + xpNeedMoreInfo, /* 57: more info is needed by SAL */ + + xpGruCopyError, /* 58: gru_copy_gru() returned error */ + xpGruSendMqError, /* 59: gru send message queue related error */ + + xpBadChannelNumber, /* 60: invalid channel number */ + xpBadMsgType, /* 61: invalid message type */ + xpBiosError, /* 62: BIOS error */ + + xpUnknownReason /* 63: unknown reason - must be last in enum */ +}; + +/* + * Define the callout function type used by XPC to update the user on + * connection activity and state changes via the user function registered + * by xpc_connect(). + * + * Arguments: + * + * reason - reason code. + * partid - partition ID associated with condition. + * ch_number - channel # associated with condition. + * data - pointer to optional data. + * key - pointer to optional user-defined value provided as the "key" + * argument to xpc_connect(). + * + * A reason code of xpConnected indicates that a connection has been + * established to the specified partition on the specified channel. The data + * argument indicates the max number of entries allowed in the message queue. + * + * A reason code of xpMsgReceived indicates that a XPC message arrived from + * the specified partition on the specified channel. The data argument + * specifies the address of the message's payload. The user must call + * xpc_received() when finished with the payload. + * + * All other reason codes indicate failure. The data argmument is NULL. + * When a failure reason code is received, one can assume that the channel + * is not connected. + */ +typedef void (*xpc_channel_func) (enum xp_retval reason, short partid, + int ch_number, void *data, void *key); + +/* + * Define the callout function type used by XPC to notify the user of + * messages received and delivered via the user function registered by + * xpc_send_notify(). + * + * Arguments: + * + * reason - reason code. + * partid - partition ID associated with condition. + * ch_number - channel # associated with condition. + * key - pointer to optional user-defined value provided as the "key" + * argument to xpc_send_notify(). + * + * A reason code of xpMsgDelivered indicates that the message was delivered + * to the intended recipient and that they have acknowledged its receipt by + * calling xpc_received(). + * + * All other reason codes indicate failure. + * + * NOTE: The user defined function must be callable by an interrupt handler + * and thus cannot block. + */ +typedef void (*xpc_notify_func) (enum xp_retval reason, short partid, + int ch_number, void *key); + +/* + * The following is a registration entry. There is a global array of these, + * one per channel. It is used to record the connection registration made + * by the users of XPC. As long as a registration entry exists, for any + * partition that comes up, XPC will attempt to establish a connection on + * that channel. Notification that a connection has been made will occur via + * the xpc_channel_func function. + * + * The 'func' field points to the function to call when aynchronous + * notification is required for such events as: a connection established/lost, + * or an incoming message received, or an error condition encountered. A + * non-NULL 'func' field indicates that there is an active registration for + * the channel. + */ +struct xpc_registration { + struct mutex mutex; + xpc_channel_func func; /* function to call */ + void *key; /* pointer to user's key */ + u16 nentries; /* #of msg entries in local msg queue */ + u16 entry_size; /* message queue's message entry size */ + u32 assigned_limit; /* limit on #of assigned kthreads */ + u32 idle_limit; /* limit on #of idle kthreads */ +} ____cacheline_aligned; + +#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL) + +/* the following are valid xpc_send() or xpc_send_notify() flags */ +#define XPC_WAIT 0 /* wait flag */ +#define XPC_NOWAIT 1 /* no wait flag */ + +struct xpc_interface { + void (*connect) (int); + void (*disconnect) (int); + enum xp_retval (*send) (short, int, u32, void *, u16); + enum xp_retval (*send_notify) (short, int, u32, void *, u16, + xpc_notify_func, void *); + void (*received) (short, int, void *); + enum xp_retval (*partid_to_nasids) (short, void *); +}; + +extern struct xpc_interface xpc_interface; + +extern void xpc_set_interface(void (*)(int), + void (*)(int), + enum xp_retval (*)(short, int, u32, void *, u16), + enum xp_retval (*)(short, int, u32, void *, u16, + xpc_notify_func, void *), + void (*)(short, int, void *), + enum xp_retval (*)(short, void *)); +extern void xpc_clear_interface(void); + +extern enum xp_retval xpc_connect(int, xpc_channel_func, void *, u16, + u16, u32, u32); +extern void xpc_disconnect(int); + +static inline enum xp_retval +xpc_send(short partid, int ch_number, u32 flags, void *payload, + u16 payload_size) +{ + if (!xpc_interface.send) + return xpNotLoaded; + + return xpc_interface.send(partid, ch_number, flags, payload, + payload_size); +} + +static inline enum xp_retval +xpc_send_notify(short partid, int ch_number, u32 flags, void *payload, + u16 payload_size, xpc_notify_func func, void *key) +{ + if (!xpc_interface.send_notify) + return xpNotLoaded; + + return xpc_interface.send_notify(partid, ch_number, flags, payload, + payload_size, func, key); +} + +static inline void +xpc_received(short partid, int ch_number, void *payload) +{ + if (xpc_interface.received) + xpc_interface.received(partid, ch_number, payload); +} + +static inline enum xp_retval +xpc_partid_to_nasids(short partid, void *nasids) +{ + if (!xpc_interface.partid_to_nasids) + return xpNotLoaded; + + return xpc_interface.partid_to_nasids(partid, nasids); +} + +extern short xp_max_npartitions; +extern short xp_partition_id; +extern u8 xp_region_size; + +extern unsigned long (*xp_pa) (void *); +extern unsigned long (*xp_socket_pa) (unsigned long); +extern enum xp_retval (*xp_remote_memcpy) (unsigned long, const unsigned long, + size_t); +extern int (*xp_cpu_to_nasid) (int); +extern enum xp_retval (*xp_expand_memprotect) (unsigned long, unsigned long); +extern enum xp_retval (*xp_restrict_memprotect) (unsigned long, unsigned long); + +extern u64 xp_nofault_PIOR_target; +extern int xp_nofault_PIOR(void *); +extern int xp_error_PIOR(void); + +extern struct device *xp; +extern enum xp_retval xp_init_sn2(void); +extern enum xp_retval xp_init_uv(void); +extern void xp_exit_sn2(void); +extern void xp_exit_uv(void); + +#endif /* _DRIVERS_MISC_SGIXP_XP_H */ diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c new file mode 100644 index 000000000..6d7f557fd --- /dev/null +++ b/drivers/misc/sgi-xp/xp_main.c @@ -0,0 +1,264 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition (XP) base. + * + * XP provides a base from which its users can interact + * with XPC, yet not be dependent on XPC. + * + */ + +#include <linux/module.h> +#include <linux/device.h> +#include "xp.h" + +/* define the XP debug device structures to be used with dev_dbg() et al */ + +struct device_driver xp_dbg_name = { + .name = "xp" +}; + +struct device xp_dbg_subname = { + .init_name = "", /* set to "" */ + .driver = &xp_dbg_name +}; + +struct device *xp = &xp_dbg_subname; + +/* max #of partitions possible */ +short xp_max_npartitions; +EXPORT_SYMBOL_GPL(xp_max_npartitions); + +short xp_partition_id; +EXPORT_SYMBOL_GPL(xp_partition_id); + +u8 xp_region_size; +EXPORT_SYMBOL_GPL(xp_region_size); + +unsigned long (*xp_pa) (void *addr); +EXPORT_SYMBOL_GPL(xp_pa); + +unsigned long (*xp_socket_pa) (unsigned long gpa); +EXPORT_SYMBOL_GPL(xp_socket_pa); + +enum xp_retval (*xp_remote_memcpy) (unsigned long dst_gpa, + const unsigned long src_gpa, size_t len); +EXPORT_SYMBOL_GPL(xp_remote_memcpy); + +int (*xp_cpu_to_nasid) (int cpuid); +EXPORT_SYMBOL_GPL(xp_cpu_to_nasid); + +enum xp_retval (*xp_expand_memprotect) (unsigned long phys_addr, + unsigned long size); +EXPORT_SYMBOL_GPL(xp_expand_memprotect); +enum xp_retval (*xp_restrict_memprotect) (unsigned long phys_addr, + unsigned long size); +EXPORT_SYMBOL_GPL(xp_restrict_memprotect); + +/* + * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level + * users of XPC. + */ +struct xpc_registration xpc_registrations[XPC_MAX_NCHANNELS]; +EXPORT_SYMBOL_GPL(xpc_registrations); + +/* + * Initialize the XPC interface to NULL to indicate that XPC isn't loaded. + */ +struct xpc_interface xpc_interface = { }; +EXPORT_SYMBOL_GPL(xpc_interface); + +/* + * XPC calls this when it (the XPC module) has been loaded. + */ +void +xpc_set_interface(void (*connect) (int), + void (*disconnect) (int), + enum xp_retval (*send) (short, int, u32, void *, u16), + enum xp_retval (*send_notify) (short, int, u32, void *, u16, + xpc_notify_func, void *), + void (*received) (short, int, void *), + enum xp_retval (*partid_to_nasids) (short, void *)) +{ + xpc_interface.connect = connect; + xpc_interface.disconnect = disconnect; + xpc_interface.send = send; + xpc_interface.send_notify = send_notify; + xpc_interface.received = received; + xpc_interface.partid_to_nasids = partid_to_nasids; +} +EXPORT_SYMBOL_GPL(xpc_set_interface); + +/* + * XPC calls this when it (the XPC module) is being unloaded. + */ +void +xpc_clear_interface(void) +{ + memset(&xpc_interface, 0, sizeof(xpc_interface)); +} +EXPORT_SYMBOL_GPL(xpc_clear_interface); + +/* + * Register for automatic establishment of a channel connection whenever + * a partition comes up. + * + * Arguments: + * + * ch_number - channel # to register for connection. + * func - function to call for asynchronous notification of channel + * state changes (i.e., connection, disconnection, error) and + * the arrival of incoming messages. + * key - pointer to optional user-defined value that gets passed back + * to the user on any callouts made to func. + * payload_size - size in bytes of the XPC message's payload area which + * contains a user-defined message. The user should make + * this large enough to hold their largest message. + * nentries - max #of XPC message entries a message queue can contain. + * The actual number, which is determined when a connection + * is established and may be less then requested, will be + * passed to the user via the xpConnected callout. + * assigned_limit - max number of kthreads allowed to be processing + * messages (per connection) at any given instant. + * idle_limit - max number of kthreads allowed to be idle at any given + * instant. + */ +enum xp_retval +xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size, + u16 nentries, u32 assigned_limit, u32 idle_limit) +{ + struct xpc_registration *registration; + + DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS); + DBUG_ON(payload_size == 0 || nentries == 0); + DBUG_ON(func == NULL); + DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit); + + if (XPC_MSG_SIZE(payload_size) > XPC_MSG_MAX_SIZE) + return xpPayloadTooBig; + + registration = &xpc_registrations[ch_number]; + + if (mutex_lock_interruptible(®istration->mutex) != 0) + return xpInterrupted; + + /* if XPC_CHANNEL_REGISTERED(ch_number) */ + if (registration->func != NULL) { + mutex_unlock(®istration->mutex); + return xpAlreadyRegistered; + } + + /* register the channel for connection */ + registration->entry_size = XPC_MSG_SIZE(payload_size); + registration->nentries = nentries; + registration->assigned_limit = assigned_limit; + registration->idle_limit = idle_limit; + registration->key = key; + registration->func = func; + + mutex_unlock(®istration->mutex); + + if (xpc_interface.connect) + xpc_interface.connect(ch_number); + + return xpSuccess; +} +EXPORT_SYMBOL_GPL(xpc_connect); + +/* + * Remove the registration for automatic connection of the specified channel + * when a partition comes up. + * + * Before returning this xpc_disconnect() will wait for all connections on the + * specified channel have been closed/torndown. So the caller can be assured + * that they will not be receiving any more callouts from XPC to their + * function registered via xpc_connect(). + * + * Arguments: + * + * ch_number - channel # to unregister. + */ +void +xpc_disconnect(int ch_number) +{ + struct xpc_registration *registration; + + DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS); + + registration = &xpc_registrations[ch_number]; + + /* + * We've decided not to make this a down_interruptible(), since we + * figured XPC's users will just turn around and call xpc_disconnect() + * again anyways, so we might as well wait, if need be. + */ + mutex_lock(®istration->mutex); + + /* if !XPC_CHANNEL_REGISTERED(ch_number) */ + if (registration->func == NULL) { + mutex_unlock(®istration->mutex); + return; + } + + /* remove the connection registration for the specified channel */ + registration->func = NULL; + registration->key = NULL; + registration->nentries = 0; + registration->entry_size = 0; + registration->assigned_limit = 0; + registration->idle_limit = 0; + + if (xpc_interface.disconnect) + xpc_interface.disconnect(ch_number); + + mutex_unlock(®istration->mutex); + + return; +} +EXPORT_SYMBOL_GPL(xpc_disconnect); + +int __init +xp_init(void) +{ + enum xp_retval ret; + int ch_number; + + /* initialize the connection registration mutex */ + for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) + mutex_init(&xpc_registrations[ch_number].mutex); + + if (is_shub()) + ret = xp_init_sn2(); + else if (is_uv()) + ret = xp_init_uv(); + else + ret = 0; + + if (ret != xpSuccess) + return ret; + + return 0; +} + +module_init(xp_init); + +void __exit +xp_exit(void) +{ + if (is_shub()) + xp_exit_sn2(); + else if (is_uv()) + xp_exit_uv(); +} + +module_exit(xp_exit); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION("Cross Partition (XP) base"); +MODULE_LICENSE("GPL"); diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S new file mode 100644 index 000000000..e38d43319 --- /dev/null +++ b/drivers/misc/sgi-xp/xp_nofault.S @@ -0,0 +1,35 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * The xp_nofault_PIOR function takes a pointer to a remote PIO register + * and attempts to load and consume a value from it. This function + * will be registered as a nofault code block. In the event that the + * PIO read fails, the MCA handler will force the error to look + * corrected and vector to the xp_error_PIOR which will return an error. + * + * The definition of "consumption" and the time it takes for an MCA + * to surface is processor implementation specific. This code + * is sufficient on Itanium through the Montvale processor family. + * It may need to be adjusted for future processor implementations. + * + * extern int xp_nofault_PIOR(void *remote_register); + */ + + .global xp_nofault_PIOR +xp_nofault_PIOR: + mov r8=r0 // Stage a success return value + ld8.acq r9=[r32];; // PIO Read the specified register + adds r9=1,r9;; // Add to force consumption + srlz.i;; // Allow time for MCA to surface + br.ret.sptk.many b0;; // Return success + + .global xp_error_PIOR +xp_error_PIOR: + mov r8=1 // Return value of 1 + br.ret.sptk.many b0;; // Return failure diff --git a/drivers/misc/sgi-xp/xp_sn2.c b/drivers/misc/sgi-xp/xp_sn2.c new file mode 100644 index 000000000..d8e463f87 --- /dev/null +++ b/drivers/misc/sgi-xp/xp_sn2.c @@ -0,0 +1,190 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition (XP) sn2-based functions. + * + * Architecture specific implementation of common functions. + */ + +#include <linux/module.h> +#include <linux/device.h> +#include <asm/sn/bte.h> +#include <asm/sn/sn_sal.h> +#include "xp.h" + +/* + * The export of xp_nofault_PIOR needs to happen here since it is defined + * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is + * defined here. + */ +EXPORT_SYMBOL_GPL(xp_nofault_PIOR); + +u64 xp_nofault_PIOR_target; +EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target); + +/* + * Register a nofault code region which performs a cross-partition PIO read. + * If the PIO read times out, the MCA handler will consume the error and + * return to a kernel-provided instruction to indicate an error. This PIO read + * exists because it is guaranteed to timeout if the destination is down + * (amo operations do not timeout on at least some CPUs on Shubs <= v1.2, + * which unfortunately we have to work around). + */ +static enum xp_retval +xp_register_nofault_code_sn2(void) +{ + int ret; + u64 func_addr; + u64 err_func_addr; + + func_addr = *(u64 *)xp_nofault_PIOR; + err_func_addr = *(u64 *)xp_error_PIOR; + ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr, + 1, 1); + if (ret != 0) { + dev_err(xp, "can't register nofault code, error=%d\n", ret); + return xpSalError; + } + /* + * Setup the nofault PIO read target. (There is no special reason why + * SH_IPI_ACCESS was selected.) + */ + if (is_shub1()) + xp_nofault_PIOR_target = SH1_IPI_ACCESS; + else if (is_shub2()) + xp_nofault_PIOR_target = SH2_IPI_ACCESS0; + + return xpSuccess; +} + +static void +xp_unregister_nofault_code_sn2(void) +{ + u64 func_addr = *(u64 *)xp_nofault_PIOR; + u64 err_func_addr = *(u64 *)xp_error_PIOR; + + /* unregister the PIO read nofault code region */ + (void)sn_register_nofault_code(func_addr, err_func_addr, + err_func_addr, 1, 0); +} + +/* + * Convert a virtual memory address to a physical memory address. + */ +static unsigned long +xp_pa_sn2(void *addr) +{ + return __pa(addr); +} + +/* + * Convert a global physical to a socket physical address. + */ +static unsigned long +xp_socket_pa_sn2(unsigned long gpa) +{ + return gpa; +} + +/* + * Wrapper for bte_copy(). + * + * dst_pa - physical address of the destination of the transfer. + * src_pa - physical address of the source of the transfer. + * len - number of bytes to transfer from source to destination. + * + * Note: xp_remote_memcpy_sn2() should never be called while holding a spinlock. + */ +static enum xp_retval +xp_remote_memcpy_sn2(unsigned long dst_pa, const unsigned long src_pa, + size_t len) +{ + bte_result_t ret; + + ret = bte_copy(src_pa, dst_pa, len, (BTE_NOTIFY | BTE_WACQUIRE), NULL); + if (ret == BTE_SUCCESS) + return xpSuccess; + + if (is_shub2()) { + dev_err(xp, "bte_copy() on shub2 failed, error=0x%x dst_pa=" + "0x%016lx src_pa=0x%016lx len=%ld\\n", ret, dst_pa, + src_pa, len); + } else { + dev_err(xp, "bte_copy() failed, error=%d dst_pa=0x%016lx " + "src_pa=0x%016lx len=%ld\\n", ret, dst_pa, src_pa, len); + } + + return xpBteCopyError; +} + +static int +xp_cpu_to_nasid_sn2(int cpuid) +{ + return cpuid_to_nasid(cpuid); +} + +static enum xp_retval +xp_expand_memprotect_sn2(unsigned long phys_addr, unsigned long size) +{ + u64 nasid_array = 0; + int ret; + + ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1, + &nasid_array); + if (ret != 0) { + dev_err(xp, "sn_change_memprotect(,, " + "SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret); + return xpSalError; + } + return xpSuccess; +} + +static enum xp_retval +xp_restrict_memprotect_sn2(unsigned long phys_addr, unsigned long size) +{ + u64 nasid_array = 0; + int ret; + + ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0, + &nasid_array); + if (ret != 0) { + dev_err(xp, "sn_change_memprotect(,, " + "SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret); + return xpSalError; + } + return xpSuccess; +} + +enum xp_retval +xp_init_sn2(void) +{ + BUG_ON(!is_shub()); + + xp_max_npartitions = XP_MAX_NPARTITIONS_SN2; + xp_partition_id = sn_partition_id; + xp_region_size = sn_region_size; + + xp_pa = xp_pa_sn2; + xp_socket_pa = xp_socket_pa_sn2; + xp_remote_memcpy = xp_remote_memcpy_sn2; + xp_cpu_to_nasid = xp_cpu_to_nasid_sn2; + xp_expand_memprotect = xp_expand_memprotect_sn2; + xp_restrict_memprotect = xp_restrict_memprotect_sn2; + + return xp_register_nofault_code_sn2(); +} + +void +xp_exit_sn2(void) +{ + BUG_ON(!is_shub()); + + xp_unregister_nofault_code_sn2(); +} + diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c new file mode 100644 index 000000000..a0d093274 --- /dev/null +++ b/drivers/misc/sgi-xp/xp_uv.c @@ -0,0 +1,171 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition (XP) uv-based functions. + * + * Architecture specific implementation of common functions. + * + */ + +#include <linux/device.h> +#include <asm/uv/uv_hub.h> +#if defined CONFIG_X86_64 +#include <asm/uv/bios.h> +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV +#include <asm/sn/sn_sal.h> +#endif +#include "../sgi-gru/grukservices.h" +#include "xp.h" + +/* + * Convert a virtual memory address to a physical memory address. + */ +static unsigned long +xp_pa_uv(void *addr) +{ + return uv_gpa(addr); +} + +/* + * Convert a global physical to socket physical address. + */ +static unsigned long +xp_socket_pa_uv(unsigned long gpa) +{ + return uv_gpa_to_soc_phys_ram(gpa); +} + +static enum xp_retval +xp_remote_mmr_read(unsigned long dst_gpa, const unsigned long src_gpa, + size_t len) +{ + int ret; + unsigned long *dst_va = __va(uv_gpa_to_soc_phys_ram(dst_gpa)); + + BUG_ON(!uv_gpa_in_mmr_space(src_gpa)); + BUG_ON(len != 8); + + ret = gru_read_gpa(dst_va, src_gpa); + if (ret == 0) + return xpSuccess; + + dev_err(xp, "gru_read_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx " + "len=%ld\n", dst_gpa, src_gpa, len); + return xpGruCopyError; +} + + +static enum xp_retval +xp_remote_memcpy_uv(unsigned long dst_gpa, const unsigned long src_gpa, + size_t len) +{ + int ret; + + if (uv_gpa_in_mmr_space(src_gpa)) + return xp_remote_mmr_read(dst_gpa, src_gpa, len); + + ret = gru_copy_gpa(dst_gpa, src_gpa, len); + if (ret == 0) + return xpSuccess; + + dev_err(xp, "gru_copy_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx " + "len=%ld\n", dst_gpa, src_gpa, len); + return xpGruCopyError; +} + +static int +xp_cpu_to_nasid_uv(int cpuid) +{ + /* ??? Is this same as sn2 nasid in mach/part bitmaps set up by SAL? */ + return UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpuid)); +} + +static enum xp_retval +xp_expand_memprotect_uv(unsigned long phys_addr, unsigned long size) +{ + int ret; + +#if defined CONFIG_X86_64 + ret = uv_bios_change_memprotect(phys_addr, size, UV_MEMPROT_ALLOW_RW); + if (ret != BIOS_STATUS_SUCCESS) { + dev_err(xp, "uv_bios_change_memprotect(,, " + "UV_MEMPROT_ALLOW_RW) failed, ret=%d\n", ret); + return xpBiosError; + } + +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + u64 nasid_array; + + ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1, + &nasid_array); + if (ret != 0) { + dev_err(xp, "sn_change_memprotect(,, " + "SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret); + return xpSalError; + } +#else + #error not a supported configuration +#endif + return xpSuccess; +} + +static enum xp_retval +xp_restrict_memprotect_uv(unsigned long phys_addr, unsigned long size) +{ + int ret; + +#if defined CONFIG_X86_64 + ret = uv_bios_change_memprotect(phys_addr, size, + UV_MEMPROT_RESTRICT_ACCESS); + if (ret != BIOS_STATUS_SUCCESS) { + dev_err(xp, "uv_bios_change_memprotect(,, " + "UV_MEMPROT_RESTRICT_ACCESS) failed, ret=%d\n", ret); + return xpBiosError; + } + +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + u64 nasid_array; + + ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0, + &nasid_array); + if (ret != 0) { + dev_err(xp, "sn_change_memprotect(,, " + "SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret); + return xpSalError; + } +#else + #error not a supported configuration +#endif + return xpSuccess; +} + +enum xp_retval +xp_init_uv(void) +{ + BUG_ON(!is_uv()); + + xp_max_npartitions = XP_MAX_NPARTITIONS_UV; + xp_partition_id = sn_partition_id; + xp_region_size = sn_region_size; + + xp_pa = xp_pa_uv; + xp_socket_pa = xp_socket_pa_uv; + xp_remote_memcpy = xp_remote_memcpy_uv; + xp_cpu_to_nasid = xp_cpu_to_nasid_uv; + xp_expand_memprotect = xp_expand_memprotect_uv; + xp_restrict_memprotect = xp_restrict_memprotect_uv; + + return xpSuccess; +} + +void +xp_exit_uv(void) +{ + BUG_ON(!is_uv()); +} diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h new file mode 100644 index 000000000..b94d5f767 --- /dev/null +++ b/drivers/misc/sgi-xp/xpc.h @@ -0,0 +1,1004 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) structures and macros. + */ + +#ifndef _DRIVERS_MISC_SGIXP_XPC_H +#define _DRIVERS_MISC_SGIXP_XPC_H + +#include <linux/wait.h> +#include <linux/completion.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include "xp.h" + +/* + * XPC Version numbers consist of a major and minor number. XPC can always + * talk to versions with same major #, and never talk to versions with a + * different major #. + */ +#define _XPC_VERSION(_maj, _min) (((_maj) << 4) | ((_min) & 0xf)) +#define XPC_VERSION_MAJOR(_v) ((_v) >> 4) +#define XPC_VERSION_MINOR(_v) ((_v) & 0xf) + +/* define frequency of the heartbeat and frequency how often it's checked */ +#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */ +#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */ + +/* define the process name of HB checker and the CPU it is pinned to */ +#define XPC_HB_CHECK_THREAD_NAME "xpc_hb" +#define XPC_HB_CHECK_CPU 0 + +/* define the process name of the discovery thread */ +#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery" + +/* + * the reserved page + * + * SAL reserves one page of memory per partition for XPC. Though a full page + * in length (16384 bytes), its starting address is not page aligned, but it + * is cacheline aligned. The reserved page consists of the following: + * + * reserved page header + * + * The first two 64-byte cachelines of the reserved page contain the + * header (struct xpc_rsvd_page). Before SAL initialization has completed, + * SAL has set up the following fields of the reserved page header: + * SAL_signature, SAL_version, SAL_partid, and SAL_nasids_size. The + * other fields are set up by XPC. (xpc_rsvd_page points to the local + * partition's reserved page.) + * + * part_nasids mask + * mach_nasids mask + * + * SAL also sets up two bitmaps (or masks), one that reflects the actual + * nasids in this partition (part_nasids), and the other that reflects + * the actual nasids in the entire machine (mach_nasids). We're only + * interested in the even numbered nasids (which contain the processors + * and/or memory), so we only need half as many bits to represent the + * nasids. When mapping nasid to bit in a mask (or bit to nasid) be sure + * to either divide or multiply by 2. The part_nasids mask is located + * starting at the first cacheline following the reserved page header. The + * mach_nasids mask follows right after the part_nasids mask. The size in + * bytes of each mask is reflected by the reserved page header field + * 'SAL_nasids_size'. (Local partition's mask pointers are xpc_part_nasids + * and xpc_mach_nasids.) + * + * vars (ia64-sn2 only) + * vars part (ia64-sn2 only) + * + * Immediately following the mach_nasids mask are the XPC variables + * required by other partitions. First are those that are generic to all + * partitions (vars), followed on the next available cacheline by those + * which are partition specific (vars part). These are setup by XPC. + * (Local partition's vars pointers are xpc_vars and xpc_vars_part.) + * + * Note: Until 'ts_jiffies' is set non-zero, the partition XPC code has not been + * initialized. + */ +struct xpc_rsvd_page { + u64 SAL_signature; /* SAL: unique signature */ + u64 SAL_version; /* SAL: version */ + short SAL_partid; /* SAL: partition ID */ + short max_npartitions; /* value of XPC_MAX_PARTITIONS */ + u8 version; + u8 pad1[3]; /* align to next u64 in 1st 64-byte cacheline */ + unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */ + union { + struct { + unsigned long vars_pa; /* phys addr */ + } sn2; + struct { + unsigned long heartbeat_gpa; /* phys addr */ + unsigned long activate_gru_mq_desc_gpa; /* phys addr */ + } uv; + } sn; + u64 pad2[9]; /* align to last u64 in 2nd 64-byte cacheline */ + u64 SAL_nasids_size; /* SAL: size of each nasid mask in bytes */ +}; + +#define XPC_RP_VERSION _XPC_VERSION(3, 0) /* version 3.0 of the reserved page */ + +/* + * Define the structures by which XPC variables can be exported to other + * partitions. (There are two: struct xpc_vars and struct xpc_vars_part) + */ + +/* + * The following structure describes the partition generic variables + * needed by other partitions in order to properly initialize. + * + * struct xpc_vars version number also applies to struct xpc_vars_part. + * Changes to either structure and/or related functionality should be + * reflected by incrementing either the major or minor version numbers + * of struct xpc_vars. + */ +struct xpc_vars_sn2 { + u8 version; + u64 heartbeat; + DECLARE_BITMAP(heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2); + u64 heartbeat_offline; /* if 0, heartbeat should be changing */ + int activate_IRQ_nasid; + int activate_IRQ_phys_cpuid; + unsigned long vars_part_pa; + unsigned long amos_page_pa;/* paddr of page of amos from MSPEC driver */ + struct amo *amos_page; /* vaddr of page of amos from MSPEC driver */ +}; + +#define XPC_V_VERSION _XPC_VERSION(3, 1) /* version 3.1 of the cross vars */ + +/* + * The following structure describes the per partition specific variables. + * + * An array of these structures, one per partition, will be defined. As a + * partition becomes active XPC will copy the array entry corresponding to + * itself from that partition. It is desirable that the size of this structure + * evenly divides into a 128-byte cacheline, such that none of the entries in + * this array crosses a 128-byte cacheline boundary. As it is now, each entry + * occupies 64-bytes. + */ +struct xpc_vars_part_sn2 { + u64 magic; + + unsigned long openclose_args_pa; /* phys addr of open and close args */ + unsigned long GPs_pa; /* physical address of Get/Put values */ + + unsigned long chctl_amo_pa; /* physical address of chctl flags' amo */ + + int notify_IRQ_nasid; /* nasid of where to send notify IRQs */ + int notify_IRQ_phys_cpuid; /* CPUID of where to send notify IRQs */ + + u8 nchannels; /* #of defined channels supported */ + + u8 reserved[23]; /* pad to a full 64 bytes */ +}; + +/* + * The vars_part MAGIC numbers play a part in the first contact protocol. + * + * MAGIC1 indicates that the per partition specific variables for a remote + * partition have been initialized by this partition. + * + * MAGIC2 indicates that this partition has pulled the remote partititions + * per partition variables that pertain to this partition. + */ +#define XPC_VP_MAGIC1_SN2 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */ +#define XPC_VP_MAGIC2_SN2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */ + +/* the reserved page sizes and offsets */ + +#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page)) +#define XPC_RP_VARS_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_vars_sn2)) + +#define XPC_RP_PART_NASIDS(_rp) ((unsigned long *)((u8 *)(_rp) + \ + XPC_RP_HEADER_SIZE)) +#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + \ + xpc_nasid_mask_nlongs) +#define XPC_RP_VARS(_rp) ((struct xpc_vars_sn2 *) \ + (XPC_RP_MACH_NASIDS(_rp) + \ + xpc_nasid_mask_nlongs)) + + +/* + * The following structure describes the partition's heartbeat info which + * will be periodically read by other partitions to determine whether this + * XPC is still 'alive'. + */ +struct xpc_heartbeat_uv { + unsigned long value; + unsigned long offline; /* if 0, heartbeat should be changing */ +}; + +/* + * Info pertinent to a GRU message queue using a watch list for irq generation. + */ +struct xpc_gru_mq_uv { + void *address; /* address of GRU message queue */ + unsigned int order; /* size of GRU message queue as a power of 2 */ + int irq; /* irq raised when message is received in mq */ + int mmr_blade; /* blade where watchlist was allocated from */ + unsigned long mmr_offset; /* offset of irq mmr located on mmr_blade */ + unsigned long mmr_value; /* value of irq mmr located on mmr_blade */ + int watchlist_num; /* number of watchlist allocatd by BIOS */ + void *gru_mq_desc; /* opaque structure used by the GRU driver */ +}; + +/* + * The activate_mq is used to send/receive GRU messages that affect XPC's + * partition active state and channel state. This is uv only. + */ +struct xpc_activate_mq_msghdr_uv { + unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */ + short partid; /* sender's partid */ + u8 act_state; /* sender's act_state at time msg sent */ + u8 type; /* message's type */ + unsigned long rp_ts_jiffies; /* timestamp of sender's rp setup by XPC */ +}; + +/* activate_mq defined message types */ +#define XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV 0 + +#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV 1 +#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV 2 + +#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV 3 +#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV 4 +#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV 5 +#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV 6 +#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV 7 + +#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV 8 +#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV 9 + +struct xpc_activate_mq_msg_uv { + struct xpc_activate_mq_msghdr_uv hdr; +}; + +struct xpc_activate_mq_msg_activate_req_uv { + struct xpc_activate_mq_msghdr_uv hdr; + unsigned long rp_gpa; + unsigned long heartbeat_gpa; + unsigned long activate_gru_mq_desc_gpa; +}; + +struct xpc_activate_mq_msg_deactivate_req_uv { + struct xpc_activate_mq_msghdr_uv hdr; + enum xp_retval reason; +}; + +struct xpc_activate_mq_msg_chctl_closerequest_uv { + struct xpc_activate_mq_msghdr_uv hdr; + short ch_number; + enum xp_retval reason; +}; + +struct xpc_activate_mq_msg_chctl_closereply_uv { + struct xpc_activate_mq_msghdr_uv hdr; + short ch_number; +}; + +struct xpc_activate_mq_msg_chctl_openrequest_uv { + struct xpc_activate_mq_msghdr_uv hdr; + short ch_number; + short entry_size; /* size of notify_mq's GRU messages */ + short local_nentries; /* ??? Is this needed? What is? */ +}; + +struct xpc_activate_mq_msg_chctl_openreply_uv { + struct xpc_activate_mq_msghdr_uv hdr; + short ch_number; + short remote_nentries; /* ??? Is this needed? What is? */ + short local_nentries; /* ??? Is this needed? What is? */ + unsigned long notify_gru_mq_desc_gpa; +}; + +struct xpc_activate_mq_msg_chctl_opencomplete_uv { + struct xpc_activate_mq_msghdr_uv hdr; + short ch_number; +}; + +/* + * Functions registered by add_timer() or called by kernel_thread() only + * allow for a single 64-bit argument. The following macros can be used to + * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from + * the passed argument. + */ +#define XPC_PACK_ARGS(_arg1, _arg2) \ + ((((u64)_arg1) & 0xffffffff) | \ + ((((u64)_arg2) & 0xffffffff) << 32)) + +#define XPC_UNPACK_ARG1(_args) (((u64)_args) & 0xffffffff) +#define XPC_UNPACK_ARG2(_args) ((((u64)_args) >> 32) & 0xffffffff) + +/* + * Define a Get/Put value pair (pointers) used with a message queue. + */ +struct xpc_gp_sn2 { + s64 get; /* Get value */ + s64 put; /* Put value */ +}; + +#define XPC_GP_SIZE \ + L1_CACHE_ALIGN(sizeof(struct xpc_gp_sn2) * XPC_MAX_NCHANNELS) + +/* + * Define a structure that contains arguments associated with opening and + * closing a channel. + */ +struct xpc_openclose_args { + u16 reason; /* reason why channel is closing */ + u16 entry_size; /* sizeof each message entry */ + u16 remote_nentries; /* #of message entries in remote msg queue */ + u16 local_nentries; /* #of message entries in local msg queue */ + unsigned long local_msgqueue_pa; /* phys addr of local message queue */ +}; + +#define XPC_OPENCLOSE_ARGS_SIZE \ + L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * \ + XPC_MAX_NCHANNELS) + + +/* + * Structures to define a fifo singly-linked list. + */ + +struct xpc_fifo_entry_uv { + struct xpc_fifo_entry_uv *next; +}; + +struct xpc_fifo_head_uv { + struct xpc_fifo_entry_uv *first; + struct xpc_fifo_entry_uv *last; + spinlock_t lock; + int n_entries; +}; + +/* + * Define a sn2 styled message. + * + * A user-defined message resides in the payload area. The max size of the + * payload is defined by the user via xpc_connect(). + * + * The size of a message entry (within a message queue) must be a 128-byte + * cacheline sized multiple in order to facilitate the BTE transfer of messages + * from one message queue to another. + */ +struct xpc_msg_sn2 { + u8 flags; /* FOR XPC INTERNAL USE ONLY */ + u8 reserved[7]; /* FOR XPC INTERNAL USE ONLY */ + s64 number; /* FOR XPC INTERNAL USE ONLY */ + + u64 payload; /* user defined portion of message */ +}; + +/* struct xpc_msg_sn2 flags */ + +#define XPC_M_SN2_DONE 0x01 /* msg has been received/consumed */ +#define XPC_M_SN2_READY 0x02 /* msg is ready to be sent */ +#define XPC_M_SN2_INTERRUPT 0x04 /* send interrupt when msg consumed */ + +/* + * The format of a uv XPC notify_mq GRU message is as follows: + * + * A user-defined message resides in the payload area. The max size of the + * payload is defined by the user via xpc_connect(). + * + * The size of a message (payload and header) sent via the GRU must be either 1 + * or 2 GRU_CACHE_LINE_BYTES in length. + */ + +struct xpc_notify_mq_msghdr_uv { + union { + unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */ + struct xpc_fifo_entry_uv next; /* FOR XPC INTERNAL USE ONLY */ + } u; + short partid; /* FOR XPC INTERNAL USE ONLY */ + u8 ch_number; /* FOR XPC INTERNAL USE ONLY */ + u8 size; /* FOR XPC INTERNAL USE ONLY */ + unsigned int msg_slot_number; /* FOR XPC INTERNAL USE ONLY */ +}; + +struct xpc_notify_mq_msg_uv { + struct xpc_notify_mq_msghdr_uv hdr; + unsigned long payload; +}; + +/* + * Define sn2's notify entry. + * + * This is used to notify a message's sender that their message was received + * and consumed by the intended recipient. + */ +struct xpc_notify_sn2 { + u8 type; /* type of notification */ + + /* the following two fields are only used if type == XPC_N_CALL */ + xpc_notify_func func; /* user's notify function */ + void *key; /* pointer to user's key */ +}; + +/* struct xpc_notify_sn2 type of notification */ + +#define XPC_N_CALL 0x01 /* notify function provided by user */ + +/* + * Define uv's version of the notify entry. It additionally is used to allocate + * a msg slot on the remote partition into which is copied a sent message. + */ +struct xpc_send_msg_slot_uv { + struct xpc_fifo_entry_uv next; + unsigned int msg_slot_number; + xpc_notify_func func; /* user's notify function */ + void *key; /* pointer to user's key */ +}; + +/* + * Define the structure that manages all the stuff required by a channel. In + * particular, they are used to manage the messages sent across the channel. + * + * This structure is private to a partition, and is NOT shared across the + * partition boundary. + * + * There is an array of these structures for each remote partition. It is + * allocated at the time a partition becomes active. The array contains one + * of these structures for each potential channel connection to that partition. + */ + +/* + * The following is sn2 only. + * + * Each channel structure manages two message queues (circular buffers). + * They are allocated at the time a channel connection is made. One of + * these message queues (local_msgqueue) holds the locally created messages + * that are destined for the remote partition. The other of these message + * queues (remote_msgqueue) is a locally cached copy of the remote partition's + * own local_msgqueue. + * + * The following is a description of the Get/Put pointers used to manage these + * two message queues. Consider the local_msgqueue to be on one partition + * and the remote_msgqueue to be its cached copy on another partition. A + * description of what each of the lettered areas contains is included. + * + * + * local_msgqueue remote_msgqueue + * + * |/////////| |/////////| + * w_remote_GP.get --> +---------+ |/////////| + * | F | |/////////| + * remote_GP.get --> +---------+ +---------+ <-- local_GP->get + * | | | | + * | | | E | + * | | | | + * | | +---------+ <-- w_local_GP.get + * | B | |/////////| + * | | |////D////| + * | | |/////////| + * | | +---------+ <-- w_remote_GP.put + * | | |////C////| + * local_GP->put --> +---------+ +---------+ <-- remote_GP.put + * | | |/////////| + * | A | |/////////| + * | | |/////////| + * w_local_GP.put --> +---------+ |/////////| + * |/////////| |/////////| + * + * + * ( remote_GP.[get|put] are cached copies of the remote + * partition's local_GP->[get|put], and thus their values can + * lag behind their counterparts on the remote partition. ) + * + * + * A - Messages that have been allocated, but have not yet been sent to the + * remote partition. + * + * B - Messages that have been sent, but have not yet been acknowledged by the + * remote partition as having been received. + * + * C - Area that needs to be prepared for the copying of sent messages, by + * the clearing of the message flags of any previously received messages. + * + * D - Area into which sent messages are to be copied from the remote + * partition's local_msgqueue and then delivered to their intended + * recipients. [ To allow for a multi-message copy, another pointer + * (next_msg_to_pull) has been added to keep track of the next message + * number needing to be copied (pulled). It chases after w_remote_GP.put. + * Any messages lying between w_local_GP.get and next_msg_to_pull have + * been copied and are ready to be delivered. ] + * + * E - Messages that have been copied and delivered, but have not yet been + * acknowledged by the recipient as having been received. + * + * F - Messages that have been acknowledged, but XPC has not yet notified the + * sender that the message was received by its intended recipient. + * This is also an area that needs to be prepared for the allocating of + * new messages, by the clearing of the message flags of the acknowledged + * messages. + */ + +struct xpc_channel_sn2 { + struct xpc_openclose_args *local_openclose_args; /* args passed on */ + /* opening or closing of channel */ + + void *local_msgqueue_base; /* base address of kmalloc'd space */ + struct xpc_msg_sn2 *local_msgqueue; /* local message queue */ + void *remote_msgqueue_base; /* base address of kmalloc'd space */ + struct xpc_msg_sn2 *remote_msgqueue; /* cached copy of remote */ + /* partition's local message queue */ + unsigned long remote_msgqueue_pa; /* phys addr of remote partition's */ + /* local message queue */ + + struct xpc_notify_sn2 *notify_queue;/* notify queue for messages sent */ + + /* various flavors of local and remote Get/Put values */ + + struct xpc_gp_sn2 *local_GP; /* local Get/Put values */ + struct xpc_gp_sn2 remote_GP; /* remote Get/Put values */ + struct xpc_gp_sn2 w_local_GP; /* working local Get/Put values */ + struct xpc_gp_sn2 w_remote_GP; /* working remote Get/Put values */ + s64 next_msg_to_pull; /* Put value of next msg to pull */ + + struct mutex msg_to_pull_mutex; /* next msg to pull serialization */ +}; + +struct xpc_channel_uv { + void *cached_notify_gru_mq_desc; /* remote partition's notify mq's */ + /* gru mq descriptor */ + + struct xpc_send_msg_slot_uv *send_msg_slots; + void *recv_msg_slots; /* each slot will hold a xpc_notify_mq_msg_uv */ + /* structure plus the user's payload */ + + struct xpc_fifo_head_uv msg_slot_free_list; + struct xpc_fifo_head_uv recv_msg_list; /* deliverable payloads */ +}; + +struct xpc_channel { + short partid; /* ID of remote partition connected */ + spinlock_t lock; /* lock for updating this structure */ + unsigned int flags; /* general flags */ + + enum xp_retval reason; /* reason why channel is disconnect'g */ + int reason_line; /* line# disconnect initiated from */ + + u16 number; /* channel # */ + + u16 entry_size; /* sizeof each msg entry */ + u16 local_nentries; /* #of msg entries in local msg queue */ + u16 remote_nentries; /* #of msg entries in remote msg queue */ + + atomic_t references; /* #of external references to queues */ + + atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */ + wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */ + + u8 delayed_chctl_flags; /* chctl flags received, but delayed */ + /* action until channel disconnected */ + + atomic_t n_to_notify; /* #of msg senders to notify */ + + xpc_channel_func func; /* user's channel function */ + void *key; /* pointer to user's key */ + + struct completion wdisconnect_wait; /* wait for channel disconnect */ + + /* kthread management related fields */ + + atomic_t kthreads_assigned; /* #of kthreads assigned to channel */ + u32 kthreads_assigned_limit; /* limit on #of kthreads assigned */ + atomic_t kthreads_idle; /* #of kthreads idle waiting for work */ + u32 kthreads_idle_limit; /* limit on #of kthreads idle */ + atomic_t kthreads_active; /* #of kthreads actively working */ + + wait_queue_head_t idle_wq; /* idle kthread wait queue */ + + union { + struct xpc_channel_sn2 sn2; + struct xpc_channel_uv uv; + } sn; + +} ____cacheline_aligned; + +/* struct xpc_channel flags */ + +#define XPC_C_WASCONNECTED 0x00000001 /* channel was connected */ + +#define XPC_C_ROPENCOMPLETE 0x00000002 /* remote open channel complete */ +#define XPC_C_OPENCOMPLETE 0x00000004 /* local open channel complete */ +#define XPC_C_ROPENREPLY 0x00000008 /* remote open channel reply */ +#define XPC_C_OPENREPLY 0x00000010 /* local open channel reply */ +#define XPC_C_ROPENREQUEST 0x00000020 /* remote open channel request */ +#define XPC_C_OPENREQUEST 0x00000040 /* local open channel request */ + +#define XPC_C_SETUP 0x00000080 /* channel's msgqueues are alloc'd */ +#define XPC_C_CONNECTEDCALLOUT 0x00000100 /* connected callout initiated */ +#define XPC_C_CONNECTEDCALLOUT_MADE \ + 0x00000200 /* connected callout completed */ +#define XPC_C_CONNECTED 0x00000400 /* local channel is connected */ +#define XPC_C_CONNECTING 0x00000800 /* channel is being connected */ + +#define XPC_C_RCLOSEREPLY 0x00001000 /* remote close channel reply */ +#define XPC_C_CLOSEREPLY 0x00002000 /* local close channel reply */ +#define XPC_C_RCLOSEREQUEST 0x00004000 /* remote close channel request */ +#define XPC_C_CLOSEREQUEST 0x00008000 /* local close channel request */ + +#define XPC_C_DISCONNECTED 0x00010000 /* channel is disconnected */ +#define XPC_C_DISCONNECTING 0x00020000 /* channel is being disconnected */ +#define XPC_C_DISCONNECTINGCALLOUT \ + 0x00040000 /* disconnecting callout initiated */ +#define XPC_C_DISCONNECTINGCALLOUT_MADE \ + 0x00080000 /* disconnecting callout completed */ +#define XPC_C_WDISCONNECT 0x00100000 /* waiting for channel disconnect */ + +/* + * The channel control flags (chctl) union consists of a 64-bit variable which + * is divided up into eight bytes, ordered from right to left. Byte zero + * pertains to channel 0, byte one to channel 1, and so on. Each channel's byte + * can have one or more of the chctl flags set in it. + */ + +union xpc_channel_ctl_flags { + u64 all_flags; + u8 flags[XPC_MAX_NCHANNELS]; +}; + +/* chctl flags */ +#define XPC_CHCTL_CLOSEREQUEST 0x01 +#define XPC_CHCTL_CLOSEREPLY 0x02 +#define XPC_CHCTL_OPENREQUEST 0x04 +#define XPC_CHCTL_OPENREPLY 0x08 +#define XPC_CHCTL_OPENCOMPLETE 0x10 +#define XPC_CHCTL_MSGREQUEST 0x20 + +#define XPC_OPENCLOSE_CHCTL_FLAGS \ + (XPC_CHCTL_CLOSEREQUEST | XPC_CHCTL_CLOSEREPLY | \ + XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY | \ + XPC_CHCTL_OPENCOMPLETE) +#define XPC_MSG_CHCTL_FLAGS XPC_CHCTL_MSGREQUEST + +static inline int +xpc_any_openclose_chctl_flags_set(union xpc_channel_ctl_flags *chctl) +{ + int ch_number; + + for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) { + if (chctl->flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS) + return 1; + } + return 0; +} + +static inline int +xpc_any_msg_chctl_flags_set(union xpc_channel_ctl_flags *chctl) +{ + int ch_number; + + for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) { + if (chctl->flags[ch_number] & XPC_MSG_CHCTL_FLAGS) + return 1; + } + return 0; +} + +/* + * Manage channels on a partition basis. There is one of these structures + * for each partition (a partition will never utilize the structure that + * represents itself). + */ + +struct xpc_partition_sn2 { + unsigned long remote_amos_page_pa; /* paddr of partition's amos page */ + int activate_IRQ_nasid; /* active partition's act/deact nasid */ + int activate_IRQ_phys_cpuid; /* active part's act/deact phys cpuid */ + + unsigned long remote_vars_pa; /* phys addr of partition's vars */ + unsigned long remote_vars_part_pa; /* paddr of partition's vars part */ + u8 remote_vars_version; /* version# of partition's vars */ + + void *local_GPs_base; /* base address of kmalloc'd space */ + struct xpc_gp_sn2 *local_GPs; /* local Get/Put values */ + void *remote_GPs_base; /* base address of kmalloc'd space */ + struct xpc_gp_sn2 *remote_GPs; /* copy of remote partition's local */ + /* Get/Put values */ + unsigned long remote_GPs_pa; /* phys addr of remote partition's local */ + /* Get/Put values */ + + void *local_openclose_args_base; /* base address of kmalloc'd space */ + struct xpc_openclose_args *local_openclose_args; /* local's args */ + unsigned long remote_openclose_args_pa; /* phys addr of remote's args */ + + int notify_IRQ_nasid; /* nasid of where to send notify IRQs */ + int notify_IRQ_phys_cpuid; /* CPUID of where to send notify IRQs */ + char notify_IRQ_owner[8]; /* notify IRQ's owner's name */ + + struct amo *remote_chctl_amo_va; /* addr of remote chctl flags' amo */ + struct amo *local_chctl_amo_va; /* address of chctl flags' amo */ + + struct timer_list dropped_notify_IRQ_timer; /* dropped IRQ timer */ +}; + +struct xpc_partition_uv { + unsigned long heartbeat_gpa; /* phys addr of partition's heartbeat */ + struct xpc_heartbeat_uv cached_heartbeat; /* cached copy of */ + /* partition's heartbeat */ + unsigned long activate_gru_mq_desc_gpa; /* phys addr of parititon's */ + /* activate mq's gru mq */ + /* descriptor */ + void *cached_activate_gru_mq_desc; /* cached copy of partition's */ + /* activate mq's gru mq descriptor */ + struct mutex cached_activate_gru_mq_desc_mutex; + spinlock_t flags_lock; /* protect updating of flags */ + unsigned int flags; /* general flags */ + u8 remote_act_state; /* remote partition's act_state */ + u8 act_state_req; /* act_state request from remote partition */ + enum xp_retval reason; /* reason for deactivate act_state request */ +}; + +/* struct xpc_partition_uv flags */ + +#define XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV 0x00000001 +#define XPC_P_ENGAGED_UV 0x00000002 + +/* struct xpc_partition_uv act_state change requests */ + +#define XPC_P_ASR_ACTIVATE_UV 0x01 +#define XPC_P_ASR_REACTIVATE_UV 0x02 +#define XPC_P_ASR_DEACTIVATE_UV 0x03 + +struct xpc_partition { + + /* XPC HB infrastructure */ + + u8 remote_rp_version; /* version# of partition's rsvd pg */ + unsigned long remote_rp_ts_jiffies; /* timestamp when rsvd pg setup */ + unsigned long remote_rp_pa; /* phys addr of partition's rsvd pg */ + u64 last_heartbeat; /* HB at last read */ + u32 activate_IRQ_rcvd; /* IRQs since activation */ + spinlock_t act_lock; /* protect updating of act_state */ + u8 act_state; /* from XPC HB viewpoint */ + enum xp_retval reason; /* reason partition is deactivating */ + int reason_line; /* line# deactivation initiated from */ + + unsigned long disengage_timeout; /* timeout in jiffies */ + struct timer_list disengage_timer; + + /* XPC infrastructure referencing and teardown control */ + + u8 setup_state; /* infrastructure setup state */ + wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */ + atomic_t references; /* #of references to infrastructure */ + + u8 nchannels; /* #of defined channels supported */ + atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */ + atomic_t nchannels_engaged; /* #of channels engaged with remote part */ + struct xpc_channel *channels; /* array of channel structures */ + + /* fields used for managing channel avialability and activity */ + + union xpc_channel_ctl_flags chctl; /* chctl flags yet to be processed */ + spinlock_t chctl_lock; /* chctl flags lock */ + + void *remote_openclose_args_base; /* base address of kmalloc'd space */ + struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */ + /* args */ + + /* channel manager related fields */ + + atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */ + wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */ + + union { + struct xpc_partition_sn2 sn2; + struct xpc_partition_uv uv; + } sn; + +} ____cacheline_aligned; + +struct xpc_arch_operations { + int (*setup_partitions) (void); + void (*teardown_partitions) (void); + void (*process_activate_IRQ_rcvd) (void); + enum xp_retval (*get_partition_rsvd_page_pa) + (void *, u64 *, unsigned long *, size_t *); + int (*setup_rsvd_page) (struct xpc_rsvd_page *); + + void (*allow_hb) (short); + void (*disallow_hb) (short); + void (*disallow_all_hbs) (void); + void (*increment_heartbeat) (void); + void (*offline_heartbeat) (void); + void (*online_heartbeat) (void); + void (*heartbeat_init) (void); + void (*heartbeat_exit) (void); + enum xp_retval (*get_remote_heartbeat) (struct xpc_partition *); + + void (*request_partition_activation) (struct xpc_rsvd_page *, + unsigned long, int); + void (*request_partition_reactivation) (struct xpc_partition *); + void (*request_partition_deactivation) (struct xpc_partition *); + void (*cancel_partition_deactivation_request) (struct xpc_partition *); + enum xp_retval (*setup_ch_structures) (struct xpc_partition *); + void (*teardown_ch_structures) (struct xpc_partition *); + + enum xp_retval (*make_first_contact) (struct xpc_partition *); + + u64 (*get_chctl_all_flags) (struct xpc_partition *); + void (*send_chctl_closerequest) (struct xpc_channel *, unsigned long *); + void (*send_chctl_closereply) (struct xpc_channel *, unsigned long *); + void (*send_chctl_openrequest) (struct xpc_channel *, unsigned long *); + void (*send_chctl_openreply) (struct xpc_channel *, unsigned long *); + void (*send_chctl_opencomplete) (struct xpc_channel *, unsigned long *); + void (*process_msg_chctl_flags) (struct xpc_partition *, int); + + enum xp_retval (*save_remote_msgqueue_pa) (struct xpc_channel *, + unsigned long); + + enum xp_retval (*setup_msg_structures) (struct xpc_channel *); + void (*teardown_msg_structures) (struct xpc_channel *); + + void (*indicate_partition_engaged) (struct xpc_partition *); + void (*indicate_partition_disengaged) (struct xpc_partition *); + void (*assume_partition_disengaged) (short); + int (*partition_engaged) (short); + int (*any_partition_engaged) (void); + + int (*n_of_deliverable_payloads) (struct xpc_channel *); + enum xp_retval (*send_payload) (struct xpc_channel *, u32, void *, + u16, u8, xpc_notify_func, void *); + void *(*get_deliverable_payload) (struct xpc_channel *); + void (*received_payload) (struct xpc_channel *, void *); + void (*notify_senders_of_disconnect) (struct xpc_channel *); +}; + +/* struct xpc_partition act_state values (for XPC HB) */ + +#define XPC_P_AS_INACTIVE 0x00 /* partition is not active */ +#define XPC_P_AS_ACTIVATION_REQ 0x01 /* created thread to activate */ +#define XPC_P_AS_ACTIVATING 0x02 /* activation thread started */ +#define XPC_P_AS_ACTIVE 0x03 /* xpc_partition_up() was called */ +#define XPC_P_AS_DEACTIVATING 0x04 /* partition deactivation initiated */ + +#define XPC_DEACTIVATE_PARTITION(_p, _reason) \ + xpc_deactivate_partition(__LINE__, (_p), (_reason)) + +/* struct xpc_partition setup_state values */ + +#define XPC_P_SS_UNSET 0x00 /* infrastructure was never setup */ +#define XPC_P_SS_SETUP 0x01 /* infrastructure is setup */ +#define XPC_P_SS_WTEARDOWN 0x02 /* waiting to teardown infrastructure */ +#define XPC_P_SS_TORNDOWN 0x03 /* infrastructure is torndown */ + +/* + * struct xpc_partition_sn2's dropped notify IRQ timer is set to wait the + * following interval #of seconds before checking for dropped notify IRQs. + * These can occur whenever an IRQ's associated amo write doesn't complete + * until after the IRQ was received. + */ +#define XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL (0.25 * HZ) + +/* number of seconds to wait for other partitions to disengage */ +#define XPC_DISENGAGE_DEFAULT_TIMELIMIT 90 + +/* interval in seconds to print 'waiting deactivation' messages */ +#define XPC_DEACTIVATE_PRINTMSG_INTERVAL 10 + +#define XPC_PARTID(_p) ((short)((_p) - &xpc_partitions[0])) + +/* found in xp_main.c */ +extern struct xpc_registration xpc_registrations[]; + +/* found in xpc_main.c */ +extern struct device *xpc_part; +extern struct device *xpc_chan; +extern struct xpc_arch_operations xpc_arch_ops; +extern int xpc_disengage_timelimit; +extern int xpc_disengage_timedout; +extern int xpc_activate_IRQ_rcvd; +extern spinlock_t xpc_activate_IRQ_rcvd_lock; +extern wait_queue_head_t xpc_activate_IRQ_wq; +extern void *xpc_kzalloc_cacheline_aligned(size_t, gfp_t, void **); +extern void xpc_activate_partition(struct xpc_partition *); +extern void xpc_activate_kthreads(struct xpc_channel *, int); +extern void xpc_create_kthreads(struct xpc_channel *, int, int); +extern void xpc_disconnect_wait(int); + +/* found in xpc_sn2.c */ +extern int xpc_init_sn2(void); +extern void xpc_exit_sn2(void); + +/* found in xpc_uv.c */ +extern int xpc_init_uv(void); +extern void xpc_exit_uv(void); + +/* found in xpc_partition.c */ +extern int xpc_exiting; +extern int xpc_nasid_mask_nlongs; +extern struct xpc_rsvd_page *xpc_rsvd_page; +extern unsigned long *xpc_mach_nasids; +extern struct xpc_partition *xpc_partitions; +extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **); +extern int xpc_setup_rsvd_page(void); +extern void xpc_teardown_rsvd_page(void); +extern int xpc_identify_activate_IRQ_sender(void); +extern int xpc_partition_disengaged(struct xpc_partition *); +extern enum xp_retval xpc_mark_partition_active(struct xpc_partition *); +extern void xpc_mark_partition_inactive(struct xpc_partition *); +extern void xpc_discovery(void); +extern enum xp_retval xpc_get_remote_rp(int, unsigned long *, + struct xpc_rsvd_page *, + unsigned long *); +extern void xpc_deactivate_partition(const int, struct xpc_partition *, + enum xp_retval); +extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *); + +/* found in xpc_channel.c */ +extern void xpc_initiate_connect(int); +extern void xpc_initiate_disconnect(int); +extern enum xp_retval xpc_allocate_msg_wait(struct xpc_channel *); +extern enum xp_retval xpc_initiate_send(short, int, u32, void *, u16); +extern enum xp_retval xpc_initiate_send_notify(short, int, u32, void *, u16, + xpc_notify_func, void *); +extern void xpc_initiate_received(short, int, void *); +extern void xpc_process_sent_chctl_flags(struct xpc_partition *); +extern void xpc_connected_callout(struct xpc_channel *); +extern void xpc_deliver_payload(struct xpc_channel *); +extern void xpc_disconnect_channel(const int, struct xpc_channel *, + enum xp_retval, unsigned long *); +extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval); +extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval); + +static inline void +xpc_wakeup_channel_mgr(struct xpc_partition *part) +{ + if (atomic_inc_return(&part->channel_mgr_requests) == 1) + wake_up(&part->channel_mgr_wq); +} + +/* + * These next two inlines are used to keep us from tearing down a channel's + * msg queues while a thread may be referencing them. + */ +static inline void +xpc_msgqueue_ref(struct xpc_channel *ch) +{ + atomic_inc(&ch->references); +} + +static inline void +xpc_msgqueue_deref(struct xpc_channel *ch) +{ + s32 refs = atomic_dec_return(&ch->references); + + DBUG_ON(refs < 0); + if (refs == 0) + xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]); +} + +#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \ + xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs) + +/* + * These two inlines are used to keep us from tearing down a partition's + * setup infrastructure while a thread may be referencing it. + */ +static inline void +xpc_part_deref(struct xpc_partition *part) +{ + s32 refs = atomic_dec_return(&part->references); + + DBUG_ON(refs < 0); + if (refs == 0 && part->setup_state == XPC_P_SS_WTEARDOWN) + wake_up(&part->teardown_wq); +} + +static inline int +xpc_part_ref(struct xpc_partition *part) +{ + int setup; + + atomic_inc(&part->references); + setup = (part->setup_state == XPC_P_SS_SETUP); + if (!setup) + xpc_part_deref(part); + + return setup; +} + +/* + * The following macro is to be used for the setting of the reason and + * reason_line fields in both the struct xpc_channel and struct xpc_partition + * structures. + */ +#define XPC_SET_REASON(_p, _reason, _line) \ + { \ + (_p)->reason = _reason; \ + (_p)->reason_line = _line; \ + } + +#endif /* _DRIVERS_MISC_SGIXP_XPC_H */ diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c new file mode 100644 index 000000000..05a890ce2 --- /dev/null +++ b/drivers/misc/sgi-xp/xpc_channel.c @@ -0,0 +1,1011 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) channel support. + * + * This is the part of XPC that manages the channels and + * sends/receives messages across them to/from other partitions. + * + */ + +#include <linux/device.h> +#include "xpc.h" + +/* + * Process a connect message from a remote partition. + * + * Note: xpc_process_connect() is expecting to be called with the + * spin_lock_irqsave held and will leave it locked upon return. + */ +static void +xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags) +{ + enum xp_retval ret; + + DBUG_ON(!spin_is_locked(&ch->lock)); + + if (!(ch->flags & XPC_C_OPENREQUEST) || + !(ch->flags & XPC_C_ROPENREQUEST)) { + /* nothing more to do for now */ + return; + } + DBUG_ON(!(ch->flags & XPC_C_CONNECTING)); + + if (!(ch->flags & XPC_C_SETUP)) { + spin_unlock_irqrestore(&ch->lock, *irq_flags); + ret = xpc_arch_ops.setup_msg_structures(ch); + spin_lock_irqsave(&ch->lock, *irq_flags); + + if (ret != xpSuccess) + XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags); + else + ch->flags |= XPC_C_SETUP; + + if (ch->flags & XPC_C_DISCONNECTING) + return; + } + + if (!(ch->flags & XPC_C_OPENREPLY)) { + ch->flags |= XPC_C_OPENREPLY; + xpc_arch_ops.send_chctl_openreply(ch, irq_flags); + } + + if (!(ch->flags & XPC_C_ROPENREPLY)) + return; + + if (!(ch->flags & XPC_C_OPENCOMPLETE)) { + ch->flags |= (XPC_C_OPENCOMPLETE | XPC_C_CONNECTED); + xpc_arch_ops.send_chctl_opencomplete(ch, irq_flags); + } + + if (!(ch->flags & XPC_C_ROPENCOMPLETE)) + return; + + dev_info(xpc_chan, "channel %d to partition %d connected\n", + ch->number, ch->partid); + + ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */ +} + +/* + * spin_lock_irqsave() is expected to be held on entry. + */ +static void +xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_partition *part = &xpc_partitions[ch->partid]; + u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED); + + DBUG_ON(!spin_is_locked(&ch->lock)); + + if (!(ch->flags & XPC_C_DISCONNECTING)) + return; + + DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST)); + + /* make sure all activity has settled down first */ + + if (atomic_read(&ch->kthreads_assigned) > 0 || + atomic_read(&ch->references) > 0) { + return; + } + DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && + !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE)); + + if (part->act_state == XPC_P_AS_DEACTIVATING) { + /* can't proceed until the other side disengages from us */ + if (xpc_arch_ops.partition_engaged(ch->partid)) + return; + + } else { + + /* as long as the other side is up do the full protocol */ + + if (!(ch->flags & XPC_C_RCLOSEREQUEST)) + return; + + if (!(ch->flags & XPC_C_CLOSEREPLY)) { + ch->flags |= XPC_C_CLOSEREPLY; + xpc_arch_ops.send_chctl_closereply(ch, irq_flags); + } + + if (!(ch->flags & XPC_C_RCLOSEREPLY)) + return; + } + + /* wake those waiting for notify completion */ + if (atomic_read(&ch->n_to_notify) > 0) { + /* we do callout while holding ch->lock, callout can't block */ + xpc_arch_ops.notify_senders_of_disconnect(ch); + } + + /* both sides are disconnected now */ + + if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) { + spin_unlock_irqrestore(&ch->lock, *irq_flags); + xpc_disconnect_callout(ch, xpDisconnected); + spin_lock_irqsave(&ch->lock, *irq_flags); + } + + DBUG_ON(atomic_read(&ch->n_to_notify) != 0); + + /* it's now safe to free the channel's message queues */ + xpc_arch_ops.teardown_msg_structures(ch); + + ch->func = NULL; + ch->key = NULL; + ch->entry_size = 0; + ch->local_nentries = 0; + ch->remote_nentries = 0; + ch->kthreads_assigned_limit = 0; + ch->kthreads_idle_limit = 0; + + /* + * Mark the channel disconnected and clear all other flags, including + * XPC_C_SETUP (because of call to + * xpc_arch_ops.teardown_msg_structures()) but not including + * XPC_C_WDISCONNECT (if it was set). + */ + ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT)); + + atomic_dec(&part->nchannels_active); + + if (channel_was_connected) { + dev_info(xpc_chan, "channel %d to partition %d disconnected, " + "reason=%d\n", ch->number, ch->partid, ch->reason); + } + + if (ch->flags & XPC_C_WDISCONNECT) { + /* we won't lose the CPU since we're holding ch->lock */ + complete(&ch->wdisconnect_wait); + } else if (ch->delayed_chctl_flags) { + if (part->act_state != XPC_P_AS_DEACTIVATING) { + /* time to take action on any delayed chctl flags */ + spin_lock(&part->chctl_lock); + part->chctl.flags[ch->number] |= + ch->delayed_chctl_flags; + spin_unlock(&part->chctl_lock); + } + ch->delayed_chctl_flags = 0; + } +} + +/* + * Process a change in the channel's remote connection state. + */ +static void +xpc_process_openclose_chctl_flags(struct xpc_partition *part, int ch_number, + u8 chctl_flags) +{ + unsigned long irq_flags; + struct xpc_openclose_args *args = + &part->remote_openclose_args[ch_number]; + struct xpc_channel *ch = &part->channels[ch_number]; + enum xp_retval reason; + enum xp_retval ret; + int create_kthread = 0; + + spin_lock_irqsave(&ch->lock, irq_flags); + +again: + + if ((ch->flags & XPC_C_DISCONNECTED) && + (ch->flags & XPC_C_WDISCONNECT)) { + /* + * Delay processing chctl flags until thread waiting disconnect + * has had a chance to see that the channel is disconnected. + */ + ch->delayed_chctl_flags |= chctl_flags; + goto out; + } + + if (chctl_flags & XPC_CHCTL_CLOSEREQUEST) { + + dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREQUEST (reason=%d) received " + "from partid=%d, channel=%d\n", args->reason, + ch->partid, ch->number); + + /* + * If RCLOSEREQUEST is set, we're probably waiting for + * RCLOSEREPLY. We should find it and a ROPENREQUEST packed + * with this RCLOSEREQUEST in the chctl_flags. + */ + + if (ch->flags & XPC_C_RCLOSEREQUEST) { + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING)); + DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST)); + DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY)); + DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY); + + DBUG_ON(!(chctl_flags & XPC_CHCTL_CLOSEREPLY)); + chctl_flags &= ~XPC_CHCTL_CLOSEREPLY; + ch->flags |= XPC_C_RCLOSEREPLY; + + /* both sides have finished disconnecting */ + xpc_process_disconnect(ch, &irq_flags); + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); + goto again; + } + + if (ch->flags & XPC_C_DISCONNECTED) { + if (!(chctl_flags & XPC_CHCTL_OPENREQUEST)) { + if (part->chctl.flags[ch_number] & + XPC_CHCTL_OPENREQUEST) { + + DBUG_ON(ch->delayed_chctl_flags != 0); + spin_lock(&part->chctl_lock); + part->chctl.flags[ch_number] |= + XPC_CHCTL_CLOSEREQUEST; + spin_unlock(&part->chctl_lock); + } + goto out; + } + + XPC_SET_REASON(ch, 0, 0); + ch->flags &= ~XPC_C_DISCONNECTED; + + atomic_inc(&part->nchannels_active); + ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST); + } + + chctl_flags &= ~(XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY | + XPC_CHCTL_OPENCOMPLETE); + + /* + * The meaningful CLOSEREQUEST connection state fields are: + * reason = reason connection is to be closed + */ + + ch->flags |= XPC_C_RCLOSEREQUEST; + + if (!(ch->flags & XPC_C_DISCONNECTING)) { + reason = args->reason; + if (reason <= xpSuccess || reason > xpUnknownReason) + reason = xpUnknownReason; + else if (reason == xpUnregistering) + reason = xpOtherUnregistering; + + XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags); + + DBUG_ON(chctl_flags & XPC_CHCTL_CLOSEREPLY); + goto out; + } + + xpc_process_disconnect(ch, &irq_flags); + } + + if (chctl_flags & XPC_CHCTL_CLOSEREPLY) { + + dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREPLY received from partid=" + "%d, channel=%d\n", ch->partid, ch->number); + + if (ch->flags & XPC_C_DISCONNECTED) { + DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING); + goto out; + } + + DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST)); + + if (!(ch->flags & XPC_C_RCLOSEREQUEST)) { + if (part->chctl.flags[ch_number] & + XPC_CHCTL_CLOSEREQUEST) { + + DBUG_ON(ch->delayed_chctl_flags != 0); + spin_lock(&part->chctl_lock); + part->chctl.flags[ch_number] |= + XPC_CHCTL_CLOSEREPLY; + spin_unlock(&part->chctl_lock); + } + goto out; + } + + ch->flags |= XPC_C_RCLOSEREPLY; + + if (ch->flags & XPC_C_CLOSEREPLY) { + /* both sides have finished disconnecting */ + xpc_process_disconnect(ch, &irq_flags); + } + } + + if (chctl_flags & XPC_CHCTL_OPENREQUEST) { + + dev_dbg(xpc_chan, "XPC_CHCTL_OPENREQUEST (entry_size=%d, " + "local_nentries=%d) received from partid=%d, " + "channel=%d\n", args->entry_size, args->local_nentries, + ch->partid, ch->number); + + if (part->act_state == XPC_P_AS_DEACTIVATING || + (ch->flags & XPC_C_ROPENREQUEST)) { + goto out; + } + + if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) { + ch->delayed_chctl_flags |= XPC_CHCTL_OPENREQUEST; + goto out; + } + DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED | + XPC_C_OPENREQUEST))); + DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY | + XPC_C_OPENREPLY | XPC_C_CONNECTED)); + + /* + * The meaningful OPENREQUEST connection state fields are: + * entry_size = size of channel's messages in bytes + * local_nentries = remote partition's local_nentries + */ + if (args->entry_size == 0 || args->local_nentries == 0) { + /* assume OPENREQUEST was delayed by mistake */ + goto out; + } + + ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING); + ch->remote_nentries = args->local_nentries; + + if (ch->flags & XPC_C_OPENREQUEST) { + if (args->entry_size != ch->entry_size) { + XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes, + &irq_flags); + goto out; + } + } else { + ch->entry_size = args->entry_size; + + XPC_SET_REASON(ch, 0, 0); + ch->flags &= ~XPC_C_DISCONNECTED; + + atomic_inc(&part->nchannels_active); + } + + xpc_process_connect(ch, &irq_flags); + } + + if (chctl_flags & XPC_CHCTL_OPENREPLY) { + + dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY (local_msgqueue_pa=" + "0x%lx, local_nentries=%d, remote_nentries=%d) " + "received from partid=%d, channel=%d\n", + args->local_msgqueue_pa, args->local_nentries, + args->remote_nentries, ch->partid, ch->number); + + if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) + goto out; + + if (!(ch->flags & XPC_C_OPENREQUEST)) { + XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError, + &irq_flags); + goto out; + } + + DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST)); + DBUG_ON(ch->flags & XPC_C_CONNECTED); + + /* + * The meaningful OPENREPLY connection state fields are: + * local_msgqueue_pa = physical address of remote + * partition's local_msgqueue + * local_nentries = remote partition's local_nentries + * remote_nentries = remote partition's remote_nentries + */ + DBUG_ON(args->local_msgqueue_pa == 0); + DBUG_ON(args->local_nentries == 0); + DBUG_ON(args->remote_nentries == 0); + + ret = xpc_arch_ops.save_remote_msgqueue_pa(ch, + args->local_msgqueue_pa); + if (ret != xpSuccess) { + XPC_DISCONNECT_CHANNEL(ch, ret, &irq_flags); + goto out; + } + ch->flags |= XPC_C_ROPENREPLY; + + if (args->local_nentries < ch->remote_nentries) { + dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new " + "remote_nentries=%d, old remote_nentries=%d, " + "partid=%d, channel=%d\n", + args->local_nentries, ch->remote_nentries, + ch->partid, ch->number); + + ch->remote_nentries = args->local_nentries; + } + if (args->remote_nentries < ch->local_nentries) { + dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new " + "local_nentries=%d, old local_nentries=%d, " + "partid=%d, channel=%d\n", + args->remote_nentries, ch->local_nentries, + ch->partid, ch->number); + + ch->local_nentries = args->remote_nentries; + } + + xpc_process_connect(ch, &irq_flags); + } + + if (chctl_flags & XPC_CHCTL_OPENCOMPLETE) { + + dev_dbg(xpc_chan, "XPC_CHCTL_OPENCOMPLETE received from " + "partid=%d, channel=%d\n", ch->partid, ch->number); + + if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) + goto out; + + if (!(ch->flags & XPC_C_OPENREQUEST) || + !(ch->flags & XPC_C_OPENREPLY)) { + XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError, + &irq_flags); + goto out; + } + + DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST)); + DBUG_ON(!(ch->flags & XPC_C_ROPENREPLY)); + DBUG_ON(!(ch->flags & XPC_C_CONNECTED)); + + ch->flags |= XPC_C_ROPENCOMPLETE; + + xpc_process_connect(ch, &irq_flags); + create_kthread = 1; + } + +out: + spin_unlock_irqrestore(&ch->lock, irq_flags); + + if (create_kthread) + xpc_create_kthreads(ch, 1, 0); +} + +/* + * Attempt to establish a channel connection to a remote partition. + */ +static enum xp_retval +xpc_connect_channel(struct xpc_channel *ch) +{ + unsigned long irq_flags; + struct xpc_registration *registration = &xpc_registrations[ch->number]; + + if (mutex_trylock(®istration->mutex) == 0) + return xpRetry; + + if (!XPC_CHANNEL_REGISTERED(ch->number)) { + mutex_unlock(®istration->mutex); + return xpUnregistered; + } + + spin_lock_irqsave(&ch->lock, irq_flags); + + DBUG_ON(ch->flags & XPC_C_CONNECTED); + DBUG_ON(ch->flags & XPC_C_OPENREQUEST); + + if (ch->flags & XPC_C_DISCONNECTING) { + spin_unlock_irqrestore(&ch->lock, irq_flags); + mutex_unlock(®istration->mutex); + return ch->reason; + } + + /* add info from the channel connect registration to the channel */ + + ch->kthreads_assigned_limit = registration->assigned_limit; + ch->kthreads_idle_limit = registration->idle_limit; + DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0); + DBUG_ON(atomic_read(&ch->kthreads_idle) != 0); + DBUG_ON(atomic_read(&ch->kthreads_active) != 0); + + ch->func = registration->func; + DBUG_ON(registration->func == NULL); + ch->key = registration->key; + + ch->local_nentries = registration->nentries; + + if (ch->flags & XPC_C_ROPENREQUEST) { + if (registration->entry_size != ch->entry_size) { + /* the local and remote sides aren't the same */ + + /* + * Because XPC_DISCONNECT_CHANNEL() can block we're + * forced to up the registration sema before we unlock + * the channel lock. But that's okay here because we're + * done with the part that required the registration + * sema. XPC_DISCONNECT_CHANNEL() requires that the + * channel lock be locked and will unlock and relock + * the channel lock as needed. + */ + mutex_unlock(®istration->mutex); + XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes, + &irq_flags); + spin_unlock_irqrestore(&ch->lock, irq_flags); + return xpUnequalMsgSizes; + } + } else { + ch->entry_size = registration->entry_size; + + XPC_SET_REASON(ch, 0, 0); + ch->flags &= ~XPC_C_DISCONNECTED; + + atomic_inc(&xpc_partitions[ch->partid].nchannels_active); + } + + mutex_unlock(®istration->mutex); + + /* initiate the connection */ + + ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING); + xpc_arch_ops.send_chctl_openrequest(ch, &irq_flags); + + xpc_process_connect(ch, &irq_flags); + + spin_unlock_irqrestore(&ch->lock, irq_flags); + + return xpSuccess; +} + +void +xpc_process_sent_chctl_flags(struct xpc_partition *part) +{ + unsigned long irq_flags; + union xpc_channel_ctl_flags chctl; + struct xpc_channel *ch; + int ch_number; + u32 ch_flags; + + chctl.all_flags = xpc_arch_ops.get_chctl_all_flags(part); + + /* + * Initiate channel connections for registered channels. + * + * For each connected channel that has pending messages activate idle + * kthreads and/or create new kthreads as needed. + */ + + for (ch_number = 0; ch_number < part->nchannels; ch_number++) { + ch = &part->channels[ch_number]; + + /* + * Process any open or close related chctl flags, and then deal + * with connecting or disconnecting the channel as required. + */ + + if (chctl.flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS) { + xpc_process_openclose_chctl_flags(part, ch_number, + chctl.flags[ch_number]); + } + + ch_flags = ch->flags; /* need an atomic snapshot of flags */ + + if (ch_flags & XPC_C_DISCONNECTING) { + spin_lock_irqsave(&ch->lock, irq_flags); + xpc_process_disconnect(ch, &irq_flags); + spin_unlock_irqrestore(&ch->lock, irq_flags); + continue; + } + + if (part->act_state == XPC_P_AS_DEACTIVATING) + continue; + + if (!(ch_flags & XPC_C_CONNECTED)) { + if (!(ch_flags & XPC_C_OPENREQUEST)) { + DBUG_ON(ch_flags & XPC_C_SETUP); + (void)xpc_connect_channel(ch); + } + continue; + } + + /* + * Process any message related chctl flags, this may involve + * the activation of kthreads to deliver any pending messages + * sent from the other partition. + */ + + if (chctl.flags[ch_number] & XPC_MSG_CHCTL_FLAGS) + xpc_arch_ops.process_msg_chctl_flags(part, ch_number); + } +} + +/* + * XPC's heartbeat code calls this function to inform XPC that a partition is + * going down. XPC responds by tearing down the XPartition Communication + * infrastructure used for the just downed partition. + * + * XPC's heartbeat code will never call this function and xpc_partition_up() + * at the same time. Nor will it ever make multiple calls to either function + * at the same time. + */ +void +xpc_partition_going_down(struct xpc_partition *part, enum xp_retval reason) +{ + unsigned long irq_flags; + int ch_number; + struct xpc_channel *ch; + + dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n", + XPC_PARTID(part), reason); + + if (!xpc_part_ref(part)) { + /* infrastructure for this partition isn't currently set up */ + return; + } + + /* disconnect channels associated with the partition going down */ + + for (ch_number = 0; ch_number < part->nchannels; ch_number++) { + ch = &part->channels[ch_number]; + + xpc_msgqueue_ref(ch); + spin_lock_irqsave(&ch->lock, irq_flags); + + XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags); + + spin_unlock_irqrestore(&ch->lock, irq_flags); + xpc_msgqueue_deref(ch); + } + + xpc_wakeup_channel_mgr(part); + + xpc_part_deref(part); +} + +/* + * Called by XP at the time of channel connection registration to cause + * XPC to establish connections to all currently active partitions. + */ +void +xpc_initiate_connect(int ch_number) +{ + short partid; + struct xpc_partition *part; + + DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS); + + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + if (xpc_part_ref(part)) { + /* + * Initiate the establishment of a connection on the + * newly registered channel to the remote partition. + */ + xpc_wakeup_channel_mgr(part); + xpc_part_deref(part); + } + } +} + +void +xpc_connected_callout(struct xpc_channel *ch) +{ + /* let the registerer know that a connection has been established */ + + if (ch->func != NULL) { + dev_dbg(xpc_chan, "ch->func() called, reason=xpConnected, " + "partid=%d, channel=%d\n", ch->partid, ch->number); + + ch->func(xpConnected, ch->partid, ch->number, + (void *)(u64)ch->local_nentries, ch->key); + + dev_dbg(xpc_chan, "ch->func() returned, reason=xpConnected, " + "partid=%d, channel=%d\n", ch->partid, ch->number); + } +} + +/* + * Called by XP at the time of channel connection unregistration to cause + * XPC to teardown all current connections for the specified channel. + * + * Before returning xpc_initiate_disconnect() will wait until all connections + * on the specified channel have been closed/torndown. So the caller can be + * assured that they will not be receiving any more callouts from XPC to the + * function they registered via xpc_connect(). + * + * Arguments: + * + * ch_number - channel # to unregister. + */ +void +xpc_initiate_disconnect(int ch_number) +{ + unsigned long irq_flags; + short partid; + struct xpc_partition *part; + struct xpc_channel *ch; + + DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS); + + /* initiate the channel disconnect for every active partition */ + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + if (xpc_part_ref(part)) { + ch = &part->channels[ch_number]; + xpc_msgqueue_ref(ch); + + spin_lock_irqsave(&ch->lock, irq_flags); + + if (!(ch->flags & XPC_C_DISCONNECTED)) { + ch->flags |= XPC_C_WDISCONNECT; + + XPC_DISCONNECT_CHANNEL(ch, xpUnregistering, + &irq_flags); + } + + spin_unlock_irqrestore(&ch->lock, irq_flags); + + xpc_msgqueue_deref(ch); + xpc_part_deref(part); + } + } + + xpc_disconnect_wait(ch_number); +} + +/* + * To disconnect a channel, and reflect it back to all who may be waiting. + * + * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by + * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by + * xpc_disconnect_wait(). + * + * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN. + */ +void +xpc_disconnect_channel(const int line, struct xpc_channel *ch, + enum xp_retval reason, unsigned long *irq_flags) +{ + u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED); + + DBUG_ON(!spin_is_locked(&ch->lock)); + + if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED)) + return; + + DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED))); + + dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n", + reason, line, ch->partid, ch->number); + + XPC_SET_REASON(ch, reason, line); + + ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING); + /* some of these may not have been set */ + ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY | + XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY | + XPC_C_CONNECTING | XPC_C_CONNECTED); + + xpc_arch_ops.send_chctl_closerequest(ch, irq_flags); + + if (channel_was_connected) + ch->flags |= XPC_C_WASCONNECTED; + + spin_unlock_irqrestore(&ch->lock, *irq_flags); + + /* wake all idle kthreads so they can exit */ + if (atomic_read(&ch->kthreads_idle) > 0) { + wake_up_all(&ch->idle_wq); + + } else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && + !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) { + /* start a kthread that will do the xpDisconnecting callout */ + xpc_create_kthreads(ch, 1, 1); + } + + /* wake those waiting to allocate an entry from the local msg queue */ + if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) + wake_up(&ch->msg_allocate_wq); + + spin_lock_irqsave(&ch->lock, *irq_flags); +} + +void +xpc_disconnect_callout(struct xpc_channel *ch, enum xp_retval reason) +{ + /* + * Let the channel's registerer know that the channel is being + * disconnected. We don't want to do this if the registerer was never + * informed of a connection being made. + */ + + if (ch->func != NULL) { + dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, " + "channel=%d\n", reason, ch->partid, ch->number); + + ch->func(reason, ch->partid, ch->number, NULL, ch->key); + + dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, " + "channel=%d\n", reason, ch->partid, ch->number); + } +} + +/* + * Wait for a message entry to become available for the specified channel, + * but don't wait any longer than 1 jiffy. + */ +enum xp_retval +xpc_allocate_msg_wait(struct xpc_channel *ch) +{ + enum xp_retval ret; + DEFINE_WAIT(wait); + + if (ch->flags & XPC_C_DISCONNECTING) { + DBUG_ON(ch->reason == xpInterrupted); + return ch->reason; + } + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); + ret = schedule_timeout(1); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + + if (ch->flags & XPC_C_DISCONNECTING) { + ret = ch->reason; + DBUG_ON(ch->reason == xpInterrupted); + } else if (ret == 0) { + ret = xpTimeout; + } else { + ret = xpInterrupted; + } + + return ret; +} + +/* + * Send a message that contains the user's payload on the specified channel + * connected to the specified partition. + * + * NOTE that this routine can sleep waiting for a message entry to become + * available. To not sleep, pass in the XPC_NOWAIT flag. + * + * Once sent, this routine will not wait for the message to be received, nor + * will notification be given when it does happen. + * + * Arguments: + * + * partid - ID of partition to which the channel is connected. + * ch_number - channel # to send message on. + * flags - see xp.h for valid flags. + * payload - pointer to the payload which is to be sent. + * payload_size - size of the payload in bytes. + */ +enum xp_retval +xpc_initiate_send(short partid, int ch_number, u32 flags, void *payload, + u16 payload_size) +{ + struct xpc_partition *part = &xpc_partitions[partid]; + enum xp_retval ret = xpUnknownReason; + + dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload, + partid, ch_number); + + DBUG_ON(partid < 0 || partid >= xp_max_npartitions); + DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); + DBUG_ON(payload == NULL); + + if (xpc_part_ref(part)) { + ret = xpc_arch_ops.send_payload(&part->channels[ch_number], + flags, payload, payload_size, 0, NULL, NULL); + xpc_part_deref(part); + } + + return ret; +} + +/* + * Send a message that contains the user's payload on the specified channel + * connected to the specified partition. + * + * NOTE that this routine can sleep waiting for a message entry to become + * available. To not sleep, pass in the XPC_NOWAIT flag. + * + * This routine will not wait for the message to be sent or received. + * + * Once the remote end of the channel has received the message, the function + * passed as an argument to xpc_initiate_send_notify() will be called. This + * allows the sender to free up or re-use any buffers referenced by the + * message, but does NOT mean the message has been processed at the remote + * end by a receiver. + * + * If this routine returns an error, the caller's function will NOT be called. + * + * Arguments: + * + * partid - ID of partition to which the channel is connected. + * ch_number - channel # to send message on. + * flags - see xp.h for valid flags. + * payload - pointer to the payload which is to be sent. + * payload_size - size of the payload in bytes. + * func - function to call with asynchronous notification of message + * receipt. THIS FUNCTION MUST BE NON-BLOCKING. + * key - user-defined key to be passed to the function when it's called. + */ +enum xp_retval +xpc_initiate_send_notify(short partid, int ch_number, u32 flags, void *payload, + u16 payload_size, xpc_notify_func func, void *key) +{ + struct xpc_partition *part = &xpc_partitions[partid]; + enum xp_retval ret = xpUnknownReason; + + dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload, + partid, ch_number); + + DBUG_ON(partid < 0 || partid >= xp_max_npartitions); + DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); + DBUG_ON(payload == NULL); + DBUG_ON(func == NULL); + + if (xpc_part_ref(part)) { + ret = xpc_arch_ops.send_payload(&part->channels[ch_number], + flags, payload, payload_size, XPC_N_CALL, func, key); + xpc_part_deref(part); + } + return ret; +} + +/* + * Deliver a message's payload to its intended recipient. + */ +void +xpc_deliver_payload(struct xpc_channel *ch) +{ + void *payload; + + payload = xpc_arch_ops.get_deliverable_payload(ch); + if (payload != NULL) { + + /* + * This ref is taken to protect the payload itself from being + * freed before the user is finished with it, which the user + * indicates by calling xpc_initiate_received(). + */ + xpc_msgqueue_ref(ch); + + atomic_inc(&ch->kthreads_active); + + if (ch->func != NULL) { + dev_dbg(xpc_chan, "ch->func() called, payload=0x%p " + "partid=%d channel=%d\n", payload, ch->partid, + ch->number); + + /* deliver the message to its intended recipient */ + ch->func(xpMsgReceived, ch->partid, ch->number, payload, + ch->key); + + dev_dbg(xpc_chan, "ch->func() returned, payload=0x%p " + "partid=%d channel=%d\n", payload, ch->partid, + ch->number); + } + + atomic_dec(&ch->kthreads_active); + } +} + +/* + * Acknowledge receipt of a delivered message's payload. + * + * This function, although called by users, does not call xpc_part_ref() to + * ensure that the partition infrastructure is in place. It relies on the + * fact that we called xpc_msgqueue_ref() in xpc_deliver_payload(). + * + * Arguments: + * + * partid - ID of partition to which the channel is connected. + * ch_number - channel # message received on. + * payload - pointer to the payload area allocated via + * xpc_initiate_send() or xpc_initiate_send_notify(). + */ +void +xpc_initiate_received(short partid, int ch_number, void *payload) +{ + struct xpc_partition *part = &xpc_partitions[partid]; + struct xpc_channel *ch; + + DBUG_ON(partid < 0 || partid >= xp_max_npartitions); + DBUG_ON(ch_number < 0 || ch_number >= part->nchannels); + + ch = &part->channels[ch_number]; + xpc_arch_ops.received_payload(ch, payload); + + /* the call to xpc_msgqueue_ref() was done by xpc_deliver_payload() */ + xpc_msgqueue_deref(ch); +} diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c new file mode 100644 index 000000000..83fc748a9 --- /dev/null +++ b/drivers/misc/sgi-xp/xpc_main.c @@ -0,0 +1,1373 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) support - standard version. + * + * XPC provides a message passing capability that crosses partition + * boundaries. This module is made up of two parts: + * + * partition This part detects the presence/absence of other + * partitions. It provides a heartbeat and monitors + * the heartbeats of other partitions. + * + * channel This part manages the channels and sends/receives + * messages across them to/from other partitions. + * + * There are a couple of additional functions residing in XP, which + * provide an interface to XPC for its users. + * + * + * Caveats: + * + * . Currently on sn2, we have no way to determine which nasid an IRQ + * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write + * followed by an IPI. The amo indicates where data is to be pulled + * from, so after the IPI arrives, the remote partition checks the amo + * word. The IPI can actually arrive before the amo however, so other + * code must periodically check for this case. Also, remote amo + * operations do not reliably time out. Thus we do a remote PIO read + * solely to know whether the remote partition is down and whether we + * should stop sending IPIs to it. This remote PIO read operation is + * set up in a special nofault region so SAL knows to ignore (and + * cleanup) any errors due to the remote amo write, PIO read, and/or + * PIO write operations. + * + * If/when new hardware solves this IPI problem, we should abandon + * the current approach. + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/kdebug.h> +#include <linux/kthread.h> +#include "xpc.h" + +#ifdef CONFIG_X86_64 +#include <asm/traps.h> +#endif + +/* define two XPC debug device structures to be used with dev_dbg() et al */ + +struct device_driver xpc_dbg_name = { + .name = "xpc" +}; + +struct device xpc_part_dbg_subname = { + .init_name = "", /* set to "part" at xpc_init() time */ + .driver = &xpc_dbg_name +}; + +struct device xpc_chan_dbg_subname = { + .init_name = "", /* set to "chan" at xpc_init() time */ + .driver = &xpc_dbg_name +}; + +struct device *xpc_part = &xpc_part_dbg_subname; +struct device *xpc_chan = &xpc_chan_dbg_subname; + +static int xpc_kdebug_ignore; + +/* systune related variables for /proc/sys directories */ + +static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; +static int xpc_hb_min_interval = 1; +static int xpc_hb_max_interval = 10; + +static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; +static int xpc_hb_check_min_interval = 10; +static int xpc_hb_check_max_interval = 120; + +int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT; +static int xpc_disengage_min_timelimit; /* = 0 */ +static int xpc_disengage_max_timelimit = 120; + +static struct ctl_table xpc_sys_xpc_hb_dir[] = { + { + .procname = "hb_interval", + .data = &xpc_hb_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xpc_hb_min_interval, + .extra2 = &xpc_hb_max_interval}, + { + .procname = "hb_check_interval", + .data = &xpc_hb_check_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xpc_hb_check_min_interval, + .extra2 = &xpc_hb_check_max_interval}, + {} +}; +static struct ctl_table xpc_sys_xpc_dir[] = { + { + .procname = "hb", + .mode = 0555, + .child = xpc_sys_xpc_hb_dir}, + { + .procname = "disengage_timelimit", + .data = &xpc_disengage_timelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xpc_disengage_min_timelimit, + .extra2 = &xpc_disengage_max_timelimit}, + {} +}; +static struct ctl_table xpc_sys_dir[] = { + { + .procname = "xpc", + .mode = 0555, + .child = xpc_sys_xpc_dir}, + {} +}; +static struct ctl_table_header *xpc_sysctl; + +/* non-zero if any remote partition disengage was timed out */ +int xpc_disengage_timedout; + +/* #of activate IRQs received and not yet processed */ +int xpc_activate_IRQ_rcvd; +DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock); + +/* IRQ handler notifies this wait queue on receipt of an IRQ */ +DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq); + +static unsigned long xpc_hb_check_timeout; +static struct timer_list xpc_hb_timer; + +/* notification that the xpc_hb_checker thread has exited */ +static DECLARE_COMPLETION(xpc_hb_checker_exited); + +/* notification that the xpc_discovery thread has exited */ +static DECLARE_COMPLETION(xpc_discovery_exited); + +static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); + +static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); +static struct notifier_block xpc_reboot_notifier = { + .notifier_call = xpc_system_reboot, +}; + +static int xpc_system_die(struct notifier_block *, unsigned long, void *); +static struct notifier_block xpc_die_notifier = { + .notifier_call = xpc_system_die, +}; + +struct xpc_arch_operations xpc_arch_ops; + +/* + * Timer function to enforce the timelimit on the partition disengage. + */ +static void +xpc_timeout_partition_disengage(struct timer_list *t) +{ + struct xpc_partition *part = from_timer(part, t, disengage_timer); + + DBUG_ON(time_is_after_jiffies(part->disengage_timeout)); + + (void)xpc_partition_disengaged(part); + + DBUG_ON(part->disengage_timeout != 0); + DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part))); +} + +/* + * Timer to produce the heartbeat. The timer structures function is + * already set when this is initially called. A tunable is used to + * specify when the next timeout should occur. + */ +static void +xpc_hb_beater(struct timer_list *unused) +{ + xpc_arch_ops.increment_heartbeat(); + + if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) + wake_up_interruptible(&xpc_activate_IRQ_wq); + + xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ); + add_timer(&xpc_hb_timer); +} + +static void +xpc_start_hb_beater(void) +{ + xpc_arch_ops.heartbeat_init(); + timer_setup(&xpc_hb_timer, xpc_hb_beater, 0); + xpc_hb_beater(0); +} + +static void +xpc_stop_hb_beater(void) +{ + del_timer_sync(&xpc_hb_timer); + xpc_arch_ops.heartbeat_exit(); +} + +/* + * At periodic intervals, scan through all active partitions and ensure + * their heartbeat is still active. If not, the partition is deactivated. + */ +static void +xpc_check_remote_hb(void) +{ + struct xpc_partition *part; + short partid; + enum xp_retval ret; + + for (partid = 0; partid < xp_max_npartitions; partid++) { + + if (xpc_exiting) + break; + + if (partid == xp_partition_id) + continue; + + part = &xpc_partitions[partid]; + + if (part->act_state == XPC_P_AS_INACTIVE || + part->act_state == XPC_P_AS_DEACTIVATING) { + continue; + } + + ret = xpc_arch_ops.get_remote_heartbeat(part); + if (ret != xpSuccess) + XPC_DEACTIVATE_PARTITION(part, ret); + } +} + +/* + * This thread is responsible for nearly all of the partition + * activation/deactivation. + */ +static int +xpc_hb_checker(void *ignore) +{ + int force_IRQ = 0; + + /* this thread was marked active by xpc_hb_init() */ + + set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU)); + + /* set our heartbeating to other partitions into motion */ + xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); + xpc_start_hb_beater(); + + while (!xpc_exiting) { + + dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " + "been received\n", + (int)(xpc_hb_check_timeout - jiffies), + xpc_activate_IRQ_rcvd); + + /* checking of remote heartbeats is skewed by IRQ handling */ + if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) { + xpc_hb_check_timeout = jiffies + + (xpc_hb_check_interval * HZ); + + dev_dbg(xpc_part, "checking remote heartbeats\n"); + xpc_check_remote_hb(); + + /* + * On sn2 we need to periodically recheck to ensure no + * IRQ/amo pairs have been missed. + */ + if (is_shub()) + force_IRQ = 1; + } + + /* check for outstanding IRQs */ + if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) { + force_IRQ = 0; + dev_dbg(xpc_part, "processing activate IRQs " + "received\n"); + xpc_arch_ops.process_activate_IRQ_rcvd(); + } + + /* wait for IRQ or timeout */ + (void)wait_event_interruptible(xpc_activate_IRQ_wq, + (time_is_before_eq_jiffies( + xpc_hb_check_timeout) || + xpc_activate_IRQ_rcvd > 0 || + xpc_exiting)); + } + + xpc_stop_hb_beater(); + + dev_dbg(xpc_part, "heartbeat checker is exiting\n"); + + /* mark this thread as having exited */ + complete(&xpc_hb_checker_exited); + return 0; +} + +/* + * This thread will attempt to discover other partitions to activate + * based on info provided by SAL. This new thread is short lived and + * will exit once discovery is complete. + */ +static int +xpc_initiate_discovery(void *ignore) +{ + xpc_discovery(); + + dev_dbg(xpc_part, "discovery thread is exiting\n"); + + /* mark this thread as having exited */ + complete(&xpc_discovery_exited); + return 0; +} + +/* + * The first kthread assigned to a newly activated partition is the one + * created by XPC HB with which it calls xpc_activating(). XPC hangs on to + * that kthread until the partition is brought down, at which time that kthread + * returns back to XPC HB. (The return of that kthread will signify to XPC HB + * that XPC has dismantled all communication infrastructure for the associated + * partition.) This kthread becomes the channel manager for that partition. + * + * Each active partition has a channel manager, who, besides connecting and + * disconnecting channels, will ensure that each of the partition's connected + * channels has the required number of assigned kthreads to get the work done. + */ +static void +xpc_channel_mgr(struct xpc_partition *part) +{ + while (part->act_state != XPC_P_AS_DEACTIVATING || + atomic_read(&part->nchannels_active) > 0 || + !xpc_partition_disengaged(part)) { + + xpc_process_sent_chctl_flags(part); + + /* + * Wait until we've been requested to activate kthreads or + * all of the channel's message queues have been torn down or + * a signal is pending. + * + * The channel_mgr_requests is set to 1 after being awakened, + * This is done to prevent the channel mgr from making one pass + * through the loop for each request, since he will + * be servicing all the requests in one pass. The reason it's + * set to 1 instead of 0 is so that other kthreads will know + * that the channel mgr is running and won't bother trying to + * wake him up. + */ + atomic_dec(&part->channel_mgr_requests); + (void)wait_event_interruptible(part->channel_mgr_wq, + (atomic_read(&part->channel_mgr_requests) > 0 || + part->chctl.all_flags != 0 || + (part->act_state == XPC_P_AS_DEACTIVATING && + atomic_read(&part->nchannels_active) == 0 && + xpc_partition_disengaged(part)))); + atomic_set(&part->channel_mgr_requests, 1); + } +} + +/* + * Guarantee that the kzalloc'd memory is cacheline aligned. + */ +void * +xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kzalloc will give us cachline aligned memory by default */ + *base = kzalloc(size, flags); + if (*base == NULL) + return NULL; + + if ((u64)*base == L1_CACHE_ALIGN((u64)*base)) + return *base; + + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kzalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) + return NULL; + + return (void *)L1_CACHE_ALIGN((u64)*base); +} + +/* + * Setup the channel structures necessary to support XPartition Communication + * between the specified remote partition and the local one. + */ +static enum xp_retval +xpc_setup_ch_structures(struct xpc_partition *part) +{ + enum xp_retval ret; + int ch_number; + struct xpc_channel *ch; + short partid = XPC_PARTID(part); + + /* + * Allocate all of the channel structures as a contiguous chunk of + * memory. + */ + DBUG_ON(part->channels != NULL); + part->channels = kcalloc(XPC_MAX_NCHANNELS, + sizeof(struct xpc_channel), + GFP_KERNEL); + if (part->channels == NULL) { + dev_err(xpc_chan, "can't get memory for channels\n"); + return xpNoMemory; + } + + /* allocate the remote open and close args */ + + part->remote_openclose_args = + xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, + GFP_KERNEL, &part-> + remote_openclose_args_base); + if (part->remote_openclose_args == NULL) { + dev_err(xpc_chan, "can't get memory for remote connect args\n"); + ret = xpNoMemory; + goto out_1; + } + + part->chctl.all_flags = 0; + spin_lock_init(&part->chctl_lock); + + atomic_set(&part->channel_mgr_requests, 1); + init_waitqueue_head(&part->channel_mgr_wq); + + part->nchannels = XPC_MAX_NCHANNELS; + + atomic_set(&part->nchannels_active, 0); + atomic_set(&part->nchannels_engaged, 0); + + for (ch_number = 0; ch_number < part->nchannels; ch_number++) { + ch = &part->channels[ch_number]; + + ch->partid = partid; + ch->number = ch_number; + ch->flags = XPC_C_DISCONNECTED; + + atomic_set(&ch->kthreads_assigned, 0); + atomic_set(&ch->kthreads_idle, 0); + atomic_set(&ch->kthreads_active, 0); + + atomic_set(&ch->references, 0); + atomic_set(&ch->n_to_notify, 0); + + spin_lock_init(&ch->lock); + init_completion(&ch->wdisconnect_wait); + + atomic_set(&ch->n_on_msg_allocate_wq, 0); + init_waitqueue_head(&ch->msg_allocate_wq); + init_waitqueue_head(&ch->idle_wq); + } + + ret = xpc_arch_ops.setup_ch_structures(part); + if (ret != xpSuccess) + goto out_2; + + /* + * With the setting of the partition setup_state to XPC_P_SS_SETUP, + * we're declaring that this partition is ready to go. + */ + part->setup_state = XPC_P_SS_SETUP; + + return xpSuccess; + + /* setup of ch structures failed */ +out_2: + kfree(part->remote_openclose_args_base); + part->remote_openclose_args = NULL; +out_1: + kfree(part->channels); + part->channels = NULL; + return ret; +} + +/* + * Teardown the channel structures necessary to support XPartition Communication + * between the specified remote partition and the local one. + */ +static void +xpc_teardown_ch_structures(struct xpc_partition *part) +{ + DBUG_ON(atomic_read(&part->nchannels_engaged) != 0); + DBUG_ON(atomic_read(&part->nchannels_active) != 0); + + /* + * Make this partition inaccessible to local processes by marking it + * as no longer setup. Then wait before proceeding with the teardown + * until all existing references cease. + */ + DBUG_ON(part->setup_state != XPC_P_SS_SETUP); + part->setup_state = XPC_P_SS_WTEARDOWN; + + wait_event(part->teardown_wq, (atomic_read(&part->references) == 0)); + + /* now we can begin tearing down the infrastructure */ + + xpc_arch_ops.teardown_ch_structures(part); + + kfree(part->remote_openclose_args_base); + part->remote_openclose_args = NULL; + kfree(part->channels); + part->channels = NULL; + + part->setup_state = XPC_P_SS_TORNDOWN; +} + +/* + * When XPC HB determines that a partition has come up, it will create a new + * kthread and that kthread will call this function to attempt to set up the + * basic infrastructure used for Cross Partition Communication with the newly + * upped partition. + * + * The kthread that was created by XPC HB and which setup the XPC + * infrastructure will remain assigned to the partition becoming the channel + * manager for that partition until the partition is deactivating, at which + * time the kthread will teardown the XPC infrastructure and then exit. + */ +static int +xpc_activating(void *__partid) +{ + short partid = (u64)__partid; + struct xpc_partition *part = &xpc_partitions[partid]; + unsigned long irq_flags; + + DBUG_ON(partid < 0 || partid >= xp_max_npartitions); + + spin_lock_irqsave(&part->act_lock, irq_flags); + + if (part->act_state == XPC_P_AS_DEACTIVATING) { + part->act_state = XPC_P_AS_INACTIVE; + spin_unlock_irqrestore(&part->act_lock, irq_flags); + part->remote_rp_pa = 0; + return 0; + } + + /* indicate the thread is activating */ + DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ); + part->act_state = XPC_P_AS_ACTIVATING; + + XPC_SET_REASON(part, 0, 0); + spin_unlock_irqrestore(&part->act_lock, irq_flags); + + dev_dbg(xpc_part, "activating partition %d\n", partid); + + xpc_arch_ops.allow_hb(partid); + + if (xpc_setup_ch_structures(part) == xpSuccess) { + (void)xpc_part_ref(part); /* this will always succeed */ + + if (xpc_arch_ops.make_first_contact(part) == xpSuccess) { + xpc_mark_partition_active(part); + xpc_channel_mgr(part); + /* won't return until partition is deactivating */ + } + + xpc_part_deref(part); + xpc_teardown_ch_structures(part); + } + + xpc_arch_ops.disallow_hb(partid); + xpc_mark_partition_inactive(part); + + if (part->reason == xpReactivating) { + /* interrupting ourselves results in activating partition */ + xpc_arch_ops.request_partition_reactivation(part); + } + + return 0; +} + +void +xpc_activate_partition(struct xpc_partition *part) +{ + short partid = XPC_PARTID(part); + unsigned long irq_flags; + struct task_struct *kthread; + + spin_lock_irqsave(&part->act_lock, irq_flags); + + DBUG_ON(part->act_state != XPC_P_AS_INACTIVE); + + part->act_state = XPC_P_AS_ACTIVATION_REQ; + XPC_SET_REASON(part, xpCloneKThread, __LINE__); + + spin_unlock_irqrestore(&part->act_lock, irq_flags); + + kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d", + partid); + if (IS_ERR(kthread)) { + spin_lock_irqsave(&part->act_lock, irq_flags); + part->act_state = XPC_P_AS_INACTIVE; + XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__); + spin_unlock_irqrestore(&part->act_lock, irq_flags); + } +} + +void +xpc_activate_kthreads(struct xpc_channel *ch, int needed) +{ + int idle = atomic_read(&ch->kthreads_idle); + int assigned = atomic_read(&ch->kthreads_assigned); + int wakeup; + + DBUG_ON(needed <= 0); + + if (idle > 0) { + wakeup = (needed > idle) ? idle : needed; + needed -= wakeup; + + dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, " + "channel=%d\n", wakeup, ch->partid, ch->number); + + /* only wakeup the requested number of kthreads */ + wake_up_nr(&ch->idle_wq, wakeup); + } + + if (needed <= 0) + return; + + if (needed + assigned > ch->kthreads_assigned_limit) { + needed = ch->kthreads_assigned_limit - assigned; + if (needed <= 0) + return; + } + + dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n", + needed, ch->partid, ch->number); + + xpc_create_kthreads(ch, needed, 0); +} + +/* + * This function is where XPC's kthreads wait for messages to deliver. + */ +static void +xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch) +{ + int (*n_of_deliverable_payloads) (struct xpc_channel *) = + xpc_arch_ops.n_of_deliverable_payloads; + + do { + /* deliver messages to their intended recipients */ + + while (n_of_deliverable_payloads(ch) > 0 && + !(ch->flags & XPC_C_DISCONNECTING)) { + xpc_deliver_payload(ch); + } + + if (atomic_inc_return(&ch->kthreads_idle) > + ch->kthreads_idle_limit) { + /* too many idle kthreads on this channel */ + atomic_dec(&ch->kthreads_idle); + break; + } + + dev_dbg(xpc_chan, "idle kthread calling " + "wait_event_interruptible_exclusive()\n"); + + (void)wait_event_interruptible_exclusive(ch->idle_wq, + (n_of_deliverable_payloads(ch) > 0 || + (ch->flags & XPC_C_DISCONNECTING))); + + atomic_dec(&ch->kthreads_idle); + + } while (!(ch->flags & XPC_C_DISCONNECTING)); +} + +static int +xpc_kthread_start(void *args) +{ + short partid = XPC_UNPACK_ARG1(args); + u16 ch_number = XPC_UNPACK_ARG2(args); + struct xpc_partition *part = &xpc_partitions[partid]; + struct xpc_channel *ch; + int n_needed; + unsigned long irq_flags; + int (*n_of_deliverable_payloads) (struct xpc_channel *) = + xpc_arch_ops.n_of_deliverable_payloads; + + dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n", + partid, ch_number); + + ch = &part->channels[ch_number]; + + if (!(ch->flags & XPC_C_DISCONNECTING)) { + + /* let registerer know that connection has been established */ + + spin_lock_irqsave(&ch->lock, irq_flags); + if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) { + ch->flags |= XPC_C_CONNECTEDCALLOUT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + xpc_connected_callout(ch); + + spin_lock_irqsave(&ch->lock, irq_flags); + ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + /* + * It is possible that while the callout was being + * made that the remote partition sent some messages. + * If that is the case, we may need to activate + * additional kthreads to help deliver them. We only + * need one less than total #of messages to deliver. + */ + n_needed = n_of_deliverable_payloads(ch) - 1; + if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING)) + xpc_activate_kthreads(ch, n_needed); + + } else { + spin_unlock_irqrestore(&ch->lock, irq_flags); + } + + xpc_kthread_waitmsgs(part, ch); + } + + /* let registerer know that connection is disconnecting */ + + spin_lock_irqsave(&ch->lock, irq_flags); + if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && + !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) { + ch->flags |= XPC_C_DISCONNECTINGCALLOUT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + xpc_disconnect_callout(ch, xpDisconnecting); + + spin_lock_irqsave(&ch->lock, irq_flags); + ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE; + } + spin_unlock_irqrestore(&ch->lock, irq_flags); + + if (atomic_dec_return(&ch->kthreads_assigned) == 0 && + atomic_dec_return(&part->nchannels_engaged) == 0) { + xpc_arch_ops.indicate_partition_disengaged(part); + } + + xpc_msgqueue_deref(ch); + + dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n", + partid, ch_number); + + xpc_part_deref(part); + return 0; +} + +/* + * For each partition that XPC has established communications with, there is + * a minimum of one kernel thread assigned to perform any operation that + * may potentially sleep or block (basically the callouts to the asynchronous + * functions registered via xpc_connect()). + * + * Additional kthreads are created and destroyed by XPC as the workload + * demands. + * + * A kthread is assigned to one of the active channels that exists for a given + * partition. + */ +void +xpc_create_kthreads(struct xpc_channel *ch, int needed, + int ignore_disconnecting) +{ + unsigned long irq_flags; + u64 args = XPC_PACK_ARGS(ch->partid, ch->number); + struct xpc_partition *part = &xpc_partitions[ch->partid]; + struct task_struct *kthread; + void (*indicate_partition_disengaged) (struct xpc_partition *) = + xpc_arch_ops.indicate_partition_disengaged; + + while (needed-- > 0) { + + /* + * The following is done on behalf of the newly created + * kthread. That kthread is responsible for doing the + * counterpart to the following before it exits. + */ + if (ignore_disconnecting) { + if (!atomic_inc_not_zero(&ch->kthreads_assigned)) { + /* kthreads assigned had gone to zero */ + BUG_ON(!(ch->flags & + XPC_C_DISCONNECTINGCALLOUT_MADE)); + break; + } + + } else if (ch->flags & XPC_C_DISCONNECTING) { + break; + + } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 && + atomic_inc_return(&part->nchannels_engaged) == 1) { + xpc_arch_ops.indicate_partition_engaged(part); + } + (void)xpc_part_ref(part); + xpc_msgqueue_ref(ch); + + kthread = kthread_run(xpc_kthread_start, (void *)args, + "xpc%02dc%d", ch->partid, ch->number); + if (IS_ERR(kthread)) { + /* the fork failed */ + + /* + * NOTE: if (ignore_disconnecting && + * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true, + * then we'll deadlock if all other kthreads assigned + * to this channel are blocked in the channel's + * registerer, because the only thing that will unblock + * them is the xpDisconnecting callout that this + * failed kthread_run() would have made. + */ + + if (atomic_dec_return(&ch->kthreads_assigned) == 0 && + atomic_dec_return(&part->nchannels_engaged) == 0) { + indicate_partition_disengaged(part); + } + xpc_msgqueue_deref(ch); + xpc_part_deref(part); + + if (atomic_read(&ch->kthreads_assigned) < + ch->kthreads_idle_limit) { + /* + * Flag this as an error only if we have an + * insufficient #of kthreads for the channel + * to function. + */ + spin_lock_irqsave(&ch->lock, irq_flags); + XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources, + &irq_flags); + spin_unlock_irqrestore(&ch->lock, irq_flags); + } + break; + } + } +} + +void +xpc_disconnect_wait(int ch_number) +{ + unsigned long irq_flags; + short partid; + struct xpc_partition *part; + struct xpc_channel *ch; + int wakeup_channel_mgr; + + /* now wait for all callouts to the caller's function to cease */ + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + if (!xpc_part_ref(part)) + continue; + + ch = &part->channels[ch_number]; + + if (!(ch->flags & XPC_C_WDISCONNECT)) { + xpc_part_deref(part); + continue; + } + + wait_for_completion(&ch->wdisconnect_wait); + + spin_lock_irqsave(&ch->lock, irq_flags); + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); + wakeup_channel_mgr = 0; + + if (ch->delayed_chctl_flags) { + if (part->act_state != XPC_P_AS_DEACTIVATING) { + spin_lock(&part->chctl_lock); + part->chctl.flags[ch->number] |= + ch->delayed_chctl_flags; + spin_unlock(&part->chctl_lock); + wakeup_channel_mgr = 1; + } + ch->delayed_chctl_flags = 0; + } + + ch->flags &= ~XPC_C_WDISCONNECT; + spin_unlock_irqrestore(&ch->lock, irq_flags); + + if (wakeup_channel_mgr) + xpc_wakeup_channel_mgr(part); + + xpc_part_deref(part); + } +} + +static int +xpc_setup_partitions(void) +{ + short partid; + struct xpc_partition *part; + + xpc_partitions = kcalloc(xp_max_npartitions, + sizeof(struct xpc_partition), + GFP_KERNEL); + if (xpc_partitions == NULL) { + dev_err(xpc_part, "can't get memory for partition structure\n"); + return -ENOMEM; + } + + /* + * The first few fields of each entry of xpc_partitions[] need to + * be initialized now so that calls to xpc_connect() and + * xpc_disconnect() can be made prior to the activation of any remote + * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE + * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING + * PARTITION HAS BEEN ACTIVATED. + */ + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part)); + + part->activate_IRQ_rcvd = 0; + spin_lock_init(&part->act_lock); + part->act_state = XPC_P_AS_INACTIVE; + XPC_SET_REASON(part, 0, 0); + + timer_setup(&part->disengage_timer, + xpc_timeout_partition_disengage, 0); + + part->setup_state = XPC_P_SS_UNSET; + init_waitqueue_head(&part->teardown_wq); + atomic_set(&part->references, 0); + } + + return xpc_arch_ops.setup_partitions(); +} + +static void +xpc_teardown_partitions(void) +{ + xpc_arch_ops.teardown_partitions(); + kfree(xpc_partitions); +} + +static void +xpc_do_exit(enum xp_retval reason) +{ + short partid; + int active_part_count, printed_waiting_msg = 0; + struct xpc_partition *part; + unsigned long printmsg_time, disengage_timeout = 0; + + /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ + DBUG_ON(xpc_exiting == 1); + + /* + * Let the heartbeat checker thread and the discovery thread + * (if one is running) know that they should exit. Also wake up + * the heartbeat checker thread in case it's sleeping. + */ + xpc_exiting = 1; + wake_up_interruptible(&xpc_activate_IRQ_wq); + + /* wait for the discovery thread to exit */ + wait_for_completion(&xpc_discovery_exited); + + /* wait for the heartbeat checker thread to exit */ + wait_for_completion(&xpc_hb_checker_exited); + + /* sleep for a 1/3 of a second or so */ + (void)msleep_interruptible(300); + + /* wait for all partitions to become inactive */ + + printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); + xpc_disengage_timedout = 0; + + do { + active_part_count = 0; + + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_AS_INACTIVE) { + continue; + } + + active_part_count++; + + XPC_DEACTIVATE_PARTITION(part, reason); + + if (part->disengage_timeout > disengage_timeout) + disengage_timeout = part->disengage_timeout; + } + + if (xpc_arch_ops.any_partition_engaged()) { + if (time_is_before_jiffies(printmsg_time)) { + dev_info(xpc_part, "waiting for remote " + "partitions to deactivate, timeout in " + "%ld seconds\n", (disengage_timeout - + jiffies) / HZ); + printmsg_time = jiffies + + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); + printed_waiting_msg = 1; + } + + } else if (active_part_count > 0) { + if (printed_waiting_msg) { + dev_info(xpc_part, "waiting for local partition" + " to deactivate\n"); + printed_waiting_msg = 0; + } + + } else { + if (!xpc_disengage_timedout) { + dev_info(xpc_part, "all partitions have " + "deactivated\n"); + } + break; + } + + /* sleep for a 1/3 of a second or so */ + (void)msleep_interruptible(300); + + } while (1); + + DBUG_ON(xpc_arch_ops.any_partition_engaged()); + + xpc_teardown_rsvd_page(); + + if (reason == xpUnloading) { + (void)unregister_die_notifier(&xpc_die_notifier); + (void)unregister_reboot_notifier(&xpc_reboot_notifier); + } + + /* clear the interface to XPC's functions */ + xpc_clear_interface(); + + if (xpc_sysctl) + unregister_sysctl_table(xpc_sysctl); + + xpc_teardown_partitions(); + + if (is_shub()) + xpc_exit_sn2(); + else if (is_uv()) + xpc_exit_uv(); +} + +/* + * This function is called when the system is being rebooted. + */ +static int +xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) +{ + enum xp_retval reason; + + switch (event) { + case SYS_RESTART: + reason = xpSystemReboot; + break; + case SYS_HALT: + reason = xpSystemHalt; + break; + case SYS_POWER_OFF: + reason = xpSystemPoweroff; + break; + default: + reason = xpSystemGoingDown; + } + + xpc_do_exit(reason); + return NOTIFY_DONE; +} + +/* Used to only allow one cpu to complete disconnect */ +static unsigned int xpc_die_disconnecting; + +/* + * Notify other partitions to deactivate from us by first disengaging from all + * references to our memory. + */ +static void +xpc_die_deactivate(void) +{ + struct xpc_partition *part; + short partid; + int any_engaged; + long keep_waiting; + long wait_to_print; + + if (cmpxchg(&xpc_die_disconnecting, 0, 1)) + return; + + /* keep xpc_hb_checker thread from doing anything (just in case) */ + xpc_exiting = 1; + + xpc_arch_ops.disallow_all_hbs(); /*indicate we're deactivated */ + + for (partid = 0; partid < xp_max_npartitions; partid++) { + part = &xpc_partitions[partid]; + + if (xpc_arch_ops.partition_engaged(partid) || + part->act_state != XPC_P_AS_INACTIVE) { + xpc_arch_ops.request_partition_deactivation(part); + xpc_arch_ops.indicate_partition_disengaged(part); + } + } + + /* + * Though we requested that all other partitions deactivate from us, + * we only wait until they've all disengaged or we've reached the + * defined timelimit. + * + * Given that one iteration through the following while-loop takes + * approximately 200 microseconds, calculate the #of loops to take + * before bailing and the #of loops before printing a waiting message. + */ + keep_waiting = xpc_disengage_timelimit * 1000 * 5; + wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5; + + while (1) { + any_engaged = xpc_arch_ops.any_partition_engaged(); + if (!any_engaged) { + dev_info(xpc_part, "all partitions have deactivated\n"); + break; + } + + if (!keep_waiting--) { + for (partid = 0; partid < xp_max_npartitions; + partid++) { + if (xpc_arch_ops.partition_engaged(partid)) { + dev_info(xpc_part, "deactivate from " + "remote partition %d timed " + "out\n", partid); + } + } + break; + } + + if (!wait_to_print--) { + dev_info(xpc_part, "waiting for remote partitions to " + "deactivate, timeout in %ld seconds\n", + keep_waiting / (1000 * 5)); + wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * + 1000 * 5; + } + + udelay(200); + } +} + +/* + * This function is called when the system is being restarted or halted due + * to some sort of system failure. If this is the case we need to notify the + * other partitions to disengage from all references to our memory. + * This function can also be called when our heartbeater could be offlined + * for a time. In this case we need to notify other partitions to not worry + * about the lack of a heartbeat. + */ +static int +xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args) +{ +#ifdef CONFIG_IA64 /* !!! temporary kludge */ + switch (event) { + case DIE_MACHINE_RESTART: + case DIE_MACHINE_HALT: + xpc_die_deactivate(); + break; + + case DIE_KDEBUG_ENTER: + /* Should lack of heartbeat be ignored by other partitions? */ + if (!xpc_kdebug_ignore) + break; + + /* fall through */ + case DIE_MCA_MONARCH_ENTER: + case DIE_INIT_MONARCH_ENTER: + xpc_arch_ops.offline_heartbeat(); + break; + + case DIE_KDEBUG_LEAVE: + /* Is lack of heartbeat being ignored by other partitions? */ + if (!xpc_kdebug_ignore) + break; + + /* fall through */ + case DIE_MCA_MONARCH_LEAVE: + case DIE_INIT_MONARCH_LEAVE: + xpc_arch_ops.online_heartbeat(); + break; + } +#else + struct die_args *die_args = _die_args; + + switch (event) { + case DIE_TRAP: + if (die_args->trapnr == X86_TRAP_DF) + xpc_die_deactivate(); + + if (((die_args->trapnr == X86_TRAP_MF) || + (die_args->trapnr == X86_TRAP_XF)) && + !user_mode(die_args->regs)) + xpc_die_deactivate(); + + break; + case DIE_INT3: + case DIE_DEBUG: + break; + case DIE_OOPS: + case DIE_GPF: + default: + xpc_die_deactivate(); + } +#endif + + return NOTIFY_DONE; +} + +int __init +xpc_init(void) +{ + int ret; + struct task_struct *kthread; + + dev_set_name(xpc_part, "part"); + dev_set_name(xpc_chan, "chan"); + + if (is_shub()) { + /* + * The ia64-sn2 architecture supports at most 64 partitions. + * And the inability to unregister remote amos restricts us + * further to only support exactly 64 partitions on this + * architecture, no less. + */ + if (xp_max_npartitions != 64) { + dev_err(xpc_part, "max #of partitions not set to 64\n"); + ret = -EINVAL; + } else { + ret = xpc_init_sn2(); + } + + } else if (is_uv()) { + ret = xpc_init_uv(); + + } else { + ret = -ENODEV; + } + + if (ret != 0) + return ret; + + ret = xpc_setup_partitions(); + if (ret != 0) { + dev_err(xpc_part, "can't get memory for partition structure\n"); + goto out_1; + } + + xpc_sysctl = register_sysctl_table(xpc_sys_dir); + + /* + * Fill the partition reserved page with the information needed by + * other partitions to discover we are alive and establish initial + * communications. + */ + ret = xpc_setup_rsvd_page(); + if (ret != 0) { + dev_err(xpc_part, "can't setup our reserved page\n"); + goto out_2; + } + + /* add ourselves to the reboot_notifier_list */ + ret = register_reboot_notifier(&xpc_reboot_notifier); + if (ret != 0) + dev_warn(xpc_part, "can't register reboot notifier\n"); + + /* add ourselves to the die_notifier list */ + ret = register_die_notifier(&xpc_die_notifier); + if (ret != 0) + dev_warn(xpc_part, "can't register die notifier\n"); + + /* + * The real work-horse behind xpc. This processes incoming + * interrupts and monitors remote heartbeats. + */ + kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME); + if (IS_ERR(kthread)) { + dev_err(xpc_part, "failed while forking hb check thread\n"); + ret = -EBUSY; + goto out_3; + } + + /* + * Startup a thread that will attempt to discover other partitions to + * activate based on info provided by SAL. This new thread is short + * lived and will exit once discovery is complete. + */ + kthread = kthread_run(xpc_initiate_discovery, NULL, + XPC_DISCOVERY_THREAD_NAME); + if (IS_ERR(kthread)) { + dev_err(xpc_part, "failed while forking discovery thread\n"); + + /* mark this new thread as a non-starter */ + complete(&xpc_discovery_exited); + + xpc_do_exit(xpUnloading); + return -EBUSY; + } + + /* set the interface to point at XPC's functions */ + xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect, + xpc_initiate_send, xpc_initiate_send_notify, + xpc_initiate_received, xpc_initiate_partid_to_nasids); + + return 0; + + /* initialization was not successful */ +out_3: + xpc_teardown_rsvd_page(); + + (void)unregister_die_notifier(&xpc_die_notifier); + (void)unregister_reboot_notifier(&xpc_reboot_notifier); +out_2: + if (xpc_sysctl) + unregister_sysctl_table(xpc_sysctl); + + xpc_teardown_partitions(); +out_1: + if (is_shub()) + xpc_exit_sn2(); + else if (is_uv()) + xpc_exit_uv(); + return ret; +} + +module_init(xpc_init); + +void __exit +xpc_exit(void) +{ + xpc_do_exit(xpUnloading); +} + +module_exit(xpc_exit); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION("Cross Partition Communication (XPC) support"); +MODULE_LICENSE("GPL"); + +module_param(xpc_hb_interval, int, 0); +MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between " + "heartbeat increments."); + +module_param(xpc_hb_check_interval, int, 0); +MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " + "heartbeat checks."); + +module_param(xpc_disengage_timelimit, int, 0); +MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait " + "for disengage to complete."); + +module_param(xpc_kdebug_ignore, int, 0); +MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by " + "other partitions when dropping into kdebug."); diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c new file mode 100644 index 000000000..519826ba1 --- /dev/null +++ b/drivers/misc/sgi-xp/xpc_partition.c @@ -0,0 +1,540 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) partition support. + * + * This is the part of XPC that detects the presence/absence of + * other partitions. It provides a heartbeat and monitors the + * heartbeats of other partitions. + * + */ + +#include <linux/device.h> +#include <linux/hardirq.h> +#include <linux/slab.h> +#include "xpc.h" +#include <asm/uv/uv_hub.h> + +/* XPC is exiting flag */ +int xpc_exiting; + +/* this partition's reserved page pointers */ +struct xpc_rsvd_page *xpc_rsvd_page; +static unsigned long *xpc_part_nasids; +unsigned long *xpc_mach_nasids; + +static int xpc_nasid_mask_nbytes; /* #of bytes in nasid mask */ +int xpc_nasid_mask_nlongs; /* #of longs in nasid mask */ + +struct xpc_partition *xpc_partitions; + +/* + * Guarantee that the kmalloc'd memory is cacheline aligned. + */ +void * +xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kmalloc will give us cachline aligned memory by default */ + *base = kmalloc(size, flags); + if (*base == NULL) + return NULL; + + if ((u64)*base == L1_CACHE_ALIGN((u64)*base)) + return *base; + + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kmalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) + return NULL; + + return (void *)L1_CACHE_ALIGN((u64)*base); +} + +/* + * Given a nasid, get the physical address of the partition's reserved page + * for that nasid. This function returns 0 on any error. + */ +static unsigned long +xpc_get_rsvd_page_pa(int nasid) +{ + enum xp_retval ret; + u64 cookie = 0; + unsigned long rp_pa = nasid; /* seed with nasid */ + size_t len = 0; + size_t buf_len = 0; + void *buf = NULL; + void *buf_base = NULL; + enum xp_retval (*get_partition_rsvd_page_pa) + (void *, u64 *, unsigned long *, size_t *) = + xpc_arch_ops.get_partition_rsvd_page_pa; + + while (1) { + + /* !!! rp_pa will need to be _gpa on UV. + * ??? So do we save it into the architecture specific parts + * ??? of the xpc_partition structure? Do we rename this + * ??? function or have two versions? Rename rp_pa for UV to + * ??? rp_gpa? + */ + ret = get_partition_rsvd_page_pa(buf, &cookie, &rp_pa, &len); + + dev_dbg(xpc_part, "SAL returned with ret=%d, cookie=0x%016lx, " + "address=0x%016lx, len=0x%016lx\n", ret, + (unsigned long)cookie, rp_pa, len); + + if (ret != xpNeedMoreInfo) + break; + + /* !!! L1_CACHE_ALIGN() is only a sn2-bte_copy requirement */ + if (is_shub()) + len = L1_CACHE_ALIGN(len); + + if (len > buf_len) { + if (buf_base != NULL) + kfree(buf_base); + buf_len = L1_CACHE_ALIGN(len); + buf = xpc_kmalloc_cacheline_aligned(buf_len, GFP_KERNEL, + &buf_base); + if (buf_base == NULL) { + dev_err(xpc_part, "unable to kmalloc " + "len=0x%016lx\n", buf_len); + ret = xpNoMemory; + break; + } + } + + ret = xp_remote_memcpy(xp_pa(buf), rp_pa, len); + if (ret != xpSuccess) { + dev_dbg(xpc_part, "xp_remote_memcpy failed %d\n", ret); + break; + } + } + + kfree(buf_base); + + if (ret != xpSuccess) + rp_pa = 0; + + dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa); + return rp_pa; +} + +/* + * Fill the partition reserved page with the information needed by + * other partitions to discover we are alive and establish initial + * communications. + */ +int +xpc_setup_rsvd_page(void) +{ + int ret; + struct xpc_rsvd_page *rp; + unsigned long rp_pa; + unsigned long new_ts_jiffies; + + /* get the local reserved page's address */ + + preempt_disable(); + rp_pa = xpc_get_rsvd_page_pa(xp_cpu_to_nasid(smp_processor_id())); + preempt_enable(); + if (rp_pa == 0) { + dev_err(xpc_part, "SAL failed to locate the reserved page\n"); + return -ESRCH; + } + rp = (struct xpc_rsvd_page *)__va(xp_socket_pa(rp_pa)); + + if (rp->SAL_version < 3) { + /* SAL_versions < 3 had a SAL_partid defined as a u8 */ + rp->SAL_partid &= 0xff; + } + BUG_ON(rp->SAL_partid != xp_partition_id); + + if (rp->SAL_partid < 0 || rp->SAL_partid >= xp_max_npartitions) { + dev_err(xpc_part, "the reserved page's partid of %d is outside " + "supported range (< 0 || >= %d)\n", rp->SAL_partid, + xp_max_npartitions); + return -EINVAL; + } + + rp->version = XPC_RP_VERSION; + rp->max_npartitions = xp_max_npartitions; + + /* establish the actual sizes of the nasid masks */ + if (rp->SAL_version == 1) { + /* SAL_version 1 didn't set the nasids_size field */ + rp->SAL_nasids_size = 128; + } + xpc_nasid_mask_nbytes = rp->SAL_nasids_size; + xpc_nasid_mask_nlongs = BITS_TO_LONGS(rp->SAL_nasids_size * + BITS_PER_BYTE); + + /* setup the pointers to the various items in the reserved page */ + xpc_part_nasids = XPC_RP_PART_NASIDS(rp); + xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp); + + ret = xpc_arch_ops.setup_rsvd_page(rp); + if (ret != 0) + return ret; + + /* + * Set timestamp of when reserved page was setup by XPC. + * This signifies to the remote partition that our reserved + * page is initialized. + */ + new_ts_jiffies = jiffies; + if (new_ts_jiffies == 0 || new_ts_jiffies == rp->ts_jiffies) + new_ts_jiffies++; + rp->ts_jiffies = new_ts_jiffies; + + xpc_rsvd_page = rp; + return 0; +} + +void +xpc_teardown_rsvd_page(void) +{ + /* a zero timestamp indicates our rsvd page is not initialized */ + xpc_rsvd_page->ts_jiffies = 0; +} + +/* + * Get a copy of a portion of the remote partition's rsvd page. + * + * remote_rp points to a buffer that is cacheline aligned for BTE copies and + * is large enough to contain a copy of their reserved page header and + * part_nasids mask. + */ +enum xp_retval +xpc_get_remote_rp(int nasid, unsigned long *discovered_nasids, + struct xpc_rsvd_page *remote_rp, unsigned long *remote_rp_pa) +{ + int l; + enum xp_retval ret; + + /* get the reserved page's physical address */ + + *remote_rp_pa = xpc_get_rsvd_page_pa(nasid); + if (*remote_rp_pa == 0) + return xpNoRsvdPageAddr; + + /* pull over the reserved page header and part_nasids mask */ + ret = xp_remote_memcpy(xp_pa(remote_rp), *remote_rp_pa, + XPC_RP_HEADER_SIZE + xpc_nasid_mask_nbytes); + if (ret != xpSuccess) + return ret; + + if (discovered_nasids != NULL) { + unsigned long *remote_part_nasids = + XPC_RP_PART_NASIDS(remote_rp); + + for (l = 0; l < xpc_nasid_mask_nlongs; l++) + discovered_nasids[l] |= remote_part_nasids[l]; + } + + /* zero timestamp indicates the reserved page has not been setup */ + if (remote_rp->ts_jiffies == 0) + return xpRsvdPageNotSet; + + if (XPC_VERSION_MAJOR(remote_rp->version) != + XPC_VERSION_MAJOR(XPC_RP_VERSION)) { + return xpBadVersion; + } + + /* check that both remote and local partids are valid for each side */ + if (remote_rp->SAL_partid < 0 || + remote_rp->SAL_partid >= xp_max_npartitions || + remote_rp->max_npartitions <= xp_partition_id) { + return xpInvalidPartid; + } + + if (remote_rp->SAL_partid == xp_partition_id) + return xpLocalPartid; + + return xpSuccess; +} + +/* + * See if the other side has responded to a partition deactivate request + * from us. Though we requested the remote partition to deactivate with regard + * to us, we really only need to wait for the other side to disengage from us. + */ +int +xpc_partition_disengaged(struct xpc_partition *part) +{ + short partid = XPC_PARTID(part); + int disengaged; + + disengaged = !xpc_arch_ops.partition_engaged(partid); + if (part->disengage_timeout) { + if (!disengaged) { + if (time_is_after_jiffies(part->disengage_timeout)) { + /* timelimit hasn't been reached yet */ + return 0; + } + + /* + * Other side hasn't responded to our deactivate + * request in a timely fashion, so assume it's dead. + */ + + dev_info(xpc_part, "deactivate request to remote " + "partition %d timed out\n", partid); + xpc_disengage_timedout = 1; + xpc_arch_ops.assume_partition_disengaged(partid); + disengaged = 1; + } + part->disengage_timeout = 0; + + /* cancel the timer function, provided it's not us */ + if (!in_interrupt()) + del_singleshot_timer_sync(&part->disengage_timer); + + DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING && + part->act_state != XPC_P_AS_INACTIVE); + if (part->act_state != XPC_P_AS_INACTIVE) + xpc_wakeup_channel_mgr(part); + + xpc_arch_ops.cancel_partition_deactivation_request(part); + } + return disengaged; +} + +/* + * Mark specified partition as active. + */ +enum xp_retval +xpc_mark_partition_active(struct xpc_partition *part) +{ + unsigned long irq_flags; + enum xp_retval ret; + + dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part)); + + spin_lock_irqsave(&part->act_lock, irq_flags); + if (part->act_state == XPC_P_AS_ACTIVATING) { + part->act_state = XPC_P_AS_ACTIVE; + ret = xpSuccess; + } else { + DBUG_ON(part->reason == xpSuccess); + ret = part->reason; + } + spin_unlock_irqrestore(&part->act_lock, irq_flags); + + return ret; +} + +/* + * Start the process of deactivating the specified partition. + */ +void +xpc_deactivate_partition(const int line, struct xpc_partition *part, + enum xp_retval reason) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&part->act_lock, irq_flags); + + if (part->act_state == XPC_P_AS_INACTIVE) { + XPC_SET_REASON(part, reason, line); + spin_unlock_irqrestore(&part->act_lock, irq_flags); + if (reason == xpReactivating) { + /* we interrupt ourselves to reactivate partition */ + xpc_arch_ops.request_partition_reactivation(part); + } + return; + } + if (part->act_state == XPC_P_AS_DEACTIVATING) { + if ((part->reason == xpUnloading && reason != xpUnloading) || + reason == xpReactivating) { + XPC_SET_REASON(part, reason, line); + } + spin_unlock_irqrestore(&part->act_lock, irq_flags); + return; + } + + part->act_state = XPC_P_AS_DEACTIVATING; + XPC_SET_REASON(part, reason, line); + + spin_unlock_irqrestore(&part->act_lock, irq_flags); + + /* ask remote partition to deactivate with regard to us */ + xpc_arch_ops.request_partition_deactivation(part); + + /* set a timelimit on the disengage phase of the deactivation request */ + part->disengage_timeout = jiffies + (xpc_disengage_timelimit * HZ); + part->disengage_timer.expires = part->disengage_timeout; + add_timer(&part->disengage_timer); + + dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n", + XPC_PARTID(part), reason); + + xpc_partition_going_down(part, reason); +} + +/* + * Mark specified partition as inactive. + */ +void +xpc_mark_partition_inactive(struct xpc_partition *part) +{ + unsigned long irq_flags; + + dev_dbg(xpc_part, "setting partition %d to INACTIVE\n", + XPC_PARTID(part)); + + spin_lock_irqsave(&part->act_lock, irq_flags); + part->act_state = XPC_P_AS_INACTIVE; + spin_unlock_irqrestore(&part->act_lock, irq_flags); + part->remote_rp_pa = 0; +} + +/* + * SAL has provided a partition and machine mask. The partition mask + * contains a bit for each even nasid in our partition. The machine + * mask contains a bit for each even nasid in the entire machine. + * + * Using those two bit arrays, we can determine which nasids are + * known in the machine. Each should also have a reserved page + * initialized if they are available for partitioning. + */ +void +xpc_discovery(void) +{ + void *remote_rp_base; + struct xpc_rsvd_page *remote_rp; + unsigned long remote_rp_pa; + int region; + int region_size; + int max_regions; + int nasid; + unsigned long *discovered_nasids; + enum xp_retval ret; + + remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE + + xpc_nasid_mask_nbytes, + GFP_KERNEL, &remote_rp_base); + if (remote_rp == NULL) + return; + + discovered_nasids = kcalloc(xpc_nasid_mask_nlongs, sizeof(long), + GFP_KERNEL); + if (discovered_nasids == NULL) { + kfree(remote_rp_base); + return; + } + + /* + * The term 'region' in this context refers to the minimum number of + * nodes that can comprise an access protection grouping. The access + * protection is in regards to memory, IOI and IPI. + */ + region_size = xp_region_size; + + if (is_uv()) + max_regions = 256; + else { + max_regions = 64; + + switch (region_size) { + case 128: + max_regions *= 2; + /* fall through */ + case 64: + max_regions *= 2; + /* fall through */ + case 32: + max_regions *= 2; + region_size = 16; + DBUG_ON(!is_shub2()); + } + } + + for (region = 0; region < max_regions; region++) { + + if (xpc_exiting) + break; + + dev_dbg(xpc_part, "searching region %d\n", region); + + for (nasid = (region * region_size * 2); + nasid < ((region + 1) * region_size * 2); nasid += 2) { + + if (xpc_exiting) + break; + + dev_dbg(xpc_part, "checking nasid %d\n", nasid); + + if (test_bit(nasid / 2, xpc_part_nasids)) { + dev_dbg(xpc_part, "PROM indicates Nasid %d is " + "part of the local partition; skipping " + "region\n", nasid); + break; + } + + if (!(test_bit(nasid / 2, xpc_mach_nasids))) { + dev_dbg(xpc_part, "PROM indicates Nasid %d was " + "not on Numa-Link network at reset\n", + nasid); + continue; + } + + if (test_bit(nasid / 2, discovered_nasids)) { + dev_dbg(xpc_part, "Nasid %d is part of a " + "partition which was previously " + "discovered\n", nasid); + continue; + } + + /* pull over the rsvd page header & part_nasids mask */ + + ret = xpc_get_remote_rp(nasid, discovered_nasids, + remote_rp, &remote_rp_pa); + if (ret != xpSuccess) { + dev_dbg(xpc_part, "unable to get reserved page " + "from nasid %d, reason=%d\n", nasid, + ret); + + if (ret == xpLocalPartid) + break; + + continue; + } + + xpc_arch_ops.request_partition_activation(remote_rp, + remote_rp_pa, nasid); + } + } + + kfree(discovered_nasids); + kfree(remote_rp_base); +} + +/* + * Given a partid, get the nasids owned by that partition from the + * remote partition's reserved page. + */ +enum xp_retval +xpc_initiate_partid_to_nasids(short partid, void *nasid_mask) +{ + struct xpc_partition *part; + unsigned long part_nasid_pa; + + part = &xpc_partitions[partid]; + if (part->remote_rp_pa == 0) + return xpPartitionDown; + + memset(nasid_mask, 0, xpc_nasid_mask_nbytes); + + part_nasid_pa = (unsigned long)XPC_RP_PART_NASIDS(part->remote_rp_pa); + + return xp_remote_memcpy(xp_pa(nasid_mask), part_nasid_pa, + xpc_nasid_mask_nbytes); +} diff --git a/drivers/misc/sgi-xp/xpc_sn2.c b/drivers/misc/sgi-xp/xpc_sn2.c new file mode 100644 index 000000000..5a12d2a54 --- /dev/null +++ b/drivers/misc/sgi-xp/xpc_sn2.c @@ -0,0 +1,2459 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) sn2-based functions. + * + * Architecture specific implementation of common functions. + * + */ + +#include <linux/delay.h> +#include <linux/slab.h> +#include <asm/uncached.h> +#include <asm/sn/mspec.h> +#include <asm/sn/sn_sal.h> +#include "xpc.h" + +/* + * Define the number of u64s required to represent all the C-brick nasids + * as a bitmap. The cross-partition kernel modules deal only with + * C-brick nasids, thus the need for bitmaps which don't account for + * odd-numbered (non C-brick) nasids. + */ +#define XPC_MAX_PHYSNODES_SN2 (MAX_NUMALINK_NODES / 2) +#define XP_NASID_MASK_BYTES_SN2 ((XPC_MAX_PHYSNODES_SN2 + 7) / 8) +#define XP_NASID_MASK_WORDS_SN2 ((XPC_MAX_PHYSNODES_SN2 + 63) / 64) + +/* + * Memory for XPC's amo variables is allocated by the MSPEC driver. These + * pages are located in the lowest granule. The lowest granule uses 4k pages + * for cached references and an alternate TLB handler to never provide a + * cacheable mapping for the entire region. This will prevent speculative + * reading of cached copies of our lines from being issued which will cause + * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64 + * amo variables (based on XP_MAX_NPARTITIONS_SN2) to identify the senders of + * NOTIFY IRQs, 128 amo variables (based on XP_NASID_MASK_WORDS_SN2) to identify + * the senders of ACTIVATE IRQs, 1 amo variable to identify which remote + * partitions (i.e., XPCs) consider themselves currently engaged with the + * local XPC and 1 amo variable to request partition deactivation. + */ +#define XPC_NOTIFY_IRQ_AMOS_SN2 0 +#define XPC_ACTIVATE_IRQ_AMOS_SN2 (XPC_NOTIFY_IRQ_AMOS_SN2 + \ + XP_MAX_NPARTITIONS_SN2) +#define XPC_ENGAGED_PARTITIONS_AMO_SN2 (XPC_ACTIVATE_IRQ_AMOS_SN2 + \ + XP_NASID_MASK_WORDS_SN2) +#define XPC_DEACTIVATE_REQUEST_AMO_SN2 (XPC_ENGAGED_PARTITIONS_AMO_SN2 + 1) + +/* + * Buffer used to store a local copy of portions of a remote partition's + * reserved page (either its header and part_nasids mask, or its vars). + */ +static void *xpc_remote_copy_buffer_base_sn2; +static char *xpc_remote_copy_buffer_sn2; + +static struct xpc_vars_sn2 *xpc_vars_sn2; +static struct xpc_vars_part_sn2 *xpc_vars_part_sn2; + +static int +xpc_setup_partitions_sn2(void) +{ + /* nothing needs to be done */ + return 0; +} + +static void +xpc_teardown_partitions_sn2(void) +{ + /* nothing needs to be done */ +} + +/* SH_IPI_ACCESS shub register value on startup */ +static u64 xpc_sh1_IPI_access_sn2; +static u64 xpc_sh2_IPI_access0_sn2; +static u64 xpc_sh2_IPI_access1_sn2; +static u64 xpc_sh2_IPI_access2_sn2; +static u64 xpc_sh2_IPI_access3_sn2; + +/* + * Change protections to allow IPI operations. + */ +static void +xpc_allow_IPI_ops_sn2(void) +{ + int node; + int nasid; + + /* !!! The following should get moved into SAL. */ + if (is_shub2()) { + xpc_sh2_IPI_access0_sn2 = + (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0)); + xpc_sh2_IPI_access1_sn2 = + (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1)); + xpc_sh2_IPI_access2_sn2 = + (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2)); + xpc_sh2_IPI_access3_sn2 = + (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3)); + + for_each_online_node(node) { + nasid = cnodeid_to_nasid(node); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0), + -1UL); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1), + -1UL); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2), + -1UL); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3), + -1UL); + } + } else { + xpc_sh1_IPI_access_sn2 = + (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS)); + + for_each_online_node(node) { + nasid = cnodeid_to_nasid(node); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS), + -1UL); + } + } +} + +/* + * Restrict protections to disallow IPI operations. + */ +static void +xpc_disallow_IPI_ops_sn2(void) +{ + int node; + int nasid; + + /* !!! The following should get moved into SAL. */ + if (is_shub2()) { + for_each_online_node(node) { + nasid = cnodeid_to_nasid(node); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0), + xpc_sh2_IPI_access0_sn2); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1), + xpc_sh2_IPI_access1_sn2); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2), + xpc_sh2_IPI_access2_sn2); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3), + xpc_sh2_IPI_access3_sn2); + } + } else { + for_each_online_node(node) { + nasid = cnodeid_to_nasid(node); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS), + xpc_sh1_IPI_access_sn2); + } + } +} + +/* + * The following set of functions are used for the sending and receiving of + * IRQs (also known as IPIs). There are two flavors of IRQs, one that is + * associated with partition activity (SGI_XPC_ACTIVATE) and the other that + * is associated with channel activity (SGI_XPC_NOTIFY). + */ + +static u64 +xpc_receive_IRQ_amo_sn2(struct amo *amo) +{ + return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR); +} + +static enum xp_retval +xpc_send_IRQ_sn2(struct amo *amo, u64 flag, int nasid, int phys_cpuid, + int vector) +{ + int ret = 0; + unsigned long irq_flags; + + local_irq_save(irq_flags); + + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag); + sn_send_IPI_phys(nasid, phys_cpuid, vector, 0); + + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IRQs and amos to it until the heartbeat times out. + */ + ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable), + xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); + + return (ret == 0) ? xpSuccess : xpPioReadError; +} + +static struct amo * +xpc_init_IRQ_amo_sn2(int index) +{ + struct amo *amo = xpc_vars_sn2->amos_page + index; + + (void)xpc_receive_IRQ_amo_sn2(amo); /* clear amo variable */ + return amo; +} + +/* + * Functions associated with SGI_XPC_ACTIVATE IRQ. + */ + +/* + * Notify the heartbeat check thread that an activate IRQ has been received. + */ +static irqreturn_t +xpc_handle_activate_IRQ_sn2(int irq, void *dev_id) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + xpc_activate_IRQ_rcvd++; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + wake_up_interruptible(&xpc_activate_IRQ_wq); + return IRQ_HANDLED; +} + +/* + * Flag the appropriate amo variable and send an IRQ to the specified node. + */ +static void +xpc_send_activate_IRQ_sn2(unsigned long amos_page_pa, int from_nasid, + int to_nasid, int to_phys_cpuid) +{ + struct amo *amos = (struct amo *)__va(amos_page_pa + + (XPC_ACTIVATE_IRQ_AMOS_SN2 * + sizeof(struct amo))); + + (void)xpc_send_IRQ_sn2(&amos[BIT_WORD(from_nasid / 2)], + BIT_MASK(from_nasid / 2), to_nasid, + to_phys_cpuid, SGI_XPC_ACTIVATE); +} + +static void +xpc_send_local_activate_IRQ_sn2(int from_nasid) +{ + unsigned long irq_flags; + struct amo *amos = (struct amo *)__va(xpc_vars_sn2->amos_page_pa + + (XPC_ACTIVATE_IRQ_AMOS_SN2 * + sizeof(struct amo))); + + /* fake the sending and receipt of an activate IRQ from remote nasid */ + FETCHOP_STORE_OP(TO_AMO((u64)&amos[BIT_WORD(from_nasid / 2)].variable), + FETCHOP_OR, BIT_MASK(from_nasid / 2)); + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + xpc_activate_IRQ_rcvd++; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + wake_up_interruptible(&xpc_activate_IRQ_wq); +} + +/* + * Functions associated with SGI_XPC_NOTIFY IRQ. + */ + +/* + * Check to see if any chctl flags were sent from the specified partition. + */ +static void +xpc_check_for_sent_chctl_flags_sn2(struct xpc_partition *part) +{ + union xpc_channel_ctl_flags chctl; + unsigned long irq_flags; + + chctl.all_flags = xpc_receive_IRQ_amo_sn2(part->sn.sn2. + local_chctl_amo_va); + if (chctl.all_flags == 0) + return; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.all_flags |= chctl.all_flags; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + dev_dbg(xpc_chan, "received notify IRQ from partid=%d, chctl.all_flags=" + "0x%llx\n", XPC_PARTID(part), chctl.all_flags); + + xpc_wakeup_channel_mgr(part); +} + +/* + * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified + * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more + * than one partition, we use an amo structure per partition to indicate + * whether a partition has sent an IRQ or not. If it has, then wake up the + * associated kthread to handle it. + * + * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IRQs sent by XPC + * running on other partitions. + * + * Noteworthy Arguments: + * + * irq - Interrupt ReQuest number. NOT USED. + * + * dev_id - partid of IRQ's potential sender. + */ +static irqreturn_t +xpc_handle_notify_IRQ_sn2(int irq, void *dev_id) +{ + short partid = (short)(u64)dev_id; + struct xpc_partition *part = &xpc_partitions[partid]; + + DBUG_ON(partid < 0 || partid >= XP_MAX_NPARTITIONS_SN2); + + if (xpc_part_ref(part)) { + xpc_check_for_sent_chctl_flags_sn2(part); + + xpc_part_deref(part); + } + return IRQ_HANDLED; +} + +/* + * Check to see if xpc_handle_notify_IRQ_sn2() dropped any IRQs on the floor + * because the write to their associated amo variable completed after the IRQ + * was received. + */ +static void +xpc_check_for_dropped_notify_IRQ_sn2(struct timer_list *t) +{ + struct xpc_partition *part = + from_timer(part, t, sn.sn2.dropped_notify_IRQ_timer); + + if (xpc_part_ref(part)) { + xpc_check_for_sent_chctl_flags_sn2(part); + + t->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL; + add_timer(t); + xpc_part_deref(part); + } +} + +/* + * Send a notify IRQ to the remote partition that is associated with the + * specified channel. + */ +static void +xpc_send_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag, + char *chctl_flag_string, unsigned long *irq_flags) +{ + struct xpc_partition *part = &xpc_partitions[ch->partid]; + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + union xpc_channel_ctl_flags chctl = { 0 }; + enum xp_retval ret; + + if (likely(part->act_state != XPC_P_AS_DEACTIVATING)) { + chctl.flags[ch->number] = chctl_flag; + ret = xpc_send_IRQ_sn2(part_sn2->remote_chctl_amo_va, + chctl.all_flags, + part_sn2->notify_IRQ_nasid, + part_sn2->notify_IRQ_phys_cpuid, + SGI_XPC_NOTIFY); + dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n", + chctl_flag_string, ch->partid, ch->number, ret); + if (unlikely(ret != xpSuccess)) { + if (irq_flags != NULL) + spin_unlock_irqrestore(&ch->lock, *irq_flags); + XPC_DEACTIVATE_PARTITION(part, ret); + if (irq_flags != NULL) + spin_lock_irqsave(&ch->lock, *irq_flags); + } + } +} + +#define XPC_SEND_NOTIFY_IRQ_SN2(_ch, _ipi_f, _irq_f) \ + xpc_send_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f, _irq_f) + +/* + * Make it look like the remote partition, which is associated with the + * specified channel, sent us a notify IRQ. This faked IRQ will be handled + * by xpc_check_for_dropped_notify_IRQ_sn2(). + */ +static void +xpc_send_local_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag, + char *chctl_flag_string) +{ + struct xpc_partition *part = &xpc_partitions[ch->partid]; + union xpc_channel_ctl_flags chctl = { 0 }; + + chctl.flags[ch->number] = chctl_flag; + FETCHOP_STORE_OP(TO_AMO((u64)&part->sn.sn2.local_chctl_amo_va-> + variable), FETCHOP_OR, chctl.all_flags); + dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n", + chctl_flag_string, ch->partid, ch->number); +} + +#define XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(_ch, _ipi_f) \ + xpc_send_local_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f) + +static void +xpc_send_chctl_closerequest_sn2(struct xpc_channel *ch, + unsigned long *irq_flags) +{ + struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args; + + args->reason = ch->reason; + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREQUEST, irq_flags); +} + +static void +xpc_send_chctl_closereply_sn2(struct xpc_channel *ch, unsigned long *irq_flags) +{ + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREPLY, irq_flags); +} + +static void +xpc_send_chctl_openrequest_sn2(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args; + + args->entry_size = ch->entry_size; + args->local_nentries = ch->local_nentries; + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREQUEST, irq_flags); +} + +static void +xpc_send_chctl_openreply_sn2(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args; + + args->remote_nentries = ch->remote_nentries; + args->local_nentries = ch->local_nentries; + args->local_msgqueue_pa = xp_pa(ch->sn.sn2.local_msgqueue); + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREPLY, irq_flags); +} + +static void +xpc_send_chctl_opencomplete_sn2(struct xpc_channel *ch, + unsigned long *irq_flags) +{ + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENCOMPLETE, irq_flags); +} + +static void +xpc_send_chctl_msgrequest_sn2(struct xpc_channel *ch) +{ + XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST, NULL); +} + +static void +xpc_send_chctl_local_msgrequest_sn2(struct xpc_channel *ch) +{ + XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST); +} + +static enum xp_retval +xpc_save_remote_msgqueue_pa_sn2(struct xpc_channel *ch, + unsigned long msgqueue_pa) +{ + ch->sn.sn2.remote_msgqueue_pa = msgqueue_pa; + return xpSuccess; +} + +/* + * This next set of functions are used to keep track of when a partition is + * potentially engaged in accessing memory belonging to another partition. + */ + +static void +xpc_indicate_partition_engaged_sn2(struct xpc_partition *part) +{ + unsigned long irq_flags; + struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO_SN2 * + sizeof(struct amo))); + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's amo */ + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, + BIT(sn_partition_id)); + + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IRQs and amos to it until the heartbeat times out. + */ + (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), + xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static void +xpc_indicate_partition_disengaged_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + unsigned long irq_flags; + struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa + + (XPC_ENGAGED_PARTITIONS_AMO_SN2 * + sizeof(struct amo))); + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's amo */ + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND, + ~BIT(sn_partition_id)); + + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IRQs and amos to it until the heartbeat times out. + */ + (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), + xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); + + /* + * Send activate IRQ to get other side to see that we've cleared our + * bit in their engaged partitions amo. + */ + xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa, + cnodeid_to_nasid(0), + part_sn2->activate_IRQ_nasid, + part_sn2->activate_IRQ_phys_cpuid); +} + +static void +xpc_assume_partition_disengaged_sn2(short partid) +{ + struct amo *amo = xpc_vars_sn2->amos_page + + XPC_ENGAGED_PARTITIONS_AMO_SN2; + + /* clear bit(s) based on partid mask in our partition's amo */ + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND, + ~BIT(partid)); +} + +static int +xpc_partition_engaged_sn2(short partid) +{ + struct amo *amo = xpc_vars_sn2->amos_page + + XPC_ENGAGED_PARTITIONS_AMO_SN2; + + /* our partition's amo variable ANDed with partid mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) & + BIT(partid)) != 0; +} + +static int +xpc_any_partition_engaged_sn2(void) +{ + struct amo *amo = xpc_vars_sn2->amos_page + + XPC_ENGAGED_PARTITIONS_AMO_SN2; + + /* our partition's amo variable */ + return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) != 0; +} + +/* original protection values for each node */ +static u64 xpc_prot_vec_sn2[MAX_NUMNODES]; + +/* + * Change protections to allow amo operations on non-Shub 1.1 systems. + */ +static enum xp_retval +xpc_allow_amo_ops_sn2(struct amo *amos_page) +{ + enum xp_retval ret = xpSuccess; + + /* + * On SHUB 1.1, we cannot call sn_change_memprotect() since the BIST + * collides with memory operations. On those systems we call + * xpc_allow_amo_ops_shub_wars_1_1_sn2() instead. + */ + if (!enable_shub_wars_1_1()) + ret = xp_expand_memprotect(ia64_tpa((u64)amos_page), PAGE_SIZE); + + return ret; +} + +/* + * Change protections to allow amo operations on Shub 1.1 systems. + */ +static void +xpc_allow_amo_ops_shub_wars_1_1_sn2(void) +{ + int node; + int nasid; + + if (!enable_shub_wars_1_1()) + return; + + for_each_online_node(node) { + nasid = cnodeid_to_nasid(node); + /* save current protection values */ + xpc_prot_vec_sn2[node] = + (u64)HUB_L((u64 *)GLOBAL_MMR_ADDR(nasid, + SH1_MD_DQLP_MMR_DIR_PRIVEC0)); + /* open up everything */ + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, + SH1_MD_DQLP_MMR_DIR_PRIVEC0), + -1UL); + HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, + SH1_MD_DQRP_MMR_DIR_PRIVEC0), + -1UL); + } +} + +static enum xp_retval +xpc_get_partition_rsvd_page_pa_sn2(void *buf, u64 *cookie, unsigned long *rp_pa, + size_t *len) +{ + s64 status; + enum xp_retval ret; + + status = sn_partition_reserved_page_pa((u64)buf, cookie, + (u64 *)rp_pa, (u64 *)len); + if (status == SALRET_OK) + ret = xpSuccess; + else if (status == SALRET_MORE_PASSES) + ret = xpNeedMoreInfo; + else + ret = xpSalError; + + return ret; +} + + +static int +xpc_setup_rsvd_page_sn2(struct xpc_rsvd_page *rp) +{ + struct amo *amos_page; + int i; + int ret; + + xpc_vars_sn2 = XPC_RP_VARS(rp); + + rp->sn.sn2.vars_pa = xp_pa(xpc_vars_sn2); + + /* vars_part array follows immediately after vars */ + xpc_vars_part_sn2 = (struct xpc_vars_part_sn2 *)((u8 *)XPC_RP_VARS(rp) + + XPC_RP_VARS_SIZE); + + /* + * Before clearing xpc_vars_sn2, see if a page of amos had been + * previously allocated. If not we'll need to allocate one and set + * permissions so that cross-partition amos are allowed. + * + * The allocated amo page needs MCA reporting to remain disabled after + * XPC has unloaded. To make this work, we keep a copy of the pointer + * to this page (i.e., amos_page) in the struct xpc_vars_sn2 structure, + * which is pointed to by the reserved page, and re-use that saved copy + * on subsequent loads of XPC. This amo page is never freed, and its + * memory protections are never restricted. + */ + amos_page = xpc_vars_sn2->amos_page; + if (amos_page == NULL) { + amos_page = (struct amo *)TO_AMO(uncached_alloc_page(0, 1)); + if (amos_page == NULL) { + dev_err(xpc_part, "can't allocate page of amos\n"); + return -ENOMEM; + } + + /* + * Open up amo-R/W to cpu. This is done on Shub 1.1 systems + * when xpc_allow_amo_ops_shub_wars_1_1_sn2() is called. + */ + ret = xpc_allow_amo_ops_sn2(amos_page); + if (ret != xpSuccess) { + dev_err(xpc_part, "can't allow amo operations\n"); + uncached_free_page(__IA64_UNCACHED_OFFSET | + TO_PHYS((u64)amos_page), 1); + return -EPERM; + } + } + + /* clear xpc_vars_sn2 */ + memset(xpc_vars_sn2, 0, sizeof(struct xpc_vars_sn2)); + + xpc_vars_sn2->version = XPC_V_VERSION; + xpc_vars_sn2->activate_IRQ_nasid = cpuid_to_nasid(0); + xpc_vars_sn2->activate_IRQ_phys_cpuid = cpu_physical_id(0); + xpc_vars_sn2->vars_part_pa = xp_pa(xpc_vars_part_sn2); + xpc_vars_sn2->amos_page_pa = ia64_tpa((u64)amos_page); + xpc_vars_sn2->amos_page = amos_page; /* save for next load of XPC */ + + /* clear xpc_vars_part_sn2 */ + memset((u64 *)xpc_vars_part_sn2, 0, sizeof(struct xpc_vars_part_sn2) * + XP_MAX_NPARTITIONS_SN2); + + /* initialize the activate IRQ related amo variables */ + for (i = 0; i < xpc_nasid_mask_nlongs; i++) + (void)xpc_init_IRQ_amo_sn2(XPC_ACTIVATE_IRQ_AMOS_SN2 + i); + + /* initialize the engaged remote partitions related amo variables */ + (void)xpc_init_IRQ_amo_sn2(XPC_ENGAGED_PARTITIONS_AMO_SN2); + (void)xpc_init_IRQ_amo_sn2(XPC_DEACTIVATE_REQUEST_AMO_SN2); + + return 0; +} + +static int +xpc_hb_allowed_sn2(short partid, void *heartbeating_to_mask) +{ + return test_bit(partid, heartbeating_to_mask); +} + +static void +xpc_allow_hb_sn2(short partid) +{ + DBUG_ON(xpc_vars_sn2 == NULL); + set_bit(partid, xpc_vars_sn2->heartbeating_to_mask); +} + +static void +xpc_disallow_hb_sn2(short partid) +{ + DBUG_ON(xpc_vars_sn2 == NULL); + clear_bit(partid, xpc_vars_sn2->heartbeating_to_mask); +} + +static void +xpc_disallow_all_hbs_sn2(void) +{ + DBUG_ON(xpc_vars_sn2 == NULL); + bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, xp_max_npartitions); +} + +static void +xpc_increment_heartbeat_sn2(void) +{ + xpc_vars_sn2->heartbeat++; +} + +static void +xpc_offline_heartbeat_sn2(void) +{ + xpc_increment_heartbeat_sn2(); + xpc_vars_sn2->heartbeat_offline = 1; +} + +static void +xpc_online_heartbeat_sn2(void) +{ + xpc_increment_heartbeat_sn2(); + xpc_vars_sn2->heartbeat_offline = 0; +} + +static void +xpc_heartbeat_init_sn2(void) +{ + DBUG_ON(xpc_vars_sn2 == NULL); + + bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2); + xpc_online_heartbeat_sn2(); +} + +static void +xpc_heartbeat_exit_sn2(void) +{ + xpc_offline_heartbeat_sn2(); +} + +static enum xp_retval +xpc_get_remote_heartbeat_sn2(struct xpc_partition *part) +{ + struct xpc_vars_sn2 *remote_vars; + enum xp_retval ret; + + remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2; + + /* pull the remote vars structure that contains the heartbeat */ + ret = xp_remote_memcpy(xp_pa(remote_vars), + part->sn.sn2.remote_vars_pa, + XPC_RP_VARS_SIZE); + if (ret != xpSuccess) + return ret; + + dev_dbg(xpc_part, "partid=%d, heartbeat=%lld, last_heartbeat=%lld, " + "heartbeat_offline=%lld, HB_mask[0]=0x%lx\n", XPC_PARTID(part), + remote_vars->heartbeat, part->last_heartbeat, + remote_vars->heartbeat_offline, + remote_vars->heartbeating_to_mask[0]); + + if ((remote_vars->heartbeat == part->last_heartbeat && + !remote_vars->heartbeat_offline) || + !xpc_hb_allowed_sn2(sn_partition_id, + remote_vars->heartbeating_to_mask)) { + ret = xpNoHeartbeat; + } else { + part->last_heartbeat = remote_vars->heartbeat; + } + + return ret; +} + +/* + * Get a copy of the remote partition's XPC variables from the reserved page. + * + * remote_vars points to a buffer that is cacheline aligned for BTE copies and + * assumed to be of size XPC_RP_VARS_SIZE. + */ +static enum xp_retval +xpc_get_remote_vars_sn2(unsigned long remote_vars_pa, + struct xpc_vars_sn2 *remote_vars) +{ + enum xp_retval ret; + + if (remote_vars_pa == 0) + return xpVarsNotSet; + + /* pull over the cross partition variables */ + ret = xp_remote_memcpy(xp_pa(remote_vars), remote_vars_pa, + XPC_RP_VARS_SIZE); + if (ret != xpSuccess) + return ret; + + if (XPC_VERSION_MAJOR(remote_vars->version) != + XPC_VERSION_MAJOR(XPC_V_VERSION)) { + return xpBadVersion; + } + + return xpSuccess; +} + +static void +xpc_request_partition_activation_sn2(struct xpc_rsvd_page *remote_rp, + unsigned long remote_rp_pa, int nasid) +{ + xpc_send_local_activate_IRQ_sn2(nasid); +} + +static void +xpc_request_partition_reactivation_sn2(struct xpc_partition *part) +{ + xpc_send_local_activate_IRQ_sn2(part->sn.sn2.activate_IRQ_nasid); +} + +static void +xpc_request_partition_deactivation_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + unsigned long irq_flags; + struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa + + (XPC_DEACTIVATE_REQUEST_AMO_SN2 * + sizeof(struct amo))); + + local_irq_save(irq_flags); + + /* set bit corresponding to our partid in remote partition's amo */ + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, + BIT(sn_partition_id)); + + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IRQs and amos to it until the heartbeat times out. + */ + (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), + xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); + + /* + * Send activate IRQ to get other side to see that we've set our + * bit in their deactivate request amo. + */ + xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa, + cnodeid_to_nasid(0), + part_sn2->activate_IRQ_nasid, + part_sn2->activate_IRQ_phys_cpuid); +} + +static void +xpc_cancel_partition_deactivation_request_sn2(struct xpc_partition *part) +{ + unsigned long irq_flags; + struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa + + (XPC_DEACTIVATE_REQUEST_AMO_SN2 * + sizeof(struct amo))); + + local_irq_save(irq_flags); + + /* clear bit corresponding to our partid in remote partition's amo */ + FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND, + ~BIT(sn_partition_id)); + + /* + * We must always use the nofault function regardless of whether we + * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we + * didn't, we'd never know that the other partition is down and would + * keep sending IRQs and amos to it until the heartbeat times out. + */ + (void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo-> + variable), + xp_nofault_PIOR_target)); + + local_irq_restore(irq_flags); +} + +static int +xpc_partition_deactivation_requested_sn2(short partid) +{ + struct amo *amo = xpc_vars_sn2->amos_page + + XPC_DEACTIVATE_REQUEST_AMO_SN2; + + /* our partition's amo variable ANDed with partid mask */ + return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) & + BIT(partid)) != 0; +} + +/* + * Update the remote partition's info. + */ +static void +xpc_update_partition_info_sn2(struct xpc_partition *part, u8 remote_rp_version, + unsigned long *remote_rp_ts_jiffies, + unsigned long remote_rp_pa, + unsigned long remote_vars_pa, + struct xpc_vars_sn2 *remote_vars) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + + part->remote_rp_version = remote_rp_version; + dev_dbg(xpc_part, " remote_rp_version = 0x%016x\n", + part->remote_rp_version); + + part->remote_rp_ts_jiffies = *remote_rp_ts_jiffies; + dev_dbg(xpc_part, " remote_rp_ts_jiffies = 0x%016lx\n", + part->remote_rp_ts_jiffies); + + part->remote_rp_pa = remote_rp_pa; + dev_dbg(xpc_part, " remote_rp_pa = 0x%016lx\n", part->remote_rp_pa); + + part_sn2->remote_vars_pa = remote_vars_pa; + dev_dbg(xpc_part, " remote_vars_pa = 0x%016lx\n", + part_sn2->remote_vars_pa); + + part->last_heartbeat = remote_vars->heartbeat - 1; + dev_dbg(xpc_part, " last_heartbeat = 0x%016llx\n", + part->last_heartbeat); + + part_sn2->remote_vars_part_pa = remote_vars->vars_part_pa; + dev_dbg(xpc_part, " remote_vars_part_pa = 0x%016lx\n", + part_sn2->remote_vars_part_pa); + + part_sn2->activate_IRQ_nasid = remote_vars->activate_IRQ_nasid; + dev_dbg(xpc_part, " activate_IRQ_nasid = 0x%x\n", + part_sn2->activate_IRQ_nasid); + + part_sn2->activate_IRQ_phys_cpuid = + remote_vars->activate_IRQ_phys_cpuid; + dev_dbg(xpc_part, " activate_IRQ_phys_cpuid = 0x%x\n", + part_sn2->activate_IRQ_phys_cpuid); + + part_sn2->remote_amos_page_pa = remote_vars->amos_page_pa; + dev_dbg(xpc_part, " remote_amos_page_pa = 0x%lx\n", + part_sn2->remote_amos_page_pa); + + part_sn2->remote_vars_version = remote_vars->version; + dev_dbg(xpc_part, " remote_vars_version = 0x%x\n", + part_sn2->remote_vars_version); +} + +/* + * Prior code has determined the nasid which generated a activate IRQ. + * Inspect that nasid to determine if its partition needs to be activated + * or deactivated. + * + * A partition is considered "awaiting activation" if our partition + * flags indicate it is not active and it has a heartbeat. A + * partition is considered "awaiting deactivation" if our partition + * flags indicate it is active but it has no heartbeat or it is not + * sending its heartbeat to us. + * + * To determine the heartbeat, the remote nasid must have a properly + * initialized reserved page. + */ +static void +xpc_identify_activate_IRQ_req_sn2(int nasid) +{ + struct xpc_rsvd_page *remote_rp; + struct xpc_vars_sn2 *remote_vars; + unsigned long remote_rp_pa; + unsigned long remote_vars_pa; + int remote_rp_version; + int reactivate = 0; + unsigned long remote_rp_ts_jiffies = 0; + short partid; + struct xpc_partition *part; + struct xpc_partition_sn2 *part_sn2; + enum xp_retval ret; + + /* pull over the reserved page structure */ + + remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer_sn2; + + ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa); + if (ret != xpSuccess) { + dev_warn(xpc_part, "unable to get reserved page from nasid %d, " + "which sent interrupt, reason=%d\n", nasid, ret); + return; + } + + remote_vars_pa = remote_rp->sn.sn2.vars_pa; + remote_rp_version = remote_rp->version; + remote_rp_ts_jiffies = remote_rp->ts_jiffies; + + partid = remote_rp->SAL_partid; + part = &xpc_partitions[partid]; + part_sn2 = &part->sn.sn2; + + /* pull over the cross partition variables */ + + remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2; + + ret = xpc_get_remote_vars_sn2(remote_vars_pa, remote_vars); + if (ret != xpSuccess) { + dev_warn(xpc_part, "unable to get XPC variables from nasid %d, " + "which sent interrupt, reason=%d\n", nasid, ret); + + XPC_DEACTIVATE_PARTITION(part, ret); + return; + } + + part->activate_IRQ_rcvd++; + + dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = " + "%lld:0x%lx\n", (int)nasid, (int)partid, + part->activate_IRQ_rcvd, + remote_vars->heartbeat, remote_vars->heartbeating_to_mask[0]); + + if (xpc_partition_disengaged(part) && + part->act_state == XPC_P_AS_INACTIVE) { + + xpc_update_partition_info_sn2(part, remote_rp_version, + &remote_rp_ts_jiffies, + remote_rp_pa, remote_vars_pa, + remote_vars); + + if (xpc_partition_deactivation_requested_sn2(partid)) { + /* + * Other side is waiting on us to deactivate even though + * we already have. + */ + return; + } + + xpc_activate_partition(part); + return; + } + + DBUG_ON(part->remote_rp_version == 0); + DBUG_ON(part_sn2->remote_vars_version == 0); + + if (remote_rp_ts_jiffies != part->remote_rp_ts_jiffies) { + + /* the other side rebooted */ + + DBUG_ON(xpc_partition_engaged_sn2(partid)); + DBUG_ON(xpc_partition_deactivation_requested_sn2(partid)); + + xpc_update_partition_info_sn2(part, remote_rp_version, + &remote_rp_ts_jiffies, + remote_rp_pa, remote_vars_pa, + remote_vars); + reactivate = 1; + } + + if (part->disengage_timeout > 0 && !xpc_partition_disengaged(part)) { + /* still waiting on other side to disengage from us */ + return; + } + + if (reactivate) + XPC_DEACTIVATE_PARTITION(part, xpReactivating); + else if (xpc_partition_deactivation_requested_sn2(partid)) + XPC_DEACTIVATE_PARTITION(part, xpOtherGoingDown); +} + +/* + * Loop through the activation amo variables and process any bits + * which are set. Each bit indicates a nasid sending a partition + * activation or deactivation request. + * + * Return #of IRQs detected. + */ +int +xpc_identify_activate_IRQ_sender_sn2(void) +{ + int l; + int b; + unsigned long nasid_mask_long; + u64 nasid; /* remote nasid */ + int n_IRQs_detected = 0; + struct amo *act_amos; + + act_amos = xpc_vars_sn2->amos_page + XPC_ACTIVATE_IRQ_AMOS_SN2; + + /* scan through activate amo variables looking for non-zero entries */ + for (l = 0; l < xpc_nasid_mask_nlongs; l++) { + + if (xpc_exiting) + break; + + nasid_mask_long = xpc_receive_IRQ_amo_sn2(&act_amos[l]); + + b = find_first_bit(&nasid_mask_long, BITS_PER_LONG); + if (b >= BITS_PER_LONG) { + /* no IRQs from nasids in this amo variable */ + continue; + } + + dev_dbg(xpc_part, "amo[%d] gave back 0x%lx\n", l, + nasid_mask_long); + + /* + * If this nasid has been added to the machine since + * our partition was reset, this will retain the + * remote nasid in our reserved pages machine mask. + * This is used in the event of module reload. + */ + xpc_mach_nasids[l] |= nasid_mask_long; + + /* locate the nasid(s) which sent interrupts */ + + do { + n_IRQs_detected++; + nasid = (l * BITS_PER_LONG + b) * 2; + dev_dbg(xpc_part, "interrupt from nasid %lld\n", nasid); + xpc_identify_activate_IRQ_req_sn2(nasid); + + b = find_next_bit(&nasid_mask_long, BITS_PER_LONG, + b + 1); + } while (b < BITS_PER_LONG); + } + return n_IRQs_detected; +} + +static void +xpc_process_activate_IRQ_rcvd_sn2(void) +{ + unsigned long irq_flags; + int n_IRQs_expected; + int n_IRQs_detected; + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + n_IRQs_expected = xpc_activate_IRQ_rcvd; + xpc_activate_IRQ_rcvd = 0; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + n_IRQs_detected = xpc_identify_activate_IRQ_sender_sn2(); + if (n_IRQs_detected < n_IRQs_expected) { + /* retry once to help avoid missing amo */ + (void)xpc_identify_activate_IRQ_sender_sn2(); + } +} + +/* + * Setup the channel structures that are sn2 specific. + */ +static enum xp_retval +xpc_setup_ch_structures_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + struct xpc_channel_sn2 *ch_sn2; + enum xp_retval retval; + int ret; + int cpuid; + int ch_number; + struct timer_list *timer; + short partid = XPC_PARTID(part); + + /* allocate all the required GET/PUT values */ + + part_sn2->local_GPs = + xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, + &part_sn2->local_GPs_base); + if (part_sn2->local_GPs == NULL) { + dev_err(xpc_chan, "can't get memory for local get/put " + "values\n"); + return xpNoMemory; + } + + part_sn2->remote_GPs = + xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, + &part_sn2->remote_GPs_base); + if (part_sn2->remote_GPs == NULL) { + dev_err(xpc_chan, "can't get memory for remote get/put " + "values\n"); + retval = xpNoMemory; + goto out_1; + } + + part_sn2->remote_GPs_pa = 0; + + /* allocate all the required open and close args */ + + part_sn2->local_openclose_args = + xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, + GFP_KERNEL, &part_sn2-> + local_openclose_args_base); + if (part_sn2->local_openclose_args == NULL) { + dev_err(xpc_chan, "can't get memory for local connect args\n"); + retval = xpNoMemory; + goto out_2; + } + + part_sn2->remote_openclose_args_pa = 0; + + part_sn2->local_chctl_amo_va = xpc_init_IRQ_amo_sn2(partid); + + part_sn2->notify_IRQ_nasid = 0; + part_sn2->notify_IRQ_phys_cpuid = 0; + part_sn2->remote_chctl_amo_va = NULL; + + sprintf(part_sn2->notify_IRQ_owner, "xpc%02d", partid); + ret = request_irq(SGI_XPC_NOTIFY, xpc_handle_notify_IRQ_sn2, + IRQF_SHARED, part_sn2->notify_IRQ_owner, + (void *)(u64)partid); + if (ret != 0) { + dev_err(xpc_chan, "can't register NOTIFY IRQ handler, " + "errno=%d\n", -ret); + retval = xpLackOfResources; + goto out_3; + } + + /* Setup a timer to check for dropped notify IRQs */ + timer = &part_sn2->dropped_notify_IRQ_timer; + timer_setup(timer, xpc_check_for_dropped_notify_IRQ_sn2, 0); + timer->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL; + add_timer(timer); + + for (ch_number = 0; ch_number < part->nchannels; ch_number++) { + ch_sn2 = &part->channels[ch_number].sn.sn2; + + ch_sn2->local_GP = &part_sn2->local_GPs[ch_number]; + ch_sn2->local_openclose_args = + &part_sn2->local_openclose_args[ch_number]; + + mutex_init(&ch_sn2->msg_to_pull_mutex); + } + + /* + * Setup the per partition specific variables required by the + * remote partition to establish channel connections with us. + * + * The setting of the magic # indicates that these per partition + * specific variables are ready to be used. + */ + xpc_vars_part_sn2[partid].GPs_pa = xp_pa(part_sn2->local_GPs); + xpc_vars_part_sn2[partid].openclose_args_pa = + xp_pa(part_sn2->local_openclose_args); + xpc_vars_part_sn2[partid].chctl_amo_pa = + xp_pa(part_sn2->local_chctl_amo_va); + cpuid = raw_smp_processor_id(); /* any CPU in this partition will do */ + xpc_vars_part_sn2[partid].notify_IRQ_nasid = cpuid_to_nasid(cpuid); + xpc_vars_part_sn2[partid].notify_IRQ_phys_cpuid = + cpu_physical_id(cpuid); + xpc_vars_part_sn2[partid].nchannels = part->nchannels; + xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC1_SN2; + + return xpSuccess; + + /* setup of ch structures failed */ +out_3: + kfree(part_sn2->local_openclose_args_base); + part_sn2->local_openclose_args = NULL; +out_2: + kfree(part_sn2->remote_GPs_base); + part_sn2->remote_GPs = NULL; +out_1: + kfree(part_sn2->local_GPs_base); + part_sn2->local_GPs = NULL; + return retval; +} + +/* + * Teardown the channel structures that are sn2 specific. + */ +static void +xpc_teardown_ch_structures_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + short partid = XPC_PARTID(part); + + /* + * Indicate that the variables specific to the remote partition are no + * longer available for its use. + */ + xpc_vars_part_sn2[partid].magic = 0; + + /* in case we've still got outstanding timers registered... */ + del_timer_sync(&part_sn2->dropped_notify_IRQ_timer); + free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid); + + kfree(part_sn2->local_openclose_args_base); + part_sn2->local_openclose_args = NULL; + kfree(part_sn2->remote_GPs_base); + part_sn2->remote_GPs = NULL; + kfree(part_sn2->local_GPs_base); + part_sn2->local_GPs = NULL; + part_sn2->local_chctl_amo_va = NULL; +} + +/* + * Create a wrapper that hides the underlying mechanism for pulling a cacheline + * (or multiple cachelines) from a remote partition. + * + * src_pa must be a cacheline aligned physical address on the remote partition. + * dst must be a cacheline aligned virtual address on this partition. + * cnt must be cacheline sized + */ +/* ??? Replace this function by call to xp_remote_memcpy() or bte_copy()? */ +static enum xp_retval +xpc_pull_remote_cachelines_sn2(struct xpc_partition *part, void *dst, + const unsigned long src_pa, size_t cnt) +{ + enum xp_retval ret; + + DBUG_ON(src_pa != L1_CACHE_ALIGN(src_pa)); + DBUG_ON((unsigned long)dst != L1_CACHE_ALIGN((unsigned long)dst)); + DBUG_ON(cnt != L1_CACHE_ALIGN(cnt)); + + if (part->act_state == XPC_P_AS_DEACTIVATING) + return part->reason; + + ret = xp_remote_memcpy(xp_pa(dst), src_pa, cnt); + if (ret != xpSuccess) { + dev_dbg(xpc_chan, "xp_remote_memcpy() from partition %d failed," + " ret=%d\n", XPC_PARTID(part), ret); + } + return ret; +} + +/* + * Pull the remote per partition specific variables from the specified + * partition. + */ +static enum xp_retval +xpc_pull_remote_vars_part_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + u8 buffer[L1_CACHE_BYTES * 2]; + struct xpc_vars_part_sn2 *pulled_entry_cacheline = + (struct xpc_vars_part_sn2 *)L1_CACHE_ALIGN((u64)buffer); + struct xpc_vars_part_sn2 *pulled_entry; + unsigned long remote_entry_cacheline_pa; + unsigned long remote_entry_pa; + short partid = XPC_PARTID(part); + enum xp_retval ret; + + /* pull the cacheline that contains the variables we're interested in */ + + DBUG_ON(part_sn2->remote_vars_part_pa != + L1_CACHE_ALIGN(part_sn2->remote_vars_part_pa)); + DBUG_ON(sizeof(struct xpc_vars_part_sn2) != L1_CACHE_BYTES / 2); + + remote_entry_pa = part_sn2->remote_vars_part_pa + + sn_partition_id * sizeof(struct xpc_vars_part_sn2); + + remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1)); + + pulled_entry = (struct xpc_vars_part_sn2 *)((u64)pulled_entry_cacheline + + (remote_entry_pa & + (L1_CACHE_BYTES - 1))); + + ret = xpc_pull_remote_cachelines_sn2(part, pulled_entry_cacheline, + remote_entry_cacheline_pa, + L1_CACHE_BYTES); + if (ret != xpSuccess) { + dev_dbg(xpc_chan, "failed to pull XPC vars_part from " + "partition %d, ret=%d\n", partid, ret); + return ret; + } + + /* see if they've been set up yet */ + + if (pulled_entry->magic != XPC_VP_MAGIC1_SN2 && + pulled_entry->magic != XPC_VP_MAGIC2_SN2) { + + if (pulled_entry->magic != 0) { + dev_dbg(xpc_chan, "partition %d's XPC vars_part for " + "partition %d has bad magic value (=0x%llx)\n", + partid, sn_partition_id, pulled_entry->magic); + return xpBadMagic; + } + + /* they've not been initialized yet */ + return xpRetry; + } + + if (xpc_vars_part_sn2[partid].magic == XPC_VP_MAGIC1_SN2) { + + /* validate the variables */ + + if (pulled_entry->GPs_pa == 0 || + pulled_entry->openclose_args_pa == 0 || + pulled_entry->chctl_amo_pa == 0) { + + dev_err(xpc_chan, "partition %d's XPC vars_part for " + "partition %d are not valid\n", partid, + sn_partition_id); + return xpInvalidAddress; + } + + /* the variables we imported look to be valid */ + + part_sn2->remote_GPs_pa = pulled_entry->GPs_pa; + part_sn2->remote_openclose_args_pa = + pulled_entry->openclose_args_pa; + part_sn2->remote_chctl_amo_va = + (struct amo *)__va(pulled_entry->chctl_amo_pa); + part_sn2->notify_IRQ_nasid = pulled_entry->notify_IRQ_nasid; + part_sn2->notify_IRQ_phys_cpuid = + pulled_entry->notify_IRQ_phys_cpuid; + + if (part->nchannels > pulled_entry->nchannels) + part->nchannels = pulled_entry->nchannels; + + /* let the other side know that we've pulled their variables */ + + xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC2_SN2; + } + + if (pulled_entry->magic == XPC_VP_MAGIC1_SN2) + return xpRetry; + + return xpSuccess; +} + +/* + * Establish first contact with the remote partititon. This involves pulling + * the XPC per partition variables from the remote partition and waiting for + * the remote partition to pull ours. + */ +static enum xp_retval +xpc_make_first_contact_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + enum xp_retval ret; + + /* + * Register the remote partition's amos with SAL so it can handle + * and cleanup errors within that address range should the remote + * partition go down. We don't unregister this range because it is + * difficult to tell when outstanding writes to the remote partition + * are finished and thus when it is safe to unregister. This should + * not result in wasted space in the SAL xp_addr_region table because + * we should get the same page for remote_amos_page_pa after module + * reloads and system reboots. + */ + if (sn_register_xp_addr_region(part_sn2->remote_amos_page_pa, + PAGE_SIZE, 1) < 0) { + dev_warn(xpc_part, "xpc_activating(%d) failed to register " + "xp_addr region\n", XPC_PARTID(part)); + + ret = xpPhysAddrRegFailed; + XPC_DEACTIVATE_PARTITION(part, ret); + return ret; + } + + /* + * Send activate IRQ to get other side to activate if they've not + * already begun to do so. + */ + xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa, + cnodeid_to_nasid(0), + part_sn2->activate_IRQ_nasid, + part_sn2->activate_IRQ_phys_cpuid); + + while ((ret = xpc_pull_remote_vars_part_sn2(part)) != xpSuccess) { + if (ret != xpRetry) { + XPC_DEACTIVATE_PARTITION(part, ret); + return ret; + } + + dev_dbg(xpc_part, "waiting to make first contact with " + "partition %d\n", XPC_PARTID(part)); + + /* wait a 1/4 of a second or so */ + (void)msleep_interruptible(250); + + if (part->act_state == XPC_P_AS_DEACTIVATING) + return part->reason; + } + + return xpSuccess; +} + +/* + * Get the chctl flags and pull the openclose args and/or remote GPs as needed. + */ +static u64 +xpc_get_chctl_all_flags_sn2(struct xpc_partition *part) +{ + struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2; + unsigned long irq_flags; + union xpc_channel_ctl_flags chctl; + enum xp_retval ret; + + /* + * See if there are any chctl flags to be handled. + */ + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + chctl = part->chctl; + if (chctl.all_flags != 0) + part->chctl.all_flags = 0; + + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + if (xpc_any_openclose_chctl_flags_set(&chctl)) { + ret = xpc_pull_remote_cachelines_sn2(part, part-> + remote_openclose_args, + part_sn2-> + remote_openclose_args_pa, + XPC_OPENCLOSE_ARGS_SIZE); + if (ret != xpSuccess) { + XPC_DEACTIVATE_PARTITION(part, ret); + + dev_dbg(xpc_chan, "failed to pull openclose args from " + "partition %d, ret=%d\n", XPC_PARTID(part), + ret); + + /* don't bother processing chctl flags anymore */ + chctl.all_flags = 0; + } + } + + if (xpc_any_msg_chctl_flags_set(&chctl)) { + ret = xpc_pull_remote_cachelines_sn2(part, part_sn2->remote_GPs, + part_sn2->remote_GPs_pa, + XPC_GP_SIZE); + if (ret != xpSuccess) { + XPC_DEACTIVATE_PARTITION(part, ret); + + dev_dbg(xpc_chan, "failed to pull GPs from partition " + "%d, ret=%d\n", XPC_PARTID(part), ret); + + /* don't bother processing chctl flags anymore */ + chctl.all_flags = 0; + } + } + + return chctl.all_flags; +} + +/* + * Allocate the local message queue and the notify queue. + */ +static enum xp_retval +xpc_allocate_local_msgqueue_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + unsigned long irq_flags; + int nentries; + size_t nbytes; + + for (nentries = ch->local_nentries; nentries > 0; nentries--) { + + nbytes = nentries * ch->entry_size; + ch_sn2->local_msgqueue = + xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, + &ch_sn2->local_msgqueue_base); + if (ch_sn2->local_msgqueue == NULL) + continue; + + nbytes = nentries * sizeof(struct xpc_notify_sn2); + ch_sn2->notify_queue = kzalloc(nbytes, GFP_KERNEL); + if (ch_sn2->notify_queue == NULL) { + kfree(ch_sn2->local_msgqueue_base); + ch_sn2->local_msgqueue = NULL; + continue; + } + + spin_lock_irqsave(&ch->lock, irq_flags); + if (nentries < ch->local_nentries) { + dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, " + "partid=%d, channel=%d\n", nentries, + ch->local_nentries, ch->partid, ch->number); + + ch->local_nentries = nentries; + } + spin_unlock_irqrestore(&ch->lock, irq_flags); + return xpSuccess; + } + + dev_dbg(xpc_chan, "can't get memory for local message queue and notify " + "queue, partid=%d, channel=%d\n", ch->partid, ch->number); + return xpNoMemory; +} + +/* + * Allocate the cached remote message queue. + */ +static enum xp_retval +xpc_allocate_remote_msgqueue_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + unsigned long irq_flags; + int nentries; + size_t nbytes; + + DBUG_ON(ch->remote_nentries <= 0); + + for (nentries = ch->remote_nentries; nentries > 0; nentries--) { + + nbytes = nentries * ch->entry_size; + ch_sn2->remote_msgqueue = + xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch_sn2-> + remote_msgqueue_base); + if (ch_sn2->remote_msgqueue == NULL) + continue; + + spin_lock_irqsave(&ch->lock, irq_flags); + if (nentries < ch->remote_nentries) { + dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, " + "partid=%d, channel=%d\n", nentries, + ch->remote_nentries, ch->partid, ch->number); + + ch->remote_nentries = nentries; + } + spin_unlock_irqrestore(&ch->lock, irq_flags); + return xpSuccess; + } + + dev_dbg(xpc_chan, "can't get memory for cached remote message queue, " + "partid=%d, channel=%d\n", ch->partid, ch->number); + return xpNoMemory; +} + +/* + * Allocate message queues and other stuff associated with a channel. + * + * Note: Assumes all of the channel sizes are filled in. + */ +static enum xp_retval +xpc_setup_msg_structures_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + enum xp_retval ret; + + DBUG_ON(ch->flags & XPC_C_SETUP); + + ret = xpc_allocate_local_msgqueue_sn2(ch); + if (ret == xpSuccess) { + + ret = xpc_allocate_remote_msgqueue_sn2(ch); + if (ret != xpSuccess) { + kfree(ch_sn2->local_msgqueue_base); + ch_sn2->local_msgqueue = NULL; + kfree(ch_sn2->notify_queue); + ch_sn2->notify_queue = NULL; + } + } + return ret; +} + +/* + * Free up message queues and other stuff that were allocated for the specified + * channel. + */ +static void +xpc_teardown_msg_structures_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + + DBUG_ON(!spin_is_locked(&ch->lock)); + + ch_sn2->remote_msgqueue_pa = 0; + + ch_sn2->local_GP->get = 0; + ch_sn2->local_GP->put = 0; + ch_sn2->remote_GP.get = 0; + ch_sn2->remote_GP.put = 0; + ch_sn2->w_local_GP.get = 0; + ch_sn2->w_local_GP.put = 0; + ch_sn2->w_remote_GP.get = 0; + ch_sn2->w_remote_GP.put = 0; + ch_sn2->next_msg_to_pull = 0; + + if (ch->flags & XPC_C_SETUP) { + dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n", + ch->flags, ch->partid, ch->number); + + kfree(ch_sn2->local_msgqueue_base); + ch_sn2->local_msgqueue = NULL; + kfree(ch_sn2->remote_msgqueue_base); + ch_sn2->remote_msgqueue = NULL; + kfree(ch_sn2->notify_queue); + ch_sn2->notify_queue = NULL; + } +} + +/* + * Notify those who wanted to be notified upon delivery of their message. + */ +static void +xpc_notify_senders_sn2(struct xpc_channel *ch, enum xp_retval reason, s64 put) +{ + struct xpc_notify_sn2 *notify; + u8 notify_type; + s64 get = ch->sn.sn2.w_remote_GP.get - 1; + + while (++get < put && atomic_read(&ch->n_to_notify) > 0) { + + notify = &ch->sn.sn2.notify_queue[get % ch->local_nentries]; + + /* + * See if the notify entry indicates it was associated with + * a message who's sender wants to be notified. It is possible + * that it is, but someone else is doing or has done the + * notification. + */ + notify_type = notify->type; + if (notify_type == 0 || + cmpxchg(¬ify->type, notify_type, 0) != notify_type) { + continue; + } + + DBUG_ON(notify_type != XPC_N_CALL); + + atomic_dec(&ch->n_to_notify); + + if (notify->func != NULL) { + dev_dbg(xpc_chan, "notify->func() called, notify=0x%p " + "msg_number=%lld partid=%d channel=%d\n", + (void *)notify, get, ch->partid, ch->number); + + notify->func(reason, ch->partid, ch->number, + notify->key); + + dev_dbg(xpc_chan, "notify->func() returned, notify=0x%p" + " msg_number=%lld partid=%d channel=%d\n", + (void *)notify, get, ch->partid, ch->number); + } + } +} + +static void +xpc_notify_senders_of_disconnect_sn2(struct xpc_channel *ch) +{ + xpc_notify_senders_sn2(ch, ch->reason, ch->sn.sn2.w_local_GP.put); +} + +/* + * Clear some of the msg flags in the local message queue. + */ +static inline void +xpc_clear_local_msgqueue_flags_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + s64 get; + + get = ch_sn2->w_remote_GP.get; + do { + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue + + (get % ch->local_nentries) * + ch->entry_size); + DBUG_ON(!(msg->flags & XPC_M_SN2_READY)); + msg->flags = 0; + } while (++get < ch_sn2->remote_GP.get); +} + +/* + * Clear some of the msg flags in the remote message queue. + */ +static inline void +xpc_clear_remote_msgqueue_flags_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + s64 put, remote_nentries = ch->remote_nentries; + + /* flags are zeroed when the buffer is allocated */ + if (ch_sn2->remote_GP.put < remote_nentries) + return; + + put = max(ch_sn2->w_remote_GP.put, remote_nentries); + do { + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue + + (put % remote_nentries) * + ch->entry_size); + DBUG_ON(!(msg->flags & XPC_M_SN2_READY)); + DBUG_ON(!(msg->flags & XPC_M_SN2_DONE)); + DBUG_ON(msg->number != put - remote_nentries); + msg->flags = 0; + } while (++put < ch_sn2->remote_GP.put); +} + +static int +xpc_n_of_deliverable_payloads_sn2(struct xpc_channel *ch) +{ + return ch->sn.sn2.w_remote_GP.put - ch->sn.sn2.w_local_GP.get; +} + +static void +xpc_process_msg_chctl_flags_sn2(struct xpc_partition *part, int ch_number) +{ + struct xpc_channel *ch = &part->channels[ch_number]; + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + int npayloads_sent; + + ch_sn2->remote_GP = part->sn.sn2.remote_GPs[ch_number]; + + /* See what, if anything, has changed for each connected channel */ + + xpc_msgqueue_ref(ch); + + if (ch_sn2->w_remote_GP.get == ch_sn2->remote_GP.get && + ch_sn2->w_remote_GP.put == ch_sn2->remote_GP.put) { + /* nothing changed since GPs were last pulled */ + xpc_msgqueue_deref(ch); + return; + } + + if (!(ch->flags & XPC_C_CONNECTED)) { + xpc_msgqueue_deref(ch); + return; + } + + /* + * First check to see if messages recently sent by us have been + * received by the other side. (The remote GET value will have + * changed since we last looked at it.) + */ + + if (ch_sn2->w_remote_GP.get != ch_sn2->remote_GP.get) { + + /* + * We need to notify any senders that want to be notified + * that their sent messages have been received by their + * intended recipients. We need to do this before updating + * w_remote_GP.get so that we don't allocate the same message + * queue entries prematurely (see xpc_allocate_msg()). + */ + if (atomic_read(&ch->n_to_notify) > 0) { + /* + * Notify senders that messages sent have been + * received and delivered by the other side. + */ + xpc_notify_senders_sn2(ch, xpMsgDelivered, + ch_sn2->remote_GP.get); + } + + /* + * Clear msg->flags in previously sent messages, so that + * they're ready for xpc_allocate_msg(). + */ + xpc_clear_local_msgqueue_flags_sn2(ch); + + ch_sn2->w_remote_GP.get = ch_sn2->remote_GP.get; + + dev_dbg(xpc_chan, "w_remote_GP.get changed to %lld, partid=%d, " + "channel=%d\n", ch_sn2->w_remote_GP.get, ch->partid, + ch->number); + + /* + * If anyone was waiting for message queue entries to become + * available, wake them up. + */ + if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) + wake_up(&ch->msg_allocate_wq); + } + + /* + * Now check for newly sent messages by the other side. (The remote + * PUT value will have changed since we last looked at it.) + */ + + if (ch_sn2->w_remote_GP.put != ch_sn2->remote_GP.put) { + /* + * Clear msg->flags in previously received messages, so that + * they're ready for xpc_get_deliverable_payload_sn2(). + */ + xpc_clear_remote_msgqueue_flags_sn2(ch); + + smp_wmb(); /* ensure flags have been cleared before bte_copy */ + ch_sn2->w_remote_GP.put = ch_sn2->remote_GP.put; + + dev_dbg(xpc_chan, "w_remote_GP.put changed to %lld, partid=%d, " + "channel=%d\n", ch_sn2->w_remote_GP.put, ch->partid, + ch->number); + + npayloads_sent = xpc_n_of_deliverable_payloads_sn2(ch); + if (npayloads_sent > 0) { + dev_dbg(xpc_chan, "msgs waiting to be copied and " + "delivered=%d, partid=%d, channel=%d\n", + npayloads_sent, ch->partid, ch->number); + + if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) + xpc_activate_kthreads(ch, npayloads_sent); + } + } + + xpc_msgqueue_deref(ch); +} + +static struct xpc_msg_sn2 * +xpc_pull_remote_msg_sn2(struct xpc_channel *ch, s64 get) +{ + struct xpc_partition *part = &xpc_partitions[ch->partid]; + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + unsigned long remote_msg_pa; + struct xpc_msg_sn2 *msg; + u32 msg_index; + u32 nmsgs; + u64 msg_offset; + enum xp_retval ret; + + if (mutex_lock_interruptible(&ch_sn2->msg_to_pull_mutex) != 0) { + /* we were interrupted by a signal */ + return NULL; + } + + while (get >= ch_sn2->next_msg_to_pull) { + + /* pull as many messages as are ready and able to be pulled */ + + msg_index = ch_sn2->next_msg_to_pull % ch->remote_nentries; + + DBUG_ON(ch_sn2->next_msg_to_pull >= ch_sn2->w_remote_GP.put); + nmsgs = ch_sn2->w_remote_GP.put - ch_sn2->next_msg_to_pull; + if (msg_index + nmsgs > ch->remote_nentries) { + /* ignore the ones that wrap the msg queue for now */ + nmsgs = ch->remote_nentries - msg_index; + } + + msg_offset = msg_index * ch->entry_size; + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue + + msg_offset); + remote_msg_pa = ch_sn2->remote_msgqueue_pa + msg_offset; + + ret = xpc_pull_remote_cachelines_sn2(part, msg, remote_msg_pa, + nmsgs * ch->entry_size); + if (ret != xpSuccess) { + + dev_dbg(xpc_chan, "failed to pull %d msgs starting with" + " msg %lld from partition %d, channel=%d, " + "ret=%d\n", nmsgs, ch_sn2->next_msg_to_pull, + ch->partid, ch->number, ret); + + XPC_DEACTIVATE_PARTITION(part, ret); + + mutex_unlock(&ch_sn2->msg_to_pull_mutex); + return NULL; + } + + ch_sn2->next_msg_to_pull += nmsgs; + } + + mutex_unlock(&ch_sn2->msg_to_pull_mutex); + + /* return the message we were looking for */ + msg_offset = (get % ch->remote_nentries) * ch->entry_size; + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue + msg_offset); + + return msg; +} + +/* + * Get the next deliverable message's payload. + */ +static void * +xpc_get_deliverable_payload_sn2(struct xpc_channel *ch) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + void *payload = NULL; + s64 get; + + do { + if (ch->flags & XPC_C_DISCONNECTING) + break; + + get = ch_sn2->w_local_GP.get; + smp_rmb(); /* guarantee that .get loads before .put */ + if (get == ch_sn2->w_remote_GP.put) + break; + + /* There are messages waiting to be pulled and delivered. + * We need to try to secure one for ourselves. We'll do this + * by trying to increment w_local_GP.get and hope that no one + * else beats us to it. If they do, we'll we'll simply have + * to try again for the next one. + */ + + if (cmpxchg(&ch_sn2->w_local_GP.get, get, get + 1) == get) { + /* we got the entry referenced by get */ + + dev_dbg(xpc_chan, "w_local_GP.get changed to %lld, " + "partid=%d, channel=%d\n", get + 1, + ch->partid, ch->number); + + /* pull the message from the remote partition */ + + msg = xpc_pull_remote_msg_sn2(ch, get); + + if (msg != NULL) { + DBUG_ON(msg->number != get); + DBUG_ON(msg->flags & XPC_M_SN2_DONE); + DBUG_ON(!(msg->flags & XPC_M_SN2_READY)); + + payload = &msg->payload; + } + break; + } + + } while (1); + + return payload; +} + +/* + * Now we actually send the messages that are ready to be sent by advancing + * the local message queue's Put value and then send a chctl msgrequest to the + * recipient partition. + */ +static void +xpc_send_msgs_sn2(struct xpc_channel *ch, s64 initial_put) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + s64 put = initial_put + 1; + int send_msgrequest = 0; + + while (1) { + + while (1) { + if (put == ch_sn2->w_local_GP.put) + break; + + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2-> + local_msgqueue + (put % + ch->local_nentries) * + ch->entry_size); + + if (!(msg->flags & XPC_M_SN2_READY)) + break; + + put++; + } + + if (put == initial_put) { + /* nothing's changed */ + break; + } + + if (cmpxchg_rel(&ch_sn2->local_GP->put, initial_put, put) != + initial_put) { + /* someone else beat us to it */ + DBUG_ON(ch_sn2->local_GP->put < initial_put); + break; + } + + /* we just set the new value of local_GP->put */ + + dev_dbg(xpc_chan, "local_GP->put changed to %lld, partid=%d, " + "channel=%d\n", put, ch->partid, ch->number); + + send_msgrequest = 1; + + /* + * We need to ensure that the message referenced by + * local_GP->put is not XPC_M_SN2_READY or that local_GP->put + * equals w_local_GP.put, so we'll go have a look. + */ + initial_put = put; + } + + if (send_msgrequest) + xpc_send_chctl_msgrequest_sn2(ch); +} + +/* + * Allocate an entry for a message from the message queue associated with the + * specified channel. + */ +static enum xp_retval +xpc_allocate_msg_sn2(struct xpc_channel *ch, u32 flags, + struct xpc_msg_sn2 **address_of_msg) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + enum xp_retval ret; + s64 put; + + /* + * Get the next available message entry from the local message queue. + * If none are available, we'll make sure that we grab the latest + * GP values. + */ + ret = xpTimeout; + + while (1) { + + put = ch_sn2->w_local_GP.put; + smp_rmb(); /* guarantee that .put loads before .get */ + if (put - ch_sn2->w_remote_GP.get < ch->local_nentries) { + + /* There are available message entries. We need to try + * to secure one for ourselves. We'll do this by trying + * to increment w_local_GP.put as long as someone else + * doesn't beat us to it. If they do, we'll have to + * try again. + */ + if (cmpxchg(&ch_sn2->w_local_GP.put, put, put + 1) == + put) { + /* we got the entry referenced by put */ + break; + } + continue; /* try again */ + } + + /* + * There aren't any available msg entries at this time. + * + * In waiting for a message entry to become available, + * we set a timeout in case the other side is not sending + * completion interrupts. This lets us fake a notify IRQ + * that will cause the notify IRQ handler to fetch the latest + * GP values as if an interrupt was sent by the other side. + */ + if (ret == xpTimeout) + xpc_send_chctl_local_msgrequest_sn2(ch); + + if (flags & XPC_NOWAIT) + return xpNoWait; + + ret = xpc_allocate_msg_wait(ch); + if (ret != xpInterrupted && ret != xpTimeout) + return ret; + } + + /* get the message's address and initialize it */ + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue + + (put % ch->local_nentries) * + ch->entry_size); + + DBUG_ON(msg->flags != 0); + msg->number = put; + + dev_dbg(xpc_chan, "w_local_GP.put changed to %lld; msg=0x%p, " + "msg_number=%lld, partid=%d, channel=%d\n", put + 1, + (void *)msg, msg->number, ch->partid, ch->number); + + *address_of_msg = msg; + return xpSuccess; +} + +/* + * Common code that does the actual sending of the message by advancing the + * local message queue's Put value and sends a chctl msgrequest to the + * partition the message is being sent to. + */ +static enum xp_retval +xpc_send_payload_sn2(struct xpc_channel *ch, u32 flags, void *payload, + u16 payload_size, u8 notify_type, xpc_notify_func func, + void *key) +{ + enum xp_retval ret = xpSuccess; + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg = msg; + struct xpc_notify_sn2 *notify = notify; + s64 msg_number; + s64 put; + + DBUG_ON(notify_type == XPC_N_CALL && func == NULL); + + if (XPC_MSG_SIZE(payload_size) > ch->entry_size) + return xpPayloadTooBig; + + xpc_msgqueue_ref(ch); + + if (ch->flags & XPC_C_DISCONNECTING) { + ret = ch->reason; + goto out_1; + } + if (!(ch->flags & XPC_C_CONNECTED)) { + ret = xpNotConnected; + goto out_1; + } + + ret = xpc_allocate_msg_sn2(ch, flags, &msg); + if (ret != xpSuccess) + goto out_1; + + msg_number = msg->number; + + if (notify_type != 0) { + /* + * Tell the remote side to send an ACK interrupt when the + * message has been delivered. + */ + msg->flags |= XPC_M_SN2_INTERRUPT; + + atomic_inc(&ch->n_to_notify); + + notify = &ch_sn2->notify_queue[msg_number % ch->local_nentries]; + notify->func = func; + notify->key = key; + notify->type = notify_type; + + /* ??? Is a mb() needed here? */ + + if (ch->flags & XPC_C_DISCONNECTING) { + /* + * An error occurred between our last error check and + * this one. We will try to clear the type field from + * the notify entry. If we succeed then + * xpc_disconnect_channel() didn't already process + * the notify entry. + */ + if (cmpxchg(¬ify->type, notify_type, 0) == + notify_type) { + atomic_dec(&ch->n_to_notify); + ret = ch->reason; + } + goto out_1; + } + } + + memcpy(&msg->payload, payload, payload_size); + + msg->flags |= XPC_M_SN2_READY; + + /* + * The preceding store of msg->flags must occur before the following + * load of local_GP->put. + */ + smp_mb(); + + /* see if the message is next in line to be sent, if so send it */ + + put = ch_sn2->local_GP->put; + if (put == msg_number) + xpc_send_msgs_sn2(ch, put); + +out_1: + xpc_msgqueue_deref(ch); + return ret; +} + +/* + * Now we actually acknowledge the messages that have been delivered and ack'd + * by advancing the cached remote message queue's Get value and if requested + * send a chctl msgrequest to the message sender's partition. + * + * If a message has XPC_M_SN2_INTERRUPT set, send an interrupt to the partition + * that sent the message. + */ +static void +xpc_acknowledge_msgs_sn2(struct xpc_channel *ch, s64 initial_get, u8 msg_flags) +{ + struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2; + struct xpc_msg_sn2 *msg; + s64 get = initial_get + 1; + int send_msgrequest = 0; + + while (1) { + + while (1) { + if (get == ch_sn2->w_local_GP.get) + break; + + msg = (struct xpc_msg_sn2 *)((u64)ch_sn2-> + remote_msgqueue + (get % + ch->remote_nentries) * + ch->entry_size); + + if (!(msg->flags & XPC_M_SN2_DONE)) + break; + + msg_flags |= msg->flags; + get++; + } + + if (get == initial_get) { + /* nothing's changed */ + break; + } + + if (cmpxchg_rel(&ch_sn2->local_GP->get, initial_get, get) != + initial_get) { + /* someone else beat us to it */ + DBUG_ON(ch_sn2->local_GP->get <= initial_get); + break; + } + + /* we just set the new value of local_GP->get */ + + dev_dbg(xpc_chan, "local_GP->get changed to %lld, partid=%d, " + "channel=%d\n", get, ch->partid, ch->number); + + send_msgrequest = (msg_flags & XPC_M_SN2_INTERRUPT); + + /* + * We need to ensure that the message referenced by + * local_GP->get is not XPC_M_SN2_DONE or that local_GP->get + * equals w_local_GP.get, so we'll go have a look. + */ + initial_get = get; + } + + if (send_msgrequest) + xpc_send_chctl_msgrequest_sn2(ch); +} + +static void +xpc_received_payload_sn2(struct xpc_channel *ch, void *payload) +{ + struct xpc_msg_sn2 *msg; + s64 msg_number; + s64 get; + + msg = container_of(payload, struct xpc_msg_sn2, payload); + msg_number = msg->number; + + dev_dbg(xpc_chan, "msg=0x%p, msg_number=%lld, partid=%d, channel=%d\n", + (void *)msg, msg_number, ch->partid, ch->number); + + DBUG_ON((((u64)msg - (u64)ch->sn.sn2.remote_msgqueue) / ch->entry_size) != + msg_number % ch->remote_nentries); + DBUG_ON(!(msg->flags & XPC_M_SN2_READY)); + DBUG_ON(msg->flags & XPC_M_SN2_DONE); + + msg->flags |= XPC_M_SN2_DONE; + + /* + * The preceding store of msg->flags must occur before the following + * load of local_GP->get. + */ + smp_mb(); + + /* + * See if this message is next in line to be acknowledged as having + * been delivered. + */ + get = ch->sn.sn2.local_GP->get; + if (get == msg_number) + xpc_acknowledge_msgs_sn2(ch, get, msg->flags); +} + +static struct xpc_arch_operations xpc_arch_ops_sn2 = { + .setup_partitions = xpc_setup_partitions_sn2, + .teardown_partitions = xpc_teardown_partitions_sn2, + .process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_sn2, + .get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_sn2, + .setup_rsvd_page = xpc_setup_rsvd_page_sn2, + + .allow_hb = xpc_allow_hb_sn2, + .disallow_hb = xpc_disallow_hb_sn2, + .disallow_all_hbs = xpc_disallow_all_hbs_sn2, + .increment_heartbeat = xpc_increment_heartbeat_sn2, + .offline_heartbeat = xpc_offline_heartbeat_sn2, + .online_heartbeat = xpc_online_heartbeat_sn2, + .heartbeat_init = xpc_heartbeat_init_sn2, + .heartbeat_exit = xpc_heartbeat_exit_sn2, + .get_remote_heartbeat = xpc_get_remote_heartbeat_sn2, + + .request_partition_activation = + xpc_request_partition_activation_sn2, + .request_partition_reactivation = + xpc_request_partition_reactivation_sn2, + .request_partition_deactivation = + xpc_request_partition_deactivation_sn2, + .cancel_partition_deactivation_request = + xpc_cancel_partition_deactivation_request_sn2, + + .setup_ch_structures = xpc_setup_ch_structures_sn2, + .teardown_ch_structures = xpc_teardown_ch_structures_sn2, + + .make_first_contact = xpc_make_first_contact_sn2, + + .get_chctl_all_flags = xpc_get_chctl_all_flags_sn2, + .send_chctl_closerequest = xpc_send_chctl_closerequest_sn2, + .send_chctl_closereply = xpc_send_chctl_closereply_sn2, + .send_chctl_openrequest = xpc_send_chctl_openrequest_sn2, + .send_chctl_openreply = xpc_send_chctl_openreply_sn2, + .send_chctl_opencomplete = xpc_send_chctl_opencomplete_sn2, + .process_msg_chctl_flags = xpc_process_msg_chctl_flags_sn2, + + .save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_sn2, + + .setup_msg_structures = xpc_setup_msg_structures_sn2, + .teardown_msg_structures = xpc_teardown_msg_structures_sn2, + + .indicate_partition_engaged = xpc_indicate_partition_engaged_sn2, + .indicate_partition_disengaged = xpc_indicate_partition_disengaged_sn2, + .partition_engaged = xpc_partition_engaged_sn2, + .any_partition_engaged = xpc_any_partition_engaged_sn2, + .assume_partition_disengaged = xpc_assume_partition_disengaged_sn2, + + .n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_sn2, + .send_payload = xpc_send_payload_sn2, + .get_deliverable_payload = xpc_get_deliverable_payload_sn2, + .received_payload = xpc_received_payload_sn2, + .notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_sn2, +}; + +int +xpc_init_sn2(void) +{ + int ret; + size_t buf_size; + + xpc_arch_ops = xpc_arch_ops_sn2; + + if (offsetof(struct xpc_msg_sn2, payload) > XPC_MSG_HDR_MAX_SIZE) { + dev_err(xpc_part, "header portion of struct xpc_msg_sn2 is " + "larger than %d\n", XPC_MSG_HDR_MAX_SIZE); + return -E2BIG; + } + + buf_size = max(XPC_RP_VARS_SIZE, + XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES_SN2); + xpc_remote_copy_buffer_sn2 = xpc_kmalloc_cacheline_aligned(buf_size, + GFP_KERNEL, + &xpc_remote_copy_buffer_base_sn2); + if (xpc_remote_copy_buffer_sn2 == NULL) { + dev_err(xpc_part, "can't get memory for remote copy buffer\n"); + return -ENOMEM; + } + + /* open up protections for IPI and [potentially] amo operations */ + xpc_allow_IPI_ops_sn2(); + xpc_allow_amo_ops_shub_wars_1_1_sn2(); + + /* + * This is safe to do before the xpc_hb_checker thread has started + * because the handler releases a wait queue. If an interrupt is + * received before the thread is waiting, it will not go to sleep, + * but rather immediately process the interrupt. + */ + ret = request_irq(SGI_XPC_ACTIVATE, xpc_handle_activate_IRQ_sn2, 0, + "xpc hb", NULL); + if (ret != 0) { + dev_err(xpc_part, "can't register ACTIVATE IRQ handler, " + "errno=%d\n", -ret); + xpc_disallow_IPI_ops_sn2(); + kfree(xpc_remote_copy_buffer_base_sn2); + } + return ret; +} + +void +xpc_exit_sn2(void) +{ + free_irq(SGI_XPC_ACTIVATE, NULL); + xpc_disallow_IPI_ops_sn2(); + kfree(xpc_remote_copy_buffer_base_sn2); +} diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c new file mode 100644 index 000000000..340b44d9e --- /dev/null +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -0,0 +1,1813 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + */ + +/* + * Cross Partition Communication (XPC) uv-based functions. + * + * Architecture specific implementation of common functions. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <asm/uv/uv_hub.h> +#if defined CONFIG_X86_64 +#include <asm/uv/bios.h> +#include <asm/uv/uv_irq.h> +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV +#include <asm/sn/intr.h> +#include <asm/sn/sn_sal.h> +#endif +#include "../sgi-gru/gru.h" +#include "../sgi-gru/grukservices.h" +#include "xpc.h" + +#if defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV +struct uv_IO_APIC_route_entry { + __u64 vector : 8, + delivery_mode : 3, + dest_mode : 1, + delivery_status : 1, + polarity : 1, + __reserved_1 : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15, + dest : 32; +}; +#endif + +static struct xpc_heartbeat_uv *xpc_heartbeat_uv; + +#define XPC_ACTIVATE_MSG_SIZE_UV (1 * GRU_CACHE_LINE_BYTES) +#define XPC_ACTIVATE_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \ + XPC_ACTIVATE_MSG_SIZE_UV) +#define XPC_ACTIVATE_IRQ_NAME "xpc_activate" + +#define XPC_NOTIFY_MSG_SIZE_UV (2 * GRU_CACHE_LINE_BYTES) +#define XPC_NOTIFY_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \ + XPC_NOTIFY_MSG_SIZE_UV) +#define XPC_NOTIFY_IRQ_NAME "xpc_notify" + +static int xpc_mq_node = -1; + +static struct xpc_gru_mq_uv *xpc_activate_mq_uv; +static struct xpc_gru_mq_uv *xpc_notify_mq_uv; + +static int +xpc_setup_partitions_uv(void) +{ + short partid; + struct xpc_partition_uv *part_uv; + + for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) { + part_uv = &xpc_partitions[partid].sn.uv; + + mutex_init(&part_uv->cached_activate_gru_mq_desc_mutex); + spin_lock_init(&part_uv->flags_lock); + part_uv->remote_act_state = XPC_P_AS_INACTIVE; + } + return 0; +} + +static void +xpc_teardown_partitions_uv(void) +{ + short partid; + struct xpc_partition_uv *part_uv; + unsigned long irq_flags; + + for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) { + part_uv = &xpc_partitions[partid].sn.uv; + + if (part_uv->cached_activate_gru_mq_desc != NULL) { + mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex); + spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV; + spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); + kfree(part_uv->cached_activate_gru_mq_desc); + part_uv->cached_activate_gru_mq_desc = NULL; + mutex_unlock(&part_uv-> + cached_activate_gru_mq_desc_mutex); + } + } +} + +static int +xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name) +{ + int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade); + +#if defined CONFIG_X86_64 + mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset, + UV_AFFINITY_CPU); + if (mq->irq < 0) + return mq->irq; + + mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset); + +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + if (strcmp(irq_name, XPC_ACTIVATE_IRQ_NAME) == 0) + mq->irq = SGI_XPC_ACTIVATE; + else if (strcmp(irq_name, XPC_NOTIFY_IRQ_NAME) == 0) + mq->irq = SGI_XPC_NOTIFY; + else + return -EINVAL; + + mq->mmr_value = (unsigned long)cpu_physical_id(cpu) << 32 | mq->irq; + uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mq->mmr_value); +#else + #error not a supported configuration +#endif + + return 0; +} + +static void +xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq) +{ +#if defined CONFIG_X86_64 + uv_teardown_irq(mq->irq); + +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + int mmr_pnode; + unsigned long mmr_value; + + mmr_pnode = uv_blade_to_pnode(mq->mmr_blade); + mmr_value = 1UL << 16; + + uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mmr_value); +#else + #error not a supported configuration +#endif +} + +static int +xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq) +{ + int ret; + +#if defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade); + + ret = sn_mq_watchlist_alloc(mmr_pnode, (void *)uv_gpa(mq->address), + mq->order, &mq->mmr_offset); + if (ret < 0) { + dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n", + ret); + return -EBUSY; + } +#elif defined CONFIG_X86_64 + ret = uv_bios_mq_watchlist_alloc(uv_gpa(mq->address), + mq->order, &mq->mmr_offset); + if (ret < 0) { + dev_err(xpc_part, "uv_bios_mq_watchlist_alloc() failed, " + "ret=%d\n", ret); + return ret; + } +#else + #error not a supported configuration +#endif + + mq->watchlist_num = ret; + return 0; +} + +static void +xpc_gru_mq_watchlist_free_uv(struct xpc_gru_mq_uv *mq) +{ + int ret; + int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade); + +#if defined CONFIG_X86_64 + ret = uv_bios_mq_watchlist_free(mmr_pnode, mq->watchlist_num); + BUG_ON(ret != BIOS_STATUS_SUCCESS); +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + ret = sn_mq_watchlist_free(mmr_pnode, mq->watchlist_num); + BUG_ON(ret != SALRET_OK); +#else + #error not a supported configuration +#endif +} + +static struct xpc_gru_mq_uv * +xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, + irq_handler_t irq_handler) +{ + enum xp_retval xp_ret; + int ret; + int nid; + int nasid; + int pg_order; + struct page *page; + struct xpc_gru_mq_uv *mq; + struct uv_IO_APIC_route_entry *mmr_value; + + mq = kmalloc(sizeof(struct xpc_gru_mq_uv), GFP_KERNEL); + if (mq == NULL) { + dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() " + "a xpc_gru_mq_uv structure\n"); + ret = -ENOMEM; + goto out_0; + } + + mq->gru_mq_desc = kzalloc(sizeof(struct gru_message_queue_desc), + GFP_KERNEL); + if (mq->gru_mq_desc == NULL) { + dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() " + "a gru_message_queue_desc structure\n"); + ret = -ENOMEM; + goto out_1; + } + + pg_order = get_order(mq_size); + mq->order = pg_order + PAGE_SHIFT; + mq_size = 1UL << mq->order; + + mq->mmr_blade = uv_cpu_to_blade_id(cpu); + + nid = cpu_to_node(cpu); + page = __alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + pg_order); + if (page == NULL) { + dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " + "bytes of memory on nid=%d for GRU mq\n", mq_size, nid); + ret = -ENOMEM; + goto out_2; + } + mq->address = page_address(page); + + /* enable generation of irq when GRU mq operation occurs to this mq */ + ret = xpc_gru_mq_watchlist_alloc_uv(mq); + if (ret != 0) + goto out_3; + + ret = xpc_get_gru_mq_irq_uv(mq, cpu, irq_name); + if (ret != 0) + goto out_4; + + ret = request_irq(mq->irq, irq_handler, 0, irq_name, NULL); + if (ret != 0) { + dev_err(xpc_part, "request_irq(irq=%d) returned error=%d\n", + mq->irq, -ret); + goto out_5; + } + + nasid = UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpu)); + + mmr_value = (struct uv_IO_APIC_route_entry *)&mq->mmr_value; + ret = gru_create_message_queue(mq->gru_mq_desc, mq->address, mq_size, + nasid, mmr_value->vector, mmr_value->dest); + if (ret != 0) { + dev_err(xpc_part, "gru_create_message_queue() returned " + "error=%d\n", ret); + ret = -EINVAL; + goto out_6; + } + + /* allow other partitions to access this GRU mq */ + xp_ret = xp_expand_memprotect(xp_pa(mq->address), mq_size); + if (xp_ret != xpSuccess) { + ret = -EACCES; + goto out_6; + } + + return mq; + + /* something went wrong */ +out_6: + free_irq(mq->irq, NULL); +out_5: + xpc_release_gru_mq_irq_uv(mq); +out_4: + xpc_gru_mq_watchlist_free_uv(mq); +out_3: + free_pages((unsigned long)mq->address, pg_order); +out_2: + kfree(mq->gru_mq_desc); +out_1: + kfree(mq); +out_0: + return ERR_PTR(ret); +} + +static void +xpc_destroy_gru_mq_uv(struct xpc_gru_mq_uv *mq) +{ + unsigned int mq_size; + int pg_order; + int ret; + + /* disallow other partitions to access GRU mq */ + mq_size = 1UL << mq->order; + ret = xp_restrict_memprotect(xp_pa(mq->address), mq_size); + BUG_ON(ret != xpSuccess); + + /* unregister irq handler and release mq irq/vector mapping */ + free_irq(mq->irq, NULL); + xpc_release_gru_mq_irq_uv(mq); + + /* disable generation of irq when GRU mq op occurs to this mq */ + xpc_gru_mq_watchlist_free_uv(mq); + + pg_order = mq->order - PAGE_SHIFT; + free_pages((unsigned long)mq->address, pg_order); + + kfree(mq); +} + +static enum xp_retval +xpc_send_gru_msg(struct gru_message_queue_desc *gru_mq_desc, void *msg, + size_t msg_size) +{ + enum xp_retval xp_ret; + int ret; + + while (1) { + ret = gru_send_message_gpa(gru_mq_desc, msg, msg_size); + if (ret == MQE_OK) { + xp_ret = xpSuccess; + break; + } + + if (ret == MQE_QUEUE_FULL) { + dev_dbg(xpc_chan, "gru_send_message_gpa() returned " + "error=MQE_QUEUE_FULL\n"); + /* !!! handle QLimit reached; delay & try again */ + /* ??? Do we add a limit to the number of retries? */ + (void)msleep_interruptible(10); + } else if (ret == MQE_CONGESTION) { + dev_dbg(xpc_chan, "gru_send_message_gpa() returned " + "error=MQE_CONGESTION\n"); + /* !!! handle LB Overflow; simply try again */ + /* ??? Do we add a limit to the number of retries? */ + } else { + /* !!! Currently this is MQE_UNEXPECTED_CB_ERR */ + dev_err(xpc_chan, "gru_send_message_gpa() returned " + "error=%d\n", ret); + xp_ret = xpGruSendMqError; + break; + } + } + return xp_ret; +} + +static void +xpc_process_activate_IRQ_rcvd_uv(void) +{ + unsigned long irq_flags; + short partid; + struct xpc_partition *part; + u8 act_state_req; + + DBUG_ON(xpc_activate_IRQ_rcvd == 0); + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) { + part = &xpc_partitions[partid]; + + if (part->sn.uv.act_state_req == 0) + continue; + + xpc_activate_IRQ_rcvd--; + BUG_ON(xpc_activate_IRQ_rcvd < 0); + + act_state_req = part->sn.uv.act_state_req; + part->sn.uv.act_state_req = 0; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + if (act_state_req == XPC_P_ASR_ACTIVATE_UV) { + if (part->act_state == XPC_P_AS_INACTIVE) + xpc_activate_partition(part); + else if (part->act_state == XPC_P_AS_DEACTIVATING) + XPC_DEACTIVATE_PARTITION(part, xpReactivating); + + } else if (act_state_req == XPC_P_ASR_REACTIVATE_UV) { + if (part->act_state == XPC_P_AS_INACTIVE) + xpc_activate_partition(part); + else + XPC_DEACTIVATE_PARTITION(part, xpReactivating); + + } else if (act_state_req == XPC_P_ASR_DEACTIVATE_UV) { + XPC_DEACTIVATE_PARTITION(part, part->sn.uv.reason); + + } else { + BUG(); + } + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (xpc_activate_IRQ_rcvd == 0) + break; + } + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + +} + +static void +xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, + struct xpc_activate_mq_msghdr_uv *msg_hdr, + int part_setup, + int *wakeup_hb_checker) +{ + unsigned long irq_flags; + struct xpc_partition_uv *part_uv = &part->sn.uv; + struct xpc_openclose_args *args; + + part_uv->remote_act_state = msg_hdr->act_state; + + switch (msg_hdr->type) { + case XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV: + /* syncing of remote_act_state was just done above */ + break; + + case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: { + struct xpc_activate_mq_msg_activate_req_uv *msg; + + /* + * ??? Do we deal here with ts_jiffies being different + * ??? if act_state != XPC_P_AS_INACTIVE instead of + * ??? below? + */ + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_activate_req_uv, hdr); + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV; + part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */ + part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies; + part_uv->heartbeat_gpa = msg->heartbeat_gpa; + + if (msg->activate_gru_mq_desc_gpa != + part_uv->activate_gru_mq_desc_gpa) { + spin_lock(&part_uv->flags_lock); + part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV; + spin_unlock(&part_uv->flags_lock); + part_uv->activate_gru_mq_desc_gpa = + msg->activate_gru_mq_desc_gpa; + } + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + (*wakeup_hb_checker)++; + break; + } + case XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV: { + struct xpc_activate_mq_msg_deactivate_req_uv *msg; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_deactivate_req_uv, hdr); + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV; + part_uv->reason = msg->reason; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + (*wakeup_hb_checker)++; + return; + } + case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: { + struct xpc_activate_mq_msg_chctl_closerequest_uv *msg; + + if (!part_setup) + break; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_chctl_closerequest_uv, + hdr); + args = &part->remote_openclose_args[msg->ch_number]; + args->reason = msg->reason; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREQUEST; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); + break; + } + case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: { + struct xpc_activate_mq_msg_chctl_closereply_uv *msg; + + if (!part_setup) + break; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_chctl_closereply_uv, + hdr); + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREPLY; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); + break; + } + case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: { + struct xpc_activate_mq_msg_chctl_openrequest_uv *msg; + + if (!part_setup) + break; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_chctl_openrequest_uv, + hdr); + args = &part->remote_openclose_args[msg->ch_number]; + args->entry_size = msg->entry_size; + args->local_nentries = msg->local_nentries; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREQUEST; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); + break; + } + case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: { + struct xpc_activate_mq_msg_chctl_openreply_uv *msg; + + if (!part_setup) + break; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_chctl_openreply_uv, hdr); + args = &part->remote_openclose_args[msg->ch_number]; + args->remote_nentries = msg->remote_nentries; + args->local_nentries = msg->local_nentries; + args->local_msgqueue_pa = msg->notify_gru_mq_desc_gpa; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREPLY; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); + break; + } + case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV: { + struct xpc_activate_mq_msg_chctl_opencomplete_uv *msg; + + if (!part_setup) + break; + + msg = container_of(msg_hdr, struct + xpc_activate_mq_msg_chctl_opencomplete_uv, hdr); + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENCOMPLETE; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); + } + case XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV: + spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + part_uv->flags |= XPC_P_ENGAGED_UV; + spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); + break; + + case XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV: + spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + part_uv->flags &= ~XPC_P_ENGAGED_UV; + spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); + break; + + default: + dev_err(xpc_part, "received unknown activate_mq msg type=%d " + "from partition=%d\n", msg_hdr->type, XPC_PARTID(part)); + + /* get hb checker to deactivate from the remote partition */ + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV; + part_uv->reason = xpBadMsgType; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + (*wakeup_hb_checker)++; + return; + } + + if (msg_hdr->rp_ts_jiffies != part->remote_rp_ts_jiffies && + part->remote_rp_ts_jiffies != 0) { + /* + * ??? Does what we do here need to be sensitive to + * ??? act_state or remote_act_state? + */ + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = XPC_P_ASR_REACTIVATE_UV; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + (*wakeup_hb_checker)++; + } +} + +static irqreturn_t +xpc_handle_activate_IRQ_uv(int irq, void *dev_id) +{ + struct xpc_activate_mq_msghdr_uv *msg_hdr; + short partid; + struct xpc_partition *part; + int wakeup_hb_checker = 0; + int part_referenced; + + while (1) { + msg_hdr = gru_get_next_message(xpc_activate_mq_uv->gru_mq_desc); + if (msg_hdr == NULL) + break; + + partid = msg_hdr->partid; + if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) { + dev_err(xpc_part, "xpc_handle_activate_IRQ_uv() " + "received invalid partid=0x%x in message\n", + partid); + } else { + part = &xpc_partitions[partid]; + + part_referenced = xpc_part_ref(part); + xpc_handle_activate_mq_msg_uv(part, msg_hdr, + part_referenced, + &wakeup_hb_checker); + if (part_referenced) + xpc_part_deref(part); + } + + gru_free_message(xpc_activate_mq_uv->gru_mq_desc, msg_hdr); + } + + if (wakeup_hb_checker) + wake_up_interruptible(&xpc_activate_IRQ_wq); + + return IRQ_HANDLED; +} + +static enum xp_retval +xpc_cache_remote_gru_mq_desc_uv(struct gru_message_queue_desc *gru_mq_desc, + unsigned long gru_mq_desc_gpa) +{ + enum xp_retval ret; + + ret = xp_remote_memcpy(uv_gpa(gru_mq_desc), gru_mq_desc_gpa, + sizeof(struct gru_message_queue_desc)); + if (ret == xpSuccess) + gru_mq_desc->mq = NULL; + + return ret; +} + +static enum xp_retval +xpc_send_activate_IRQ_uv(struct xpc_partition *part, void *msg, size_t msg_size, + int msg_type) +{ + struct xpc_activate_mq_msghdr_uv *msg_hdr = msg; + struct xpc_partition_uv *part_uv = &part->sn.uv; + struct gru_message_queue_desc *gru_mq_desc; + unsigned long irq_flags; + enum xp_retval ret; + + DBUG_ON(msg_size > XPC_ACTIVATE_MSG_SIZE_UV); + + msg_hdr->type = msg_type; + msg_hdr->partid = xp_partition_id; + msg_hdr->act_state = part->act_state; + msg_hdr->rp_ts_jiffies = xpc_rsvd_page->ts_jiffies; + + mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex); +again: + if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV)) { + gru_mq_desc = part_uv->cached_activate_gru_mq_desc; + if (gru_mq_desc == NULL) { + gru_mq_desc = kmalloc(sizeof(struct + gru_message_queue_desc), + GFP_KERNEL); + if (gru_mq_desc == NULL) { + ret = xpNoMemory; + goto done; + } + part_uv->cached_activate_gru_mq_desc = gru_mq_desc; + } + + ret = xpc_cache_remote_gru_mq_desc_uv(gru_mq_desc, + part_uv-> + activate_gru_mq_desc_gpa); + if (ret != xpSuccess) + goto done; + + spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + part_uv->flags |= XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV; + spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); + } + + /* ??? Is holding a spin_lock (ch->lock) during this call a bad idea? */ + ret = xpc_send_gru_msg(part_uv->cached_activate_gru_mq_desc, msg, + msg_size); + if (ret != xpSuccess) { + smp_rmb(); /* ensure a fresh copy of part_uv->flags */ + if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV)) + goto again; + } +done: + mutex_unlock(&part_uv->cached_activate_gru_mq_desc_mutex); + return ret; +} + +static void +xpc_send_activate_IRQ_part_uv(struct xpc_partition *part, void *msg, + size_t msg_size, int msg_type) +{ + enum xp_retval ret; + + ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type); + if (unlikely(ret != xpSuccess)) + XPC_DEACTIVATE_PARTITION(part, ret); +} + +static void +xpc_send_activate_IRQ_ch_uv(struct xpc_channel *ch, unsigned long *irq_flags, + void *msg, size_t msg_size, int msg_type) +{ + struct xpc_partition *part = &xpc_partitions[ch->partid]; + enum xp_retval ret; + + ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type); + if (unlikely(ret != xpSuccess)) { + if (irq_flags != NULL) + spin_unlock_irqrestore(&ch->lock, *irq_flags); + + XPC_DEACTIVATE_PARTITION(part, ret); + + if (irq_flags != NULL) + spin_lock_irqsave(&ch->lock, *irq_flags); + } +} + +static void +xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req) +{ + unsigned long irq_flags; + struct xpc_partition_uv *part_uv = &part->sn.uv; + + /* + * !!! Make our side think that the remote partition sent an activate + * !!! mq message our way by doing what the activate IRQ handler would + * !!! do had one really been sent. + */ + + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = act_state_req; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + wake_up_interruptible(&xpc_activate_IRQ_wq); +} + +static enum xp_retval +xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa, + size_t *len) +{ + s64 status; + enum xp_retval ret; + +#if defined CONFIG_X86_64 + status = uv_bios_reserved_page_pa((u64)buf, cookie, (u64 *)rp_pa, + (u64 *)len); + if (status == BIOS_STATUS_SUCCESS) + ret = xpSuccess; + else if (status == BIOS_STATUS_MORE_PASSES) + ret = xpNeedMoreInfo; + else + ret = xpBiosError; + +#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV + status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len); + if (status == SALRET_OK) + ret = xpSuccess; + else if (status == SALRET_MORE_PASSES) + ret = xpNeedMoreInfo; + else + ret = xpSalError; + +#else + #error not a supported configuration +#endif + + return ret; +} + +static int +xpc_setup_rsvd_page_uv(struct xpc_rsvd_page *rp) +{ + xpc_heartbeat_uv = + &xpc_partitions[sn_partition_id].sn.uv.cached_heartbeat; + rp->sn.uv.heartbeat_gpa = uv_gpa(xpc_heartbeat_uv); + rp->sn.uv.activate_gru_mq_desc_gpa = + uv_gpa(xpc_activate_mq_uv->gru_mq_desc); + return 0; +} + +static void +xpc_allow_hb_uv(short partid) +{ +} + +static void +xpc_disallow_hb_uv(short partid) +{ +} + +static void +xpc_disallow_all_hbs_uv(void) +{ +} + +static void +xpc_increment_heartbeat_uv(void) +{ + xpc_heartbeat_uv->value++; +} + +static void +xpc_offline_heartbeat_uv(void) +{ + xpc_increment_heartbeat_uv(); + xpc_heartbeat_uv->offline = 1; +} + +static void +xpc_online_heartbeat_uv(void) +{ + xpc_increment_heartbeat_uv(); + xpc_heartbeat_uv->offline = 0; +} + +static void +xpc_heartbeat_init_uv(void) +{ + xpc_heartbeat_uv->value = 1; + xpc_heartbeat_uv->offline = 0; +} + +static void +xpc_heartbeat_exit_uv(void) +{ + xpc_offline_heartbeat_uv(); +} + +static enum xp_retval +xpc_get_remote_heartbeat_uv(struct xpc_partition *part) +{ + struct xpc_partition_uv *part_uv = &part->sn.uv; + enum xp_retval ret; + + ret = xp_remote_memcpy(uv_gpa(&part_uv->cached_heartbeat), + part_uv->heartbeat_gpa, + sizeof(struct xpc_heartbeat_uv)); + if (ret != xpSuccess) + return ret; + + if (part_uv->cached_heartbeat.value == part->last_heartbeat && + !part_uv->cached_heartbeat.offline) { + + ret = xpNoHeartbeat; + } else { + part->last_heartbeat = part_uv->cached_heartbeat.value; + } + return ret; +} + +static void +xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp, + unsigned long remote_rp_gpa, int nasid) +{ + short partid = remote_rp->SAL_partid; + struct xpc_partition *part = &xpc_partitions[partid]; + struct xpc_activate_mq_msg_activate_req_uv msg; + + part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */ + part->remote_rp_ts_jiffies = remote_rp->ts_jiffies; + part->sn.uv.heartbeat_gpa = remote_rp->sn.uv.heartbeat_gpa; + part->sn.uv.activate_gru_mq_desc_gpa = + remote_rp->sn.uv.activate_gru_mq_desc_gpa; + + /* + * ??? Is it a good idea to make this conditional on what is + * ??? potentially stale state information? + */ + if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) { + msg.rp_gpa = uv_gpa(xpc_rsvd_page); + msg.heartbeat_gpa = xpc_rsvd_page->sn.uv.heartbeat_gpa; + msg.activate_gru_mq_desc_gpa = + xpc_rsvd_page->sn.uv.activate_gru_mq_desc_gpa; + xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV); + } + + if (part->act_state == XPC_P_AS_INACTIVE) + xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV); +} + +static void +xpc_request_partition_reactivation_uv(struct xpc_partition *part) +{ + xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV); +} + +static void +xpc_request_partition_deactivation_uv(struct xpc_partition *part) +{ + struct xpc_activate_mq_msg_deactivate_req_uv msg; + + /* + * ??? Is it a good idea to make this conditional on what is + * ??? potentially stale state information? + */ + if (part->sn.uv.remote_act_state != XPC_P_AS_DEACTIVATING && + part->sn.uv.remote_act_state != XPC_P_AS_INACTIVE) { + + msg.reason = part->reason; + xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV); + } +} + +static void +xpc_cancel_partition_deactivation_request_uv(struct xpc_partition *part) +{ + /* nothing needs to be done */ + return; +} + +static void +xpc_init_fifo_uv(struct xpc_fifo_head_uv *head) +{ + head->first = NULL; + head->last = NULL; + spin_lock_init(&head->lock); + head->n_entries = 0; +} + +static void * +xpc_get_fifo_entry_uv(struct xpc_fifo_head_uv *head) +{ + unsigned long irq_flags; + struct xpc_fifo_entry_uv *first; + + spin_lock_irqsave(&head->lock, irq_flags); + first = head->first; + if (head->first != NULL) { + head->first = first->next; + if (head->first == NULL) + head->last = NULL; + + head->n_entries--; + BUG_ON(head->n_entries < 0); + + first->next = NULL; + } + spin_unlock_irqrestore(&head->lock, irq_flags); + return first; +} + +static void +xpc_put_fifo_entry_uv(struct xpc_fifo_head_uv *head, + struct xpc_fifo_entry_uv *last) +{ + unsigned long irq_flags; + + last->next = NULL; + spin_lock_irqsave(&head->lock, irq_flags); + if (head->last != NULL) + head->last->next = last; + else + head->first = last; + head->last = last; + head->n_entries++; + spin_unlock_irqrestore(&head->lock, irq_flags); +} + +static int +xpc_n_of_fifo_entries_uv(struct xpc_fifo_head_uv *head) +{ + return head->n_entries; +} + +/* + * Setup the channel structures that are uv specific. + */ +static enum xp_retval +xpc_setup_ch_structures_uv(struct xpc_partition *part) +{ + struct xpc_channel_uv *ch_uv; + int ch_number; + + for (ch_number = 0; ch_number < part->nchannels; ch_number++) { + ch_uv = &part->channels[ch_number].sn.uv; + + xpc_init_fifo_uv(&ch_uv->msg_slot_free_list); + xpc_init_fifo_uv(&ch_uv->recv_msg_list); + } + + return xpSuccess; +} + +/* + * Teardown the channel structures that are uv specific. + */ +static void +xpc_teardown_ch_structures_uv(struct xpc_partition *part) +{ + /* nothing needs to be done */ + return; +} + +static enum xp_retval +xpc_make_first_contact_uv(struct xpc_partition *part) +{ + struct xpc_activate_mq_msg_uv msg; + + /* + * We send a sync msg to get the remote partition's remote_act_state + * updated to our current act_state which at this point should + * be XPC_P_AS_ACTIVATING. + */ + xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV); + + while (!((part->sn.uv.remote_act_state == XPC_P_AS_ACTIVATING) || + (part->sn.uv.remote_act_state == XPC_P_AS_ACTIVE))) { + + dev_dbg(xpc_part, "waiting to make first contact with " + "partition %d\n", XPC_PARTID(part)); + + /* wait a 1/4 of a second or so */ + (void)msleep_interruptible(250); + + if (part->act_state == XPC_P_AS_DEACTIVATING) + return part->reason; + } + + return xpSuccess; +} + +static u64 +xpc_get_chctl_all_flags_uv(struct xpc_partition *part) +{ + unsigned long irq_flags; + union xpc_channel_ctl_flags chctl; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + chctl = part->chctl; + if (chctl.all_flags != 0) + part->chctl.all_flags = 0; + + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + return chctl.all_flags; +} + +static enum xp_retval +xpc_allocate_send_msg_slot_uv(struct xpc_channel *ch) +{ + struct xpc_channel_uv *ch_uv = &ch->sn.uv; + struct xpc_send_msg_slot_uv *msg_slot; + unsigned long irq_flags; + int nentries; + int entry; + size_t nbytes; + + for (nentries = ch->local_nentries; nentries > 0; nentries--) { + nbytes = nentries * sizeof(struct xpc_send_msg_slot_uv); + ch_uv->send_msg_slots = kzalloc(nbytes, GFP_KERNEL); + if (ch_uv->send_msg_slots == NULL) + continue; + + for (entry = 0; entry < nentries; entry++) { + msg_slot = &ch_uv->send_msg_slots[entry]; + + msg_slot->msg_slot_number = entry; + xpc_put_fifo_entry_uv(&ch_uv->msg_slot_free_list, + &msg_slot->next); + } + + spin_lock_irqsave(&ch->lock, irq_flags); + if (nentries < ch->local_nentries) + ch->local_nentries = nentries; + spin_unlock_irqrestore(&ch->lock, irq_flags); + return xpSuccess; + } + + return xpNoMemory; +} + +static enum xp_retval +xpc_allocate_recv_msg_slot_uv(struct xpc_channel *ch) +{ + struct xpc_channel_uv *ch_uv = &ch->sn.uv; + struct xpc_notify_mq_msg_uv *msg_slot; + unsigned long irq_flags; + int nentries; + int entry; + size_t nbytes; + + for (nentries = ch->remote_nentries; nentries > 0; nentries--) { + nbytes = nentries * ch->entry_size; + ch_uv->recv_msg_slots = kzalloc(nbytes, GFP_KERNEL); + if (ch_uv->recv_msg_slots == NULL) + continue; + + for (entry = 0; entry < nentries; entry++) { + msg_slot = ch_uv->recv_msg_slots + + entry * ch->entry_size; + + msg_slot->hdr.msg_slot_number = entry; + } + + spin_lock_irqsave(&ch->lock, irq_flags); + if (nentries < ch->remote_nentries) + ch->remote_nentries = nentries; + spin_unlock_irqrestore(&ch->lock, irq_flags); + return xpSuccess; + } + + return xpNoMemory; +} + +/* + * Allocate msg_slots associated with the channel. + */ +static enum xp_retval +xpc_setup_msg_structures_uv(struct xpc_channel *ch) +{ + static enum xp_retval ret; + struct xpc_channel_uv *ch_uv = &ch->sn.uv; + + DBUG_ON(ch->flags & XPC_C_SETUP); + + ch_uv->cached_notify_gru_mq_desc = kmalloc(sizeof(struct + gru_message_queue_desc), + GFP_KERNEL); + if (ch_uv->cached_notify_gru_mq_desc == NULL) + return xpNoMemory; + + ret = xpc_allocate_send_msg_slot_uv(ch); + if (ret == xpSuccess) { + + ret = xpc_allocate_recv_msg_slot_uv(ch); + if (ret != xpSuccess) { + kfree(ch_uv->send_msg_slots); + xpc_init_fifo_uv(&ch_uv->msg_slot_free_list); + } + } + return ret; +} + +/* + * Free up msg_slots and clear other stuff that were setup for the specified + * channel. + */ +static void +xpc_teardown_msg_structures_uv(struct xpc_channel *ch) +{ + struct xpc_channel_uv *ch_uv = &ch->sn.uv; + + DBUG_ON(!spin_is_locked(&ch->lock)); + + kfree(ch_uv->cached_notify_gru_mq_desc); + ch_uv->cached_notify_gru_mq_desc = NULL; + + if (ch->flags & XPC_C_SETUP) { + xpc_init_fifo_uv(&ch_uv->msg_slot_free_list); + kfree(ch_uv->send_msg_slots); + xpc_init_fifo_uv(&ch_uv->recv_msg_list); + kfree(ch_uv->recv_msg_slots); + } +} + +static void +xpc_send_chctl_closerequest_uv(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_activate_mq_msg_chctl_closerequest_uv msg; + + msg.ch_number = ch->number; + msg.reason = ch->reason; + xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV); +} + +static void +xpc_send_chctl_closereply_uv(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_activate_mq_msg_chctl_closereply_uv msg; + + msg.ch_number = ch->number; + xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV); +} + +static void +xpc_send_chctl_openrequest_uv(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_activate_mq_msg_chctl_openrequest_uv msg; + + msg.ch_number = ch->number; + msg.entry_size = ch->entry_size; + msg.local_nentries = ch->local_nentries; + xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV); +} + +static void +xpc_send_chctl_openreply_uv(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_activate_mq_msg_chctl_openreply_uv msg; + + msg.ch_number = ch->number; + msg.local_nentries = ch->local_nentries; + msg.remote_nentries = ch->remote_nentries; + msg.notify_gru_mq_desc_gpa = uv_gpa(xpc_notify_mq_uv->gru_mq_desc); + xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV); +} + +static void +xpc_send_chctl_opencomplete_uv(struct xpc_channel *ch, unsigned long *irq_flags) +{ + struct xpc_activate_mq_msg_chctl_opencomplete_uv msg; + + msg.ch_number = ch->number; + xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV); +} + +static void +xpc_send_chctl_local_msgrequest_uv(struct xpc_partition *part, int ch_number) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&part->chctl_lock, irq_flags); + part->chctl.flags[ch_number] |= XPC_CHCTL_MSGREQUEST; + spin_unlock_irqrestore(&part->chctl_lock, irq_flags); + + xpc_wakeup_channel_mgr(part); +} + +static enum xp_retval +xpc_save_remote_msgqueue_pa_uv(struct xpc_channel *ch, + unsigned long gru_mq_desc_gpa) +{ + struct xpc_channel_uv *ch_uv = &ch->sn.uv; + + DBUG_ON(ch_uv->cached_notify_gru_mq_desc == NULL); + return xpc_cache_remote_gru_mq_desc_uv(ch_uv->cached_notify_gru_mq_desc, + gru_mq_desc_gpa); +} + +static void +xpc_indicate_partition_engaged_uv(struct xpc_partition *part) +{ + struct xpc_activate_mq_msg_uv msg; + + xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV); +} + +static void +xpc_indicate_partition_disengaged_uv(struct xpc_partition *part) +{ + struct xpc_activate_mq_msg_uv msg; + + xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg), + XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV); +} + +static void +xpc_assume_partition_disengaged_uv(short partid) +{ + struct xpc_partition_uv *part_uv = &xpc_partitions[partid].sn.uv; + unsigned long irq_flags; + + spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + part_uv->flags &= ~XPC_P_ENGAGED_UV; + spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); +} + +static int +xpc_partition_engaged_uv(short partid) +{ + return (xpc_partitions[partid].sn.uv.flags & XPC_P_ENGAGED_UV) != 0; +} + +static int +xpc_any_partition_engaged_uv(void) +{ + struct xpc_partition_uv *part_uv; + short partid; + + for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) { + part_uv = &xpc_partitions[partid].sn.uv; + if ((part_uv->flags & XPC_P_ENGAGED_UV) != 0) + return 1; + } + return 0; +} + +static enum xp_retval +xpc_allocate_msg_slot_uv(struct xpc_channel *ch, u32 flags, + struct xpc_send_msg_slot_uv **address_of_msg_slot) +{ + enum xp_retval ret; + struct xpc_send_msg_slot_uv *msg_slot; + struct xpc_fifo_entry_uv *entry; + + while (1) { + entry = xpc_get_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list); + if (entry != NULL) + break; + + if (flags & XPC_NOWAIT) + return xpNoWait; + + ret = xpc_allocate_msg_wait(ch); + if (ret != xpInterrupted && ret != xpTimeout) + return ret; + } + + msg_slot = container_of(entry, struct xpc_send_msg_slot_uv, next); + *address_of_msg_slot = msg_slot; + return xpSuccess; +} + +static void +xpc_free_msg_slot_uv(struct xpc_channel *ch, + struct xpc_send_msg_slot_uv *msg_slot) +{ + xpc_put_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list, &msg_slot->next); + + /* wakeup anyone waiting for a free msg slot */ + if (atomic_read(&ch->n_on_msg_allocate_wq) > 0) + wake_up(&ch->msg_allocate_wq); +} + +static void +xpc_notify_sender_uv(struct xpc_channel *ch, + struct xpc_send_msg_slot_uv *msg_slot, + enum xp_retval reason) +{ + xpc_notify_func func = msg_slot->func; + + if (func != NULL && cmpxchg(&msg_slot->func, func, NULL) == func) { + + atomic_dec(&ch->n_to_notify); + + dev_dbg(xpc_chan, "msg_slot->func() called, msg_slot=0x%p " + "msg_slot_number=%d partid=%d channel=%d\n", msg_slot, + msg_slot->msg_slot_number, ch->partid, ch->number); + + func(reason, ch->partid, ch->number, msg_slot->key); + + dev_dbg(xpc_chan, "msg_slot->func() returned, msg_slot=0x%p " + "msg_slot_number=%d partid=%d channel=%d\n", msg_slot, + msg_slot->msg_slot_number, ch->partid, ch->number); + } +} + +static void +xpc_handle_notify_mq_ack_uv(struct xpc_channel *ch, + struct xpc_notify_mq_msg_uv *msg) +{ + struct xpc_send_msg_slot_uv *msg_slot; + int entry = msg->hdr.msg_slot_number % ch->local_nentries; + + msg_slot = &ch->sn.uv.send_msg_slots[entry]; + + BUG_ON(msg_slot->msg_slot_number != msg->hdr.msg_slot_number); + msg_slot->msg_slot_number += ch->local_nentries; + + if (msg_slot->func != NULL) + xpc_notify_sender_uv(ch, msg_slot, xpMsgDelivered); + + xpc_free_msg_slot_uv(ch, msg_slot); +} + +static void +xpc_handle_notify_mq_msg_uv(struct xpc_partition *part, + struct xpc_notify_mq_msg_uv *msg) +{ + struct xpc_partition_uv *part_uv = &part->sn.uv; + struct xpc_channel *ch; + struct xpc_channel_uv *ch_uv; + struct xpc_notify_mq_msg_uv *msg_slot; + unsigned long irq_flags; + int ch_number = msg->hdr.ch_number; + + if (unlikely(ch_number >= part->nchannels)) { + dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received invalid " + "channel number=0x%x in message from partid=%d\n", + ch_number, XPC_PARTID(part)); + + /* get hb checker to deactivate from the remote partition */ + spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags); + if (part_uv->act_state_req == 0) + xpc_activate_IRQ_rcvd++; + part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV; + part_uv->reason = xpBadChannelNumber; + spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags); + + wake_up_interruptible(&xpc_activate_IRQ_wq); + return; + } + + ch = &part->channels[ch_number]; + xpc_msgqueue_ref(ch); + + if (!(ch->flags & XPC_C_CONNECTED)) { + xpc_msgqueue_deref(ch); + return; + } + + /* see if we're really dealing with an ACK for a previously sent msg */ + if (msg->hdr.size == 0) { + xpc_handle_notify_mq_ack_uv(ch, msg); + xpc_msgqueue_deref(ch); + return; + } + + /* we're dealing with a normal message sent via the notify_mq */ + ch_uv = &ch->sn.uv; + + msg_slot = ch_uv->recv_msg_slots + + (msg->hdr.msg_slot_number % ch->remote_nentries) * ch->entry_size; + + BUG_ON(msg_slot->hdr.size != 0); + + memcpy(msg_slot, msg, msg->hdr.size); + + xpc_put_fifo_entry_uv(&ch_uv->recv_msg_list, &msg_slot->hdr.u.next); + + if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) { + /* + * If there is an existing idle kthread get it to deliver + * the payload, otherwise we'll have to get the channel mgr + * for this partition to create a kthread to do the delivery. + */ + if (atomic_read(&ch->kthreads_idle) > 0) + wake_up_nr(&ch->idle_wq, 1); + else + xpc_send_chctl_local_msgrequest_uv(part, ch->number); + } + xpc_msgqueue_deref(ch); +} + +static irqreturn_t +xpc_handle_notify_IRQ_uv(int irq, void *dev_id) +{ + struct xpc_notify_mq_msg_uv *msg; + short partid; + struct xpc_partition *part; + + while ((msg = gru_get_next_message(xpc_notify_mq_uv->gru_mq_desc)) != + NULL) { + + partid = msg->hdr.partid; + if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) { + dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received " + "invalid partid=0x%x in message\n", partid); + } else { + part = &xpc_partitions[partid]; + + if (xpc_part_ref(part)) { + xpc_handle_notify_mq_msg_uv(part, msg); + xpc_part_deref(part); + } + } + + gru_free_message(xpc_notify_mq_uv->gru_mq_desc, msg); + } + + return IRQ_HANDLED; +} + +static int +xpc_n_of_deliverable_payloads_uv(struct xpc_channel *ch) +{ + return xpc_n_of_fifo_entries_uv(&ch->sn.uv.recv_msg_list); +} + +static void +xpc_process_msg_chctl_flags_uv(struct xpc_partition *part, int ch_number) +{ + struct xpc_channel *ch = &part->channels[ch_number]; + int ndeliverable_payloads; + + xpc_msgqueue_ref(ch); + + ndeliverable_payloads = xpc_n_of_deliverable_payloads_uv(ch); + + if (ndeliverable_payloads > 0 && + (ch->flags & XPC_C_CONNECTED) && + (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)) { + + xpc_activate_kthreads(ch, ndeliverable_payloads); + } + + xpc_msgqueue_deref(ch); +} + +static enum xp_retval +xpc_send_payload_uv(struct xpc_channel *ch, u32 flags, void *payload, + u16 payload_size, u8 notify_type, xpc_notify_func func, + void *key) +{ + enum xp_retval ret = xpSuccess; + struct xpc_send_msg_slot_uv *msg_slot = NULL; + struct xpc_notify_mq_msg_uv *msg; + u8 msg_buffer[XPC_NOTIFY_MSG_SIZE_UV]; + size_t msg_size; + + DBUG_ON(notify_type != XPC_N_CALL); + + msg_size = sizeof(struct xpc_notify_mq_msghdr_uv) + payload_size; + if (msg_size > ch->entry_size) + return xpPayloadTooBig; + + xpc_msgqueue_ref(ch); + + if (ch->flags & XPC_C_DISCONNECTING) { + ret = ch->reason; + goto out_1; + } + if (!(ch->flags & XPC_C_CONNECTED)) { + ret = xpNotConnected; + goto out_1; + } + + ret = xpc_allocate_msg_slot_uv(ch, flags, &msg_slot); + if (ret != xpSuccess) + goto out_1; + + if (func != NULL) { + atomic_inc(&ch->n_to_notify); + + msg_slot->key = key; + smp_wmb(); /* a non-NULL func must hit memory after the key */ + msg_slot->func = func; + + if (ch->flags & XPC_C_DISCONNECTING) { + ret = ch->reason; + goto out_2; + } + } + + msg = (struct xpc_notify_mq_msg_uv *)&msg_buffer; + msg->hdr.partid = xp_partition_id; + msg->hdr.ch_number = ch->number; + msg->hdr.size = msg_size; + msg->hdr.msg_slot_number = msg_slot->msg_slot_number; + memcpy(&msg->payload, payload, payload_size); + + ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg, + msg_size); + if (ret == xpSuccess) + goto out_1; + + XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret); +out_2: + if (func != NULL) { + /* + * Try to NULL the msg_slot's func field. If we fail, then + * xpc_notify_senders_of_disconnect_uv() beat us to it, in which + * case we need to pretend we succeeded to send the message + * since the user will get a callout for the disconnect error + * by xpc_notify_senders_of_disconnect_uv(), and to also get an + * error returned here will confuse them. Additionally, since + * in this case the channel is being disconnected we don't need + * to put the the msg_slot back on the free list. + */ + if (cmpxchg(&msg_slot->func, func, NULL) != func) { + ret = xpSuccess; + goto out_1; + } + + msg_slot->key = NULL; + atomic_dec(&ch->n_to_notify); + } + xpc_free_msg_slot_uv(ch, msg_slot); +out_1: + xpc_msgqueue_deref(ch); + return ret; +} + +/* + * Tell the callers of xpc_send_notify() that the status of their payloads + * is unknown because the channel is now disconnecting. + * + * We don't worry about putting these msg_slots on the free list since the + * msg_slots themselves are about to be kfree'd. + */ +static void +xpc_notify_senders_of_disconnect_uv(struct xpc_channel *ch) +{ + struct xpc_send_msg_slot_uv *msg_slot; + int entry; + + DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING)); + + for (entry = 0; entry < ch->local_nentries; entry++) { + + if (atomic_read(&ch->n_to_notify) == 0) + break; + + msg_slot = &ch->sn.uv.send_msg_slots[entry]; + if (msg_slot->func != NULL) + xpc_notify_sender_uv(ch, msg_slot, ch->reason); + } +} + +/* + * Get the next deliverable message's payload. + */ +static void * +xpc_get_deliverable_payload_uv(struct xpc_channel *ch) +{ + struct xpc_fifo_entry_uv *entry; + struct xpc_notify_mq_msg_uv *msg; + void *payload = NULL; + + if (!(ch->flags & XPC_C_DISCONNECTING)) { + entry = xpc_get_fifo_entry_uv(&ch->sn.uv.recv_msg_list); + if (entry != NULL) { + msg = container_of(entry, struct xpc_notify_mq_msg_uv, + hdr.u.next); + payload = &msg->payload; + } + } + return payload; +} + +static void +xpc_received_payload_uv(struct xpc_channel *ch, void *payload) +{ + struct xpc_notify_mq_msg_uv *msg; + enum xp_retval ret; + + msg = container_of(payload, struct xpc_notify_mq_msg_uv, payload); + + /* return an ACK to the sender of this message */ + + msg->hdr.partid = xp_partition_id; + msg->hdr.size = 0; /* size of zero indicates this is an ACK */ + + ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg, + sizeof(struct xpc_notify_mq_msghdr_uv)); + if (ret != xpSuccess) + XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret); +} + +static struct xpc_arch_operations xpc_arch_ops_uv = { + .setup_partitions = xpc_setup_partitions_uv, + .teardown_partitions = xpc_teardown_partitions_uv, + .process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv, + .get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv, + .setup_rsvd_page = xpc_setup_rsvd_page_uv, + + .allow_hb = xpc_allow_hb_uv, + .disallow_hb = xpc_disallow_hb_uv, + .disallow_all_hbs = xpc_disallow_all_hbs_uv, + .increment_heartbeat = xpc_increment_heartbeat_uv, + .offline_heartbeat = xpc_offline_heartbeat_uv, + .online_heartbeat = xpc_online_heartbeat_uv, + .heartbeat_init = xpc_heartbeat_init_uv, + .heartbeat_exit = xpc_heartbeat_exit_uv, + .get_remote_heartbeat = xpc_get_remote_heartbeat_uv, + + .request_partition_activation = + xpc_request_partition_activation_uv, + .request_partition_reactivation = + xpc_request_partition_reactivation_uv, + .request_partition_deactivation = + xpc_request_partition_deactivation_uv, + .cancel_partition_deactivation_request = + xpc_cancel_partition_deactivation_request_uv, + + .setup_ch_structures = xpc_setup_ch_structures_uv, + .teardown_ch_structures = xpc_teardown_ch_structures_uv, + + .make_first_contact = xpc_make_first_contact_uv, + + .get_chctl_all_flags = xpc_get_chctl_all_flags_uv, + .send_chctl_closerequest = xpc_send_chctl_closerequest_uv, + .send_chctl_closereply = xpc_send_chctl_closereply_uv, + .send_chctl_openrequest = xpc_send_chctl_openrequest_uv, + .send_chctl_openreply = xpc_send_chctl_openreply_uv, + .send_chctl_opencomplete = xpc_send_chctl_opencomplete_uv, + .process_msg_chctl_flags = xpc_process_msg_chctl_flags_uv, + + .save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_uv, + + .setup_msg_structures = xpc_setup_msg_structures_uv, + .teardown_msg_structures = xpc_teardown_msg_structures_uv, + + .indicate_partition_engaged = xpc_indicate_partition_engaged_uv, + .indicate_partition_disengaged = xpc_indicate_partition_disengaged_uv, + .assume_partition_disengaged = xpc_assume_partition_disengaged_uv, + .partition_engaged = xpc_partition_engaged_uv, + .any_partition_engaged = xpc_any_partition_engaged_uv, + + .n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_uv, + .send_payload = xpc_send_payload_uv, + .get_deliverable_payload = xpc_get_deliverable_payload_uv, + .received_payload = xpc_received_payload_uv, + .notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv, +}; + +static int +xpc_init_mq_node(int nid) +{ + int cpu; + + get_online_cpus(); + + for_each_cpu(cpu, cpumask_of_node(nid)) { + xpc_activate_mq_uv = + xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid, + XPC_ACTIVATE_IRQ_NAME, + xpc_handle_activate_IRQ_uv); + if (!IS_ERR(xpc_activate_mq_uv)) + break; + } + if (IS_ERR(xpc_activate_mq_uv)) { + put_online_cpus(); + return PTR_ERR(xpc_activate_mq_uv); + } + + for_each_cpu(cpu, cpumask_of_node(nid)) { + xpc_notify_mq_uv = + xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid, + XPC_NOTIFY_IRQ_NAME, + xpc_handle_notify_IRQ_uv); + if (!IS_ERR(xpc_notify_mq_uv)) + break; + } + if (IS_ERR(xpc_notify_mq_uv)) { + xpc_destroy_gru_mq_uv(xpc_activate_mq_uv); + put_online_cpus(); + return PTR_ERR(xpc_notify_mq_uv); + } + + put_online_cpus(); + return 0; +} + +int +xpc_init_uv(void) +{ + int nid; + int ret = 0; + + xpc_arch_ops = xpc_arch_ops_uv; + + if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) { + dev_err(xpc_part, "xpc_notify_mq_msghdr_uv is larger than %d\n", + XPC_MSG_HDR_MAX_SIZE); + return -E2BIG; + } + + if (xpc_mq_node < 0) + for_each_online_node(nid) { + ret = xpc_init_mq_node(nid); + + if (!ret) + break; + } + else + ret = xpc_init_mq_node(xpc_mq_node); + + if (ret < 0) + dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n", + -ret); + + return ret; +} + +void +xpc_exit_uv(void) +{ + xpc_destroy_gru_mq_uv(xpc_notify_mq_uv); + xpc_destroy_gru_mq_uv(xpc_activate_mq_uv); +} + +module_param(xpc_mq_node, int, 0); +MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues."); diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c new file mode 100644 index 000000000..44d750d98 --- /dev/null +++ b/drivers/misc/sgi-xp/xpnet.c @@ -0,0 +1,596 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1999-2009 Silicon Graphics, Inc. All rights reserved. + */ + +/* + * Cross Partition Network Interface (XPNET) support + * + * XPNET provides a virtual network layered on top of the Cross + * Partition communication layer. + * + * XPNET provides direct point-to-point and broadcast-like support + * for an ethernet-like device. The ethernet broadcast medium is + * replaced with a point-to-point message structure which passes + * pointers to a DMA-capable block that a remote partition should + * retrieve and pass to the upper level networking layer. + * + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include "xp.h" + +/* + * The message payload transferred by XPC. + * + * buf_pa is the physical address where the DMA should pull from. + * + * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a + * cacheline boundary. To accomplish this, we record the number of + * bytes from the beginning of the first cacheline to the first useful + * byte of the skb (leadin_ignore) and the number of bytes from the + * last useful byte of the skb to the end of the last cacheline + * (tailout_ignore). + * + * size is the number of bytes to transfer which includes the skb->len + * (useful bytes of the senders skb) plus the leadin and tailout + */ +struct xpnet_message { + u16 version; /* Version for this message */ + u16 embedded_bytes; /* #of bytes embedded in XPC message */ + u32 magic; /* Special number indicating this is xpnet */ + unsigned long buf_pa; /* phys address of buffer to retrieve */ + u32 size; /* #of bytes in buffer */ + u8 leadin_ignore; /* #of bytes to ignore at the beginning */ + u8 tailout_ignore; /* #of bytes to ignore at the end */ + unsigned char data; /* body of small packets */ +}; + +/* + * Determine the size of our message, the cacheline aligned size, + * and then the number of message will request from XPC. + * + * XPC expects each message to exist in an individual cacheline. + */ +#define XPNET_MSG_SIZE XPC_MSG_PAYLOAD_MAX_SIZE +#define XPNET_MSG_DATA_MAX \ + (XPNET_MSG_SIZE - offsetof(struct xpnet_message, data)) +#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPC_MSG_MAX_SIZE) + +#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1) +#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1) + +/* + * Version number of XPNET implementation. XPNET can always talk to versions + * with same major #, and never talk to versions with a different version. + */ +#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor)) +#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4) +#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf) + +#define XPNET_VERSION _XPNET_VERSION(1, 0) /* version 1.0 */ +#define XPNET_VERSION_EMBED _XPNET_VERSION(1, 1) /* version 1.1 */ +#define XPNET_MAGIC 0x88786984 /* "XNET" */ + +#define XPNET_VALID_MSG(_m) \ + ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \ + && (msg->magic == XPNET_MAGIC)) + +#define XPNET_DEVICE_NAME "xp0" + +/* + * When messages are queued with xpc_send_notify, a kmalloc'd buffer + * of the following type is passed as a notification cookie. When the + * notification function is called, we use the cookie to decide + * whether all outstanding message sends have completed. The skb can + * then be released. + */ +struct xpnet_pending_msg { + struct sk_buff *skb; + atomic_t use_count; +}; + +struct net_device *xpnet_device; + +/* + * When we are notified of other partitions activating, we add them to + * our bitmask of partitions to which we broadcast. + */ +static unsigned long *xpnet_broadcast_partitions; +/* protect above */ +static DEFINE_SPINLOCK(xpnet_broadcast_lock); + +/* + * Since the Block Transfer Engine (BTE) is being used for the transfer + * and it relies upon cache-line size transfers, we need to reserve at + * least one cache-line for head and tail alignment. The BTE is + * limited to 8MB transfers. + * + * Testing has shown that changing MTU to greater than 64KB has no effect + * on TCP as the two sides negotiate a Max Segment Size that is limited + * to 64K. Other protocols May use packets greater than this, but for + * now, the default is 64KB. + */ +#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES) +/* 68 comes from min TCP+IP+MAC header */ +#define XPNET_MIN_MTU 68 +/* 32KB has been determined to be the ideal */ +#define XPNET_DEF_MTU (0x8000UL) + +/* + * The partid is encapsulated in the MAC address beginning in the following + * octet and it consists of two octets. + */ +#define XPNET_PARTID_OCTET 2 + +/* Define the XPNET debug device structures to be used with dev_dbg() et al */ + +struct device_driver xpnet_dbg_name = { + .name = "xpnet" +}; + +struct device xpnet_dbg_subname = { + .init_name = "", /* set to "" */ + .driver = &xpnet_dbg_name +}; + +struct device *xpnet = &xpnet_dbg_subname; + +/* + * Packet was recevied by XPC and forwarded to us. + */ +static void +xpnet_receive(short partid, int channel, struct xpnet_message *msg) +{ + struct sk_buff *skb; + void *dst; + enum xp_retval ret; + + if (!XPNET_VALID_MSG(msg)) { + /* + * Packet with a different XPC version. Ignore. + */ + xpc_received(partid, channel, (void *)msg); + + xpnet_device->stats.rx_errors++; + + return; + } + dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size, + msg->leadin_ignore, msg->tailout_ignore); + + /* reserve an extra cache line */ + skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES); + if (!skb) { + dev_err(xpnet, "failed on dev_alloc_skb(%d)\n", + msg->size + L1_CACHE_BYTES); + + xpc_received(partid, channel, (void *)msg); + + xpnet_device->stats.rx_errors++; + + return; + } + + /* + * The allocated skb has some reserved space. + * In order to use xp_remote_memcpy(), we need to get the + * skb->data pointer moved forward. + */ + skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data & + (L1_CACHE_BYTES - 1)) + + msg->leadin_ignore)); + + /* + * Update the tail pointer to indicate data actually + * transferred. + */ + skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore)); + + /* + * Move the data over from the other side. + */ + if ((XPNET_VERSION_MINOR(msg->version) == 1) && + (msg->embedded_bytes != 0)) { + dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, " + "%lu)\n", skb->data, &msg->data, + (size_t)msg->embedded_bytes); + + skb_copy_to_linear_data(skb, &msg->data, + (size_t)msg->embedded_bytes); + } else { + dst = (void *)((u64)skb->data & ~(L1_CACHE_BYTES - 1)); + dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t" + "xp_remote_memcpy(0x%p, 0x%p, %hu)\n", dst, + (void *)msg->buf_pa, msg->size); + + ret = xp_remote_memcpy(xp_pa(dst), msg->buf_pa, msg->size); + if (ret != xpSuccess) { + /* + * !!! Need better way of cleaning skb. Currently skb + * !!! appears in_use and we can't just call + * !!! dev_kfree_skb. + */ + dev_err(xpnet, "xp_remote_memcpy(0x%p, 0x%p, 0x%hx) " + "returned error=0x%x\n", dst, + (void *)msg->buf_pa, msg->size, ret); + + xpc_received(partid, channel, (void *)msg); + + xpnet_device->stats.rx_errors++; + + return; + } + } + + dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p " + "skb->end=0x%p skb->len=%d\n", (void *)skb->head, + (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb), + skb->len); + + skb->protocol = eth_type_trans(skb, xpnet_device); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + dev_dbg(xpnet, "passing skb to network layer\n" + "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p " + "skb->end=0x%p skb->len=%d\n", + (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb), + skb_end_pointer(skb), skb->len); + + xpnet_device->stats.rx_packets++; + xpnet_device->stats.rx_bytes += skb->len + ETH_HLEN; + + netif_rx_ni(skb); + xpc_received(partid, channel, (void *)msg); +} + +/* + * This is the handler which XPC calls during any sort of change in + * state or message reception on a connection. + */ +static void +xpnet_connection_activity(enum xp_retval reason, short partid, int channel, + void *data, void *key) +{ + DBUG_ON(partid < 0 || partid >= xp_max_npartitions); + DBUG_ON(channel != XPC_NET_CHANNEL); + + switch (reason) { + case xpMsgReceived: /* message received */ + DBUG_ON(data == NULL); + + xpnet_receive(partid, channel, (struct xpnet_message *)data); + break; + + case xpConnected: /* connection completed to a partition */ + spin_lock_bh(&xpnet_broadcast_lock); + __set_bit(partid, xpnet_broadcast_partitions); + spin_unlock_bh(&xpnet_broadcast_lock); + + netif_carrier_on(xpnet_device); + + dev_dbg(xpnet, "%s connected to partition %d\n", + xpnet_device->name, partid); + break; + + default: + spin_lock_bh(&xpnet_broadcast_lock); + __clear_bit(partid, xpnet_broadcast_partitions); + spin_unlock_bh(&xpnet_broadcast_lock); + + if (bitmap_empty((unsigned long *)xpnet_broadcast_partitions, + xp_max_npartitions)) { + netif_carrier_off(xpnet_device); + } + + dev_dbg(xpnet, "%s disconnected from partition %d\n", + xpnet_device->name, partid); + break; + } +} + +static int +xpnet_dev_open(struct net_device *dev) +{ + enum xp_retval ret; + + dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, " + "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity, + (unsigned long)XPNET_MSG_SIZE, + (unsigned long)XPNET_MSG_NENTRIES, + (unsigned long)XPNET_MAX_KTHREADS, + (unsigned long)XPNET_MAX_IDLE_KTHREADS); + + ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL, + XPNET_MSG_SIZE, XPNET_MSG_NENTRIES, + XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS); + if (ret != xpSuccess) { + dev_err(xpnet, "ifconfig up of %s failed on XPC connect, " + "ret=%d\n", dev->name, ret); + + return -ENOMEM; + } + + dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name); + + return 0; +} + +static int +xpnet_dev_stop(struct net_device *dev) +{ + xpc_disconnect(XPC_NET_CHANNEL); + + dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name); + + return 0; +} + +/* + * Notification that the other end has received the message and + * DMA'd the skb information. At this point, they are done with + * our side. When all recipients are done processing, we + * release the skb and then release our pending message structure. + */ +static void +xpnet_send_completed(enum xp_retval reason, short partid, int channel, + void *__qm) +{ + struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm; + + DBUG_ON(queued_msg == NULL); + + dev_dbg(xpnet, "message to %d notified with reason %d\n", + partid, reason); + + if (atomic_dec_return(&queued_msg->use_count) == 0) { + dev_dbg(xpnet, "all acks for skb->head=-x%p\n", + (void *)queued_msg->skb->head); + + dev_kfree_skb_any(queued_msg->skb); + kfree(queued_msg); + } +} + +static void +xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg, + u64 start_addr, u64 end_addr, u16 embedded_bytes, int dest_partid) +{ + u8 msg_buffer[XPNET_MSG_SIZE]; + struct xpnet_message *msg = (struct xpnet_message *)&msg_buffer; + u16 msg_size = sizeof(struct xpnet_message); + enum xp_retval ret; + + msg->embedded_bytes = embedded_bytes; + if (unlikely(embedded_bytes != 0)) { + msg->version = XPNET_VERSION_EMBED; + dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n", + &msg->data, skb->data, (size_t)embedded_bytes); + skb_copy_from_linear_data(skb, &msg->data, + (size_t)embedded_bytes); + msg_size += embedded_bytes - 1; + } else { + msg->version = XPNET_VERSION; + } + msg->magic = XPNET_MAGIC; + msg->size = end_addr - start_addr; + msg->leadin_ignore = (u64)skb->data - start_addr; + msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb); + msg->buf_pa = xp_pa((void *)start_addr); + + dev_dbg(xpnet, "sending XPC message to %d:%d\n" + "msg->buf_pa=0x%lx, msg->size=%u, " + "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n", + dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size, + msg->leadin_ignore, msg->tailout_ignore); + + atomic_inc(&queued_msg->use_count); + + ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, XPC_NOWAIT, msg, + msg_size, xpnet_send_completed, queued_msg); + if (unlikely(ret != xpSuccess)) + atomic_dec(&queued_msg->use_count); +} + +/* + * Network layer has formatted a packet (skb) and is ready to place it + * "on the wire". Prepare and send an xpnet_message to all partitions + * which have connected with us and are targets of this packet. + * + * MAC-NOTE: For the XPNET driver, the MAC address contains the + * destination partid. If the destination partid octets are 0xffff, + * this packet is to be broadcast to all connected partitions. + */ +static netdev_tx_t +xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xpnet_pending_msg *queued_msg; + u64 start_addr, end_addr; + short dest_partid; + u16 embedded_bytes = 0; + + dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p " + "skb->end=0x%p skb->len=%d\n", (void *)skb->head, + (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb), + skb->len); + + if (skb->data[0] == 0x33) { + dev_kfree_skb(skb); + return NETDEV_TX_OK; /* nothing needed to be done */ + } + + /* + * The xpnet_pending_msg tracks how many outstanding + * xpc_send_notifies are relying on this skb. When none + * remain, release the skb. + */ + queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC); + if (queued_msg == NULL) { + dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping " + "packet\n", sizeof(struct xpnet_pending_msg)); + + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + /* get the beginning of the first cacheline and end of last */ + start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1)); + end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb)); + + /* calculate how many bytes to embed in the XPC message */ + if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) { + /* skb->data does fit so embed */ + embedded_bytes = skb->len; + } + + /* + * Since the send occurs asynchronously, we set the count to one + * and begin sending. Any sends that happen to complete before + * we are done sending will not free the skb. We will be left + * with that task during exit. This also handles the case of + * a packet destined for a partition which is no longer up. + */ + atomic_set(&queued_msg->use_count, 1); + queued_msg->skb = skb; + + if (skb->data[0] == 0xff) { + /* we are being asked to broadcast to all partitions */ + for_each_set_bit(dest_partid, xpnet_broadcast_partitions, + xp_max_npartitions) { + + xpnet_send(skb, queued_msg, start_addr, end_addr, + embedded_bytes, dest_partid); + } + } else { + dest_partid = (short)skb->data[XPNET_PARTID_OCTET + 1]; + dest_partid |= (short)skb->data[XPNET_PARTID_OCTET + 0] << 8; + + if (dest_partid >= 0 && + dest_partid < xp_max_npartitions && + test_bit(dest_partid, xpnet_broadcast_partitions) != 0) { + + xpnet_send(skb, queued_msg, start_addr, end_addr, + embedded_bytes, dest_partid); + } + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (atomic_dec_return(&queued_msg->use_count) == 0) { + dev_kfree_skb(skb); + kfree(queued_msg); + } + + return NETDEV_TX_OK; +} + +/* + * Deal with transmit timeouts coming from the network layer. + */ +static void +xpnet_dev_tx_timeout(struct net_device *dev) +{ + dev->stats.tx_errors++; +} + +static const struct net_device_ops xpnet_netdev_ops = { + .ndo_open = xpnet_dev_open, + .ndo_stop = xpnet_dev_stop, + .ndo_start_xmit = xpnet_dev_hard_start_xmit, + .ndo_tx_timeout = xpnet_dev_tx_timeout, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + +static int __init +xpnet_init(void) +{ + int result; + + if (!is_shub() && !is_uv()) + return -ENODEV; + + dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME); + + xpnet_broadcast_partitions = kcalloc(BITS_TO_LONGS(xp_max_npartitions), + sizeof(long), + GFP_KERNEL); + if (xpnet_broadcast_partitions == NULL) + return -ENOMEM; + + /* + * use ether_setup() to init the majority of our device + * structure and then override the necessary pieces. + */ + xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN, + ether_setup); + if (xpnet_device == NULL) { + kfree(xpnet_broadcast_partitions); + return -ENOMEM; + } + + netif_carrier_off(xpnet_device); + + xpnet_device->netdev_ops = &xpnet_netdev_ops; + xpnet_device->mtu = XPNET_DEF_MTU; + xpnet_device->min_mtu = XPNET_MIN_MTU; + xpnet_device->max_mtu = XPNET_MAX_MTU; + + /* + * Multicast assumes the LSB of the first octet is set for multicast + * MAC addresses. We chose the first octet of the MAC to be unlikely + * to collide with any vendor's officially issued MAC. + */ + xpnet_device->dev_addr[0] = 0x02; /* locally administered, no OUI */ + + xpnet_device->dev_addr[XPNET_PARTID_OCTET + 1] = xp_partition_id; + xpnet_device->dev_addr[XPNET_PARTID_OCTET + 0] = (xp_partition_id >> 8); + + /* + * ether_setup() sets this to a multicast device. We are + * really not supporting multicast at this time. + */ + xpnet_device->flags &= ~IFF_MULTICAST; + + /* + * No need to checksum as it is a DMA transfer. The BTE will + * report an error if the data is not retrievable and the + * packet will be dropped. + */ + xpnet_device->features = NETIF_F_HW_CSUM; + + result = register_netdev(xpnet_device); + if (result != 0) { + free_netdev(xpnet_device); + kfree(xpnet_broadcast_partitions); + } + + return result; +} + +module_init(xpnet_init); + +static void __exit +xpnet_exit(void) +{ + dev_info(xpnet, "unregistering network device %s\n", + xpnet_device[0].name); + + unregister_netdev(xpnet_device); + free_netdev(xpnet_device); + kfree(xpnet_broadcast_partitions); +} + +module_exit(xpnet_exit); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)"); +MODULE_LICENSE("GPL"); |