13 files changed, 9844 insertions, 0 deletions
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 000000000..bbb622c19
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for SGI's XP devices.
+#
+
+obj-$(CONFIG_SGI_XP)		+= xp.o
+xp-y				:= xp_main.o
+xp-$(CONFIG_IA64_SGI_SN2)	+= xp_sn2.o xp_nofault.o
+xp-$(CONFIG_IA64_GENERIC)	+= xp_sn2.o xp_nofault.o
+xp-$(CONFIG_IA64_SGI_UV)	+= xp_uv.o
+xp-$(CONFIG_X86_64)		+= xp_uv.o
+
+obj-$(CONFIG_SGI_XP)		+= xpc.o
+xpc-y				:= xpc_main.o xpc_channel.o xpc_partition.o
+xpc-$(CONFIG_IA64_SGI_SN2)	+= xpc_sn2.o
+xpc-$(CONFIG_IA64_GENERIC)	+= xpc_sn2.o
+xpc-$(CONFIG_IA64_SGI_UV) 	+= xpc_uv.o
+xpc-$(CONFIG_X86_64)		+= xpc_uv.o
+
+obj-$(CONFIG_SGI_XP)		+= xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 000000000..b8069eec1
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,368 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * External Cross Partition (XP) structures and defines.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XP_H
+#define _DRIVERS_MISC_SGIXP_XP_H
+
+#include <linux/mutex.h>
+
+#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV
+#include <asm/uv/uv.h>
+#define is_uv()		is_uv_system()
+#endif
+
+#ifndef is_uv
+#define is_uv()		0
+#endif
+
+#if defined CONFIG_IA64
+#include <asm/sn/arch.h>	/* defines is_shub1() and is_shub2() */
+#define is_shub()	ia64_platform_is("sn2")
+#endif
+
+#ifndef is_shub1
+#define is_shub1()	0
+#endif
+
+#ifndef is_shub2
+#define is_shub2()	0
+#endif
+
+#ifndef is_shub
+#define is_shub()	0
+#endif
+
+#ifdef USE_DBUG_ON
+#define DBUG_ON(condition)	BUG_ON(condition)
+#else
+#define DBUG_ON(condition)
+#endif
+
+/*
+ * Define the maximum number of partitions the system can possibly support.
+ * It is based on the maximum number of hardware partitionable regions. The
+ * term 'region' in this context refers to the minimum number of nodes that
+ * can comprise an access protection grouping. The access protection is in
+ * regards to memory, IPI and IOI.
+ *
+ * The maximum number of hardware partitionable regions is equal to the
+ * maximum number of nodes in the entire system divided by the minimum number
+ * of nodes that comprise an access protection grouping.
+ */
+#define XP_MAX_NPARTITIONS_SN2	64
+#define XP_MAX_NPARTITIONS_UV	256
+
+/*
+ * XPC establishes channel connections between the local partition and any
+ * other partition that is currently up. Over these channels, kernel-level
+ * `users' can communicate with their counterparts on the other partitions.
+ *
+ * If the need for additional channels arises, one can simply increase
+ * XPC_MAX_NCHANNELS accordingly. If the day should come where that number
+ * exceeds the absolute MAXIMUM number of channels possible (eight), then one
+ * will need to make changes to the XPC code to accommodate for this.
+ *
+ * The absolute maximum number of channels possible is limited to eight for
+ * performance reasons on sn2 hardware. The internal cross partition structures
+ * require sixteen bytes per channel, and eight allows all of this
+ * interface-shared info to fit in one 128-byte cacheline.
+ */
+#define XPC_MEM_CHANNEL		0	/* memory channel number */
+#define	XPC_NET_CHANNEL		1	/* network channel number */
+
+#define XPC_MAX_NCHANNELS	2	/* max #of channels allowed */
+
+#if XPC_MAX_NCHANNELS > 8
+#error	XPC_MAX_NCHANNELS exceeds absolute MAXIMUM possible.
+#endif
+
+/*
+ * Define macro, XPC_MSG_SIZE(), is provided for the user
+ * that wants to fit as many msg entries as possible in a given memory size
+ * (e.g. a memory page).
+ */
+#define XPC_MSG_MAX_SIZE	128
+#define XPC_MSG_HDR_MAX_SIZE	16
+#define XPC_MSG_PAYLOAD_MAX_SIZE (XPC_MSG_MAX_SIZE - XPC_MSG_HDR_MAX_SIZE)
+
+#define XPC_MSG_SIZE(_payload_size) \
+				ALIGN(XPC_MSG_HDR_MAX_SIZE + (_payload_size), \
+				      is_uv() ? 64 : 128)
+
+
+/*
+ * Define the return values and values passed to user's callout functions.
+ * (It is important to add new value codes at the end just preceding
+ * xpUnknownReason, which must have the highest numerical value.)
+ */
+enum xp_retval {
+	xpSuccess = 0,
+
+	xpNotConnected,		/*  1: channel is not connected */
+	xpConnected,		/*  2: channel connected (opened) */
+	xpRETIRED1,		/*  3: (formerly xpDisconnected) */
+
+	xpMsgReceived,		/*  4: message received */
+	xpMsgDelivered,		/*  5: message delivered and acknowledged */
+
+	xpRETIRED2,		/*  6: (formerly xpTransferFailed) */
+
+	xpNoWait,		/*  7: operation would require wait */
+	xpRetry,		/*  8: retry operation */
+	xpTimeout,		/*  9: timeout in xpc_allocate_msg_wait() */
+	xpInterrupted,		/* 10: interrupted wait */
+
+	xpUnequalMsgSizes,	/* 11: message size disparity between sides */
+	xpInvalidAddress,	/* 12: invalid address */
+
+	xpNoMemory,		/* 13: no memory available for XPC structures */
+	xpLackOfResources,	/* 14: insufficient resources for operation */
+	xpUnregistered,		/* 15: channel is not registered */
+	xpAlreadyRegistered,	/* 16: channel is already registered */
+
+	xpPartitionDown,	/* 17: remote partition is down */
+	xpNotLoaded,		/* 18: XPC module is not loaded */
+	xpUnloading,		/* 19: this side is unloading XPC module */
+
+	xpBadMagic,		/* 20: XPC MAGIC string not found */
+
+	xpReactivating,		/* 21: remote partition was reactivated */
+
+	xpUnregistering,	/* 22: this side is unregistering channel */
+	xpOtherUnregistering,	/* 23: other side is unregistering channel */
+
+	xpCloneKThread,		/* 24: cloning kernel thread */
+	xpCloneKThreadFailed,	/* 25: cloning kernel thread failed */
+
+	xpNoHeartbeat,		/* 26: remote partition has no heartbeat */
+
+	xpPioReadError,		/* 27: PIO read error */
+	xpPhysAddrRegFailed,	/* 28: registration of phys addr range failed */
+
+	xpRETIRED3,		/* 29: (formerly xpBteDirectoryError) */
+	xpRETIRED4,		/* 30: (formerly xpBtePoisonError) */
+	xpRETIRED5,		/* 31: (formerly xpBteWriteError) */
+	xpRETIRED6,		/* 32: (formerly xpBteAccessError) */
+	xpRETIRED7,		/* 33: (formerly xpBtePWriteError) */
+	xpRETIRED8,		/* 34: (formerly xpBtePReadError) */
+	xpRETIRED9,		/* 35: (formerly xpBteTimeOutError) */
+	xpRETIRED10,		/* 36: (formerly xpBteXtalkError) */
+	xpRETIRED11,		/* 37: (formerly xpBteNotAvailable) */
+	xpRETIRED12,		/* 38: (formerly xpBteUnmappedError) */
+
+	xpBadVersion,		/* 39: bad version number */
+	xpVarsNotSet,		/* 40: the XPC variables are not set up */
+	xpNoRsvdPageAddr,	/* 41: unable to get rsvd page's phys addr */
+	xpInvalidPartid,	/* 42: invalid partition ID */
+	xpLocalPartid,		/* 43: local partition ID */
+
+	xpOtherGoingDown,	/* 44: other side going down, reason unknown */
+	xpSystemGoingDown,	/* 45: system is going down, reason unknown */
+	xpSystemHalt,		/* 46: system is being halted */
+	xpSystemReboot,		/* 47: system is being rebooted */
+	xpSystemPoweroff,	/* 48: system is being powered off */
+
+	xpDisconnecting,	/* 49: channel disconnecting (closing) */
+
+	xpOpenCloseError,	/* 50: channel open/close protocol error */
+
+	xpDisconnected,		/* 51: channel disconnected (closed) */
+
+	xpBteCopyError,		/* 52: bte_copy() returned error */
+	xpSalError,		/* 53: sn SAL error */
+	xpRsvdPageNotSet,	/* 54: the reserved page is not set up */
+	xpPayloadTooBig,	/* 55: payload too large for message slot */
+
+	xpUnsupported,		/* 56: unsupported functionality or resource */
+	xpNeedMoreInfo,		/* 57: more info is needed by SAL */
+
+	xpGruCopyError,		/* 58: gru_copy_gru() returned error */
+	xpGruSendMqError,	/* 59: gru send message queue related error */
+
+	xpBadChannelNumber,	/* 60: invalid channel number */
+	xpBadMsgType,		/* 61: invalid message type */
+	xpBiosError,		/* 62: BIOS error */
+
+	xpUnknownReason		/* 63: unknown reason - must be last in enum */
+};
+
+/*
+ * Define the callout function type used by XPC to update the user on
+ * connection activity and state changes via the user function registered
+ * by xpc_connect().
+ *
+ * Arguments:
+ *
+ *	reason - reason code.
+ *	partid - partition ID associated with condition.
+ *	ch_number - channel # associated with condition.
+ *	data - pointer to optional data.
+ *	key - pointer to optional user-defined value provided as the "key"
+ *	      argument to xpc_connect().
+ *
+ * A reason code of xpConnected indicates that a connection has been
+ * established to the specified partition on the specified channel. The data
+ * argument indicates the max number of entries allowed in the message queue.
+ *
+ * A reason code of xpMsgReceived indicates that a XPC message arrived from
+ * the specified partition on the specified channel. The data argument
+ * specifies the address of the message's payload. The user must call
+ * xpc_received() when finished with the payload.
+ *
+ * All other reason codes indicate failure. The data argmument is NULL.
+ * When a failure reason code is received, one can assume that the channel
+ * is not connected.
+ */
+typedef void (*xpc_channel_func) (enum xp_retval reason, short partid,
+				  int ch_number, void *data, void *key);
+
+/*
+ * Define the callout function type used by XPC to notify the user of
+ * messages received and delivered via the user function registered by
+ * xpc_send_notify().
+ *
+ * Arguments:
+ *
+ *	reason - reason code.
+ *	partid - partition ID associated with condition.
+ *	ch_number - channel # associated with condition.
+ *	key - pointer to optional user-defined value provided as the "key"
+ *	      argument to xpc_send_notify().
+ *
+ * A reason code of xpMsgDelivered indicates that the message was delivered
+ * to the intended recipient and that they have acknowledged its receipt by
+ * calling xpc_received().
+ *
+ * All other reason codes indicate failure.
+ *
+ * NOTE: The user defined function must be callable by an interrupt handler
+ *       and thus cannot block.
+ */
+typedef void (*xpc_notify_func) (enum xp_retval reason, short partid,
+				 int ch_number, void *key);
+
+/*
+ * The following is a registration entry. There is a global array of these,
+ * one per channel. It is used to record the connection registration made
+ * by the users of XPC. As long as a registration entry exists, for any
+ * partition that comes up, XPC will attempt to establish a connection on
+ * that channel. Notification that a connection has been made will occur via
+ * the xpc_channel_func function.
+ *
+ * The 'func' field points to the function to call when aynchronous
+ * notification is required for such events as: a connection established/lost,
+ * or an incoming message received, or an error condition encountered. A
+ * non-NULL 'func' field indicates that there is an active registration for
+ * the channel.
+ */
+struct xpc_registration {
+	struct mutex mutex;
+	xpc_channel_func func;	/* function to call */
+	void *key;		/* pointer to user's key */
+	u16 nentries;		/* #of msg entries in local msg queue */
+	u16 entry_size;		/* message queue's message entry size */
+	u32 assigned_limit;	/* limit on #of assigned kthreads */
+	u32 idle_limit;		/* limit on #of idle kthreads */
+} ____cacheline_aligned;
+
+#define XPC_CHANNEL_REGISTERED(_c)	(xpc_registrations[_c].func != NULL)
+
+/* the following are valid xpc_send() or xpc_send_notify() flags */
+#define XPC_WAIT	0	/* wait flag */
+#define XPC_NOWAIT	1	/* no wait flag */
+
+struct xpc_interface {
+	void (*connect) (int);
+	void (*disconnect) (int);
+	enum xp_retval (*send) (short, int, u32, void *, u16);
+	enum xp_retval (*send_notify) (short, int, u32, void *, u16,
+					xpc_notify_func, void *);
+	void (*received) (short, int, void *);
+	enum xp_retval (*partid_to_nasids) (short, void *);
+};
+
+extern struct xpc_interface xpc_interface;
+
+extern void xpc_set_interface(void (*)(int),
+			      void (*)(int),
+			      enum xp_retval (*)(short, int, u32, void *, u16),
+			      enum xp_retval (*)(short, int, u32, void *, u16,
+						 xpc_notify_func, void *),
+			      void (*)(short, int, void *),
+			      enum xp_retval (*)(short, void *));
+extern void xpc_clear_interface(void);
+
+extern enum xp_retval xpc_connect(int, xpc_channel_func, void *, u16,
+				   u16, u32, u32);
+extern void xpc_disconnect(int);
+
+static inline enum xp_retval
+xpc_send(short partid, int ch_number, u32 flags, void *payload,
+	 u16 payload_size)
+{
+	if (!xpc_interface.send)
+		return xpNotLoaded;
+
+	return xpc_interface.send(partid, ch_number, flags, payload,
+				  payload_size);
+}
+
+static inline enum xp_retval
+xpc_send_notify(short partid, int ch_number, u32 flags, void *payload,
+		u16 payload_size, xpc_notify_func func, void *key)
+{
+	if (!xpc_interface.send_notify)
+		return xpNotLoaded;
+
+	return xpc_interface.send_notify(partid, ch_number, flags, payload,
+					 payload_size, func, key);
+}
+
+static inline void
+xpc_received(short partid, int ch_number, void *payload)
+{
+	if (xpc_interface.received)
+		xpc_interface.received(partid, ch_number, payload);
+}
+
+static inline enum xp_retval
+xpc_partid_to_nasids(short partid, void *nasids)
+{
+	if (!xpc_interface.partid_to_nasids)
+		return xpNotLoaded;
+
+	return xpc_interface.partid_to_nasids(partid, nasids);
+}
+
+extern short xp_max_npartitions;
+extern short xp_partition_id;
+extern u8 xp_region_size;
+
+extern unsigned long (*xp_pa) (void *);
+extern unsigned long (*xp_socket_pa) (unsigned long);
+extern enum xp_retval (*xp_remote_memcpy) (unsigned long, const unsigned long,
+		       size_t);
+extern int (*xp_cpu_to_nasid) (int);
+extern enum xp_retval (*xp_expand_memprotect) (unsigned long, unsigned long);
+extern enum xp_retval (*xp_restrict_memprotect) (unsigned long, unsigned long);
+
+extern u64 xp_nofault_PIOR_target;
+extern int xp_nofault_PIOR(void *);
+extern int xp_error_PIOR(void);
+
+extern struct device *xp;
+extern enum xp_retval xp_init_sn2(void);
+extern enum xp_retval xp_init_uv(void);
+extern void xp_exit_sn2(void);
+extern void xp_exit_uv(void);
+
+#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 000000000..6d7f557fd
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,264 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) base.
+ *
+ *	XP provides a base from which its users can interact
+ *	with XPC, yet not be dependent on XPC.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include "xp.h"
+
+/* define the XP debug device structures to be used with dev_dbg() et al */
+
+struct device_driver xp_dbg_name = {
+	.name = "xp"
+};
+
+struct device xp_dbg_subname = {
+	.init_name = "",		/* set to "" */
+	.driver = &xp_dbg_name
+};
+
+struct device *xp = &xp_dbg_subname;
+
+/* max #of partitions possible */
+short xp_max_npartitions;
+EXPORT_SYMBOL_GPL(xp_max_npartitions);
+
+short xp_partition_id;
+EXPORT_SYMBOL_GPL(xp_partition_id);
+
+u8 xp_region_size;
+EXPORT_SYMBOL_GPL(xp_region_size);
+
+unsigned long (*xp_pa) (void *addr);
+EXPORT_SYMBOL_GPL(xp_pa);
+
+unsigned long (*xp_socket_pa) (unsigned long gpa);
+EXPORT_SYMBOL_GPL(xp_socket_pa);
+
+enum xp_retval (*xp_remote_memcpy) (unsigned long dst_gpa,
+				    const unsigned long src_gpa, size_t len);
+EXPORT_SYMBOL_GPL(xp_remote_memcpy);
+
+int (*xp_cpu_to_nasid) (int cpuid);
+EXPORT_SYMBOL_GPL(xp_cpu_to_nasid);
+
+enum xp_retval (*xp_expand_memprotect) (unsigned long phys_addr,
+					unsigned long size);
+EXPORT_SYMBOL_GPL(xp_expand_memprotect);
+enum xp_retval (*xp_restrict_memprotect) (unsigned long phys_addr,
+					  unsigned long size);
+EXPORT_SYMBOL_GPL(xp_restrict_memprotect);
+
+/*
+ * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
+ * users of XPC.
+ */
+struct xpc_registration xpc_registrations[XPC_MAX_NCHANNELS];
+EXPORT_SYMBOL_GPL(xpc_registrations);
+
+/*
+ * Initialize the XPC interface to NULL to indicate that XPC isn't loaded.
+ */
+struct xpc_interface xpc_interface = { };
+EXPORT_SYMBOL_GPL(xpc_interface);
+
+/*
+ * XPC calls this when it (the XPC module) has been loaded.
+ */
+void
+xpc_set_interface(void (*connect) (int),
+		  void (*disconnect) (int),
+		  enum xp_retval (*send) (short, int, u32, void *, u16),
+		  enum xp_retval (*send_notify) (short, int, u32, void *, u16,
+						  xpc_notify_func, void *),
+		  void (*received) (short, int, void *),
+		  enum xp_retval (*partid_to_nasids) (short, void *))
+{
+	xpc_interface.connect = connect;
+	xpc_interface.disconnect = disconnect;
+	xpc_interface.send = send;
+	xpc_interface.send_notify = send_notify;
+	xpc_interface.received = received;
+	xpc_interface.partid_to_nasids = partid_to_nasids;
+}
+EXPORT_SYMBOL_GPL(xpc_set_interface);
+
+/*
+ * XPC calls this when it (the XPC module) is being unloaded.
+ */
+void
+xpc_clear_interface(void)
+{
+	memset(&xpc_interface, 0, sizeof(xpc_interface));
+}
+EXPORT_SYMBOL_GPL(xpc_clear_interface);
+
+/*
+ * Register for automatic establishment of a channel connection whenever
+ * a partition comes up.
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to register for connection.
+ *	func - function to call for asynchronous notification of channel
+ *	       state changes (i.e., connection, disconnection, error) and
+ *	       the arrival of incoming messages.
+ *      key - pointer to optional user-defined value that gets passed back
+ *	      to the user on any callouts made to func.
+ *	payload_size - size in bytes of the XPC message's payload area which
+ *		       contains a user-defined message. The user should make
+ *		       this large enough to hold their largest message.
+ *	nentries - max #of XPC message entries a message queue can contain.
+ *		   The actual number, which is determined when a connection
+ * 		   is established and may be less then requested, will be
+ *		   passed to the user via the xpConnected callout.
+ *	assigned_limit - max number of kthreads allowed to be processing
+ * 			 messages (per connection) at any given instant.
+ *	idle_limit - max number of kthreads allowed to be idle at any given
+ * 		     instant.
+ */
+enum xp_retval
+xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
+	    u16 nentries, u32 assigned_limit, u32 idle_limit)
+{
+	struct xpc_registration *registration;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+	DBUG_ON(payload_size == 0 || nentries == 0);
+	DBUG_ON(func == NULL);
+	DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
+
+	if (XPC_MSG_SIZE(payload_size) > XPC_MSG_MAX_SIZE)
+		return xpPayloadTooBig;
+
+	registration = &xpc_registrations[ch_number];
+
+	if (mutex_lock_interruptible(&registration->mutex) != 0)
+		return xpInterrupted;
+
+	/* if XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func != NULL) {
+		mutex_unlock(&registration->mutex);
+		return xpAlreadyRegistered;
+	}
+
+	/* register the channel for connection */
+	registration->entry_size = XPC_MSG_SIZE(payload_size);
+	registration->nentries = nentries;
+	registration->assigned_limit = assigned_limit;
+	registration->idle_limit = idle_limit;
+	registration->key = key;
+	registration->func = func;
+
+	mutex_unlock(&registration->mutex);
+
+	if (xpc_interface.connect)
+		xpc_interface.connect(ch_number);
+
+	return xpSuccess;
+}
+EXPORT_SYMBOL_GPL(xpc_connect);
+
+/*
+ * Remove the registration for automatic connection of the specified channel
+ * when a partition comes up.
+ *
+ * Before returning this xpc_disconnect() will wait for all connections on the
+ * specified channel have been closed/torndown. So the caller can be assured
+ * that they will not be receiving any more callouts from XPC to their
+ * function registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to unregister.
+ */
+void
+xpc_disconnect(int ch_number)
+{
+	struct xpc_registration *registration;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+	registration = &xpc_registrations[ch_number];
+
+	/*
+	 * We've decided not to make this a down_interruptible(), since we
+	 * figured XPC's users will just turn around and call xpc_disconnect()
+	 * again anyways, so we might as well wait, if need be.
+	 */
+	mutex_lock(&registration->mutex);
+
+	/* if !XPC_CHANNEL_REGISTERED(ch_number) */
+	if (registration->func == NULL) {
+		mutex_unlock(&registration->mutex);
+		return;
+	}
+
+	/* remove the connection registration for the specified channel */
+	registration->func = NULL;
+	registration->key = NULL;
+	registration->nentries = 0;
+	registration->entry_size = 0;
+	registration->assigned_limit = 0;
+	registration->idle_limit = 0;
+
+	if (xpc_interface.disconnect)
+		xpc_interface.disconnect(ch_number);
+
+	mutex_unlock(&registration->mutex);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(xpc_disconnect);
+
+int __init
+xp_init(void)
+{
+	enum xp_retval ret;
+	int ch_number;
+
+	/* initialize the connection registration mutex */
+	for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++)
+		mutex_init(&xpc_registrations[ch_number].mutex);
+
+	if (is_shub())
+		ret = xp_init_sn2();
+	else if (is_uv())
+		ret = xp_init_uv();
+	else
+		ret = 0;
+
+	if (ret != xpSuccess)
+		return ret;
+
+	return 0;
+}
+
+module_init(xp_init);
+
+void __exit
+xp_exit(void)
+{
+	if (is_shub())
+		xp_exit_sn2();
+	else if (is_uv())
+		xp_exit_uv();
+}
+
+module_exit(xp_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition (XP) base");
+MODULE_LICENSE("GPL");
diff --git a/drivers/misc/sgi-xp/xp_nofault.S b/drivers/misc/sgi-xp/xp_nofault.S
new file mode 100644
index 000000000..e38d43319
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_nofault.S
@@ -0,0 +1,35 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * The xp_nofault_PIOR function takes a pointer to a remote PIO register
+ * and attempts to load and consume a value from it.  This function
+ * will be registered as a nofault code block.  In the event that the
+ * PIO read fails, the MCA handler will force the error to look
+ * corrected and vector to the xp_error_PIOR which will return an error.
+ *
+ * The definition of "consumption" and the time it takes for an MCA
+ * to surface is processor implementation specific.  This code
+ * is sufficient on Itanium through the Montvale processor family.
+ * It may need to be adjusted for future processor implementations.
+ *
+ *	extern int xp_nofault_PIOR(void *remote_register);
+ */
+
+	.global xp_nofault_PIOR
+xp_nofault_PIOR:
+	mov	r8=r0			// Stage a success return value
+	ld8.acq	r9=[r32];;		// PIO Read the specified register
+	adds	r9=1,r9;;		// Add to force consumption
+	srlz.i;;			// Allow time for MCA to surface
+	br.ret.sptk.many b0;;		// Return success
+
+	.global xp_error_PIOR
+xp_error_PIOR:
+	mov	r8=1			// Return value of 1
+	br.ret.sptk.many b0;;		// Return failure
diff --git a/drivers/misc/sgi-xp/xp_sn2.c b/drivers/misc/sgi-xp/xp_sn2.c
new file mode 100644
index 000000000..d8e463f87
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_sn2.c
@@ -0,0 +1,190 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) sn2-based functions.
+ *
+ *      Architecture specific implementation of common functions.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <asm/sn/bte.h>
+#include <asm/sn/sn_sal.h>
+#include "xp.h"
+
+/*
+ * The export of xp_nofault_PIOR needs to happen here since it is defined
+ * in drivers/misc/sgi-xp/xp_nofault.S. The target of the nofault read is
+ * defined here.
+ */
+EXPORT_SYMBOL_GPL(xp_nofault_PIOR);
+
+u64 xp_nofault_PIOR_target;
+EXPORT_SYMBOL_GPL(xp_nofault_PIOR_target);
+
+/*
+ * Register a nofault code region which performs a cross-partition PIO read.
+ * If the PIO read times out, the MCA handler will consume the error and
+ * return to a kernel-provided instruction to indicate an error. This PIO read
+ * exists because it is guaranteed to timeout if the destination is down
+ * (amo operations do not timeout on at least some CPUs on Shubs <= v1.2,
+ * which unfortunately we have to work around).
+ */
+static enum xp_retval
+xp_register_nofault_code_sn2(void)
+{
+	int ret;
+	u64 func_addr;
+	u64 err_func_addr;
+
+	func_addr = *(u64 *)xp_nofault_PIOR;
+	err_func_addr = *(u64 *)xp_error_PIOR;
+	ret = sn_register_nofault_code(func_addr, err_func_addr, err_func_addr,
+				       1, 1);
+	if (ret != 0) {
+		dev_err(xp, "can't register nofault code, error=%d\n", ret);
+		return xpSalError;
+	}
+	/*
+	 * Setup the nofault PIO read target. (There is no special reason why
+	 * SH_IPI_ACCESS was selected.)
+	 */
+	if (is_shub1())
+		xp_nofault_PIOR_target = SH1_IPI_ACCESS;
+	else if (is_shub2())
+		xp_nofault_PIOR_target = SH2_IPI_ACCESS0;
+
+	return xpSuccess;
+}
+
+static void
+xp_unregister_nofault_code_sn2(void)
+{
+	u64 func_addr = *(u64 *)xp_nofault_PIOR;
+	u64 err_func_addr = *(u64 *)xp_error_PIOR;
+
+	/* unregister the PIO read nofault code region */
+	(void)sn_register_nofault_code(func_addr, err_func_addr,
+				       err_func_addr, 1, 0);
+}
+
+/*
+ * Convert a virtual memory address to a physical memory address.
+ */
+static unsigned long
+xp_pa_sn2(void *addr)
+{
+	return __pa(addr);
+}
+
+/*
+ * Convert a global physical to a socket physical address.
+ */
+static unsigned long
+xp_socket_pa_sn2(unsigned long gpa)
+{
+	return gpa;
+}
+
+/*
+ * Wrapper for bte_copy().
+ *
+ *	dst_pa - physical address of the destination of the transfer.
+ *	src_pa - physical address of the source of the transfer.
+ *	len - number of bytes to transfer from source to destination.
+ *
+ * Note: xp_remote_memcpy_sn2() should never be called while holding a spinlock.
+ */
+static enum xp_retval
+xp_remote_memcpy_sn2(unsigned long dst_pa, const unsigned long src_pa,
+		     size_t len)
+{
+	bte_result_t ret;
+
+	ret = bte_copy(src_pa, dst_pa, len, (BTE_NOTIFY | BTE_WACQUIRE), NULL);
+	if (ret == BTE_SUCCESS)
+		return xpSuccess;
+
+	if (is_shub2()) {
+		dev_err(xp, "bte_copy() on shub2 failed, error=0x%x dst_pa="
+			"0x%016lx src_pa=0x%016lx len=%ld\\n", ret, dst_pa,
+			src_pa, len);
+	} else {
+		dev_err(xp, "bte_copy() failed, error=%d dst_pa=0x%016lx "
+			"src_pa=0x%016lx len=%ld\\n", ret, dst_pa, src_pa, len);
+	}
+
+	return xpBteCopyError;
+}
+
+static int
+xp_cpu_to_nasid_sn2(int cpuid)
+{
+	return cpuid_to_nasid(cpuid);
+}
+
+static enum xp_retval
+xp_expand_memprotect_sn2(unsigned long phys_addr, unsigned long size)
+{
+	u64 nasid_array = 0;
+	int ret;
+
+	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1,
+				   &nasid_array);
+	if (ret != 0) {
+		dev_err(xp, "sn_change_memprotect(,, "
+			"SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret);
+		return xpSalError;
+	}
+	return xpSuccess;
+}
+
+static enum xp_retval
+xp_restrict_memprotect_sn2(unsigned long phys_addr, unsigned long size)
+{
+	u64 nasid_array = 0;
+	int ret;
+
+	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0,
+				   &nasid_array);
+	if (ret != 0) {
+		dev_err(xp, "sn_change_memprotect(,, "
+			"SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret);
+		return xpSalError;
+	}
+	return xpSuccess;
+}
+
+enum xp_retval
+xp_init_sn2(void)
+{
+	BUG_ON(!is_shub());
+
+	xp_max_npartitions = XP_MAX_NPARTITIONS_SN2;
+	xp_partition_id = sn_partition_id;
+	xp_region_size = sn_region_size;
+
+	xp_pa = xp_pa_sn2;
+	xp_socket_pa = xp_socket_pa_sn2;
+	xp_remote_memcpy = xp_remote_memcpy_sn2;
+	xp_cpu_to_nasid = xp_cpu_to_nasid_sn2;
+	xp_expand_memprotect = xp_expand_memprotect_sn2;
+	xp_restrict_memprotect = xp_restrict_memprotect_sn2;
+
+	return xp_register_nofault_code_sn2();
+}
+
+void
+xp_exit_sn2(void)
+{
+	BUG_ON(!is_shub());
+
+	xp_unregister_nofault_code_sn2();
+}
+
diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c
new file mode 100644
index 000000000..a0d093274
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_uv.c
@@ -0,0 +1,171 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) uv-based functions.
+ *
+ *      Architecture specific implementation of common functions.
+ *
+ */
+
+#include <linux/device.h>
+#include <asm/uv/uv_hub.h>
+#if defined CONFIG_X86_64
+#include <asm/uv/bios.h>
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+#include <asm/sn/sn_sal.h>
+#endif
+#include "../sgi-gru/grukservices.h"
+#include "xp.h"
+
+/*
+ * Convert a virtual memory address to a physical memory address.
+ */
+static unsigned long
+xp_pa_uv(void *addr)
+{
+	return uv_gpa(addr);
+}
+
+/*
+ * Convert a global physical to socket physical address.
+ */
+static unsigned long
+xp_socket_pa_uv(unsigned long gpa)
+{
+	return uv_gpa_to_soc_phys_ram(gpa);
+}
+
+static enum xp_retval
+xp_remote_mmr_read(unsigned long dst_gpa, const unsigned long src_gpa,
+		   size_t len)
+{
+	int ret;
+	unsigned long *dst_va = __va(uv_gpa_to_soc_phys_ram(dst_gpa));
+
+	BUG_ON(!uv_gpa_in_mmr_space(src_gpa));
+	BUG_ON(len != 8);
+
+	ret = gru_read_gpa(dst_va, src_gpa);
+	if (ret == 0)
+		return xpSuccess;
+
+	dev_err(xp, "gru_read_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx "
+		"len=%ld\n", dst_gpa, src_gpa, len);
+	return xpGruCopyError;
+}
+
+
+static enum xp_retval
+xp_remote_memcpy_uv(unsigned long dst_gpa, const unsigned long src_gpa,
+		    size_t len)
+{
+	int ret;
+
+	if (uv_gpa_in_mmr_space(src_gpa))
+		return xp_remote_mmr_read(dst_gpa, src_gpa, len);
+
+	ret = gru_copy_gpa(dst_gpa, src_gpa, len);
+	if (ret == 0)
+		return xpSuccess;
+
+	dev_err(xp, "gru_copy_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx "
+		"len=%ld\n", dst_gpa, src_gpa, len);
+	return xpGruCopyError;
+}
+
+static int
+xp_cpu_to_nasid_uv(int cpuid)
+{
+	/* ??? Is this same as sn2 nasid in mach/part bitmaps set up by SAL? */
+	return UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpuid));
+}
+
+static enum xp_retval
+xp_expand_memprotect_uv(unsigned long phys_addr, unsigned long size)
+{
+	int ret;
+
+#if defined CONFIG_X86_64
+	ret = uv_bios_change_memprotect(phys_addr, size, UV_MEMPROT_ALLOW_RW);
+	if (ret != BIOS_STATUS_SUCCESS) {
+		dev_err(xp, "uv_bios_change_memprotect(,, "
+			"UV_MEMPROT_ALLOW_RW) failed, ret=%d\n", ret);
+		return xpBiosError;
+	}
+
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	u64 nasid_array;
+
+	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1,
+				   &nasid_array);
+	if (ret != 0) {
+		dev_err(xp, "sn_change_memprotect(,, "
+			"SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret);
+		return xpSalError;
+	}
+#else
+	#error not a supported configuration
+#endif
+	return xpSuccess;
+}
+
+static enum xp_retval
+xp_restrict_memprotect_uv(unsigned long phys_addr, unsigned long size)
+{
+	int ret;
+
+#if defined CONFIG_X86_64
+	ret = uv_bios_change_memprotect(phys_addr, size,
+					UV_MEMPROT_RESTRICT_ACCESS);
+	if (ret != BIOS_STATUS_SUCCESS) {
+		dev_err(xp, "uv_bios_change_memprotect(,, "
+			"UV_MEMPROT_RESTRICT_ACCESS) failed, ret=%d\n", ret);
+		return xpBiosError;
+	}
+
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	u64 nasid_array;
+
+	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0,
+				   &nasid_array);
+	if (ret != 0) {
+		dev_err(xp, "sn_change_memprotect(,, "
+			"SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret);
+		return xpSalError;
+	}
+#else
+	#error not a supported configuration
+#endif
+	return xpSuccess;
+}
+
+enum xp_retval
+xp_init_uv(void)
+{
+	BUG_ON(!is_uv());
+
+	xp_max_npartitions = XP_MAX_NPARTITIONS_UV;
+	xp_partition_id = sn_partition_id;
+	xp_region_size = sn_region_size;
+
+	xp_pa = xp_pa_uv;
+	xp_socket_pa = xp_socket_pa_uv;
+	xp_remote_memcpy = xp_remote_memcpy_uv;
+	xp_cpu_to_nasid = xp_cpu_to_nasid_uv;
+	xp_expand_memprotect = xp_expand_memprotect_uv;
+	xp_restrict_memprotect = xp_restrict_memprotect_uv;
+
+	return xpSuccess;
+}
+
+void
+xp_exit_uv(void)
+{
+	BUG_ON(!is_uv());
+}
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 000000000..b94d5f767
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,1004 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) structures and macros.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XPC_H
+#define _DRIVERS_MISC_SGIXP_XPC_H
+
+#include <linux/wait.h>
+#include <linux/completion.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include "xp.h"
+
+/*
+ * XPC Version numbers consist of a major and minor number. XPC can always
+ * talk to versions with same major #, and never talk to versions with a
+ * different major #.
+ */
+#define _XPC_VERSION(_maj, _min)	(((_maj) << 4) | ((_min) & 0xf))
+#define XPC_VERSION_MAJOR(_v)		((_v) >> 4)
+#define XPC_VERSION_MINOR(_v)		((_v) & 0xf)
+
+/* define frequency of the heartbeat and frequency how often it's checked */
+#define XPC_HB_DEFAULT_INTERVAL		5	/* incr HB every x secs */
+#define XPC_HB_CHECK_DEFAULT_INTERVAL	20	/* check HB every x secs */
+
+/* define the process name of HB checker and the CPU it is pinned to */
+#define XPC_HB_CHECK_THREAD_NAME	"xpc_hb"
+#define XPC_HB_CHECK_CPU		0
+
+/* define the process name of the discovery thread */
+#define XPC_DISCOVERY_THREAD_NAME	"xpc_discovery"
+
+/*
+ * the reserved page
+ *
+ *   SAL reserves one page of memory per partition for XPC. Though a full page
+ *   in length (16384 bytes), its starting address is not page aligned, but it
+ *   is cacheline aligned. The reserved page consists of the following:
+ *
+ *   reserved page header
+ *
+ *     The first two 64-byte cachelines of the reserved page contain the
+ *     header (struct xpc_rsvd_page). Before SAL initialization has completed,
+ *     SAL has set up the following fields of the reserved page header:
+ *     SAL_signature, SAL_version, SAL_partid, and SAL_nasids_size. The
+ *     other fields are set up by XPC. (xpc_rsvd_page points to the local
+ *     partition's reserved page.)
+ *
+ *   part_nasids mask
+ *   mach_nasids mask
+ *
+ *     SAL also sets up two bitmaps (or masks), one that reflects the actual
+ *     nasids in this partition (part_nasids), and the other that reflects
+ *     the actual nasids in the entire machine (mach_nasids). We're only
+ *     interested in the even numbered nasids (which contain the processors
+ *     and/or memory), so we only need half as many bits to represent the
+ *     nasids. When mapping nasid to bit in a mask (or bit to nasid) be sure
+ *     to either divide or multiply by 2. The part_nasids mask is located
+ *     starting at the first cacheline following the reserved page header. The
+ *     mach_nasids mask follows right after the part_nasids mask. The size in
+ *     bytes of each mask is reflected by the reserved page header field
+ *     'SAL_nasids_size'. (Local partition's mask pointers are xpc_part_nasids
+ *     and xpc_mach_nasids.)
+ *
+ *   vars	(ia64-sn2 only)
+ *   vars part	(ia64-sn2 only)
+ *
+ *     Immediately following the mach_nasids mask are the XPC variables
+ *     required by other partitions. First are those that are generic to all
+ *     partitions (vars), followed on the next available cacheline by those
+ *     which are partition specific (vars part). These are setup by XPC.
+ *     (Local partition's vars pointers are xpc_vars and xpc_vars_part.)
+ *
+ * Note: Until 'ts_jiffies' is set non-zero, the partition XPC code has not been
+ *       initialized.
+ */
+struct xpc_rsvd_page {
+	u64 SAL_signature;	/* SAL: unique signature */
+	u64 SAL_version;	/* SAL: version */
+	short SAL_partid;	/* SAL: partition ID */
+	short max_npartitions;	/* value of XPC_MAX_PARTITIONS */
+	u8 version;
+	u8 pad1[3];		/* align to next u64 in 1st 64-byte cacheline */
+	unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */
+	union {
+		struct {
+			unsigned long vars_pa;	/* phys addr */
+		} sn2;
+		struct {
+			unsigned long heartbeat_gpa; /* phys addr */
+			unsigned long activate_gru_mq_desc_gpa; /* phys addr */
+		} uv;
+	} sn;
+	u64 pad2[9];		/* align to last u64 in 2nd 64-byte cacheline */
+	u64 SAL_nasids_size;	/* SAL: size of each nasid mask in bytes */
+};
+
+#define XPC_RP_VERSION _XPC_VERSION(3, 0) /* version 3.0 of the reserved page */
+
+/*
+ * Define the structures by which XPC variables can be exported to other
+ * partitions. (There are two: struct xpc_vars and struct xpc_vars_part)
+ */
+
+/*
+ * The following structure describes the partition generic variables
+ * needed by other partitions in order to properly initialize.
+ *
+ * struct xpc_vars version number also applies to struct xpc_vars_part.
+ * Changes to either structure and/or related functionality should be
+ * reflected by incrementing either the major or minor version numbers
+ * of struct xpc_vars.
+ */
+struct xpc_vars_sn2 {
+	u8 version;
+	u64 heartbeat;
+	DECLARE_BITMAP(heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2);
+	u64 heartbeat_offline;	/* if 0, heartbeat should be changing */
+	int activate_IRQ_nasid;
+	int activate_IRQ_phys_cpuid;
+	unsigned long vars_part_pa;
+	unsigned long amos_page_pa;/* paddr of page of amos from MSPEC driver */
+	struct amo *amos_page;	/* vaddr of page of amos from MSPEC driver */
+};
+
+#define XPC_V_VERSION _XPC_VERSION(3, 1)    /* version 3.1 of the cross vars */
+
+/*
+ * The following structure describes the per partition specific variables.
+ *
+ * An array of these structures, one per partition, will be defined. As a
+ * partition becomes active XPC will copy the array entry corresponding to
+ * itself from that partition. It is desirable that the size of this structure
+ * evenly divides into a 128-byte cacheline, such that none of the entries in
+ * this array crosses a 128-byte cacheline boundary. As it is now, each entry
+ * occupies 64-bytes.
+ */
+struct xpc_vars_part_sn2 {
+	u64 magic;
+
+	unsigned long openclose_args_pa; /* phys addr of open and close args */
+	unsigned long GPs_pa;	/* physical address of Get/Put values */
+
+	unsigned long chctl_amo_pa; /* physical address of chctl flags' amo */
+
+	int notify_IRQ_nasid;	/* nasid of where to send notify IRQs */
+	int notify_IRQ_phys_cpuid;	/* CPUID of where to send notify IRQs */
+
+	u8 nchannels;		/* #of defined channels supported */
+
+	u8 reserved[23];	/* pad to a full 64 bytes */
+};
+
+/*
+ * The vars_part MAGIC numbers play a part in the first contact protocol.
+ *
+ * MAGIC1 indicates that the per partition specific variables for a remote
+ * partition have been initialized by this partition.
+ *
+ * MAGIC2 indicates that this partition has pulled the remote partititions
+ * per partition variables that pertain to this partition.
+ */
+#define XPC_VP_MAGIC1_SN2 0x0053524156435058L /* 'XPCVARS\0'L (little endian) */
+#define XPC_VP_MAGIC2_SN2 0x0073726176435058L /* 'XPCvars\0'L (little endian) */
+
+/* the reserved page sizes and offsets */
+
+#define XPC_RP_HEADER_SIZE	L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
+#define XPC_RP_VARS_SIZE	L1_CACHE_ALIGN(sizeof(struct xpc_vars_sn2))
+
+#define XPC_RP_PART_NASIDS(_rp) ((unsigned long *)((u8 *)(_rp) + \
+				 XPC_RP_HEADER_SIZE))
+#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + \
+				 xpc_nasid_mask_nlongs)
+#define XPC_RP_VARS(_rp)	((struct xpc_vars_sn2 *) \
+				 (XPC_RP_MACH_NASIDS(_rp) + \
+				  xpc_nasid_mask_nlongs))
+
+
+/*
+ * The following structure describes the partition's heartbeat info which
+ * will be periodically read by other partitions to determine whether this
+ * XPC is still 'alive'.
+ */
+struct xpc_heartbeat_uv {
+	unsigned long value;
+	unsigned long offline;	/* if 0, heartbeat should be changing */
+};
+
+/*
+ * Info pertinent to a GRU message queue using a watch list for irq generation.
+ */
+struct xpc_gru_mq_uv {
+	void *address;		/* address of GRU message queue */
+	unsigned int order;	/* size of GRU message queue as a power of 2 */
+	int irq;		/* irq raised when message is received in mq */
+	int mmr_blade;		/* blade where watchlist was allocated from */
+	unsigned long mmr_offset; /* offset of irq mmr located on mmr_blade */
+	unsigned long mmr_value; /* value of irq mmr located on mmr_blade */
+	int watchlist_num;	/* number of watchlist allocatd by BIOS */
+	void *gru_mq_desc;	/* opaque structure used by the GRU driver */
+};
+
+/*
+ * The activate_mq is used to send/receive GRU messages that affect XPC's
+ * partition active state and channel state. This is uv only.
+ */
+struct xpc_activate_mq_msghdr_uv {
+	unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */
+	short partid;		/* sender's partid */
+	u8 act_state;		/* sender's act_state at time msg sent */
+	u8 type;		/* message's type */
+	unsigned long rp_ts_jiffies; /* timestamp of sender's rp setup by XPC */
+};
+
+/* activate_mq defined message types */
+#define XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV		0
+
+#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV		1
+#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV		2
+
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV	3
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV		4
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV	5
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV		6
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV	7
+
+#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV		8
+#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV		9
+
+struct xpc_activate_mq_msg_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+};
+
+struct xpc_activate_mq_msg_activate_req_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	unsigned long rp_gpa;
+	unsigned long heartbeat_gpa;
+	unsigned long activate_gru_mq_desc_gpa;
+};
+
+struct xpc_activate_mq_msg_deactivate_req_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	enum xp_retval reason;
+};
+
+struct xpc_activate_mq_msg_chctl_closerequest_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	short ch_number;
+	enum xp_retval reason;
+};
+
+struct xpc_activate_mq_msg_chctl_closereply_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	short ch_number;
+};
+
+struct xpc_activate_mq_msg_chctl_openrequest_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	short ch_number;
+	short entry_size;	/* size of notify_mq's GRU messages */
+	short local_nentries;	/* ??? Is this needed? What is? */
+};
+
+struct xpc_activate_mq_msg_chctl_openreply_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	short ch_number;
+	short remote_nentries;	/* ??? Is this needed? What is? */
+	short local_nentries;	/* ??? Is this needed? What is? */
+	unsigned long notify_gru_mq_desc_gpa;
+};
+
+struct xpc_activate_mq_msg_chctl_opencomplete_uv {
+	struct xpc_activate_mq_msghdr_uv hdr;
+	short ch_number;
+};
+
+/*
+ * Functions registered by add_timer() or called by kernel_thread() only
+ * allow for a single 64-bit argument. The following macros can be used to
+ * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
+ * the passed argument.
+ */
+#define XPC_PACK_ARGS(_arg1, _arg2) \
+			((((u64)_arg1) & 0xffffffff) | \
+			((((u64)_arg2) & 0xffffffff) << 32))
+
+#define XPC_UNPACK_ARG1(_args)	(((u64)_args) & 0xffffffff)
+#define XPC_UNPACK_ARG2(_args)	((((u64)_args) >> 32) & 0xffffffff)
+
+/*
+ * Define a Get/Put value pair (pointers) used with a message queue.
+ */
+struct xpc_gp_sn2 {
+	s64 get;		/* Get value */
+	s64 put;		/* Put value */
+};
+
+#define XPC_GP_SIZE \
+		L1_CACHE_ALIGN(sizeof(struct xpc_gp_sn2) * XPC_MAX_NCHANNELS)
+
+/*
+ * Define a structure that contains arguments associated with opening and
+ * closing a channel.
+ */
+struct xpc_openclose_args {
+	u16 reason;		/* reason why channel is closing */
+	u16 entry_size;		/* sizeof each message entry */
+	u16 remote_nentries;	/* #of message entries in remote msg queue */
+	u16 local_nentries;	/* #of message entries in local msg queue */
+	unsigned long local_msgqueue_pa; /* phys addr of local message queue */
+};
+
+#define XPC_OPENCLOSE_ARGS_SIZE \
+	      L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * \
+	      XPC_MAX_NCHANNELS)
+
+
+/*
+ * Structures to define a fifo singly-linked list.
+ */
+
+struct xpc_fifo_entry_uv {
+	struct xpc_fifo_entry_uv *next;
+};
+
+struct xpc_fifo_head_uv {
+	struct xpc_fifo_entry_uv *first;
+	struct xpc_fifo_entry_uv *last;
+	spinlock_t lock;
+	int n_entries;
+};
+
+/*
+ * Define a sn2 styled message.
+ *
+ * A user-defined message resides in the payload area. The max size of the
+ * payload is defined by the user via xpc_connect().
+ *
+ * The size of a message entry (within a message queue) must be a 128-byte
+ * cacheline sized multiple in order to facilitate the BTE transfer of messages
+ * from one message queue to another.
+ */
+struct xpc_msg_sn2 {
+	u8 flags;		/* FOR XPC INTERNAL USE ONLY */
+	u8 reserved[7];		/* FOR XPC INTERNAL USE ONLY */
+	s64 number;		/* FOR XPC INTERNAL USE ONLY */
+
+	u64 payload;		/* user defined portion of message */
+};
+
+/* struct xpc_msg_sn2 flags */
+
+#define	XPC_M_SN2_DONE		0x01	/* msg has been received/consumed */
+#define	XPC_M_SN2_READY		0x02	/* msg is ready to be sent */
+#define	XPC_M_SN2_INTERRUPT	0x04	/* send interrupt when msg consumed */
+
+/*
+ * The format of a uv XPC notify_mq GRU message is as follows:
+ *
+ * A user-defined message resides in the payload area. The max size of the
+ * payload is defined by the user via xpc_connect().
+ *
+ * The size of a message (payload and header) sent via the GRU must be either 1
+ * or 2 GRU_CACHE_LINE_BYTES in length.
+ */
+
+struct xpc_notify_mq_msghdr_uv {
+	union {
+		unsigned int gru_msg_hdr;	/* FOR GRU INTERNAL USE ONLY */
+		struct xpc_fifo_entry_uv next;	/* FOR XPC INTERNAL USE ONLY */
+	} u;
+	short partid;		/* FOR XPC INTERNAL USE ONLY */
+	u8 ch_number;		/* FOR XPC INTERNAL USE ONLY */
+	u8 size;		/* FOR XPC INTERNAL USE ONLY */
+	unsigned int msg_slot_number;	/* FOR XPC INTERNAL USE ONLY */
+};
+
+struct xpc_notify_mq_msg_uv {
+	struct xpc_notify_mq_msghdr_uv hdr;
+	unsigned long payload;
+};
+
+/*
+ * Define sn2's notify entry.
+ *
+ * This is used to notify a message's sender that their message was received
+ * and consumed by the intended recipient.
+ */
+struct xpc_notify_sn2 {
+	u8 type;		/* type of notification */
+
+	/* the following two fields are only used if type == XPC_N_CALL */
+	xpc_notify_func func;	/* user's notify function */
+	void *key;		/* pointer to user's key */
+};
+
+/* struct xpc_notify_sn2 type of notification */
+
+#define	XPC_N_CALL	0x01	/* notify function provided by user */
+
+/*
+ * Define uv's version of the notify entry. It additionally is used to allocate
+ * a msg slot on the remote partition into which is copied a sent message.
+ */
+struct xpc_send_msg_slot_uv {
+	struct xpc_fifo_entry_uv next;
+	unsigned int msg_slot_number;
+	xpc_notify_func func;	/* user's notify function */
+	void *key;		/* pointer to user's key */
+};
+
+/*
+ * Define the structure that manages all the stuff required by a channel. In
+ * particular, they are used to manage the messages sent across the channel.
+ *
+ * This structure is private to a partition, and is NOT shared across the
+ * partition boundary.
+ *
+ * There is an array of these structures for each remote partition. It is
+ * allocated at the time a partition becomes active. The array contains one
+ * of these structures for each potential channel connection to that partition.
+ */
+
+/*
+ * The following is sn2 only.
+ *
+ * Each channel structure manages two message queues (circular buffers).
+ * They are allocated at the time a channel connection is made. One of
+ * these message queues (local_msgqueue) holds the locally created messages
+ * that are destined for the remote partition. The other of these message
+ * queues (remote_msgqueue) is a locally cached copy of the remote partition's
+ * own local_msgqueue.
+ *
+ * The following is a description of the Get/Put pointers used to manage these
+ * two message queues. Consider the local_msgqueue to be on one partition
+ * and the remote_msgqueue to be its cached copy on another partition. A
+ * description of what each of the lettered areas contains is included.
+ *
+ *
+ *                     local_msgqueue      remote_msgqueue
+ *
+ *                        |/////////|      |/////////|
+ *    w_remote_GP.get --> +---------+      |/////////|
+ *                        |    F    |      |/////////|
+ *     remote_GP.get  --> +---------+      +---------+ <-- local_GP->get
+ *                        |         |      |         |
+ *                        |         |      |    E    |
+ *                        |         |      |         |
+ *                        |         |      +---------+ <-- w_local_GP.get
+ *                        |    B    |      |/////////|
+ *                        |         |      |////D////|
+ *                        |         |      |/////////|
+ *                        |         |      +---------+ <-- w_remote_GP.put
+ *                        |         |      |////C////|
+ *      local_GP->put --> +---------+      +---------+ <-- remote_GP.put
+ *                        |         |      |/////////|
+ *                        |    A    |      |/////////|
+ *                        |         |      |/////////|
+ *     w_local_GP.put --> +---------+      |/////////|
+ *                        |/////////|      |/////////|
+ *
+ *
+ *	    ( remote_GP.[get|put] are cached copies of the remote
+ *	      partition's local_GP->[get|put], and thus their values can
+ *	      lag behind their counterparts on the remote partition. )
+ *
+ *
+ *  A - Messages that have been allocated, but have not yet been sent to the
+ *	remote partition.
+ *
+ *  B - Messages that have been sent, but have not yet been acknowledged by the
+ *      remote partition as having been received.
+ *
+ *  C - Area that needs to be prepared for the copying of sent messages, by
+ *	the clearing of the message flags of any previously received messages.
+ *
+ *  D - Area into which sent messages are to be copied from the remote
+ *	partition's local_msgqueue and then delivered to their intended
+ *	recipients. [ To allow for a multi-message copy, another pointer
+ *	(next_msg_to_pull) has been added to keep track of the next message
+ *	number needing to be copied (pulled). It chases after w_remote_GP.put.
+ *	Any messages lying between w_local_GP.get and next_msg_to_pull have
+ *	been copied and are ready to be delivered. ]
+ *
+ *  E - Messages that have been copied and delivered, but have not yet been
+ *	acknowledged by the recipient as having been received.
+ *
+ *  F - Messages that have been acknowledged, but XPC has not yet notified the
+ *	sender that the message was received by its intended recipient.
+ *	This is also an area that needs to be prepared for the allocating of
+ *	new messages, by the clearing of the message flags of the acknowledged
+ *	messages.
+ */
+
+struct xpc_channel_sn2 {
+	struct xpc_openclose_args *local_openclose_args; /* args passed on */
+					     /* opening or closing of channel */
+
+	void *local_msgqueue_base;	/* base address of kmalloc'd space */
+	struct xpc_msg_sn2 *local_msgqueue;	/* local message queue */
+	void *remote_msgqueue_base;	/* base address of kmalloc'd space */
+	struct xpc_msg_sn2 *remote_msgqueue; /* cached copy of remote */
+					   /* partition's local message queue */
+	unsigned long remote_msgqueue_pa; /* phys addr of remote partition's */
+					  /* local message queue */
+
+	struct xpc_notify_sn2 *notify_queue;/* notify queue for messages sent */
+
+	/* various flavors of local and remote Get/Put values */
+
+	struct xpc_gp_sn2 *local_GP;	/* local Get/Put values */
+	struct xpc_gp_sn2 remote_GP;	/* remote Get/Put values */
+	struct xpc_gp_sn2 w_local_GP;	/* working local Get/Put values */
+	struct xpc_gp_sn2 w_remote_GP;	/* working remote Get/Put values */
+	s64 next_msg_to_pull;	/* Put value of next msg to pull */
+
+	struct mutex msg_to_pull_mutex;	/* next msg to pull serialization */
+};
+
+struct xpc_channel_uv {
+	void *cached_notify_gru_mq_desc; /* remote partition's notify mq's */
+					 /* gru mq descriptor */
+
+	struct xpc_send_msg_slot_uv *send_msg_slots;
+	void *recv_msg_slots;	/* each slot will hold a xpc_notify_mq_msg_uv */
+				/* structure plus the user's payload */
+
+	struct xpc_fifo_head_uv msg_slot_free_list;
+	struct xpc_fifo_head_uv recv_msg_list;	/* deliverable payloads */
+};
+
+struct xpc_channel {
+	short partid;		/* ID of remote partition connected */
+	spinlock_t lock;	/* lock for updating this structure */
+	unsigned int flags;	/* general flags */
+
+	enum xp_retval reason;	/* reason why channel is disconnect'g */
+	int reason_line;	/* line# disconnect initiated from */
+
+	u16 number;		/* channel # */
+
+	u16 entry_size;		/* sizeof each msg entry */
+	u16 local_nentries;	/* #of msg entries in local msg queue */
+	u16 remote_nentries;	/* #of msg entries in remote msg queue */
+
+	atomic_t references;	/* #of external references to queues */
+
+	atomic_t n_on_msg_allocate_wq;	/* #on msg allocation wait queue */
+	wait_queue_head_t msg_allocate_wq;	/* msg allocation wait queue */
+
+	u8 delayed_chctl_flags;	/* chctl flags received, but delayed */
+				/* action until channel disconnected */
+
+	atomic_t n_to_notify;	/* #of msg senders to notify */
+
+	xpc_channel_func func;	/* user's channel function */
+	void *key;		/* pointer to user's key */
+
+	struct completion wdisconnect_wait;    /* wait for channel disconnect */
+
+	/* kthread management related fields */
+
+	atomic_t kthreads_assigned;	/* #of kthreads assigned to channel */
+	u32 kthreads_assigned_limit;	/* limit on #of kthreads assigned */
+	atomic_t kthreads_idle;	/* #of kthreads idle waiting for work */
+	u32 kthreads_idle_limit;	/* limit on #of kthreads idle */
+	atomic_t kthreads_active;	/* #of kthreads actively working */
+
+	wait_queue_head_t idle_wq;	/* idle kthread wait queue */
+
+	union {
+		struct xpc_channel_sn2 sn2;
+		struct xpc_channel_uv uv;
+	} sn;
+
+} ____cacheline_aligned;
+
+/* struct xpc_channel flags */
+
+#define	XPC_C_WASCONNECTED	0x00000001	/* channel was connected */
+
+#define XPC_C_ROPENCOMPLETE	0x00000002    /* remote open channel complete */
+#define XPC_C_OPENCOMPLETE	0x00000004     /* local open channel complete */
+#define	XPC_C_ROPENREPLY	0x00000008	/* remote open channel reply */
+#define	XPC_C_OPENREPLY		0x00000010	/* local open channel reply */
+#define	XPC_C_ROPENREQUEST	0x00000020     /* remote open channel request */
+#define	XPC_C_OPENREQUEST	0x00000040	/* local open channel request */
+
+#define	XPC_C_SETUP		0x00000080 /* channel's msgqueues are alloc'd */
+#define	XPC_C_CONNECTEDCALLOUT	0x00000100     /* connected callout initiated */
+#define	XPC_C_CONNECTEDCALLOUT_MADE \
+				0x00000200     /* connected callout completed */
+#define	XPC_C_CONNECTED		0x00000400	/* local channel is connected */
+#define	XPC_C_CONNECTING	0x00000800	/* channel is being connected */
+
+#define	XPC_C_RCLOSEREPLY	0x00001000	/* remote close channel reply */
+#define	XPC_C_CLOSEREPLY	0x00002000	/* local close channel reply */
+#define	XPC_C_RCLOSEREQUEST	0x00004000    /* remote close channel request */
+#define	XPC_C_CLOSEREQUEST	0x00008000     /* local close channel request */
+
+#define	XPC_C_DISCONNECTED	0x00010000	/* channel is disconnected */
+#define	XPC_C_DISCONNECTING	0x00020000   /* channel is being disconnected */
+#define	XPC_C_DISCONNECTINGCALLOUT \
+				0x00040000 /* disconnecting callout initiated */
+#define	XPC_C_DISCONNECTINGCALLOUT_MADE \
+				0x00080000 /* disconnecting callout completed */
+#define	XPC_C_WDISCONNECT	0x00100000  /* waiting for channel disconnect */
+
+/*
+ * The channel control flags (chctl) union consists of a 64-bit variable which
+ * is divided up into eight bytes, ordered from right to left. Byte zero
+ * pertains to channel 0, byte one to channel 1, and so on. Each channel's byte
+ * can have one or more of the chctl flags set in it.
+ */
+
+union xpc_channel_ctl_flags {
+	u64 all_flags;
+	u8 flags[XPC_MAX_NCHANNELS];
+};
+
+/* chctl flags */
+#define	XPC_CHCTL_CLOSEREQUEST	0x01
+#define	XPC_CHCTL_CLOSEREPLY	0x02
+#define	XPC_CHCTL_OPENREQUEST	0x04
+#define	XPC_CHCTL_OPENREPLY	0x08
+#define XPC_CHCTL_OPENCOMPLETE	0x10
+#define	XPC_CHCTL_MSGREQUEST	0x20
+
+#define XPC_OPENCLOSE_CHCTL_FLAGS \
+			(XPC_CHCTL_CLOSEREQUEST | XPC_CHCTL_CLOSEREPLY | \
+			 XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY | \
+			 XPC_CHCTL_OPENCOMPLETE)
+#define XPC_MSG_CHCTL_FLAGS	XPC_CHCTL_MSGREQUEST
+
+static inline int
+xpc_any_openclose_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
+{
+	int ch_number;
+
+	for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
+		if (chctl->flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS)
+			return 1;
+	}
+	return 0;
+}
+
+static inline int
+xpc_any_msg_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
+{
+	int ch_number;
+
+	for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
+		if (chctl->flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Manage channels on a partition basis. There is one of these structures
+ * for each partition (a partition will never utilize the structure that
+ * represents itself).
+ */
+
+struct xpc_partition_sn2 {
+	unsigned long remote_amos_page_pa; /* paddr of partition's amos page */
+	int activate_IRQ_nasid;	/* active partition's act/deact nasid */
+	int activate_IRQ_phys_cpuid;	/* active part's act/deact phys cpuid */
+
+	unsigned long remote_vars_pa;	/* phys addr of partition's vars */
+	unsigned long remote_vars_part_pa; /* paddr of partition's vars part */
+	u8 remote_vars_version;	/* version# of partition's vars */
+
+	void *local_GPs_base;	/* base address of kmalloc'd space */
+	struct xpc_gp_sn2 *local_GPs;	/* local Get/Put values */
+	void *remote_GPs_base;	/* base address of kmalloc'd space */
+	struct xpc_gp_sn2 *remote_GPs;	/* copy of remote partition's local */
+					/* Get/Put values */
+	unsigned long remote_GPs_pa; /* phys addr of remote partition's local */
+				     /* Get/Put values */
+
+	void *local_openclose_args_base;   /* base address of kmalloc'd space */
+	struct xpc_openclose_args *local_openclose_args;      /* local's args */
+	unsigned long remote_openclose_args_pa;	/* phys addr of remote's args */
+
+	int notify_IRQ_nasid;	/* nasid of where to send notify IRQs */
+	int notify_IRQ_phys_cpuid;	/* CPUID of where to send notify IRQs */
+	char notify_IRQ_owner[8];	/* notify IRQ's owner's name */
+
+	struct amo *remote_chctl_amo_va; /* addr of remote chctl flags' amo */
+	struct amo *local_chctl_amo_va;	/* address of chctl flags' amo */
+
+	struct timer_list dropped_notify_IRQ_timer;	/* dropped IRQ timer */
+};
+
+struct xpc_partition_uv {
+	unsigned long heartbeat_gpa; /* phys addr of partition's heartbeat */
+	struct xpc_heartbeat_uv cached_heartbeat; /* cached copy of */
+						  /* partition's heartbeat */
+	unsigned long activate_gru_mq_desc_gpa;	/* phys addr of parititon's */
+						/* activate mq's gru mq */
+						/* descriptor */
+	void *cached_activate_gru_mq_desc; /* cached copy of partition's */
+					   /* activate mq's gru mq descriptor */
+	struct mutex cached_activate_gru_mq_desc_mutex;
+	spinlock_t flags_lock;	/* protect updating of flags */
+	unsigned int flags;	/* general flags */
+	u8 remote_act_state;	/* remote partition's act_state */
+	u8 act_state_req;	/* act_state request from remote partition */
+	enum xp_retval reason;	/* reason for deactivate act_state request */
+};
+
+/* struct xpc_partition_uv flags */
+
+#define XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV	0x00000001
+#define XPC_P_ENGAGED_UV			0x00000002
+
+/* struct xpc_partition_uv act_state change requests */
+
+#define XPC_P_ASR_ACTIVATE_UV		0x01
+#define XPC_P_ASR_REACTIVATE_UV		0x02
+#define XPC_P_ASR_DEACTIVATE_UV		0x03
+
+struct xpc_partition {
+
+	/* XPC HB infrastructure */
+
+	u8 remote_rp_version;	/* version# of partition's rsvd pg */
+	unsigned long remote_rp_ts_jiffies; /* timestamp when rsvd pg setup */
+	unsigned long remote_rp_pa;	/* phys addr of partition's rsvd pg */
+	u64 last_heartbeat;	/* HB at last read */
+	u32 activate_IRQ_rcvd;	/* IRQs since activation */
+	spinlock_t act_lock;	/* protect updating of act_state */
+	u8 act_state;		/* from XPC HB viewpoint */
+	enum xp_retval reason;	/* reason partition is deactivating */
+	int reason_line;	/* line# deactivation initiated from */
+
+	unsigned long disengage_timeout;	/* timeout in jiffies */
+	struct timer_list disengage_timer;
+
+	/* XPC infrastructure referencing and teardown control */
+
+	u8 setup_state;		/* infrastructure setup state */
+	wait_queue_head_t teardown_wq;	/* kthread waiting to teardown infra */
+	atomic_t references;	/* #of references to infrastructure */
+
+	u8 nchannels;		/* #of defined channels supported */
+	atomic_t nchannels_active;  /* #of channels that are not DISCONNECTED */
+	atomic_t nchannels_engaged;  /* #of channels engaged with remote part */
+	struct xpc_channel *channels;	/* array of channel structures */
+
+	/* fields used for managing channel avialability and activity */
+
+	union xpc_channel_ctl_flags chctl; /* chctl flags yet to be processed */
+	spinlock_t chctl_lock;	/* chctl flags lock */
+
+	void *remote_openclose_args_base;  /* base address of kmalloc'd space */
+	struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
+							  /* args */
+
+	/* channel manager related fields */
+
+	atomic_t channel_mgr_requests;	/* #of requests to activate chan mgr */
+	wait_queue_head_t channel_mgr_wq;	/* channel mgr's wait queue */
+
+	union {
+		struct xpc_partition_sn2 sn2;
+		struct xpc_partition_uv uv;
+	} sn;
+
+} ____cacheline_aligned;
+
+struct xpc_arch_operations {
+	int (*setup_partitions) (void);
+	void (*teardown_partitions) (void);
+	void (*process_activate_IRQ_rcvd) (void);
+	enum xp_retval (*get_partition_rsvd_page_pa)
+		(void *, u64 *, unsigned long *, size_t *);
+	int (*setup_rsvd_page) (struct xpc_rsvd_page *);
+
+	void (*allow_hb) (short);
+	void (*disallow_hb) (short);
+	void (*disallow_all_hbs) (void);
+	void (*increment_heartbeat) (void);
+	void (*offline_heartbeat) (void);
+	void (*online_heartbeat) (void);
+	void (*heartbeat_init) (void);
+	void (*heartbeat_exit) (void);
+	enum xp_retval (*get_remote_heartbeat) (struct xpc_partition *);
+
+	void (*request_partition_activation) (struct xpc_rsvd_page *,
+						 unsigned long, int);
+	void (*request_partition_reactivation) (struct xpc_partition *);
+	void (*request_partition_deactivation) (struct xpc_partition *);
+	void (*cancel_partition_deactivation_request) (struct xpc_partition *);
+	enum xp_retval (*setup_ch_structures) (struct xpc_partition *);
+	void (*teardown_ch_structures) (struct xpc_partition *);
+
+	enum xp_retval (*make_first_contact) (struct xpc_partition *);
+
+	u64 (*get_chctl_all_flags) (struct xpc_partition *);
+	void (*send_chctl_closerequest) (struct xpc_channel *, unsigned long *);
+	void (*send_chctl_closereply) (struct xpc_channel *, unsigned long *);
+	void (*send_chctl_openrequest) (struct xpc_channel *, unsigned long *);
+	void (*send_chctl_openreply) (struct xpc_channel *, unsigned long *);
+	void (*send_chctl_opencomplete) (struct xpc_channel *, unsigned long *);
+	void (*process_msg_chctl_flags) (struct xpc_partition *, int);
+
+	enum xp_retval (*save_remote_msgqueue_pa) (struct xpc_channel *,
+						      unsigned long);
+
+	enum xp_retval (*setup_msg_structures) (struct xpc_channel *);
+	void (*teardown_msg_structures) (struct xpc_channel *);
+
+	void (*indicate_partition_engaged) (struct xpc_partition *);
+	void (*indicate_partition_disengaged) (struct xpc_partition *);
+	void (*assume_partition_disengaged) (short);
+	int (*partition_engaged) (short);
+	int (*any_partition_engaged) (void);
+
+	int (*n_of_deliverable_payloads) (struct xpc_channel *);
+	enum xp_retval (*send_payload) (struct xpc_channel *, u32, void *,
+					   u16, u8, xpc_notify_func, void *);
+	void *(*get_deliverable_payload) (struct xpc_channel *);
+	void (*received_payload) (struct xpc_channel *, void *);
+	void (*notify_senders_of_disconnect) (struct xpc_channel *);
+};
+
+/* struct xpc_partition act_state values (for XPC HB) */
+
+#define	XPC_P_AS_INACTIVE	0x00	/* partition is not active */
+#define XPC_P_AS_ACTIVATION_REQ	0x01	/* created thread to activate */
+#define XPC_P_AS_ACTIVATING	0x02	/* activation thread started */
+#define XPC_P_AS_ACTIVE		0x03	/* xpc_partition_up() was called */
+#define XPC_P_AS_DEACTIVATING	0x04	/* partition deactivation initiated */
+
+#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
+			xpc_deactivate_partition(__LINE__, (_p), (_reason))
+
+/* struct xpc_partition setup_state values */
+
+#define XPC_P_SS_UNSET		0x00	/* infrastructure was never setup */
+#define XPC_P_SS_SETUP		0x01	/* infrastructure is setup */
+#define XPC_P_SS_WTEARDOWN	0x02	/* waiting to teardown infrastructure */
+#define XPC_P_SS_TORNDOWN	0x03	/* infrastructure is torndown */
+
+/*
+ * struct xpc_partition_sn2's dropped notify IRQ timer is set to wait the
+ * following interval #of seconds before checking for dropped notify IRQs.
+ * These can occur whenever an IRQ's associated amo write doesn't complete
+ * until after the IRQ was received.
+ */
+#define XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL	(0.25 * HZ)
+
+/* number of seconds to wait for other partitions to disengage */
+#define XPC_DISENGAGE_DEFAULT_TIMELIMIT		90
+
+/* interval in seconds to print 'waiting deactivation' messages */
+#define XPC_DEACTIVATE_PRINTMSG_INTERVAL	10
+
+#define XPC_PARTID(_p)	((short)((_p) - &xpc_partitions[0]))
+
+/* found in xp_main.c */
+extern struct xpc_registration xpc_registrations[];
+
+/* found in xpc_main.c */
+extern struct device *xpc_part;
+extern struct device *xpc_chan;
+extern struct xpc_arch_operations xpc_arch_ops;
+extern int xpc_disengage_timelimit;
+extern int xpc_disengage_timedout;
+extern int xpc_activate_IRQ_rcvd;
+extern spinlock_t xpc_activate_IRQ_rcvd_lock;
+extern wait_queue_head_t xpc_activate_IRQ_wq;
+extern void *xpc_kzalloc_cacheline_aligned(size_t, gfp_t, void **);
+extern void xpc_activate_partition(struct xpc_partition *);
+extern void xpc_activate_kthreads(struct xpc_channel *, int);
+extern void xpc_create_kthreads(struct xpc_channel *, int, int);
+extern void xpc_disconnect_wait(int);
+
+/* found in xpc_sn2.c */
+extern int xpc_init_sn2(void);
+extern void xpc_exit_sn2(void);
+
+/* found in xpc_uv.c */
+extern int xpc_init_uv(void);
+extern void xpc_exit_uv(void);
+
+/* found in xpc_partition.c */
+extern int xpc_exiting;
+extern int xpc_nasid_mask_nlongs;
+extern struct xpc_rsvd_page *xpc_rsvd_page;
+extern unsigned long *xpc_mach_nasids;
+extern struct xpc_partition *xpc_partitions;
+extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
+extern int xpc_setup_rsvd_page(void);
+extern void xpc_teardown_rsvd_page(void);
+extern int xpc_identify_activate_IRQ_sender(void);
+extern int xpc_partition_disengaged(struct xpc_partition *);
+extern enum xp_retval xpc_mark_partition_active(struct xpc_partition *);
+extern void xpc_mark_partition_inactive(struct xpc_partition *);
+extern void xpc_discovery(void);
+extern enum xp_retval xpc_get_remote_rp(int, unsigned long *,
+					struct xpc_rsvd_page *,
+					unsigned long *);
+extern void xpc_deactivate_partition(const int, struct xpc_partition *,
+				     enum xp_retval);
+extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *);
+
+/* found in xpc_channel.c */
+extern void xpc_initiate_connect(int);
+extern void xpc_initiate_disconnect(int);
+extern enum xp_retval xpc_allocate_msg_wait(struct xpc_channel *);
+extern enum xp_retval xpc_initiate_send(short, int, u32, void *, u16);
+extern enum xp_retval xpc_initiate_send_notify(short, int, u32, void *, u16,
+					       xpc_notify_func, void *);
+extern void xpc_initiate_received(short, int, void *);
+extern void xpc_process_sent_chctl_flags(struct xpc_partition *);
+extern void xpc_connected_callout(struct xpc_channel *);
+extern void xpc_deliver_payload(struct xpc_channel *);
+extern void xpc_disconnect_channel(const int, struct xpc_channel *,
+				   enum xp_retval, unsigned long *);
+extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval);
+extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval);
+
+static inline void
+xpc_wakeup_channel_mgr(struct xpc_partition *part)
+{
+	if (atomic_inc_return(&part->channel_mgr_requests) == 1)
+		wake_up(&part->channel_mgr_wq);
+}
+
+/*
+ * These next two inlines are used to keep us from tearing down a channel's
+ * msg queues while a thread may be referencing them.
+ */
+static inline void
+xpc_msgqueue_ref(struct xpc_channel *ch)
+{
+	atomic_inc(&ch->references);
+}
+
+static inline void
+xpc_msgqueue_deref(struct xpc_channel *ch)
+{
+	s32 refs = atomic_dec_return(&ch->references);
+
+	DBUG_ON(refs < 0);
+	if (refs == 0)
+		xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
+}
+
+#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
+		xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
+
+/*
+ * These two inlines are used to keep us from tearing down a partition's
+ * setup infrastructure while a thread may be referencing it.
+ */
+static inline void
+xpc_part_deref(struct xpc_partition *part)
+{
+	s32 refs = atomic_dec_return(&part->references);
+
+	DBUG_ON(refs < 0);
+	if (refs == 0 && part->setup_state == XPC_P_SS_WTEARDOWN)
+		wake_up(&part->teardown_wq);
+}
+
+static inline int
+xpc_part_ref(struct xpc_partition *part)
+{
+	int setup;
+
+	atomic_inc(&part->references);
+	setup = (part->setup_state == XPC_P_SS_SETUP);
+	if (!setup)
+		xpc_part_deref(part);
+
+	return setup;
+}
+
+/*
+ * The following macro is to be used for the setting of the reason and
+ * reason_line fields in both the struct xpc_channel and struct xpc_partition
+ * structures.
+ */
+#define XPC_SET_REASON(_p, _reason, _line) \
+	{ \
+		(_p)->reason = _reason; \
+		(_p)->reason_line = _line; \
+	}
+
+#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 000000000..05a890ce2
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,1011 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) channel support.
+ *
+ *	This is the part of XPC that manages the channels and
+ *	sends/receives messages across them to/from other partitions.
+ *
+ */
+
+#include <linux/device.h>
+#include "xpc.h"
+
+/*
+ * Process a connect message from a remote partition.
+ *
+ * Note: xpc_process_connect() is expecting to be called with the
+ * spin_lock_irqsave held and will leave it locked upon return.
+ */
+static void
+xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	enum xp_retval ret;
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (!(ch->flags & XPC_C_OPENREQUEST) ||
+	    !(ch->flags & XPC_C_ROPENREQUEST)) {
+		/* nothing more to do for now */
+		return;
+	}
+	DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
+
+	if (!(ch->flags & XPC_C_SETUP)) {
+		spin_unlock_irqrestore(&ch->lock, *irq_flags);
+		ret = xpc_arch_ops.setup_msg_structures(ch);
+		spin_lock_irqsave(&ch->lock, *irq_flags);
+
+		if (ret != xpSuccess)
+			XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
+		else
+			ch->flags |= XPC_C_SETUP;
+
+		if (ch->flags & XPC_C_DISCONNECTING)
+			return;
+	}
+
+	if (!(ch->flags & XPC_C_OPENREPLY)) {
+		ch->flags |= XPC_C_OPENREPLY;
+		xpc_arch_ops.send_chctl_openreply(ch, irq_flags);
+	}
+
+	if (!(ch->flags & XPC_C_ROPENREPLY))
+		return;
+
+	if (!(ch->flags & XPC_C_OPENCOMPLETE)) {
+		ch->flags |= (XPC_C_OPENCOMPLETE | XPC_C_CONNECTED);
+		xpc_arch_ops.send_chctl_opencomplete(ch, irq_flags);
+	}
+
+	if (!(ch->flags & XPC_C_ROPENCOMPLETE))
+		return;
+
+	dev_info(xpc_chan, "channel %d to partition %d connected\n",
+		 ch->number, ch->partid);
+
+	ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP);	/* clear all else */
+}
+
+/*
+ * spin_lock_irqsave() is expected to be held on entry.
+ */
+static void
+xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (!(ch->flags & XPC_C_DISCONNECTING))
+		return;
+
+	DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+	/* make sure all activity has settled down first */
+
+	if (atomic_read(&ch->kthreads_assigned) > 0 ||
+	    atomic_read(&ch->references) > 0) {
+		return;
+	}
+	DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+		!(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
+
+	if (part->act_state == XPC_P_AS_DEACTIVATING) {
+		/* can't proceed until the other side disengages from us */
+		if (xpc_arch_ops.partition_engaged(ch->partid))
+			return;
+
+	} else {
+
+		/* as long as the other side is up do the full protocol */
+
+		if (!(ch->flags & XPC_C_RCLOSEREQUEST))
+			return;
+
+		if (!(ch->flags & XPC_C_CLOSEREPLY)) {
+			ch->flags |= XPC_C_CLOSEREPLY;
+			xpc_arch_ops.send_chctl_closereply(ch, irq_flags);
+		}
+
+		if (!(ch->flags & XPC_C_RCLOSEREPLY))
+			return;
+	}
+
+	/* wake those waiting for notify completion */
+	if (atomic_read(&ch->n_to_notify) > 0) {
+		/* we do callout while holding ch->lock, callout can't block */
+		xpc_arch_ops.notify_senders_of_disconnect(ch);
+	}
+
+	/* both sides are disconnected now */
+
+	if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
+		spin_unlock_irqrestore(&ch->lock, *irq_flags);
+		xpc_disconnect_callout(ch, xpDisconnected);
+		spin_lock_irqsave(&ch->lock, *irq_flags);
+	}
+
+	DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
+
+	/* it's now safe to free the channel's message queues */
+	xpc_arch_ops.teardown_msg_structures(ch);
+
+	ch->func = NULL;
+	ch->key = NULL;
+	ch->entry_size = 0;
+	ch->local_nentries = 0;
+	ch->remote_nentries = 0;
+	ch->kthreads_assigned_limit = 0;
+	ch->kthreads_idle_limit = 0;
+
+	/*
+	 * Mark the channel disconnected and clear all other flags, including
+	 * XPC_C_SETUP (because of call to
+	 * xpc_arch_ops.teardown_msg_structures()) but not including
+	 * XPC_C_WDISCONNECT (if it was set).
+	 */
+	ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
+
+	atomic_dec(&part->nchannels_active);
+
+	if (channel_was_connected) {
+		dev_info(xpc_chan, "channel %d to partition %d disconnected, "
+			 "reason=%d\n", ch->number, ch->partid, ch->reason);
+	}
+
+	if (ch->flags & XPC_C_WDISCONNECT) {
+		/* we won't lose the CPU since we're holding ch->lock */
+		complete(&ch->wdisconnect_wait);
+	} else if (ch->delayed_chctl_flags) {
+		if (part->act_state != XPC_P_AS_DEACTIVATING) {
+			/* time to take action on any delayed chctl flags */
+			spin_lock(&part->chctl_lock);
+			part->chctl.flags[ch->number] |=
+			    ch->delayed_chctl_flags;
+			spin_unlock(&part->chctl_lock);
+		}
+		ch->delayed_chctl_flags = 0;
+	}
+}
+
+/*
+ * Process a change in the channel's remote connection state.
+ */
+static void
+xpc_process_openclose_chctl_flags(struct xpc_partition *part, int ch_number,
+				  u8 chctl_flags)
+{
+	unsigned long irq_flags;
+	struct xpc_openclose_args *args =
+	    &part->remote_openclose_args[ch_number];
+	struct xpc_channel *ch = &part->channels[ch_number];
+	enum xp_retval reason;
+	enum xp_retval ret;
+	int create_kthread = 0;
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+
+again:
+
+	if ((ch->flags & XPC_C_DISCONNECTED) &&
+	    (ch->flags & XPC_C_WDISCONNECT)) {
+		/*
+		 * Delay processing chctl flags until thread waiting disconnect
+		 * has had a chance to see that the channel is disconnected.
+		 */
+		ch->delayed_chctl_flags |= chctl_flags;
+		goto out;
+	}
+
+	if (chctl_flags & XPC_CHCTL_CLOSEREQUEST) {
+
+		dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREQUEST (reason=%d) received "
+			"from partid=%d, channel=%d\n", args->reason,
+			ch->partid, ch->number);
+
+		/*
+		 * If RCLOSEREQUEST is set, we're probably waiting for
+		 * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
+		 * with this RCLOSEREQUEST in the chctl_flags.
+		 */
+
+		if (ch->flags & XPC_C_RCLOSEREQUEST) {
+			DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
+			DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+			DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
+			DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
+
+			DBUG_ON(!(chctl_flags & XPC_CHCTL_CLOSEREPLY));
+			chctl_flags &= ~XPC_CHCTL_CLOSEREPLY;
+			ch->flags |= XPC_C_RCLOSEREPLY;
+
+			/* both sides have finished disconnecting */
+			xpc_process_disconnect(ch, &irq_flags);
+			DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+			goto again;
+		}
+
+		if (ch->flags & XPC_C_DISCONNECTED) {
+			if (!(chctl_flags & XPC_CHCTL_OPENREQUEST)) {
+				if (part->chctl.flags[ch_number] &
+				    XPC_CHCTL_OPENREQUEST) {
+
+					DBUG_ON(ch->delayed_chctl_flags != 0);
+					spin_lock(&part->chctl_lock);
+					part->chctl.flags[ch_number] |=
+					    XPC_CHCTL_CLOSEREQUEST;
+					spin_unlock(&part->chctl_lock);
+				}
+				goto out;
+			}
+
+			XPC_SET_REASON(ch, 0, 0);
+			ch->flags &= ~XPC_C_DISCONNECTED;
+
+			atomic_inc(&part->nchannels_active);
+			ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
+		}
+
+		chctl_flags &= ~(XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY |
+		    XPC_CHCTL_OPENCOMPLETE);
+
+		/*
+		 * The meaningful CLOSEREQUEST connection state fields are:
+		 *      reason = reason connection is to be closed
+		 */
+
+		ch->flags |= XPC_C_RCLOSEREQUEST;
+
+		if (!(ch->flags & XPC_C_DISCONNECTING)) {
+			reason = args->reason;
+			if (reason <= xpSuccess || reason > xpUnknownReason)
+				reason = xpUnknownReason;
+			else if (reason == xpUnregistering)
+				reason = xpOtherUnregistering;
+
+			XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+			DBUG_ON(chctl_flags & XPC_CHCTL_CLOSEREPLY);
+			goto out;
+		}
+
+		xpc_process_disconnect(ch, &irq_flags);
+	}
+
+	if (chctl_flags & XPC_CHCTL_CLOSEREPLY) {
+
+		dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREPLY received from partid="
+			"%d, channel=%d\n", ch->partid, ch->number);
+
+		if (ch->flags & XPC_C_DISCONNECTED) {
+			DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING);
+			goto out;
+		}
+
+		DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+		if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
+			if (part->chctl.flags[ch_number] &
+			    XPC_CHCTL_CLOSEREQUEST) {
+
+				DBUG_ON(ch->delayed_chctl_flags != 0);
+				spin_lock(&part->chctl_lock);
+				part->chctl.flags[ch_number] |=
+				    XPC_CHCTL_CLOSEREPLY;
+				spin_unlock(&part->chctl_lock);
+			}
+			goto out;
+		}
+
+		ch->flags |= XPC_C_RCLOSEREPLY;
+
+		if (ch->flags & XPC_C_CLOSEREPLY) {
+			/* both sides have finished disconnecting */
+			xpc_process_disconnect(ch, &irq_flags);
+		}
+	}
+
+	if (chctl_flags & XPC_CHCTL_OPENREQUEST) {
+
+		dev_dbg(xpc_chan, "XPC_CHCTL_OPENREQUEST (entry_size=%d, "
+			"local_nentries=%d) received from partid=%d, "
+			"channel=%d\n", args->entry_size, args->local_nentries,
+			ch->partid, ch->number);
+
+		if (part->act_state == XPC_P_AS_DEACTIVATING ||
+		    (ch->flags & XPC_C_ROPENREQUEST)) {
+			goto out;
+		}
+
+		if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
+			ch->delayed_chctl_flags |= XPC_CHCTL_OPENREQUEST;
+			goto out;
+		}
+		DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
+				       XPC_C_OPENREQUEST)));
+		DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+				     XPC_C_OPENREPLY | XPC_C_CONNECTED));
+
+		/*
+		 * The meaningful OPENREQUEST connection state fields are:
+		 *      entry_size = size of channel's messages in bytes
+		 *      local_nentries = remote partition's local_nentries
+		 */
+		if (args->entry_size == 0 || args->local_nentries == 0) {
+			/* assume OPENREQUEST was delayed by mistake */
+			goto out;
+		}
+
+		ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
+		ch->remote_nentries = args->local_nentries;
+
+		if (ch->flags & XPC_C_OPENREQUEST) {
+			if (args->entry_size != ch->entry_size) {
+				XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes,
+						       &irq_flags);
+				goto out;
+			}
+		} else {
+			ch->entry_size = args->entry_size;
+
+			XPC_SET_REASON(ch, 0, 0);
+			ch->flags &= ~XPC_C_DISCONNECTED;
+
+			atomic_inc(&part->nchannels_active);
+		}
+
+		xpc_process_connect(ch, &irq_flags);
+	}
+
+	if (chctl_flags & XPC_CHCTL_OPENREPLY) {
+
+		dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY (local_msgqueue_pa="
+			"0x%lx, local_nentries=%d, remote_nentries=%d) "
+			"received from partid=%d, channel=%d\n",
+			args->local_msgqueue_pa, args->local_nentries,
+			args->remote_nentries, ch->partid, ch->number);
+
+		if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+			goto out;
+
+		if (!(ch->flags & XPC_C_OPENREQUEST)) {
+			XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError,
+					       &irq_flags);
+			goto out;
+		}
+
+		DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
+		DBUG_ON(ch->flags & XPC_C_CONNECTED);
+
+		/*
+		 * The meaningful OPENREPLY connection state fields are:
+		 *      local_msgqueue_pa = physical address of remote
+		 *                          partition's local_msgqueue
+		 *      local_nentries = remote partition's local_nentries
+		 *      remote_nentries = remote partition's remote_nentries
+		 */
+		DBUG_ON(args->local_msgqueue_pa == 0);
+		DBUG_ON(args->local_nentries == 0);
+		DBUG_ON(args->remote_nentries == 0);
+
+		ret = xpc_arch_ops.save_remote_msgqueue_pa(ch,
+						      args->local_msgqueue_pa);
+		if (ret != xpSuccess) {
+			XPC_DISCONNECT_CHANNEL(ch, ret, &irq_flags);
+			goto out;
+		}
+		ch->flags |= XPC_C_ROPENREPLY;
+
+		if (args->local_nentries < ch->remote_nentries) {
+			dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
+				"remote_nentries=%d, old remote_nentries=%d, "
+				"partid=%d, channel=%d\n",
+				args->local_nentries, ch->remote_nentries,
+				ch->partid, ch->number);
+
+			ch->remote_nentries = args->local_nentries;
+		}
+		if (args->remote_nentries < ch->local_nentries) {
+			dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
+				"local_nentries=%d, old local_nentries=%d, "
+				"partid=%d, channel=%d\n",
+				args->remote_nentries, ch->local_nentries,
+				ch->partid, ch->number);
+
+			ch->local_nentries = args->remote_nentries;
+		}
+
+		xpc_process_connect(ch, &irq_flags);
+	}
+
+	if (chctl_flags & XPC_CHCTL_OPENCOMPLETE) {
+
+		dev_dbg(xpc_chan, "XPC_CHCTL_OPENCOMPLETE received from "
+			"partid=%d, channel=%d\n", ch->partid, ch->number);
+
+		if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+			goto out;
+
+		if (!(ch->flags & XPC_C_OPENREQUEST) ||
+		    !(ch->flags & XPC_C_OPENREPLY)) {
+			XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError,
+					       &irq_flags);
+			goto out;
+		}
+
+		DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
+		DBUG_ON(!(ch->flags & XPC_C_ROPENREPLY));
+		DBUG_ON(!(ch->flags & XPC_C_CONNECTED));
+
+		ch->flags |= XPC_C_ROPENCOMPLETE;
+
+		xpc_process_connect(ch, &irq_flags);
+		create_kthread = 1;
+	}
+
+out:
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	if (create_kthread)
+		xpc_create_kthreads(ch, 1, 0);
+}
+
+/*
+ * Attempt to establish a channel connection to a remote partition.
+ */
+static enum xp_retval
+xpc_connect_channel(struct xpc_channel *ch)
+{
+	unsigned long irq_flags;
+	struct xpc_registration *registration = &xpc_registrations[ch->number];
+
+	if (mutex_trylock(&registration->mutex) == 0)
+		return xpRetry;
+
+	if (!XPC_CHANNEL_REGISTERED(ch->number)) {
+		mutex_unlock(&registration->mutex);
+		return xpUnregistered;
+	}
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+
+	DBUG_ON(ch->flags & XPC_C_CONNECTED);
+	DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		mutex_unlock(&registration->mutex);
+		return ch->reason;
+	}
+
+	/* add info from the channel connect registration to the channel */
+
+	ch->kthreads_assigned_limit = registration->assigned_limit;
+	ch->kthreads_idle_limit = registration->idle_limit;
+	DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
+	DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
+	DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
+
+	ch->func = registration->func;
+	DBUG_ON(registration->func == NULL);
+	ch->key = registration->key;
+
+	ch->local_nentries = registration->nentries;
+
+	if (ch->flags & XPC_C_ROPENREQUEST) {
+		if (registration->entry_size != ch->entry_size) {
+			/* the local and remote sides aren't the same */
+
+			/*
+			 * Because XPC_DISCONNECT_CHANNEL() can block we're
+			 * forced to up the registration sema before we unlock
+			 * the channel lock. But that's okay here because we're
+			 * done with the part that required the registration
+			 * sema. XPC_DISCONNECT_CHANNEL() requires that the
+			 * channel lock be locked and will unlock and relock
+			 * the channel lock as needed.
+			 */
+			mutex_unlock(&registration->mutex);
+			XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes,
+					       &irq_flags);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			return xpUnequalMsgSizes;
+		}
+	} else {
+		ch->entry_size = registration->entry_size;
+
+		XPC_SET_REASON(ch, 0, 0);
+		ch->flags &= ~XPC_C_DISCONNECTED;
+
+		atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
+	}
+
+	mutex_unlock(&registration->mutex);
+
+	/* initiate the connection */
+
+	ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
+	xpc_arch_ops.send_chctl_openrequest(ch, &irq_flags);
+
+	xpc_process_connect(ch, &irq_flags);
+
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	return xpSuccess;
+}
+
+void
+xpc_process_sent_chctl_flags(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	union xpc_channel_ctl_flags chctl;
+	struct xpc_channel *ch;
+	int ch_number;
+	u32 ch_flags;
+
+	chctl.all_flags = xpc_arch_ops.get_chctl_all_flags(part);
+
+	/*
+	 * Initiate channel connections for registered channels.
+	 *
+	 * For each connected channel that has pending messages activate idle
+	 * kthreads and/or create new kthreads as needed.
+	 */
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		/*
+		 * Process any open or close related chctl flags, and then deal
+		 * with connecting or disconnecting the channel as required.
+		 */
+
+		if (chctl.flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS) {
+			xpc_process_openclose_chctl_flags(part, ch_number,
+							chctl.flags[ch_number]);
+		}
+
+		ch_flags = ch->flags;	/* need an atomic snapshot of flags */
+
+		if (ch_flags & XPC_C_DISCONNECTING) {
+			spin_lock_irqsave(&ch->lock, irq_flags);
+			xpc_process_disconnect(ch, &irq_flags);
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+			continue;
+		}
+
+		if (part->act_state == XPC_P_AS_DEACTIVATING)
+			continue;
+
+		if (!(ch_flags & XPC_C_CONNECTED)) {
+			if (!(ch_flags & XPC_C_OPENREQUEST)) {
+				DBUG_ON(ch_flags & XPC_C_SETUP);
+				(void)xpc_connect_channel(ch);
+			}
+			continue;
+		}
+
+		/*
+		 * Process any message related chctl flags, this may involve
+		 * the activation of kthreads to deliver any pending messages
+		 * sent from the other partition.
+		 */
+
+		if (chctl.flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
+			xpc_arch_ops.process_msg_chctl_flags(part, ch_number);
+	}
+}
+
+/*
+ * XPC's heartbeat code calls this function to inform XPC that a partition is
+ * going down.  XPC responds by tearing down the XPartition Communication
+ * infrastructure used for the just downed partition.
+ *
+ * XPC's heartbeat code will never call this function and xpc_partition_up()
+ * at the same time. Nor will it ever make multiple calls to either function
+ * at the same time.
+ */
+void
+xpc_partition_going_down(struct xpc_partition *part, enum xp_retval reason)
+{
+	unsigned long irq_flags;
+	int ch_number;
+	struct xpc_channel *ch;
+
+	dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
+		XPC_PARTID(part), reason);
+
+	if (!xpc_part_ref(part)) {
+		/* infrastructure for this partition isn't currently set up */
+		return;
+	}
+
+	/* disconnect channels associated with the partition going down */
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		xpc_msgqueue_ref(ch);
+		spin_lock_irqsave(&ch->lock, irq_flags);
+
+		XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		xpc_msgqueue_deref(ch);
+	}
+
+	xpc_wakeup_channel_mgr(part);
+
+	xpc_part_deref(part);
+}
+
+/*
+ * Called by XP at the time of channel connection registration to cause
+ * XPC to establish connections to all currently active partitions.
+ */
+void
+xpc_initiate_connect(int ch_number)
+{
+	short partid;
+	struct xpc_partition *part;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (xpc_part_ref(part)) {
+			/*
+			 * Initiate the establishment of a connection on the
+			 * newly registered channel to the remote partition.
+			 */
+			xpc_wakeup_channel_mgr(part);
+			xpc_part_deref(part);
+		}
+	}
+}
+
+void
+xpc_connected_callout(struct xpc_channel *ch)
+{
+	/* let the registerer know that a connection has been established */
+
+	if (ch->func != NULL) {
+		dev_dbg(xpc_chan, "ch->func() called, reason=xpConnected, "
+			"partid=%d, channel=%d\n", ch->partid, ch->number);
+
+		ch->func(xpConnected, ch->partid, ch->number,
+			 (void *)(u64)ch->local_nentries, ch->key);
+
+		dev_dbg(xpc_chan, "ch->func() returned, reason=xpConnected, "
+			"partid=%d, channel=%d\n", ch->partid, ch->number);
+	}
+}
+
+/*
+ * Called by XP at the time of channel connection unregistration to cause
+ * XPC to teardown all current connections for the specified channel.
+ *
+ * Before returning xpc_initiate_disconnect() will wait until all connections
+ * on the specified channel have been closed/torndown. So the caller can be
+ * assured that they will not be receiving any more callouts from XPC to the
+ * function they registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ *	ch_number - channel # to unregister.
+ */
+void
+xpc_initiate_disconnect(int ch_number)
+{
+	unsigned long irq_flags;
+	short partid;
+	struct xpc_partition *part;
+	struct xpc_channel *ch;
+
+	DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+	/* initiate the channel disconnect for every active partition */
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (xpc_part_ref(part)) {
+			ch = &part->channels[ch_number];
+			xpc_msgqueue_ref(ch);
+
+			spin_lock_irqsave(&ch->lock, irq_flags);
+
+			if (!(ch->flags & XPC_C_DISCONNECTED)) {
+				ch->flags |= XPC_C_WDISCONNECT;
+
+				XPC_DISCONNECT_CHANNEL(ch, xpUnregistering,
+						       &irq_flags);
+			}
+
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			xpc_msgqueue_deref(ch);
+			xpc_part_deref(part);
+		}
+	}
+
+	xpc_disconnect_wait(ch_number);
+}
+
+/*
+ * To disconnect a channel, and reflect it back to all who may be waiting.
+ *
+ * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
+ * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
+ * xpc_disconnect_wait().
+ *
+ * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
+ */
+void
+xpc_disconnect_channel(const int line, struct xpc_channel *ch,
+		       enum xp_retval reason, unsigned long *irq_flags)
+{
+	u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+		return;
+
+	DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
+
+	dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
+		reason, line, ch->partid, ch->number);
+
+	XPC_SET_REASON(ch, reason, line);
+
+	ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
+	/* some of these may not have been set */
+	ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
+		       XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+		       XPC_C_CONNECTING | XPC_C_CONNECTED);
+
+	xpc_arch_ops.send_chctl_closerequest(ch, irq_flags);
+
+	if (channel_was_connected)
+		ch->flags |= XPC_C_WASCONNECTED;
+
+	spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+	/* wake all idle kthreads so they can exit */
+	if (atomic_read(&ch->kthreads_idle) > 0) {
+		wake_up_all(&ch->idle_wq);
+
+	} else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+		   !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+		/* start a kthread that will do the xpDisconnecting callout */
+		xpc_create_kthreads(ch, 1, 1);
+	}
+
+	/* wake those waiting to allocate an entry from the local msg queue */
+	if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+		wake_up(&ch->msg_allocate_wq);
+
+	spin_lock_irqsave(&ch->lock, *irq_flags);
+}
+
+void
+xpc_disconnect_callout(struct xpc_channel *ch, enum xp_retval reason)
+{
+	/*
+	 * Let the channel's registerer know that the channel is being
+	 * disconnected. We don't want to do this if the registerer was never
+	 * informed of a connection being made.
+	 */
+
+	if (ch->func != NULL) {
+		dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
+			"channel=%d\n", reason, ch->partid, ch->number);
+
+		ch->func(reason, ch->partid, ch->number, NULL, ch->key);
+
+		dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
+			"channel=%d\n", reason, ch->partid, ch->number);
+	}
+}
+
+/*
+ * Wait for a message entry to become available for the specified channel,
+ * but don't wait any longer than 1 jiffy.
+ */
+enum xp_retval
+xpc_allocate_msg_wait(struct xpc_channel *ch)
+{
+	enum xp_retval ret;
+	DEFINE_WAIT(wait);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		DBUG_ON(ch->reason == xpInterrupted);
+		return ch->reason;
+	}
+
+	atomic_inc(&ch->n_on_msg_allocate_wq);
+	prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE);
+	ret = schedule_timeout(1);
+	finish_wait(&ch->msg_allocate_wq, &wait);
+	atomic_dec(&ch->n_on_msg_allocate_wq);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		ret = ch->reason;
+		DBUG_ON(ch->reason == xpInterrupted);
+	} else if (ret == 0) {
+		ret = xpTimeout;
+	} else {
+		ret = xpInterrupted;
+	}
+
+	return ret;
+}
+
+/*
+ * Send a message that contains the user's payload on the specified channel
+ * connected to the specified partition.
+ *
+ * NOTE that this routine can sleep waiting for a message entry to become
+ * available. To not sleep, pass in the XPC_NOWAIT flag.
+ *
+ * Once sent, this routine will not wait for the message to be received, nor
+ * will notification be given when it does happen.
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # to send message on.
+ *	flags - see xp.h for valid flags.
+ *	payload - pointer to the payload which is to be sent.
+ *	payload_size - size of the payload in bytes.
+ */
+enum xp_retval
+xpc_initiate_send(short partid, int ch_number, u32 flags, void *payload,
+		  u16 payload_size)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	enum xp_retval ret = xpUnknownReason;
+
+	dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
+		partid, ch_number);
+
+	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+	DBUG_ON(payload == NULL);
+
+	if (xpc_part_ref(part)) {
+		ret = xpc_arch_ops.send_payload(&part->channels[ch_number],
+				  flags, payload, payload_size, 0, NULL, NULL);
+		xpc_part_deref(part);
+	}
+
+	return ret;
+}
+
+/*
+ * Send a message that contains the user's payload on the specified channel
+ * connected to the specified partition.
+ *
+ * NOTE that this routine can sleep waiting for a message entry to become
+ * available. To not sleep, pass in the XPC_NOWAIT flag.
+ *
+ * This routine will not wait for the message to be sent or received.
+ *
+ * Once the remote end of the channel has received the message, the function
+ * passed as an argument to xpc_initiate_send_notify() will be called. This
+ * allows the sender to free up or re-use any buffers referenced by the
+ * message, but does NOT mean the message has been processed at the remote
+ * end by a receiver.
+ *
+ * If this routine returns an error, the caller's function will NOT be called.
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # to send message on.
+ *	flags - see xp.h for valid flags.
+ *	payload - pointer to the payload which is to be sent.
+ *	payload_size - size of the payload in bytes.
+ *	func - function to call with asynchronous notification of message
+ *		  receipt. THIS FUNCTION MUST BE NON-BLOCKING.
+ *	key - user-defined key to be passed to the function when it's called.
+ */
+enum xp_retval
+xpc_initiate_send_notify(short partid, int ch_number, u32 flags, void *payload,
+			 u16 payload_size, xpc_notify_func func, void *key)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	enum xp_retval ret = xpUnknownReason;
+
+	dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
+		partid, ch_number);
+
+	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+	DBUG_ON(payload == NULL);
+	DBUG_ON(func == NULL);
+
+	if (xpc_part_ref(part)) {
+		ret = xpc_arch_ops.send_payload(&part->channels[ch_number],
+			  flags, payload, payload_size, XPC_N_CALL, func, key);
+		xpc_part_deref(part);
+	}
+	return ret;
+}
+
+/*
+ * Deliver a message's payload to its intended recipient.
+ */
+void
+xpc_deliver_payload(struct xpc_channel *ch)
+{
+	void *payload;
+
+	payload = xpc_arch_ops.get_deliverable_payload(ch);
+	if (payload != NULL) {
+
+		/*
+		 * This ref is taken to protect the payload itself from being
+		 * freed before the user is finished with it, which the user
+		 * indicates by calling xpc_initiate_received().
+		 */
+		xpc_msgqueue_ref(ch);
+
+		atomic_inc(&ch->kthreads_active);
+
+		if (ch->func != NULL) {
+			dev_dbg(xpc_chan, "ch->func() called, payload=0x%p "
+				"partid=%d channel=%d\n", payload, ch->partid,
+				ch->number);
+
+			/* deliver the message to its intended recipient */
+			ch->func(xpMsgReceived, ch->partid, ch->number, payload,
+				 ch->key);
+
+			dev_dbg(xpc_chan, "ch->func() returned, payload=0x%p "
+				"partid=%d channel=%d\n", payload, ch->partid,
+				ch->number);
+		}
+
+		atomic_dec(&ch->kthreads_active);
+	}
+}
+
+/*
+ * Acknowledge receipt of a delivered message's payload.
+ *
+ * This function, although called by users, does not call xpc_part_ref() to
+ * ensure that the partition infrastructure is in place. It relies on the
+ * fact that we called xpc_msgqueue_ref() in xpc_deliver_payload().
+ *
+ * Arguments:
+ *
+ *	partid - ID of partition to which the channel is connected.
+ *	ch_number - channel # message received on.
+ *	payload - pointer to the payload area allocated via
+ *			xpc_initiate_send() or xpc_initiate_send_notify().
+ */
+void
+xpc_initiate_received(short partid, int ch_number, void *payload)
+{
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_channel *ch;
+
+	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+	DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+
+	ch = &part->channels[ch_number];
+	xpc_arch_ops.received_payload(ch, payload);
+
+	/* the call to xpc_msgqueue_ref() was done by xpc_deliver_payload()  */
+	xpc_msgqueue_deref(ch);
+}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 000000000..83fc748a9
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1373 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) support - standard version.
+ *
+ *	XPC provides a message passing capability that crosses partition
+ *	boundaries. This module is made up of two parts:
+ *
+ *	    partition	This part detects the presence/absence of other
+ *			partitions. It provides a heartbeat and monitors
+ *			the heartbeats of other partitions.
+ *
+ *	    channel	This part manages the channels and sends/receives
+ *			messages across them to/from other partitions.
+ *
+ *	There are a couple of additional functions residing in XP, which
+ *	provide an interface to XPC for its users.
+ *
+ *
+ *	Caveats:
+ *
+ *	  . Currently on sn2, we have no way to determine which nasid an IRQ
+ *	    came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
+ *	    followed by an IPI. The amo indicates where data is to be pulled
+ *	    from, so after the IPI arrives, the remote partition checks the amo
+ *	    word. The IPI can actually arrive before the amo however, so other
+ *	    code must periodically check for this case. Also, remote amo
+ *	    operations do not reliably time out. Thus we do a remote PIO read
+ *	    solely to know whether the remote partition is down and whether we
+ *	    should stop sending IPIs to it. This remote PIO read operation is
+ *	    set up in a special nofault region so SAL knows to ignore (and
+ *	    cleanup) any errors due to the remote amo write, PIO read, and/or
+ *	    PIO write operations.
+ *
+ *	    If/when new hardware solves this IPI problem, we should abandon
+ *	    the current approach.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/kdebug.h>
+#include <linux/kthread.h>
+#include "xpc.h"
+
+#ifdef CONFIG_X86_64
+#include <asm/traps.h>
+#endif
+
+/* define two XPC debug device structures to be used with dev_dbg() et al */
+
+struct device_driver xpc_dbg_name = {
+	.name = "xpc"
+};
+
+struct device xpc_part_dbg_subname = {
+	.init_name = "",	/* set to "part" at xpc_init() time */
+	.driver = &xpc_dbg_name
+};
+
+struct device xpc_chan_dbg_subname = {
+	.init_name = "",	/* set to "chan" at xpc_init() time */
+	.driver = &xpc_dbg_name
+};
+
+struct device *xpc_part = &xpc_part_dbg_subname;
+struct device *xpc_chan = &xpc_chan_dbg_subname;
+
+static int xpc_kdebug_ignore;
+
+/* systune related variables for /proc/sys directories */
+
+static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
+static int xpc_hb_min_interval = 1;
+static int xpc_hb_max_interval = 10;
+
+static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
+static int xpc_hb_check_min_interval = 10;
+static int xpc_hb_check_max_interval = 120;
+
+int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
+static int xpc_disengage_min_timelimit;	/* = 0 */
+static int xpc_disengage_max_timelimit = 120;
+
+static struct ctl_table xpc_sys_xpc_hb_dir[] = {
+	{
+	 .procname = "hb_interval",
+	 .data = &xpc_hb_interval,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = proc_dointvec_minmax,
+	 .extra1 = &xpc_hb_min_interval,
+	 .extra2 = &xpc_hb_max_interval},
+	{
+	 .procname = "hb_check_interval",
+	 .data = &xpc_hb_check_interval,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = proc_dointvec_minmax,
+	 .extra1 = &xpc_hb_check_min_interval,
+	 .extra2 = &xpc_hb_check_max_interval},
+	{}
+};
+static struct ctl_table xpc_sys_xpc_dir[] = {
+	{
+	 .procname = "hb",
+	 .mode = 0555,
+	 .child = xpc_sys_xpc_hb_dir},
+	{
+	 .procname = "disengage_timelimit",
+	 .data = &xpc_disengage_timelimit,
+	 .maxlen = sizeof(int),
+	 .mode = 0644,
+	 .proc_handler = proc_dointvec_minmax,
+	 .extra1 = &xpc_disengage_min_timelimit,
+	 .extra2 = &xpc_disengage_max_timelimit},
+	{}
+};
+static struct ctl_table xpc_sys_dir[] = {
+	{
+	 .procname = "xpc",
+	 .mode = 0555,
+	 .child = xpc_sys_xpc_dir},
+	{}
+};
+static struct ctl_table_header *xpc_sysctl;
+
+/* non-zero if any remote partition disengage was timed out */
+int xpc_disengage_timedout;
+
+/* #of activate IRQs received and not yet processed */
+int xpc_activate_IRQ_rcvd;
+DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
+
+/* IRQ handler notifies this wait queue on receipt of an IRQ */
+DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
+
+static unsigned long xpc_hb_check_timeout;
+static struct timer_list xpc_hb_timer;
+
+/* notification that the xpc_hb_checker thread has exited */
+static DECLARE_COMPLETION(xpc_hb_checker_exited);
+
+/* notification that the xpc_discovery thread has exited */
+static DECLARE_COMPLETION(xpc_discovery_exited);
+
+static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
+
+static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_reboot_notifier = {
+	.notifier_call = xpc_system_reboot,
+};
+
+static int xpc_system_die(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_die_notifier = {
+	.notifier_call = xpc_system_die,
+};
+
+struct xpc_arch_operations xpc_arch_ops;
+
+/*
+ * Timer function to enforce the timelimit on the partition disengage.
+ */
+static void
+xpc_timeout_partition_disengage(struct timer_list *t)
+{
+	struct xpc_partition *part = from_timer(part, t, disengage_timer);
+
+	DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
+
+	(void)xpc_partition_disengaged(part);
+
+	DBUG_ON(part->disengage_timeout != 0);
+	DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
+}
+
+/*
+ * Timer to produce the heartbeat.  The timer structures function is
+ * already set when this is initially called.  A tunable is used to
+ * specify when the next timeout should occur.
+ */
+static void
+xpc_hb_beater(struct timer_list *unused)
+{
+	xpc_arch_ops.increment_heartbeat();
+
+	if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
+		wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+	xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
+	add_timer(&xpc_hb_timer);
+}
+
+static void
+xpc_start_hb_beater(void)
+{
+	xpc_arch_ops.heartbeat_init();
+	timer_setup(&xpc_hb_timer, xpc_hb_beater, 0);
+	xpc_hb_beater(0);
+}
+
+static void
+xpc_stop_hb_beater(void)
+{
+	del_timer_sync(&xpc_hb_timer);
+	xpc_arch_ops.heartbeat_exit();
+}
+
+/*
+ * At periodic intervals, scan through all active partitions and ensure
+ * their heartbeat is still active.  If not, the partition is deactivated.
+ */
+static void
+xpc_check_remote_hb(void)
+{
+	struct xpc_partition *part;
+	short partid;
+	enum xp_retval ret;
+
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+
+		if (xpc_exiting)
+			break;
+
+		if (partid == xp_partition_id)
+			continue;
+
+		part = &xpc_partitions[partid];
+
+		if (part->act_state == XPC_P_AS_INACTIVE ||
+		    part->act_state == XPC_P_AS_DEACTIVATING) {
+			continue;
+		}
+
+		ret = xpc_arch_ops.get_remote_heartbeat(part);
+		if (ret != xpSuccess)
+			XPC_DEACTIVATE_PARTITION(part, ret);
+	}
+}
+
+/*
+ * This thread is responsible for nearly all of the partition
+ * activation/deactivation.
+ */
+static int
+xpc_hb_checker(void *ignore)
+{
+	int force_IRQ = 0;
+
+	/* this thread was marked active by xpc_hb_init() */
+
+	set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
+
+	/* set our heartbeating to other partitions into motion */
+	xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
+	xpc_start_hb_beater();
+
+	while (!xpc_exiting) {
+
+		dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
+			"been received\n",
+			(int)(xpc_hb_check_timeout - jiffies),
+			xpc_activate_IRQ_rcvd);
+
+		/* checking of remote heartbeats is skewed by IRQ handling */
+		if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
+			xpc_hb_check_timeout = jiffies +
+			    (xpc_hb_check_interval * HZ);
+
+			dev_dbg(xpc_part, "checking remote heartbeats\n");
+			xpc_check_remote_hb();
+
+			/*
+			 * On sn2 we need to periodically recheck to ensure no
+			 * IRQ/amo pairs have been missed.
+			 */
+			if (is_shub())
+				force_IRQ = 1;
+		}
+
+		/* check for outstanding IRQs */
+		if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
+			force_IRQ = 0;
+			dev_dbg(xpc_part, "processing activate IRQs "
+				"received\n");
+			xpc_arch_ops.process_activate_IRQ_rcvd();
+		}
+
+		/* wait for IRQ or timeout */
+		(void)wait_event_interruptible(xpc_activate_IRQ_wq,
+					       (time_is_before_eq_jiffies(
+						xpc_hb_check_timeout) ||
+						xpc_activate_IRQ_rcvd > 0 ||
+						xpc_exiting));
+	}
+
+	xpc_stop_hb_beater();
+
+	dev_dbg(xpc_part, "heartbeat checker is exiting\n");
+
+	/* mark this thread as having exited */
+	complete(&xpc_hb_checker_exited);
+	return 0;
+}
+
+/*
+ * This thread will attempt to discover other partitions to activate
+ * based on info provided by SAL. This new thread is short lived and
+ * will exit once discovery is complete.
+ */
+static int
+xpc_initiate_discovery(void *ignore)
+{
+	xpc_discovery();
+
+	dev_dbg(xpc_part, "discovery thread is exiting\n");
+
+	/* mark this thread as having exited */
+	complete(&xpc_discovery_exited);
+	return 0;
+}
+
+/*
+ * The first kthread assigned to a newly activated partition is the one
+ * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
+ * that kthread until the partition is brought down, at which time that kthread
+ * returns back to XPC HB. (The return of that kthread will signify to XPC HB
+ * that XPC has dismantled all communication infrastructure for the associated
+ * partition.) This kthread becomes the channel manager for that partition.
+ *
+ * Each active partition has a channel manager, who, besides connecting and
+ * disconnecting channels, will ensure that each of the partition's connected
+ * channels has the required number of assigned kthreads to get the work done.
+ */
+static void
+xpc_channel_mgr(struct xpc_partition *part)
+{
+	while (part->act_state != XPC_P_AS_DEACTIVATING ||
+	       atomic_read(&part->nchannels_active) > 0 ||
+	       !xpc_partition_disengaged(part)) {
+
+		xpc_process_sent_chctl_flags(part);
+
+		/*
+		 * Wait until we've been requested to activate kthreads or
+		 * all of the channel's message queues have been torn down or
+		 * a signal is pending.
+		 *
+		 * The channel_mgr_requests is set to 1 after being awakened,
+		 * This is done to prevent the channel mgr from making one pass
+		 * through the loop for each request, since he will
+		 * be servicing all the requests in one pass. The reason it's
+		 * set to 1 instead of 0 is so that other kthreads will know
+		 * that the channel mgr is running and won't bother trying to
+		 * wake him up.
+		 */
+		atomic_dec(&part->channel_mgr_requests);
+		(void)wait_event_interruptible(part->channel_mgr_wq,
+				(atomic_read(&part->channel_mgr_requests) > 0 ||
+				 part->chctl.all_flags != 0 ||
+				 (part->act_state == XPC_P_AS_DEACTIVATING &&
+				 atomic_read(&part->nchannels_active) == 0 &&
+				 xpc_partition_disengaged(part))));
+		atomic_set(&part->channel_mgr_requests, 1);
+	}
+}
+
+/*
+ * Guarantee that the kzalloc'd memory is cacheline aligned.
+ */
+void *
+xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kzalloc will give us cachline aligned memory by default */
+	*base = kzalloc(size, flags);
+	if (*base == NULL)
+		return NULL;
+
+	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+		return *base;
+
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kzalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL)
+		return NULL;
+
+	return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Setup the channel structures necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+static enum xp_retval
+xpc_setup_ch_structures(struct xpc_partition *part)
+{
+	enum xp_retval ret;
+	int ch_number;
+	struct xpc_channel *ch;
+	short partid = XPC_PARTID(part);
+
+	/*
+	 * Allocate all of the channel structures as a contiguous chunk of
+	 * memory.
+	 */
+	DBUG_ON(part->channels != NULL);
+	part->channels = kcalloc(XPC_MAX_NCHANNELS,
+				 sizeof(struct xpc_channel),
+				 GFP_KERNEL);
+	if (part->channels == NULL) {
+		dev_err(xpc_chan, "can't get memory for channels\n");
+		return xpNoMemory;
+	}
+
+	/* allocate the remote open and close args */
+
+	part->remote_openclose_args =
+	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
+					  GFP_KERNEL, &part->
+					  remote_openclose_args_base);
+	if (part->remote_openclose_args == NULL) {
+		dev_err(xpc_chan, "can't get memory for remote connect args\n");
+		ret = xpNoMemory;
+		goto out_1;
+	}
+
+	part->chctl.all_flags = 0;
+	spin_lock_init(&part->chctl_lock);
+
+	atomic_set(&part->channel_mgr_requests, 1);
+	init_waitqueue_head(&part->channel_mgr_wq);
+
+	part->nchannels = XPC_MAX_NCHANNELS;
+
+	atomic_set(&part->nchannels_active, 0);
+	atomic_set(&part->nchannels_engaged, 0);
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch = &part->channels[ch_number];
+
+		ch->partid = partid;
+		ch->number = ch_number;
+		ch->flags = XPC_C_DISCONNECTED;
+
+		atomic_set(&ch->kthreads_assigned, 0);
+		atomic_set(&ch->kthreads_idle, 0);
+		atomic_set(&ch->kthreads_active, 0);
+
+		atomic_set(&ch->references, 0);
+		atomic_set(&ch->n_to_notify, 0);
+
+		spin_lock_init(&ch->lock);
+		init_completion(&ch->wdisconnect_wait);
+
+		atomic_set(&ch->n_on_msg_allocate_wq, 0);
+		init_waitqueue_head(&ch->msg_allocate_wq);
+		init_waitqueue_head(&ch->idle_wq);
+	}
+
+	ret = xpc_arch_ops.setup_ch_structures(part);
+	if (ret != xpSuccess)
+		goto out_2;
+
+	/*
+	 * With the setting of the partition setup_state to XPC_P_SS_SETUP,
+	 * we're declaring that this partition is ready to go.
+	 */
+	part->setup_state = XPC_P_SS_SETUP;
+
+	return xpSuccess;
+
+	/* setup of ch structures failed */
+out_2:
+	kfree(part->remote_openclose_args_base);
+	part->remote_openclose_args = NULL;
+out_1:
+	kfree(part->channels);
+	part->channels = NULL;
+	return ret;
+}
+
+/*
+ * Teardown the channel structures necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+static void
+xpc_teardown_ch_structures(struct xpc_partition *part)
+{
+	DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
+	DBUG_ON(atomic_read(&part->nchannels_active) != 0);
+
+	/*
+	 * Make this partition inaccessible to local processes by marking it
+	 * as no longer setup. Then wait before proceeding with the teardown
+	 * until all existing references cease.
+	 */
+	DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
+	part->setup_state = XPC_P_SS_WTEARDOWN;
+
+	wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
+
+	/* now we can begin tearing down the infrastructure */
+
+	xpc_arch_ops.teardown_ch_structures(part);
+
+	kfree(part->remote_openclose_args_base);
+	part->remote_openclose_args = NULL;
+	kfree(part->channels);
+	part->channels = NULL;
+
+	part->setup_state = XPC_P_SS_TORNDOWN;
+}
+
+/*
+ * When XPC HB determines that a partition has come up, it will create a new
+ * kthread and that kthread will call this function to attempt to set up the
+ * basic infrastructure used for Cross Partition Communication with the newly
+ * upped partition.
+ *
+ * The kthread that was created by XPC HB and which setup the XPC
+ * infrastructure will remain assigned to the partition becoming the channel
+ * manager for that partition until the partition is deactivating, at which
+ * time the kthread will teardown the XPC infrastructure and then exit.
+ */
+static int
+xpc_activating(void *__partid)
+{
+	short partid = (u64)__partid;
+	struct xpc_partition *part = &xpc_partitions[partid];
+	unsigned long irq_flags;
+
+	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	if (part->act_state == XPC_P_AS_DEACTIVATING) {
+		part->act_state = XPC_P_AS_INACTIVE;
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		part->remote_rp_pa = 0;
+		return 0;
+	}
+
+	/* indicate the thread is activating */
+	DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
+	part->act_state = XPC_P_AS_ACTIVATING;
+
+	XPC_SET_REASON(part, 0, 0);
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	dev_dbg(xpc_part, "activating partition %d\n", partid);
+
+	xpc_arch_ops.allow_hb(partid);
+
+	if (xpc_setup_ch_structures(part) == xpSuccess) {
+		(void)xpc_part_ref(part);	/* this will always succeed */
+
+		if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
+			xpc_mark_partition_active(part);
+			xpc_channel_mgr(part);
+			/* won't return until partition is deactivating */
+		}
+
+		xpc_part_deref(part);
+		xpc_teardown_ch_structures(part);
+	}
+
+	xpc_arch_ops.disallow_hb(partid);
+	xpc_mark_partition_inactive(part);
+
+	if (part->reason == xpReactivating) {
+		/* interrupting ourselves results in activating partition */
+		xpc_arch_ops.request_partition_reactivation(part);
+	}
+
+	return 0;
+}
+
+void
+xpc_activate_partition(struct xpc_partition *part)
+{
+	short partid = XPC_PARTID(part);
+	unsigned long irq_flags;
+	struct task_struct *kthread;
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
+
+	part->act_state = XPC_P_AS_ACTIVATION_REQ;
+	XPC_SET_REASON(part, xpCloneKThread, __LINE__);
+
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
+			      partid);
+	if (IS_ERR(kthread)) {
+		spin_lock_irqsave(&part->act_lock, irq_flags);
+		part->act_state = XPC_P_AS_INACTIVE;
+		XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+	}
+}
+
+void
+xpc_activate_kthreads(struct xpc_channel *ch, int needed)
+{
+	int idle = atomic_read(&ch->kthreads_idle);
+	int assigned = atomic_read(&ch->kthreads_assigned);
+	int wakeup;
+
+	DBUG_ON(needed <= 0);
+
+	if (idle > 0) {
+		wakeup = (needed > idle) ? idle : needed;
+		needed -= wakeup;
+
+		dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
+			"channel=%d\n", wakeup, ch->partid, ch->number);
+
+		/* only wakeup the requested number of kthreads */
+		wake_up_nr(&ch->idle_wq, wakeup);
+	}
+
+	if (needed <= 0)
+		return;
+
+	if (needed + assigned > ch->kthreads_assigned_limit) {
+		needed = ch->kthreads_assigned_limit - assigned;
+		if (needed <= 0)
+			return;
+	}
+
+	dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
+		needed, ch->partid, ch->number);
+
+	xpc_create_kthreads(ch, needed, 0);
+}
+
+/*
+ * This function is where XPC's kthreads wait for messages to deliver.
+ */
+static void
+xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
+{
+	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
+		xpc_arch_ops.n_of_deliverable_payloads;
+
+	do {
+		/* deliver messages to their intended recipients */
+
+		while (n_of_deliverable_payloads(ch) > 0 &&
+		       !(ch->flags & XPC_C_DISCONNECTING)) {
+			xpc_deliver_payload(ch);
+		}
+
+		if (atomic_inc_return(&ch->kthreads_idle) >
+		    ch->kthreads_idle_limit) {
+			/* too many idle kthreads on this channel */
+			atomic_dec(&ch->kthreads_idle);
+			break;
+		}
+
+		dev_dbg(xpc_chan, "idle kthread calling "
+			"wait_event_interruptible_exclusive()\n");
+
+		(void)wait_event_interruptible_exclusive(ch->idle_wq,
+				(n_of_deliverable_payloads(ch) > 0 ||
+				 (ch->flags & XPC_C_DISCONNECTING)));
+
+		atomic_dec(&ch->kthreads_idle);
+
+	} while (!(ch->flags & XPC_C_DISCONNECTING));
+}
+
+static int
+xpc_kthread_start(void *args)
+{
+	short partid = XPC_UNPACK_ARG1(args);
+	u16 ch_number = XPC_UNPACK_ARG2(args);
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_channel *ch;
+	int n_needed;
+	unsigned long irq_flags;
+	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
+		xpc_arch_ops.n_of_deliverable_payloads;
+
+	dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
+		partid, ch_number);
+
+	ch = &part->channels[ch_number];
+
+	if (!(ch->flags & XPC_C_DISCONNECTING)) {
+
+		/* let registerer know that connection has been established */
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
+			ch->flags |= XPC_C_CONNECTEDCALLOUT;
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			xpc_connected_callout(ch);
+
+			spin_lock_irqsave(&ch->lock, irq_flags);
+			ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+			/*
+			 * It is possible that while the callout was being
+			 * made that the remote partition sent some messages.
+			 * If that is the case, we may need to activate
+			 * additional kthreads to help deliver them. We only
+			 * need one less than total #of messages to deliver.
+			 */
+			n_needed = n_of_deliverable_payloads(ch) - 1;
+			if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
+				xpc_activate_kthreads(ch, n_needed);
+
+		} else {
+			spin_unlock_irqrestore(&ch->lock, irq_flags);
+		}
+
+		xpc_kthread_waitmsgs(part, ch);
+	}
+
+	/* let registerer know that connection is disconnecting */
+
+	spin_lock_irqsave(&ch->lock, irq_flags);
+	if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+	    !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+		ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+		xpc_disconnect_callout(ch, xpDisconnecting);
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
+	}
+	spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+	if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+	    atomic_dec_return(&part->nchannels_engaged) == 0) {
+		xpc_arch_ops.indicate_partition_disengaged(part);
+	}
+
+	xpc_msgqueue_deref(ch);
+
+	dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
+		partid, ch_number);
+
+	xpc_part_deref(part);
+	return 0;
+}
+
+/*
+ * For each partition that XPC has established communications with, there is
+ * a minimum of one kernel thread assigned to perform any operation that
+ * may potentially sleep or block (basically the callouts to the asynchronous
+ * functions registered via xpc_connect()).
+ *
+ * Additional kthreads are created and destroyed by XPC as the workload
+ * demands.
+ *
+ * A kthread is assigned to one of the active channels that exists for a given
+ * partition.
+ */
+void
+xpc_create_kthreads(struct xpc_channel *ch, int needed,
+		    int ignore_disconnecting)
+{
+	unsigned long irq_flags;
+	u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	struct task_struct *kthread;
+	void (*indicate_partition_disengaged) (struct xpc_partition *) =
+		xpc_arch_ops.indicate_partition_disengaged;
+
+	while (needed-- > 0) {
+
+		/*
+		 * The following is done on behalf of the newly created
+		 * kthread. That kthread is responsible for doing the
+		 * counterpart to the following before it exits.
+		 */
+		if (ignore_disconnecting) {
+			if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
+				/* kthreads assigned had gone to zero */
+				BUG_ON(!(ch->flags &
+					 XPC_C_DISCONNECTINGCALLOUT_MADE));
+				break;
+			}
+
+		} else if (ch->flags & XPC_C_DISCONNECTING) {
+			break;
+
+		} else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
+			   atomic_inc_return(&part->nchannels_engaged) == 1) {
+			xpc_arch_ops.indicate_partition_engaged(part);
+		}
+		(void)xpc_part_ref(part);
+		xpc_msgqueue_ref(ch);
+
+		kthread = kthread_run(xpc_kthread_start, (void *)args,
+				      "xpc%02dc%d", ch->partid, ch->number);
+		if (IS_ERR(kthread)) {
+			/* the fork failed */
+
+			/*
+			 * NOTE: if (ignore_disconnecting &&
+			 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
+			 * then we'll deadlock if all other kthreads assigned
+			 * to this channel are blocked in the channel's
+			 * registerer, because the only thing that will unblock
+			 * them is the xpDisconnecting callout that this
+			 * failed kthread_run() would have made.
+			 */
+
+			if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+			    atomic_dec_return(&part->nchannels_engaged) == 0) {
+				indicate_partition_disengaged(part);
+			}
+			xpc_msgqueue_deref(ch);
+			xpc_part_deref(part);
+
+			if (atomic_read(&ch->kthreads_assigned) <
+			    ch->kthreads_idle_limit) {
+				/*
+				 * Flag this as an error only if we have an
+				 * insufficient #of kthreads for the channel
+				 * to function.
+				 */
+				spin_lock_irqsave(&ch->lock, irq_flags);
+				XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
+						       &irq_flags);
+				spin_unlock_irqrestore(&ch->lock, irq_flags);
+			}
+			break;
+		}
+	}
+}
+
+void
+xpc_disconnect_wait(int ch_number)
+{
+	unsigned long irq_flags;
+	short partid;
+	struct xpc_partition *part;
+	struct xpc_channel *ch;
+	int wakeup_channel_mgr;
+
+	/* now wait for all callouts to the caller's function to cease */
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (!xpc_part_ref(part))
+			continue;
+
+		ch = &part->channels[ch_number];
+
+		if (!(ch->flags & XPC_C_WDISCONNECT)) {
+			xpc_part_deref(part);
+			continue;
+		}
+
+		wait_for_completion(&ch->wdisconnect_wait);
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+		wakeup_channel_mgr = 0;
+
+		if (ch->delayed_chctl_flags) {
+			if (part->act_state != XPC_P_AS_DEACTIVATING) {
+				spin_lock(&part->chctl_lock);
+				part->chctl.flags[ch->number] |=
+				    ch->delayed_chctl_flags;
+				spin_unlock(&part->chctl_lock);
+				wakeup_channel_mgr = 1;
+			}
+			ch->delayed_chctl_flags = 0;
+		}
+
+		ch->flags &= ~XPC_C_WDISCONNECT;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+		if (wakeup_channel_mgr)
+			xpc_wakeup_channel_mgr(part);
+
+		xpc_part_deref(part);
+	}
+}
+
+static int
+xpc_setup_partitions(void)
+{
+	short partid;
+	struct xpc_partition *part;
+
+	xpc_partitions = kcalloc(xp_max_npartitions,
+				 sizeof(struct xpc_partition),
+				 GFP_KERNEL);
+	if (xpc_partitions == NULL) {
+		dev_err(xpc_part, "can't get memory for partition structure\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * The first few fields of each entry of xpc_partitions[] need to
+	 * be initialized now so that calls to xpc_connect() and
+	 * xpc_disconnect() can be made prior to the activation of any remote
+	 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
+	 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
+	 * PARTITION HAS BEEN ACTIVATED.
+	 */
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+		part = &xpc_partitions[partid];
+
+		DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
+
+		part->activate_IRQ_rcvd = 0;
+		spin_lock_init(&part->act_lock);
+		part->act_state = XPC_P_AS_INACTIVE;
+		XPC_SET_REASON(part, 0, 0);
+
+		timer_setup(&part->disengage_timer,
+			    xpc_timeout_partition_disengage, 0);
+
+		part->setup_state = XPC_P_SS_UNSET;
+		init_waitqueue_head(&part->teardown_wq);
+		atomic_set(&part->references, 0);
+	}
+
+	return xpc_arch_ops.setup_partitions();
+}
+
+static void
+xpc_teardown_partitions(void)
+{
+	xpc_arch_ops.teardown_partitions();
+	kfree(xpc_partitions);
+}
+
+static void
+xpc_do_exit(enum xp_retval reason)
+{
+	short partid;
+	int active_part_count, printed_waiting_msg = 0;
+	struct xpc_partition *part;
+	unsigned long printmsg_time, disengage_timeout = 0;
+
+	/* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
+	DBUG_ON(xpc_exiting == 1);
+
+	/*
+	 * Let the heartbeat checker thread and the discovery thread
+	 * (if one is running) know that they should exit. Also wake up
+	 * the heartbeat checker thread in case it's sleeping.
+	 */
+	xpc_exiting = 1;
+	wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+	/* wait for the discovery thread to exit */
+	wait_for_completion(&xpc_discovery_exited);
+
+	/* wait for the heartbeat checker thread to exit */
+	wait_for_completion(&xpc_hb_checker_exited);
+
+	/* sleep for a 1/3 of a second or so */
+	(void)msleep_interruptible(300);
+
+	/* wait for all partitions to become inactive */
+
+	printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
+	xpc_disengage_timedout = 0;
+
+	do {
+		active_part_count = 0;
+
+		for (partid = 0; partid < xp_max_npartitions; partid++) {
+			part = &xpc_partitions[partid];
+
+			if (xpc_partition_disengaged(part) &&
+			    part->act_state == XPC_P_AS_INACTIVE) {
+				continue;
+			}
+
+			active_part_count++;
+
+			XPC_DEACTIVATE_PARTITION(part, reason);
+
+			if (part->disengage_timeout > disengage_timeout)
+				disengage_timeout = part->disengage_timeout;
+		}
+
+		if (xpc_arch_ops.any_partition_engaged()) {
+			if (time_is_before_jiffies(printmsg_time)) {
+				dev_info(xpc_part, "waiting for remote "
+					 "partitions to deactivate, timeout in "
+					 "%ld seconds\n", (disengage_timeout -
+					 jiffies) / HZ);
+				printmsg_time = jiffies +
+				    (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
+				printed_waiting_msg = 1;
+			}
+
+		} else if (active_part_count > 0) {
+			if (printed_waiting_msg) {
+				dev_info(xpc_part, "waiting for local partition"
+					 " to deactivate\n");
+				printed_waiting_msg = 0;
+			}
+
+		} else {
+			if (!xpc_disengage_timedout) {
+				dev_info(xpc_part, "all partitions have "
+					 "deactivated\n");
+			}
+			break;
+		}
+
+		/* sleep for a 1/3 of a second or so */
+		(void)msleep_interruptible(300);
+
+	} while (1);
+
+	DBUG_ON(xpc_arch_ops.any_partition_engaged());
+
+	xpc_teardown_rsvd_page();
+
+	if (reason == xpUnloading) {
+		(void)unregister_die_notifier(&xpc_die_notifier);
+		(void)unregister_reboot_notifier(&xpc_reboot_notifier);
+	}
+
+	/* clear the interface to XPC's functions */
+	xpc_clear_interface();
+
+	if (xpc_sysctl)
+		unregister_sysctl_table(xpc_sysctl);
+
+	xpc_teardown_partitions();
+
+	if (is_shub())
+		xpc_exit_sn2();
+	else if (is_uv())
+		xpc_exit_uv();
+}
+
+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
+{
+	enum xp_retval reason;
+
+	switch (event) {
+	case SYS_RESTART:
+		reason = xpSystemReboot;
+		break;
+	case SYS_HALT:
+		reason = xpSystemHalt;
+		break;
+	case SYS_POWER_OFF:
+		reason = xpSystemPoweroff;
+		break;
+	default:
+		reason = xpSystemGoingDown;
+	}
+
+	xpc_do_exit(reason);
+	return NOTIFY_DONE;
+}
+
+/* Used to only allow one cpu to complete disconnect */
+static unsigned int xpc_die_disconnecting;
+
+/*
+ * Notify other partitions to deactivate from us by first disengaging from all
+ * references to our memory.
+ */
+static void
+xpc_die_deactivate(void)
+{
+	struct xpc_partition *part;
+	short partid;
+	int any_engaged;
+	long keep_waiting;
+	long wait_to_print;
+
+	if (cmpxchg(&xpc_die_disconnecting, 0, 1))
+		return;
+
+	/* keep xpc_hb_checker thread from doing anything (just in case) */
+	xpc_exiting = 1;
+
+	xpc_arch_ops.disallow_all_hbs();   /*indicate we're deactivated */
+
+	for (partid = 0; partid < xp_max_npartitions; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (xpc_arch_ops.partition_engaged(partid) ||
+		    part->act_state != XPC_P_AS_INACTIVE) {
+			xpc_arch_ops.request_partition_deactivation(part);
+			xpc_arch_ops.indicate_partition_disengaged(part);
+		}
+	}
+
+	/*
+	 * Though we requested that all other partitions deactivate from us,
+	 * we only wait until they've all disengaged or we've reached the
+	 * defined timelimit.
+	 *
+	 * Given that one iteration through the following while-loop takes
+	 * approximately 200 microseconds, calculate the #of loops to take
+	 * before bailing and the #of loops before printing a waiting message.
+	 */
+	keep_waiting = xpc_disengage_timelimit * 1000 * 5;
+	wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
+
+	while (1) {
+		any_engaged = xpc_arch_ops.any_partition_engaged();
+		if (!any_engaged) {
+			dev_info(xpc_part, "all partitions have deactivated\n");
+			break;
+		}
+
+		if (!keep_waiting--) {
+			for (partid = 0; partid < xp_max_npartitions;
+			     partid++) {
+				if (xpc_arch_ops.partition_engaged(partid)) {
+					dev_info(xpc_part, "deactivate from "
+						 "remote partition %d timed "
+						 "out\n", partid);
+				}
+			}
+			break;
+		}
+
+		if (!wait_to_print--) {
+			dev_info(xpc_part, "waiting for remote partitions to "
+				 "deactivate, timeout in %ld seconds\n",
+				 keep_waiting / (1000 * 5));
+			wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
+			    1000 * 5;
+		}
+
+		udelay(200);
+	}
+}
+
+/*
+ * This function is called when the system is being restarted or halted due
+ * to some sort of system failure. If this is the case we need to notify the
+ * other partitions to disengage from all references to our memory.
+ * This function can also be called when our heartbeater could be offlined
+ * for a time. In this case we need to notify other partitions to not worry
+ * about the lack of a heartbeat.
+ */
+static int
+xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
+{
+#ifdef CONFIG_IA64		/* !!! temporary kludge */
+	switch (event) {
+	case DIE_MACHINE_RESTART:
+	case DIE_MACHINE_HALT:
+		xpc_die_deactivate();
+		break;
+
+	case DIE_KDEBUG_ENTER:
+		/* Should lack of heartbeat be ignored by other partitions? */
+		if (!xpc_kdebug_ignore)
+			break;
+
+		/* fall through */
+	case DIE_MCA_MONARCH_ENTER:
+	case DIE_INIT_MONARCH_ENTER:
+		xpc_arch_ops.offline_heartbeat();
+		break;
+
+	case DIE_KDEBUG_LEAVE:
+		/* Is lack of heartbeat being ignored by other partitions? */
+		if (!xpc_kdebug_ignore)
+			break;
+
+		/* fall through */
+	case DIE_MCA_MONARCH_LEAVE:
+	case DIE_INIT_MONARCH_LEAVE:
+		xpc_arch_ops.online_heartbeat();
+		break;
+	}
+#else
+	struct die_args *die_args = _die_args;
+
+	switch (event) {
+	case DIE_TRAP:
+		if (die_args->trapnr == X86_TRAP_DF)
+			xpc_die_deactivate();
+
+		if (((die_args->trapnr == X86_TRAP_MF) ||
+		     (die_args->trapnr == X86_TRAP_XF)) &&
+		    !user_mode(die_args->regs))
+			xpc_die_deactivate();
+
+		break;
+	case DIE_INT3:
+	case DIE_DEBUG:
+		break;
+	case DIE_OOPS:
+	case DIE_GPF:
+	default:
+		xpc_die_deactivate();
+	}
+#endif
+
+	return NOTIFY_DONE;
+}
+
+int __init
+xpc_init(void)
+{
+	int ret;
+	struct task_struct *kthread;
+
+	dev_set_name(xpc_part, "part");
+	dev_set_name(xpc_chan, "chan");
+
+	if (is_shub()) {
+		/*
+		 * The ia64-sn2 architecture supports at most 64 partitions.
+		 * And the inability to unregister remote amos restricts us
+		 * further to only support exactly 64 partitions on this
+		 * architecture, no less.
+		 */
+		if (xp_max_npartitions != 64) {
+			dev_err(xpc_part, "max #of partitions not set to 64\n");
+			ret = -EINVAL;
+		} else {
+			ret = xpc_init_sn2();
+		}
+
+	} else if (is_uv()) {
+		ret = xpc_init_uv();
+
+	} else {
+		ret = -ENODEV;
+	}
+
+	if (ret != 0)
+		return ret;
+
+	ret = xpc_setup_partitions();
+	if (ret != 0) {
+		dev_err(xpc_part, "can't get memory for partition structure\n");
+		goto out_1;
+	}
+
+	xpc_sysctl = register_sysctl_table(xpc_sys_dir);
+
+	/*
+	 * Fill the partition reserved page with the information needed by
+	 * other partitions to discover we are alive and establish initial
+	 * communications.
+	 */
+	ret = xpc_setup_rsvd_page();
+	if (ret != 0) {
+		dev_err(xpc_part, "can't setup our reserved page\n");
+		goto out_2;
+	}
+
+	/* add ourselves to the reboot_notifier_list */
+	ret = register_reboot_notifier(&xpc_reboot_notifier);
+	if (ret != 0)
+		dev_warn(xpc_part, "can't register reboot notifier\n");
+
+	/* add ourselves to the die_notifier list */
+	ret = register_die_notifier(&xpc_die_notifier);
+	if (ret != 0)
+		dev_warn(xpc_part, "can't register die notifier\n");
+
+	/*
+	 * The real work-horse behind xpc.  This processes incoming
+	 * interrupts and monitors remote heartbeats.
+	 */
+	kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
+	if (IS_ERR(kthread)) {
+		dev_err(xpc_part, "failed while forking hb check thread\n");
+		ret = -EBUSY;
+		goto out_3;
+	}
+
+	/*
+	 * Startup a thread that will attempt to discover other partitions to
+	 * activate based on info provided by SAL. This new thread is short
+	 * lived and will exit once discovery is complete.
+	 */
+	kthread = kthread_run(xpc_initiate_discovery, NULL,
+			      XPC_DISCOVERY_THREAD_NAME);
+	if (IS_ERR(kthread)) {
+		dev_err(xpc_part, "failed while forking discovery thread\n");
+
+		/* mark this new thread as a non-starter */
+		complete(&xpc_discovery_exited);
+
+		xpc_do_exit(xpUnloading);
+		return -EBUSY;
+	}
+
+	/* set the interface to point at XPC's functions */
+	xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
+			  xpc_initiate_send, xpc_initiate_send_notify,
+			  xpc_initiate_received, xpc_initiate_partid_to_nasids);
+
+	return 0;
+
+	/* initialization was not successful */
+out_3:
+	xpc_teardown_rsvd_page();
+
+	(void)unregister_die_notifier(&xpc_die_notifier);
+	(void)unregister_reboot_notifier(&xpc_reboot_notifier);
+out_2:
+	if (xpc_sysctl)
+		unregister_sysctl_table(xpc_sysctl);
+
+	xpc_teardown_partitions();
+out_1:
+	if (is_shub())
+		xpc_exit_sn2();
+	else if (is_uv())
+		xpc_exit_uv();
+	return ret;
+}
+
+module_init(xpc_init);
+
+void __exit
+xpc_exit(void)
+{
+	xpc_do_exit(xpUnloading);
+}
+
+module_exit(xpc_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
+MODULE_LICENSE("GPL");
+
+module_param(xpc_hb_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
+		 "heartbeat increments.");
+
+module_param(xpc_hb_check_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
+		 "heartbeat checks.");
+
+module_param(xpc_disengage_timelimit, int, 0);
+MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
+		 "for disengage to complete.");
+
+module_param(xpc_kdebug_ignore, int, 0);
+MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
+		 "other partitions when dropping into kdebug.");
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 000000000..519826ba1
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,540 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) partition support.
+ *
+ *	This is the part of XPC that detects the presence/absence of
+ *	other partitions. It provides a heartbeat and monitors the
+ *	heartbeats of other partitions.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/hardirq.h>
+#include <linux/slab.h>
+#include "xpc.h"
+#include <asm/uv/uv_hub.h>
+
+/* XPC is exiting flag */
+int xpc_exiting;
+
+/* this partition's reserved page pointers */
+struct xpc_rsvd_page *xpc_rsvd_page;
+static unsigned long *xpc_part_nasids;
+unsigned long *xpc_mach_nasids;
+
+static int xpc_nasid_mask_nbytes;	/* #of bytes in nasid mask */
+int xpc_nasid_mask_nlongs;	/* #of longs in nasid mask */
+
+struct xpc_partition *xpc_partitions;
+
+/*
+ * Guarantee that the kmalloc'd memory is cacheline aligned.
+ */
+void *
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kmalloc will give us cachline aligned memory by default */
+	*base = kmalloc(size, flags);
+	if (*base == NULL)
+		return NULL;
+
+	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+		return *base;
+
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kmalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL)
+		return NULL;
+
+	return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Given a nasid, get the physical address of the  partition's reserved page
+ * for that nasid. This function returns 0 on any error.
+ */
+static unsigned long
+xpc_get_rsvd_page_pa(int nasid)
+{
+	enum xp_retval ret;
+	u64 cookie = 0;
+	unsigned long rp_pa = nasid;	/* seed with nasid */
+	size_t len = 0;
+	size_t buf_len = 0;
+	void *buf = NULL;
+	void *buf_base = NULL;
+	enum xp_retval (*get_partition_rsvd_page_pa)
+		(void *, u64 *, unsigned long *, size_t *) =
+		xpc_arch_ops.get_partition_rsvd_page_pa;
+
+	while (1) {
+
+		/* !!! rp_pa will need to be _gpa on UV.
+		 * ??? So do we save it into the architecture specific parts
+		 * ??? of the xpc_partition structure? Do we rename this
+		 * ??? function or have two versions? Rename rp_pa for UV to
+		 * ??? rp_gpa?
+		 */
+		ret = get_partition_rsvd_page_pa(buf, &cookie, &rp_pa, &len);
+
+		dev_dbg(xpc_part, "SAL returned with ret=%d, cookie=0x%016lx, "
+			"address=0x%016lx, len=0x%016lx\n", ret,
+			(unsigned long)cookie, rp_pa, len);
+
+		if (ret != xpNeedMoreInfo)
+			break;
+
+		/* !!! L1_CACHE_ALIGN() is only a sn2-bte_copy requirement */
+		if (is_shub())
+			len = L1_CACHE_ALIGN(len);
+
+		if (len > buf_len) {
+			if (buf_base != NULL)
+				kfree(buf_base);
+			buf_len = L1_CACHE_ALIGN(len);
+			buf = xpc_kmalloc_cacheline_aligned(buf_len, GFP_KERNEL,
+							    &buf_base);
+			if (buf_base == NULL) {
+				dev_err(xpc_part, "unable to kmalloc "
+					"len=0x%016lx\n", buf_len);
+				ret = xpNoMemory;
+				break;
+			}
+		}
+
+		ret = xp_remote_memcpy(xp_pa(buf), rp_pa, len);
+		if (ret != xpSuccess) {
+			dev_dbg(xpc_part, "xp_remote_memcpy failed %d\n", ret);
+			break;
+		}
+	}
+
+	kfree(buf_base);
+
+	if (ret != xpSuccess)
+		rp_pa = 0;
+
+	dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
+	return rp_pa;
+}
+
+/*
+ * Fill the partition reserved page with the information needed by
+ * other partitions to discover we are alive and establish initial
+ * communications.
+ */
+int
+xpc_setup_rsvd_page(void)
+{
+	int ret;
+	struct xpc_rsvd_page *rp;
+	unsigned long rp_pa;
+	unsigned long new_ts_jiffies;
+
+	/* get the local reserved page's address */
+
+	preempt_disable();
+	rp_pa = xpc_get_rsvd_page_pa(xp_cpu_to_nasid(smp_processor_id()));
+	preempt_enable();
+	if (rp_pa == 0) {
+		dev_err(xpc_part, "SAL failed to locate the reserved page\n");
+		return -ESRCH;
+	}
+	rp = (struct xpc_rsvd_page *)__va(xp_socket_pa(rp_pa));
+
+	if (rp->SAL_version < 3) {
+		/* SAL_versions < 3 had a SAL_partid defined as a u8 */
+		rp->SAL_partid &= 0xff;
+	}
+	BUG_ON(rp->SAL_partid != xp_partition_id);
+
+	if (rp->SAL_partid < 0 || rp->SAL_partid >= xp_max_npartitions) {
+		dev_err(xpc_part, "the reserved page's partid of %d is outside "
+			"supported range (< 0 || >= %d)\n", rp->SAL_partid,
+			xp_max_npartitions);
+		return -EINVAL;
+	}
+
+	rp->version = XPC_RP_VERSION;
+	rp->max_npartitions = xp_max_npartitions;
+
+	/* establish the actual sizes of the nasid masks */
+	if (rp->SAL_version == 1) {
+		/* SAL_version 1 didn't set the nasids_size field */
+		rp->SAL_nasids_size = 128;
+	}
+	xpc_nasid_mask_nbytes = rp->SAL_nasids_size;
+	xpc_nasid_mask_nlongs = BITS_TO_LONGS(rp->SAL_nasids_size *
+					      BITS_PER_BYTE);
+
+	/* setup the pointers to the various items in the reserved page */
+	xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
+	xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
+
+	ret = xpc_arch_ops.setup_rsvd_page(rp);
+	if (ret != 0)
+		return ret;
+
+	/*
+	 * Set timestamp of when reserved page was setup by XPC.
+	 * This signifies to the remote partition that our reserved
+	 * page is initialized.
+	 */
+	new_ts_jiffies = jiffies;
+	if (new_ts_jiffies == 0 || new_ts_jiffies == rp->ts_jiffies)
+		new_ts_jiffies++;
+	rp->ts_jiffies = new_ts_jiffies;
+
+	xpc_rsvd_page = rp;
+	return 0;
+}
+
+void
+xpc_teardown_rsvd_page(void)
+{
+	/* a zero timestamp indicates our rsvd page is not initialized */
+	xpc_rsvd_page->ts_jiffies = 0;
+}
+
+/*
+ * Get a copy of a portion of the remote partition's rsvd page.
+ *
+ * remote_rp points to a buffer that is cacheline aligned for BTE copies and
+ * is large enough to contain a copy of their reserved page header and
+ * part_nasids mask.
+ */
+enum xp_retval
+xpc_get_remote_rp(int nasid, unsigned long *discovered_nasids,
+		  struct xpc_rsvd_page *remote_rp, unsigned long *remote_rp_pa)
+{
+	int l;
+	enum xp_retval ret;
+
+	/* get the reserved page's physical address */
+
+	*remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
+	if (*remote_rp_pa == 0)
+		return xpNoRsvdPageAddr;
+
+	/* pull over the reserved page header and part_nasids mask */
+	ret = xp_remote_memcpy(xp_pa(remote_rp), *remote_rp_pa,
+			       XPC_RP_HEADER_SIZE + xpc_nasid_mask_nbytes);
+	if (ret != xpSuccess)
+		return ret;
+
+	if (discovered_nasids != NULL) {
+		unsigned long *remote_part_nasids =
+		    XPC_RP_PART_NASIDS(remote_rp);
+
+		for (l = 0; l < xpc_nasid_mask_nlongs; l++)
+			discovered_nasids[l] |= remote_part_nasids[l];
+	}
+
+	/* zero timestamp indicates the reserved page has not been setup */
+	if (remote_rp->ts_jiffies == 0)
+		return xpRsvdPageNotSet;
+
+	if (XPC_VERSION_MAJOR(remote_rp->version) !=
+	    XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
+		return xpBadVersion;
+	}
+
+	/* check that both remote and local partids are valid for each side */
+	if (remote_rp->SAL_partid < 0 ||
+	    remote_rp->SAL_partid >= xp_max_npartitions ||
+	    remote_rp->max_npartitions <= xp_partition_id) {
+		return xpInvalidPartid;
+	}
+
+	if (remote_rp->SAL_partid == xp_partition_id)
+		return xpLocalPartid;
+
+	return xpSuccess;
+}
+
+/*
+ * See if the other side has responded to a partition deactivate request
+ * from us. Though we requested the remote partition to deactivate with regard
+ * to us, we really only need to wait for the other side to disengage from us.
+ */
+int
+xpc_partition_disengaged(struct xpc_partition *part)
+{
+	short partid = XPC_PARTID(part);
+	int disengaged;
+
+	disengaged = !xpc_arch_ops.partition_engaged(partid);
+	if (part->disengage_timeout) {
+		if (!disengaged) {
+			if (time_is_after_jiffies(part->disengage_timeout)) {
+				/* timelimit hasn't been reached yet */
+				return 0;
+			}
+
+			/*
+			 * Other side hasn't responded to our deactivate
+			 * request in a timely fashion, so assume it's dead.
+			 */
+
+			dev_info(xpc_part, "deactivate request to remote "
+				 "partition %d timed out\n", partid);
+			xpc_disengage_timedout = 1;
+			xpc_arch_ops.assume_partition_disengaged(partid);
+			disengaged = 1;
+		}
+		part->disengage_timeout = 0;
+
+		/* cancel the timer function, provided it's not us */
+		if (!in_interrupt())
+			del_singleshot_timer_sync(&part->disengage_timer);
+
+		DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING &&
+			part->act_state != XPC_P_AS_INACTIVE);
+		if (part->act_state != XPC_P_AS_INACTIVE)
+			xpc_wakeup_channel_mgr(part);
+
+		xpc_arch_ops.cancel_partition_deactivation_request(part);
+	}
+	return disengaged;
+}
+
+/*
+ * Mark specified partition as active.
+ */
+enum xp_retval
+xpc_mark_partition_active(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	enum xp_retval ret;
+
+	dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+	if (part->act_state == XPC_P_AS_ACTIVATING) {
+		part->act_state = XPC_P_AS_ACTIVE;
+		ret = xpSuccess;
+	} else {
+		DBUG_ON(part->reason == xpSuccess);
+		ret = part->reason;
+	}
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	return ret;
+}
+
+/*
+ * Start the process of deactivating the specified partition.
+ */
+void
+xpc_deactivate_partition(const int line, struct xpc_partition *part,
+			 enum xp_retval reason)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+
+	if (part->act_state == XPC_P_AS_INACTIVE) {
+		XPC_SET_REASON(part, reason, line);
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		if (reason == xpReactivating) {
+			/* we interrupt ourselves to reactivate partition */
+			xpc_arch_ops.request_partition_reactivation(part);
+		}
+		return;
+	}
+	if (part->act_state == XPC_P_AS_DEACTIVATING) {
+		if ((part->reason == xpUnloading && reason != xpUnloading) ||
+		    reason == xpReactivating) {
+			XPC_SET_REASON(part, reason, line);
+		}
+		spin_unlock_irqrestore(&part->act_lock, irq_flags);
+		return;
+	}
+
+	part->act_state = XPC_P_AS_DEACTIVATING;
+	XPC_SET_REASON(part, reason, line);
+
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+	/* ask remote partition to deactivate with regard to us */
+	xpc_arch_ops.request_partition_deactivation(part);
+
+	/* set a timelimit on the disengage phase of the deactivation request */
+	part->disengage_timeout = jiffies + (xpc_disengage_timelimit * HZ);
+	part->disengage_timer.expires = part->disengage_timeout;
+	add_timer(&part->disengage_timer);
+
+	dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
+		XPC_PARTID(part), reason);
+
+	xpc_partition_going_down(part, reason);
+}
+
+/*
+ * Mark specified partition as inactive.
+ */
+void
+xpc_mark_partition_inactive(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+
+	dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
+		XPC_PARTID(part));
+
+	spin_lock_irqsave(&part->act_lock, irq_flags);
+	part->act_state = XPC_P_AS_INACTIVE;
+	spin_unlock_irqrestore(&part->act_lock, irq_flags);
+	part->remote_rp_pa = 0;
+}
+
+/*
+ * SAL has provided a partition and machine mask.  The partition mask
+ * contains a bit for each even nasid in our partition.  The machine
+ * mask contains a bit for each even nasid in the entire machine.
+ *
+ * Using those two bit arrays, we can determine which nasids are
+ * known in the machine.  Each should also have a reserved page
+ * initialized if they are available for partitioning.
+ */
+void
+xpc_discovery(void)
+{
+	void *remote_rp_base;
+	struct xpc_rsvd_page *remote_rp;
+	unsigned long remote_rp_pa;
+	int region;
+	int region_size;
+	int max_regions;
+	int nasid;
+	unsigned long *discovered_nasids;
+	enum xp_retval ret;
+
+	remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
+						  xpc_nasid_mask_nbytes,
+						  GFP_KERNEL, &remote_rp_base);
+	if (remote_rp == NULL)
+		return;
+
+	discovered_nasids = kcalloc(xpc_nasid_mask_nlongs, sizeof(long),
+				    GFP_KERNEL);
+	if (discovered_nasids == NULL) {
+		kfree(remote_rp_base);
+		return;
+	}
+
+	/*
+	 * The term 'region' in this context refers to the minimum number of
+	 * nodes that can comprise an access protection grouping. The access
+	 * protection is in regards to memory, IOI and IPI.
+	 */
+	region_size = xp_region_size;
+
+	if (is_uv())
+		max_regions = 256;
+	else {
+		max_regions = 64;
+
+		switch (region_size) {
+		case 128:
+			max_regions *= 2;
+			/* fall through */
+		case 64:
+			max_regions *= 2;
+			/* fall through */
+		case 32:
+			max_regions *= 2;
+			region_size = 16;
+			DBUG_ON(!is_shub2());
+		}
+	}
+
+	for (region = 0; region < max_regions; region++) {
+
+		if (xpc_exiting)
+			break;
+
+		dev_dbg(xpc_part, "searching region %d\n", region);
+
+		for (nasid = (region * region_size * 2);
+		     nasid < ((region + 1) * region_size * 2); nasid += 2) {
+
+			if (xpc_exiting)
+				break;
+
+			dev_dbg(xpc_part, "checking nasid %d\n", nasid);
+
+			if (test_bit(nasid / 2, xpc_part_nasids)) {
+				dev_dbg(xpc_part, "PROM indicates Nasid %d is "
+					"part of the local partition; skipping "
+					"region\n", nasid);
+				break;
+			}
+
+			if (!(test_bit(nasid / 2, xpc_mach_nasids))) {
+				dev_dbg(xpc_part, "PROM indicates Nasid %d was "
+					"not on Numa-Link network at reset\n",
+					nasid);
+				continue;
+			}
+
+			if (test_bit(nasid / 2, discovered_nasids)) {
+				dev_dbg(xpc_part, "Nasid %d is part of a "
+					"partition which was previously "
+					"discovered\n", nasid);
+				continue;
+			}
+
+			/* pull over the rsvd page header & part_nasids mask */
+
+			ret = xpc_get_remote_rp(nasid, discovered_nasids,
+						remote_rp, &remote_rp_pa);
+			if (ret != xpSuccess) {
+				dev_dbg(xpc_part, "unable to get reserved page "
+					"from nasid %d, reason=%d\n", nasid,
+					ret);
+
+				if (ret == xpLocalPartid)
+					break;
+
+				continue;
+			}
+
+			xpc_arch_ops.request_partition_activation(remote_rp,
+							 remote_rp_pa, nasid);
+		}
+	}
+
+	kfree(discovered_nasids);
+	kfree(remote_rp_base);
+}
+
+/*
+ * Given a partid, get the nasids owned by that partition from the
+ * remote partition's reserved page.
+ */
+enum xp_retval
+xpc_initiate_partid_to_nasids(short partid, void *nasid_mask)
+{
+	struct xpc_partition *part;
+	unsigned long part_nasid_pa;
+
+	part = &xpc_partitions[partid];
+	if (part->remote_rp_pa == 0)
+		return xpPartitionDown;
+
+	memset(nasid_mask, 0, xpc_nasid_mask_nbytes);
+
+	part_nasid_pa = (unsigned long)XPC_RP_PART_NASIDS(part->remote_rp_pa);
+
+	return xp_remote_memcpy(xp_pa(nasid_mask), part_nasid_pa,
+				xpc_nasid_mask_nbytes);
+}
diff --git a/drivers/misc/sgi-xp/xpc_sn2.c b/drivers/misc/sgi-xp/xpc_sn2.c
new file mode 100644
index 000000000..5a12d2a54
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_sn2.c
@@ -0,0 +1,2459 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) sn2-based functions.
+ *
+ *     Architecture specific implementation of common functions.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <asm/uncached.h>
+#include <asm/sn/mspec.h>
+#include <asm/sn/sn_sal.h>
+#include "xpc.h"
+
+/*
+ * Define the number of u64s required to represent all the C-brick nasids
+ * as a bitmap.  The cross-partition kernel modules deal only with
+ * C-brick nasids, thus the need for bitmaps which don't account for
+ * odd-numbered (non C-brick) nasids.
+ */
+#define XPC_MAX_PHYSNODES_SN2	(MAX_NUMALINK_NODES / 2)
+#define XP_NASID_MASK_BYTES_SN2	((XPC_MAX_PHYSNODES_SN2 + 7) / 8)
+#define XP_NASID_MASK_WORDS_SN2	((XPC_MAX_PHYSNODES_SN2 + 63) / 64)
+
+/*
+ * Memory for XPC's amo variables is allocated by the MSPEC driver. These
+ * pages are located in the lowest granule. The lowest granule uses 4k pages
+ * for cached references and an alternate TLB handler to never provide a
+ * cacheable mapping for the entire region. This will prevent speculative
+ * reading of cached copies of our lines from being issued which will cause
+ * a PI FSB Protocol error to be generated by the SHUB. For XPC, we need 64
+ * amo variables (based on XP_MAX_NPARTITIONS_SN2) to identify the senders of
+ * NOTIFY IRQs, 128 amo variables (based on XP_NASID_MASK_WORDS_SN2) to identify
+ * the senders of ACTIVATE IRQs, 1 amo variable to identify which remote
+ * partitions (i.e., XPCs) consider themselves currently engaged with the
+ * local XPC and 1 amo variable to request partition deactivation.
+ */
+#define XPC_NOTIFY_IRQ_AMOS_SN2		0
+#define XPC_ACTIVATE_IRQ_AMOS_SN2	(XPC_NOTIFY_IRQ_AMOS_SN2 + \
+					 XP_MAX_NPARTITIONS_SN2)
+#define XPC_ENGAGED_PARTITIONS_AMO_SN2	(XPC_ACTIVATE_IRQ_AMOS_SN2 + \
+					 XP_NASID_MASK_WORDS_SN2)
+#define XPC_DEACTIVATE_REQUEST_AMO_SN2	(XPC_ENGAGED_PARTITIONS_AMO_SN2 + 1)
+
+/*
+ * Buffer used to store a local copy of portions of a remote partition's
+ * reserved page (either its header and part_nasids mask, or its vars).
+ */
+static void *xpc_remote_copy_buffer_base_sn2;
+static char *xpc_remote_copy_buffer_sn2;
+
+static struct xpc_vars_sn2 *xpc_vars_sn2;
+static struct xpc_vars_part_sn2 *xpc_vars_part_sn2;
+
+static int
+xpc_setup_partitions_sn2(void)
+{
+	/* nothing needs to be done */
+	return 0;
+}
+
+static void
+xpc_teardown_partitions_sn2(void)
+{
+	/* nothing needs to be done */
+}
+
+/* SH_IPI_ACCESS shub register value on startup */
+static u64 xpc_sh1_IPI_access_sn2;
+static u64 xpc_sh2_IPI_access0_sn2;
+static u64 xpc_sh2_IPI_access1_sn2;
+static u64 xpc_sh2_IPI_access2_sn2;
+static u64 xpc_sh2_IPI_access3_sn2;
+
+/*
+ * Change protections to allow IPI operations.
+ */
+static void
+xpc_allow_IPI_ops_sn2(void)
+{
+	int node;
+	int nasid;
+
+	/* !!! The following should get moved into SAL. */
+	if (is_shub2()) {
+		xpc_sh2_IPI_access0_sn2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS0));
+		xpc_sh2_IPI_access1_sn2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS1));
+		xpc_sh2_IPI_access2_sn2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS2));
+		xpc_sh2_IPI_access3_sn2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH2_IPI_ACCESS3));
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
+			      -1UL);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
+			      -1UL);
+		}
+	} else {
+		xpc_sh1_IPI_access_sn2 =
+		    (u64)HUB_L((u64 *)LOCAL_MMR_ADDR(SH1_IPI_ACCESS));
+
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
+			      -1UL);
+		}
+	}
+}
+
+/*
+ * Restrict protections to disallow IPI operations.
+ */
+static void
+xpc_disallow_IPI_ops_sn2(void)
+{
+	int node;
+	int nasid;
+
+	/* !!! The following should get moved into SAL. */
+	if (is_shub2()) {
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS0),
+			      xpc_sh2_IPI_access0_sn2);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS1),
+			      xpc_sh2_IPI_access1_sn2);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS2),
+			      xpc_sh2_IPI_access2_sn2);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH2_IPI_ACCESS3),
+			      xpc_sh2_IPI_access3_sn2);
+		}
+	} else {
+		for_each_online_node(node) {
+			nasid = cnodeid_to_nasid(node);
+			HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid, SH1_IPI_ACCESS),
+			      xpc_sh1_IPI_access_sn2);
+		}
+	}
+}
+
+/*
+ * The following set of functions are used for the sending and receiving of
+ * IRQs (also known as IPIs). There are two flavors of IRQs, one that is
+ * associated with partition activity (SGI_XPC_ACTIVATE) and the other that
+ * is associated with channel activity (SGI_XPC_NOTIFY).
+ */
+
+static u64
+xpc_receive_IRQ_amo_sn2(struct amo *amo)
+{
+	return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_CLEAR);
+}
+
+static enum xp_retval
+xpc_send_IRQ_sn2(struct amo *amo, u64 flag, int nasid, int phys_cpuid,
+		 int vector)
+{
+	int ret = 0;
+	unsigned long irq_flags;
+
+	local_irq_save(irq_flags);
+
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR, flag);
+	sn_send_IPI_phys(nasid, phys_cpuid, vector, 0);
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IRQs and amos to it until the heartbeat times out.
+	 */
+	ret = xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+
+	return (ret == 0) ? xpSuccess : xpPioReadError;
+}
+
+static struct amo *
+xpc_init_IRQ_amo_sn2(int index)
+{
+	struct amo *amo = xpc_vars_sn2->amos_page + index;
+
+	(void)xpc_receive_IRQ_amo_sn2(amo);	/* clear amo variable */
+	return amo;
+}
+
+/*
+ * Functions associated with SGI_XPC_ACTIVATE IRQ.
+ */
+
+/*
+ * Notify the heartbeat check thread that an activate IRQ has been received.
+ */
+static irqreturn_t
+xpc_handle_activate_IRQ_sn2(int irq, void *dev_id)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+	xpc_activate_IRQ_rcvd++;
+	spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+	wake_up_interruptible(&xpc_activate_IRQ_wq);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Flag the appropriate amo variable and send an IRQ to the specified node.
+ */
+static void
+xpc_send_activate_IRQ_sn2(unsigned long amos_page_pa, int from_nasid,
+			  int to_nasid, int to_phys_cpuid)
+{
+	struct amo *amos = (struct amo *)__va(amos_page_pa +
+					      (XPC_ACTIVATE_IRQ_AMOS_SN2 *
+					      sizeof(struct amo)));
+
+	(void)xpc_send_IRQ_sn2(&amos[BIT_WORD(from_nasid / 2)],
+			       BIT_MASK(from_nasid / 2), to_nasid,
+			       to_phys_cpuid, SGI_XPC_ACTIVATE);
+}
+
+static void
+xpc_send_local_activate_IRQ_sn2(int from_nasid)
+{
+	unsigned long irq_flags;
+	struct amo *amos = (struct amo *)__va(xpc_vars_sn2->amos_page_pa +
+					      (XPC_ACTIVATE_IRQ_AMOS_SN2 *
+					      sizeof(struct amo)));
+
+	/* fake the sending and receipt of an activate IRQ from remote nasid */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amos[BIT_WORD(from_nasid / 2)].variable),
+			 FETCHOP_OR, BIT_MASK(from_nasid / 2));
+
+	spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+	xpc_activate_IRQ_rcvd++;
+	spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+	wake_up_interruptible(&xpc_activate_IRQ_wq);
+}
+
+/*
+ * Functions associated with SGI_XPC_NOTIFY IRQ.
+ */
+
+/*
+ * Check to see if any chctl flags were sent from the specified partition.
+ */
+static void
+xpc_check_for_sent_chctl_flags_sn2(struct xpc_partition *part)
+{
+	union xpc_channel_ctl_flags chctl;
+	unsigned long irq_flags;
+
+	chctl.all_flags = xpc_receive_IRQ_amo_sn2(part->sn.sn2.
+						  local_chctl_amo_va);
+	if (chctl.all_flags == 0)
+		return;
+
+	spin_lock_irqsave(&part->chctl_lock, irq_flags);
+	part->chctl.all_flags |= chctl.all_flags;
+	spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+	dev_dbg(xpc_chan, "received notify IRQ from partid=%d, chctl.all_flags="
+		"0x%llx\n", XPC_PARTID(part), chctl.all_flags);
+
+	xpc_wakeup_channel_mgr(part);
+}
+
+/*
+ * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
+ * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
+ * than one partition, we use an amo structure per partition to indicate
+ * whether a partition has sent an IRQ or not.  If it has, then wake up the
+ * associated kthread to handle it.
+ *
+ * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IRQs sent by XPC
+ * running on other partitions.
+ *
+ * Noteworthy Arguments:
+ *
+ *	irq - Interrupt ReQuest number. NOT USED.
+ *
+ *	dev_id - partid of IRQ's potential sender.
+ */
+static irqreturn_t
+xpc_handle_notify_IRQ_sn2(int irq, void *dev_id)
+{
+	short partid = (short)(u64)dev_id;
+	struct xpc_partition *part = &xpc_partitions[partid];
+
+	DBUG_ON(partid < 0 || partid >= XP_MAX_NPARTITIONS_SN2);
+
+	if (xpc_part_ref(part)) {
+		xpc_check_for_sent_chctl_flags_sn2(part);
+
+		xpc_part_deref(part);
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * Check to see if xpc_handle_notify_IRQ_sn2() dropped any IRQs on the floor
+ * because the write to their associated amo variable completed after the IRQ
+ * was received.
+ */
+static void
+xpc_check_for_dropped_notify_IRQ_sn2(struct timer_list *t)
+{
+	struct xpc_partition *part =
+		from_timer(part, t, sn.sn2.dropped_notify_IRQ_timer);
+
+	if (xpc_part_ref(part)) {
+		xpc_check_for_sent_chctl_flags_sn2(part);
+
+		t->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL;
+		add_timer(t);
+		xpc_part_deref(part);
+	}
+}
+
+/*
+ * Send a notify IRQ to the remote partition that is associated with the
+ * specified channel.
+ */
+static void
+xpc_send_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag,
+			char *chctl_flag_string, unsigned long *irq_flags)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	union xpc_channel_ctl_flags chctl = { 0 };
+	enum xp_retval ret;
+
+	if (likely(part->act_state != XPC_P_AS_DEACTIVATING)) {
+		chctl.flags[ch->number] = chctl_flag;
+		ret = xpc_send_IRQ_sn2(part_sn2->remote_chctl_amo_va,
+				       chctl.all_flags,
+				       part_sn2->notify_IRQ_nasid,
+				       part_sn2->notify_IRQ_phys_cpuid,
+				       SGI_XPC_NOTIFY);
+		dev_dbg(xpc_chan, "%s sent to partid=%d, channel=%d, ret=%d\n",
+			chctl_flag_string, ch->partid, ch->number, ret);
+		if (unlikely(ret != xpSuccess)) {
+			if (irq_flags != NULL)
+				spin_unlock_irqrestore(&ch->lock, *irq_flags);
+			XPC_DEACTIVATE_PARTITION(part, ret);
+			if (irq_flags != NULL)
+				spin_lock_irqsave(&ch->lock, *irq_flags);
+		}
+	}
+}
+
+#define XPC_SEND_NOTIFY_IRQ_SN2(_ch, _ipi_f, _irq_f) \
+		xpc_send_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f, _irq_f)
+
+/*
+ * Make it look like the remote partition, which is associated with the
+ * specified channel, sent us a notify IRQ. This faked IRQ will be handled
+ * by xpc_check_for_dropped_notify_IRQ_sn2().
+ */
+static void
+xpc_send_local_notify_IRQ_sn2(struct xpc_channel *ch, u8 chctl_flag,
+			      char *chctl_flag_string)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	union xpc_channel_ctl_flags chctl = { 0 };
+
+	chctl.flags[ch->number] = chctl_flag;
+	FETCHOP_STORE_OP(TO_AMO((u64)&part->sn.sn2.local_chctl_amo_va->
+				variable), FETCHOP_OR, chctl.all_flags);
+	dev_dbg(xpc_chan, "%s sent local from partid=%d, channel=%d\n",
+		chctl_flag_string, ch->partid, ch->number);
+}
+
+#define XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(_ch, _ipi_f) \
+		xpc_send_local_notify_IRQ_sn2(_ch, _ipi_f, #_ipi_f)
+
+static void
+xpc_send_chctl_closerequest_sn2(struct xpc_channel *ch,
+				unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
+
+	args->reason = ch->reason;
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREQUEST, irq_flags);
+}
+
+static void
+xpc_send_chctl_closereply_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_CLOSEREPLY, irq_flags);
+}
+
+static void
+xpc_send_chctl_openrequest_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
+
+	args->entry_size = ch->entry_size;
+	args->local_nentries = ch->local_nentries;
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREQUEST, irq_flags);
+}
+
+static void
+xpc_send_chctl_openreply_sn2(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_openclose_args *args = ch->sn.sn2.local_openclose_args;
+
+	args->remote_nentries = ch->remote_nentries;
+	args->local_nentries = ch->local_nentries;
+	args->local_msgqueue_pa = xp_pa(ch->sn.sn2.local_msgqueue);
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENREPLY, irq_flags);
+}
+
+static void
+xpc_send_chctl_opencomplete_sn2(struct xpc_channel *ch,
+				unsigned long *irq_flags)
+{
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_OPENCOMPLETE, irq_flags);
+}
+
+static void
+xpc_send_chctl_msgrequest_sn2(struct xpc_channel *ch)
+{
+	XPC_SEND_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST, NULL);
+}
+
+static void
+xpc_send_chctl_local_msgrequest_sn2(struct xpc_channel *ch)
+{
+	XPC_SEND_LOCAL_NOTIFY_IRQ_SN2(ch, XPC_CHCTL_MSGREQUEST);
+}
+
+static enum xp_retval
+xpc_save_remote_msgqueue_pa_sn2(struct xpc_channel *ch,
+				unsigned long msgqueue_pa)
+{
+	ch->sn.sn2.remote_msgqueue_pa = msgqueue_pa;
+	return xpSuccess;
+}
+
+/*
+ * This next set of functions are used to keep track of when a partition is
+ * potentially engaged in accessing memory belonging to another partition.
+ */
+
+static void
+xpc_indicate_partition_engaged_sn2(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa +
+					     (XPC_ENGAGED_PARTITIONS_AMO_SN2 *
+					     sizeof(struct amo)));
+
+	local_irq_save(irq_flags);
+
+	/* set bit corresponding to our partid in remote partition's amo */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
+			 BIT(sn_partition_id));
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IRQs and amos to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static void
+xpc_indicate_partition_disengaged_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	unsigned long irq_flags;
+	struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa +
+					     (XPC_ENGAGED_PARTITIONS_AMO_SN2 *
+					     sizeof(struct amo)));
+
+	local_irq_save(irq_flags);
+
+	/* clear bit corresponding to our partid in remote partition's amo */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~BIT(sn_partition_id));
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IRQs and amos to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+
+	/*
+	 * Send activate IRQ to get other side to see that we've cleared our
+	 * bit in their engaged partitions amo.
+	 */
+	xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
+				  cnodeid_to_nasid(0),
+				  part_sn2->activate_IRQ_nasid,
+				  part_sn2->activate_IRQ_phys_cpuid);
+}
+
+static void
+xpc_assume_partition_disengaged_sn2(short partid)
+{
+	struct amo *amo = xpc_vars_sn2->amos_page +
+			  XPC_ENGAGED_PARTITIONS_AMO_SN2;
+
+	/* clear bit(s) based on partid mask in our partition's amo */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~BIT(partid));
+}
+
+static int
+xpc_partition_engaged_sn2(short partid)
+{
+	struct amo *amo = xpc_vars_sn2->amos_page +
+			  XPC_ENGAGED_PARTITIONS_AMO_SN2;
+
+	/* our partition's amo variable ANDed with partid mask */
+	return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
+		BIT(partid)) != 0;
+}
+
+static int
+xpc_any_partition_engaged_sn2(void)
+{
+	struct amo *amo = xpc_vars_sn2->amos_page +
+			  XPC_ENGAGED_PARTITIONS_AMO_SN2;
+
+	/* our partition's amo variable */
+	return FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) != 0;
+}
+
+/* original protection values for each node */
+static u64 xpc_prot_vec_sn2[MAX_NUMNODES];
+
+/*
+ * Change protections to allow amo operations on non-Shub 1.1 systems.
+ */
+static enum xp_retval
+xpc_allow_amo_ops_sn2(struct amo *amos_page)
+{
+	enum xp_retval ret = xpSuccess;
+
+	/*
+	 * On SHUB 1.1, we cannot call sn_change_memprotect() since the BIST
+	 * collides with memory operations. On those systems we call
+	 * xpc_allow_amo_ops_shub_wars_1_1_sn2() instead.
+	 */
+	if (!enable_shub_wars_1_1())
+		ret = xp_expand_memprotect(ia64_tpa((u64)amos_page), PAGE_SIZE);
+
+	return ret;
+}
+
+/*
+ * Change protections to allow amo operations on Shub 1.1 systems.
+ */
+static void
+xpc_allow_amo_ops_shub_wars_1_1_sn2(void)
+{
+	int node;
+	int nasid;
+
+	if (!enable_shub_wars_1_1())
+		return;
+
+	for_each_online_node(node) {
+		nasid = cnodeid_to_nasid(node);
+		/* save current protection values */
+		xpc_prot_vec_sn2[node] =
+		    (u64)HUB_L((u64 *)GLOBAL_MMR_ADDR(nasid,
+						  SH1_MD_DQLP_MMR_DIR_PRIVEC0));
+		/* open up everything */
+		HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
+					     SH1_MD_DQLP_MMR_DIR_PRIVEC0),
+		      -1UL);
+		HUB_S((u64 *)GLOBAL_MMR_ADDR(nasid,
+					     SH1_MD_DQRP_MMR_DIR_PRIVEC0),
+		      -1UL);
+	}
+}
+
+static enum xp_retval
+xpc_get_partition_rsvd_page_pa_sn2(void *buf, u64 *cookie, unsigned long *rp_pa,
+				   size_t *len)
+{
+	s64 status;
+	enum xp_retval ret;
+
+	status = sn_partition_reserved_page_pa((u64)buf, cookie,
+			(u64 *)rp_pa, (u64 *)len);
+	if (status == SALRET_OK)
+		ret = xpSuccess;
+	else if (status == SALRET_MORE_PASSES)
+		ret = xpNeedMoreInfo;
+	else
+		ret = xpSalError;
+
+	return ret;
+}
+
+
+static int
+xpc_setup_rsvd_page_sn2(struct xpc_rsvd_page *rp)
+{
+	struct amo *amos_page;
+	int i;
+	int ret;
+
+	xpc_vars_sn2 = XPC_RP_VARS(rp);
+
+	rp->sn.sn2.vars_pa = xp_pa(xpc_vars_sn2);
+
+	/* vars_part array follows immediately after vars */
+	xpc_vars_part_sn2 = (struct xpc_vars_part_sn2 *)((u8 *)XPC_RP_VARS(rp) +
+							 XPC_RP_VARS_SIZE);
+
+	/*
+	 * Before clearing xpc_vars_sn2, see if a page of amos had been
+	 * previously allocated. If not we'll need to allocate one and set
+	 * permissions so that cross-partition amos are allowed.
+	 *
+	 * The allocated amo page needs MCA reporting to remain disabled after
+	 * XPC has unloaded.  To make this work, we keep a copy of the pointer
+	 * to this page (i.e., amos_page) in the struct xpc_vars_sn2 structure,
+	 * which is pointed to by the reserved page, and re-use that saved copy
+	 * on subsequent loads of XPC. This amo page is never freed, and its
+	 * memory protections are never restricted.
+	 */
+	amos_page = xpc_vars_sn2->amos_page;
+	if (amos_page == NULL) {
+		amos_page = (struct amo *)TO_AMO(uncached_alloc_page(0, 1));
+		if (amos_page == NULL) {
+			dev_err(xpc_part, "can't allocate page of amos\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Open up amo-R/W to cpu.  This is done on Shub 1.1 systems
+		 * when xpc_allow_amo_ops_shub_wars_1_1_sn2() is called.
+		 */
+		ret = xpc_allow_amo_ops_sn2(amos_page);
+		if (ret != xpSuccess) {
+			dev_err(xpc_part, "can't allow amo operations\n");
+			uncached_free_page(__IA64_UNCACHED_OFFSET |
+					   TO_PHYS((u64)amos_page), 1);
+			return -EPERM;
+		}
+	}
+
+	/* clear xpc_vars_sn2 */
+	memset(xpc_vars_sn2, 0, sizeof(struct xpc_vars_sn2));
+
+	xpc_vars_sn2->version = XPC_V_VERSION;
+	xpc_vars_sn2->activate_IRQ_nasid = cpuid_to_nasid(0);
+	xpc_vars_sn2->activate_IRQ_phys_cpuid = cpu_physical_id(0);
+	xpc_vars_sn2->vars_part_pa = xp_pa(xpc_vars_part_sn2);
+	xpc_vars_sn2->amos_page_pa = ia64_tpa((u64)amos_page);
+	xpc_vars_sn2->amos_page = amos_page;	/* save for next load of XPC */
+
+	/* clear xpc_vars_part_sn2 */
+	memset((u64 *)xpc_vars_part_sn2, 0, sizeof(struct xpc_vars_part_sn2) *
+	       XP_MAX_NPARTITIONS_SN2);
+
+	/* initialize the activate IRQ related amo variables */
+	for (i = 0; i < xpc_nasid_mask_nlongs; i++)
+		(void)xpc_init_IRQ_amo_sn2(XPC_ACTIVATE_IRQ_AMOS_SN2 + i);
+
+	/* initialize the engaged remote partitions related amo variables */
+	(void)xpc_init_IRQ_amo_sn2(XPC_ENGAGED_PARTITIONS_AMO_SN2);
+	(void)xpc_init_IRQ_amo_sn2(XPC_DEACTIVATE_REQUEST_AMO_SN2);
+
+	return 0;
+}
+
+static int
+xpc_hb_allowed_sn2(short partid, void *heartbeating_to_mask)
+{
+	return test_bit(partid, heartbeating_to_mask);
+}
+
+static void
+xpc_allow_hb_sn2(short partid)
+{
+	DBUG_ON(xpc_vars_sn2 == NULL);
+	set_bit(partid, xpc_vars_sn2->heartbeating_to_mask);
+}
+
+static void
+xpc_disallow_hb_sn2(short partid)
+{
+	DBUG_ON(xpc_vars_sn2 == NULL);
+	clear_bit(partid, xpc_vars_sn2->heartbeating_to_mask);
+}
+
+static void
+xpc_disallow_all_hbs_sn2(void)
+{
+	DBUG_ON(xpc_vars_sn2 == NULL);
+	bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, xp_max_npartitions);
+}
+
+static void
+xpc_increment_heartbeat_sn2(void)
+{
+	xpc_vars_sn2->heartbeat++;
+}
+
+static void
+xpc_offline_heartbeat_sn2(void)
+{
+	xpc_increment_heartbeat_sn2();
+	xpc_vars_sn2->heartbeat_offline = 1;
+}
+
+static void
+xpc_online_heartbeat_sn2(void)
+{
+	xpc_increment_heartbeat_sn2();
+	xpc_vars_sn2->heartbeat_offline = 0;
+}
+
+static void
+xpc_heartbeat_init_sn2(void)
+{
+	DBUG_ON(xpc_vars_sn2 == NULL);
+
+	bitmap_zero(xpc_vars_sn2->heartbeating_to_mask, XP_MAX_NPARTITIONS_SN2);
+	xpc_online_heartbeat_sn2();
+}
+
+static void
+xpc_heartbeat_exit_sn2(void)
+{
+	xpc_offline_heartbeat_sn2();
+}
+
+static enum xp_retval
+xpc_get_remote_heartbeat_sn2(struct xpc_partition *part)
+{
+	struct xpc_vars_sn2 *remote_vars;
+	enum xp_retval ret;
+
+	remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2;
+
+	/* pull the remote vars structure that contains the heartbeat */
+	ret = xp_remote_memcpy(xp_pa(remote_vars),
+			       part->sn.sn2.remote_vars_pa,
+			       XPC_RP_VARS_SIZE);
+	if (ret != xpSuccess)
+		return ret;
+
+	dev_dbg(xpc_part, "partid=%d, heartbeat=%lld, last_heartbeat=%lld, "
+		"heartbeat_offline=%lld, HB_mask[0]=0x%lx\n", XPC_PARTID(part),
+		remote_vars->heartbeat, part->last_heartbeat,
+		remote_vars->heartbeat_offline,
+		remote_vars->heartbeating_to_mask[0]);
+
+	if ((remote_vars->heartbeat == part->last_heartbeat &&
+	    !remote_vars->heartbeat_offline) ||
+	    !xpc_hb_allowed_sn2(sn_partition_id,
+				remote_vars->heartbeating_to_mask)) {
+		ret = xpNoHeartbeat;
+	} else {
+		part->last_heartbeat = remote_vars->heartbeat;
+	}
+
+	return ret;
+}
+
+/*
+ * Get a copy of the remote partition's XPC variables from the reserved page.
+ *
+ * remote_vars points to a buffer that is cacheline aligned for BTE copies and
+ * assumed to be of size XPC_RP_VARS_SIZE.
+ */
+static enum xp_retval
+xpc_get_remote_vars_sn2(unsigned long remote_vars_pa,
+			struct xpc_vars_sn2 *remote_vars)
+{
+	enum xp_retval ret;
+
+	if (remote_vars_pa == 0)
+		return xpVarsNotSet;
+
+	/* pull over the cross partition variables */
+	ret = xp_remote_memcpy(xp_pa(remote_vars), remote_vars_pa,
+			       XPC_RP_VARS_SIZE);
+	if (ret != xpSuccess)
+		return ret;
+
+	if (XPC_VERSION_MAJOR(remote_vars->version) !=
+	    XPC_VERSION_MAJOR(XPC_V_VERSION)) {
+		return xpBadVersion;
+	}
+
+	return xpSuccess;
+}
+
+static void
+xpc_request_partition_activation_sn2(struct xpc_rsvd_page *remote_rp,
+				     unsigned long remote_rp_pa, int nasid)
+{
+	xpc_send_local_activate_IRQ_sn2(nasid);
+}
+
+static void
+xpc_request_partition_reactivation_sn2(struct xpc_partition *part)
+{
+	xpc_send_local_activate_IRQ_sn2(part->sn.sn2.activate_IRQ_nasid);
+}
+
+static void
+xpc_request_partition_deactivation_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	unsigned long irq_flags;
+	struct amo *amo = (struct amo *)__va(part_sn2->remote_amos_page_pa +
+					     (XPC_DEACTIVATE_REQUEST_AMO_SN2 *
+					     sizeof(struct amo)));
+
+	local_irq_save(irq_flags);
+
+	/* set bit corresponding to our partid in remote partition's amo */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_OR,
+			 BIT(sn_partition_id));
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IRQs and amos to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+
+	/*
+	 * Send activate IRQ to get other side to see that we've set our
+	 * bit in their deactivate request amo.
+	 */
+	xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
+				  cnodeid_to_nasid(0),
+				  part_sn2->activate_IRQ_nasid,
+				  part_sn2->activate_IRQ_phys_cpuid);
+}
+
+static void
+xpc_cancel_partition_deactivation_request_sn2(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	struct amo *amo = (struct amo *)__va(part->sn.sn2.remote_amos_page_pa +
+					     (XPC_DEACTIVATE_REQUEST_AMO_SN2 *
+					     sizeof(struct amo)));
+
+	local_irq_save(irq_flags);
+
+	/* clear bit corresponding to our partid in remote partition's amo */
+	FETCHOP_STORE_OP(TO_AMO((u64)&amo->variable), FETCHOP_AND,
+			 ~BIT(sn_partition_id));
+
+	/*
+	 * We must always use the nofault function regardless of whether we
+	 * are on a Shub 1.1 system or a Shub 1.2 slice 0xc processor. If we
+	 * didn't, we'd never know that the other partition is down and would
+	 * keep sending IRQs and amos to it until the heartbeat times out.
+	 */
+	(void)xp_nofault_PIOR((u64 *)GLOBAL_MMR_ADDR(NASID_GET(&amo->
+							       variable),
+						     xp_nofault_PIOR_target));
+
+	local_irq_restore(irq_flags);
+}
+
+static int
+xpc_partition_deactivation_requested_sn2(short partid)
+{
+	struct amo *amo = xpc_vars_sn2->amos_page +
+			  XPC_DEACTIVATE_REQUEST_AMO_SN2;
+
+	/* our partition's amo variable ANDed with partid mask */
+	return (FETCHOP_LOAD_OP(TO_AMO((u64)&amo->variable), FETCHOP_LOAD) &
+		BIT(partid)) != 0;
+}
+
+/*
+ * Update the remote partition's info.
+ */
+static void
+xpc_update_partition_info_sn2(struct xpc_partition *part, u8 remote_rp_version,
+			      unsigned long *remote_rp_ts_jiffies,
+			      unsigned long remote_rp_pa,
+			      unsigned long remote_vars_pa,
+			      struct xpc_vars_sn2 *remote_vars)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+
+	part->remote_rp_version = remote_rp_version;
+	dev_dbg(xpc_part, "  remote_rp_version = 0x%016x\n",
+		part->remote_rp_version);
+
+	part->remote_rp_ts_jiffies = *remote_rp_ts_jiffies;
+	dev_dbg(xpc_part, "  remote_rp_ts_jiffies = 0x%016lx\n",
+		part->remote_rp_ts_jiffies);
+
+	part->remote_rp_pa = remote_rp_pa;
+	dev_dbg(xpc_part, "  remote_rp_pa = 0x%016lx\n", part->remote_rp_pa);
+
+	part_sn2->remote_vars_pa = remote_vars_pa;
+	dev_dbg(xpc_part, "  remote_vars_pa = 0x%016lx\n",
+		part_sn2->remote_vars_pa);
+
+	part->last_heartbeat = remote_vars->heartbeat - 1;
+	dev_dbg(xpc_part, "  last_heartbeat = 0x%016llx\n",
+		part->last_heartbeat);
+
+	part_sn2->remote_vars_part_pa = remote_vars->vars_part_pa;
+	dev_dbg(xpc_part, "  remote_vars_part_pa = 0x%016lx\n",
+		part_sn2->remote_vars_part_pa);
+
+	part_sn2->activate_IRQ_nasid = remote_vars->activate_IRQ_nasid;
+	dev_dbg(xpc_part, "  activate_IRQ_nasid = 0x%x\n",
+		part_sn2->activate_IRQ_nasid);
+
+	part_sn2->activate_IRQ_phys_cpuid =
+	    remote_vars->activate_IRQ_phys_cpuid;
+	dev_dbg(xpc_part, "  activate_IRQ_phys_cpuid = 0x%x\n",
+		part_sn2->activate_IRQ_phys_cpuid);
+
+	part_sn2->remote_amos_page_pa = remote_vars->amos_page_pa;
+	dev_dbg(xpc_part, "  remote_amos_page_pa = 0x%lx\n",
+		part_sn2->remote_amos_page_pa);
+
+	part_sn2->remote_vars_version = remote_vars->version;
+	dev_dbg(xpc_part, "  remote_vars_version = 0x%x\n",
+		part_sn2->remote_vars_version);
+}
+
+/*
+ * Prior code has determined the nasid which generated a activate IRQ.
+ * Inspect that nasid to determine if its partition needs to be activated
+ * or deactivated.
+ *
+ * A partition is considered "awaiting activation" if our partition
+ * flags indicate it is not active and it has a heartbeat.  A
+ * partition is considered "awaiting deactivation" if our partition
+ * flags indicate it is active but it has no heartbeat or it is not
+ * sending its heartbeat to us.
+ *
+ * To determine the heartbeat, the remote nasid must have a properly
+ * initialized reserved page.
+ */
+static void
+xpc_identify_activate_IRQ_req_sn2(int nasid)
+{
+	struct xpc_rsvd_page *remote_rp;
+	struct xpc_vars_sn2 *remote_vars;
+	unsigned long remote_rp_pa;
+	unsigned long remote_vars_pa;
+	int remote_rp_version;
+	int reactivate = 0;
+	unsigned long remote_rp_ts_jiffies = 0;
+	short partid;
+	struct xpc_partition *part;
+	struct xpc_partition_sn2 *part_sn2;
+	enum xp_retval ret;
+
+	/* pull over the reserved page structure */
+
+	remote_rp = (struct xpc_rsvd_page *)xpc_remote_copy_buffer_sn2;
+
+	ret = xpc_get_remote_rp(nasid, NULL, remote_rp, &remote_rp_pa);
+	if (ret != xpSuccess) {
+		dev_warn(xpc_part, "unable to get reserved page from nasid %d, "
+			 "which sent interrupt, reason=%d\n", nasid, ret);
+		return;
+	}
+
+	remote_vars_pa = remote_rp->sn.sn2.vars_pa;
+	remote_rp_version = remote_rp->version;
+	remote_rp_ts_jiffies = remote_rp->ts_jiffies;
+
+	partid = remote_rp->SAL_partid;
+	part = &xpc_partitions[partid];
+	part_sn2 = &part->sn.sn2;
+
+	/* pull over the cross partition variables */
+
+	remote_vars = (struct xpc_vars_sn2 *)xpc_remote_copy_buffer_sn2;
+
+	ret = xpc_get_remote_vars_sn2(remote_vars_pa, remote_vars);
+	if (ret != xpSuccess) {
+		dev_warn(xpc_part, "unable to get XPC variables from nasid %d, "
+			 "which sent interrupt, reason=%d\n", nasid, ret);
+
+		XPC_DEACTIVATE_PARTITION(part, ret);
+		return;
+	}
+
+	part->activate_IRQ_rcvd++;
+
+	dev_dbg(xpc_part, "partid for nasid %d is %d; IRQs = %d; HB = "
+		"%lld:0x%lx\n", (int)nasid, (int)partid,
+		part->activate_IRQ_rcvd,
+		remote_vars->heartbeat, remote_vars->heartbeating_to_mask[0]);
+
+	if (xpc_partition_disengaged(part) &&
+	    part->act_state == XPC_P_AS_INACTIVE) {
+
+		xpc_update_partition_info_sn2(part, remote_rp_version,
+					      &remote_rp_ts_jiffies,
+					      remote_rp_pa, remote_vars_pa,
+					      remote_vars);
+
+		if (xpc_partition_deactivation_requested_sn2(partid)) {
+			/*
+			 * Other side is waiting on us to deactivate even though
+			 * we already have.
+			 */
+			return;
+		}
+
+		xpc_activate_partition(part);
+		return;
+	}
+
+	DBUG_ON(part->remote_rp_version == 0);
+	DBUG_ON(part_sn2->remote_vars_version == 0);
+
+	if (remote_rp_ts_jiffies != part->remote_rp_ts_jiffies) {
+
+		/* the other side rebooted */
+
+		DBUG_ON(xpc_partition_engaged_sn2(partid));
+		DBUG_ON(xpc_partition_deactivation_requested_sn2(partid));
+
+		xpc_update_partition_info_sn2(part, remote_rp_version,
+					      &remote_rp_ts_jiffies,
+					      remote_rp_pa, remote_vars_pa,
+					      remote_vars);
+		reactivate = 1;
+	}
+
+	if (part->disengage_timeout > 0 && !xpc_partition_disengaged(part)) {
+		/* still waiting on other side to disengage from us */
+		return;
+	}
+
+	if (reactivate)
+		XPC_DEACTIVATE_PARTITION(part, xpReactivating);
+	else if (xpc_partition_deactivation_requested_sn2(partid))
+		XPC_DEACTIVATE_PARTITION(part, xpOtherGoingDown);
+}
+
+/*
+ * Loop through the activation amo variables and process any bits
+ * which are set.  Each bit indicates a nasid sending a partition
+ * activation or deactivation request.
+ *
+ * Return #of IRQs detected.
+ */
+int
+xpc_identify_activate_IRQ_sender_sn2(void)
+{
+	int l;
+	int b;
+	unsigned long nasid_mask_long;
+	u64 nasid;		/* remote nasid */
+	int n_IRQs_detected = 0;
+	struct amo *act_amos;
+
+	act_amos = xpc_vars_sn2->amos_page + XPC_ACTIVATE_IRQ_AMOS_SN2;
+
+	/* scan through activate amo variables looking for non-zero entries */
+	for (l = 0; l < xpc_nasid_mask_nlongs; l++) {
+
+		if (xpc_exiting)
+			break;
+
+		nasid_mask_long = xpc_receive_IRQ_amo_sn2(&act_amos[l]);
+
+		b = find_first_bit(&nasid_mask_long, BITS_PER_LONG);
+		if (b >= BITS_PER_LONG) {
+			/* no IRQs from nasids in this amo variable */
+			continue;
+		}
+
+		dev_dbg(xpc_part, "amo[%d] gave back 0x%lx\n", l,
+			nasid_mask_long);
+
+		/*
+		 * If this nasid has been added to the machine since
+		 * our partition was reset, this will retain the
+		 * remote nasid in our reserved pages machine mask.
+		 * This is used in the event of module reload.
+		 */
+		xpc_mach_nasids[l] |= nasid_mask_long;
+
+		/* locate the nasid(s) which sent interrupts */
+
+		do {
+			n_IRQs_detected++;
+			nasid = (l * BITS_PER_LONG + b) * 2;
+			dev_dbg(xpc_part, "interrupt from nasid %lld\n", nasid);
+			xpc_identify_activate_IRQ_req_sn2(nasid);
+
+			b = find_next_bit(&nasid_mask_long, BITS_PER_LONG,
+					  b + 1);
+		} while (b < BITS_PER_LONG);
+	}
+	return n_IRQs_detected;
+}
+
+static void
+xpc_process_activate_IRQ_rcvd_sn2(void)
+{
+	unsigned long irq_flags;
+	int n_IRQs_expected;
+	int n_IRQs_detected;
+
+	spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+	n_IRQs_expected = xpc_activate_IRQ_rcvd;
+	xpc_activate_IRQ_rcvd = 0;
+	spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+	n_IRQs_detected = xpc_identify_activate_IRQ_sender_sn2();
+	if (n_IRQs_detected < n_IRQs_expected) {
+		/* retry once to help avoid missing amo */
+		(void)xpc_identify_activate_IRQ_sender_sn2();
+	}
+}
+
+/*
+ * Setup the channel structures that are sn2 specific.
+ */
+static enum xp_retval
+xpc_setup_ch_structures_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	struct xpc_channel_sn2 *ch_sn2;
+	enum xp_retval retval;
+	int ret;
+	int cpuid;
+	int ch_number;
+	struct timer_list *timer;
+	short partid = XPC_PARTID(part);
+
+	/* allocate all the required GET/PUT values */
+
+	part_sn2->local_GPs =
+	    xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL,
+					  &part_sn2->local_GPs_base);
+	if (part_sn2->local_GPs == NULL) {
+		dev_err(xpc_chan, "can't get memory for local get/put "
+			"values\n");
+		return xpNoMemory;
+	}
+
+	part_sn2->remote_GPs =
+	    xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL,
+					  &part_sn2->remote_GPs_base);
+	if (part_sn2->remote_GPs == NULL) {
+		dev_err(xpc_chan, "can't get memory for remote get/put "
+			"values\n");
+		retval = xpNoMemory;
+		goto out_1;
+	}
+
+	part_sn2->remote_GPs_pa = 0;
+
+	/* allocate all the required open and close args */
+
+	part_sn2->local_openclose_args =
+	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
+					  GFP_KERNEL, &part_sn2->
+					  local_openclose_args_base);
+	if (part_sn2->local_openclose_args == NULL) {
+		dev_err(xpc_chan, "can't get memory for local connect args\n");
+		retval = xpNoMemory;
+		goto out_2;
+	}
+
+	part_sn2->remote_openclose_args_pa = 0;
+
+	part_sn2->local_chctl_amo_va = xpc_init_IRQ_amo_sn2(partid);
+
+	part_sn2->notify_IRQ_nasid = 0;
+	part_sn2->notify_IRQ_phys_cpuid = 0;
+	part_sn2->remote_chctl_amo_va = NULL;
+
+	sprintf(part_sn2->notify_IRQ_owner, "xpc%02d", partid);
+	ret = request_irq(SGI_XPC_NOTIFY, xpc_handle_notify_IRQ_sn2,
+			  IRQF_SHARED, part_sn2->notify_IRQ_owner,
+			  (void *)(u64)partid);
+	if (ret != 0) {
+		dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
+			"errno=%d\n", -ret);
+		retval = xpLackOfResources;
+		goto out_3;
+	}
+
+	/* Setup a timer to check for dropped notify IRQs */
+	timer = &part_sn2->dropped_notify_IRQ_timer;
+	timer_setup(timer, xpc_check_for_dropped_notify_IRQ_sn2, 0);
+	timer->expires = jiffies + XPC_DROPPED_NOTIFY_IRQ_WAIT_INTERVAL;
+	add_timer(timer);
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch_sn2 = &part->channels[ch_number].sn.sn2;
+
+		ch_sn2->local_GP = &part_sn2->local_GPs[ch_number];
+		ch_sn2->local_openclose_args =
+		    &part_sn2->local_openclose_args[ch_number];
+
+		mutex_init(&ch_sn2->msg_to_pull_mutex);
+	}
+
+	/*
+	 * Setup the per partition specific variables required by the
+	 * remote partition to establish channel connections with us.
+	 *
+	 * The setting of the magic # indicates that these per partition
+	 * specific variables are ready to be used.
+	 */
+	xpc_vars_part_sn2[partid].GPs_pa = xp_pa(part_sn2->local_GPs);
+	xpc_vars_part_sn2[partid].openclose_args_pa =
+	    xp_pa(part_sn2->local_openclose_args);
+	xpc_vars_part_sn2[partid].chctl_amo_pa =
+	    xp_pa(part_sn2->local_chctl_amo_va);
+	cpuid = raw_smp_processor_id();	/* any CPU in this partition will do */
+	xpc_vars_part_sn2[partid].notify_IRQ_nasid = cpuid_to_nasid(cpuid);
+	xpc_vars_part_sn2[partid].notify_IRQ_phys_cpuid =
+	    cpu_physical_id(cpuid);
+	xpc_vars_part_sn2[partid].nchannels = part->nchannels;
+	xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC1_SN2;
+
+	return xpSuccess;
+
+	/* setup of ch structures failed */
+out_3:
+	kfree(part_sn2->local_openclose_args_base);
+	part_sn2->local_openclose_args = NULL;
+out_2:
+	kfree(part_sn2->remote_GPs_base);
+	part_sn2->remote_GPs = NULL;
+out_1:
+	kfree(part_sn2->local_GPs_base);
+	part_sn2->local_GPs = NULL;
+	return retval;
+}
+
+/*
+ * Teardown the channel structures that are sn2 specific.
+ */
+static void
+xpc_teardown_ch_structures_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	short partid = XPC_PARTID(part);
+
+	/*
+	 * Indicate that the variables specific to the remote partition are no
+	 * longer available for its use.
+	 */
+	xpc_vars_part_sn2[partid].magic = 0;
+
+	/* in case we've still got outstanding timers registered... */
+	del_timer_sync(&part_sn2->dropped_notify_IRQ_timer);
+	free_irq(SGI_XPC_NOTIFY, (void *)(u64)partid);
+
+	kfree(part_sn2->local_openclose_args_base);
+	part_sn2->local_openclose_args = NULL;
+	kfree(part_sn2->remote_GPs_base);
+	part_sn2->remote_GPs = NULL;
+	kfree(part_sn2->local_GPs_base);
+	part_sn2->local_GPs = NULL;
+	part_sn2->local_chctl_amo_va = NULL;
+}
+
+/*
+ * Create a wrapper that hides the underlying mechanism for pulling a cacheline
+ * (or multiple cachelines) from a remote partition.
+ *
+ * src_pa must be a cacheline aligned physical address on the remote partition.
+ * dst must be a cacheline aligned virtual address on this partition.
+ * cnt must be cacheline sized
+ */
+/* ??? Replace this function by call to xp_remote_memcpy() or bte_copy()? */
+static enum xp_retval
+xpc_pull_remote_cachelines_sn2(struct xpc_partition *part, void *dst,
+			       const unsigned long src_pa, size_t cnt)
+{
+	enum xp_retval ret;
+
+	DBUG_ON(src_pa != L1_CACHE_ALIGN(src_pa));
+	DBUG_ON((unsigned long)dst != L1_CACHE_ALIGN((unsigned long)dst));
+	DBUG_ON(cnt != L1_CACHE_ALIGN(cnt));
+
+	if (part->act_state == XPC_P_AS_DEACTIVATING)
+		return part->reason;
+
+	ret = xp_remote_memcpy(xp_pa(dst), src_pa, cnt);
+	if (ret != xpSuccess) {
+		dev_dbg(xpc_chan, "xp_remote_memcpy() from partition %d failed,"
+			" ret=%d\n", XPC_PARTID(part), ret);
+	}
+	return ret;
+}
+
+/*
+ * Pull the remote per partition specific variables from the specified
+ * partition.
+ */
+static enum xp_retval
+xpc_pull_remote_vars_part_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	u8 buffer[L1_CACHE_BYTES * 2];
+	struct xpc_vars_part_sn2 *pulled_entry_cacheline =
+	    (struct xpc_vars_part_sn2 *)L1_CACHE_ALIGN((u64)buffer);
+	struct xpc_vars_part_sn2 *pulled_entry;
+	unsigned long remote_entry_cacheline_pa;
+	unsigned long remote_entry_pa;
+	short partid = XPC_PARTID(part);
+	enum xp_retval ret;
+
+	/* pull the cacheline that contains the variables we're interested in */
+
+	DBUG_ON(part_sn2->remote_vars_part_pa !=
+		L1_CACHE_ALIGN(part_sn2->remote_vars_part_pa));
+	DBUG_ON(sizeof(struct xpc_vars_part_sn2) != L1_CACHE_BYTES / 2);
+
+	remote_entry_pa = part_sn2->remote_vars_part_pa +
+	    sn_partition_id * sizeof(struct xpc_vars_part_sn2);
+
+	remote_entry_cacheline_pa = (remote_entry_pa & ~(L1_CACHE_BYTES - 1));
+
+	pulled_entry = (struct xpc_vars_part_sn2 *)((u64)pulled_entry_cacheline
+						    + (remote_entry_pa &
+						    (L1_CACHE_BYTES - 1)));
+
+	ret = xpc_pull_remote_cachelines_sn2(part, pulled_entry_cacheline,
+					     remote_entry_cacheline_pa,
+					     L1_CACHE_BYTES);
+	if (ret != xpSuccess) {
+		dev_dbg(xpc_chan, "failed to pull XPC vars_part from "
+			"partition %d, ret=%d\n", partid, ret);
+		return ret;
+	}
+
+	/* see if they've been set up yet */
+
+	if (pulled_entry->magic != XPC_VP_MAGIC1_SN2 &&
+	    pulled_entry->magic != XPC_VP_MAGIC2_SN2) {
+
+		if (pulled_entry->magic != 0) {
+			dev_dbg(xpc_chan, "partition %d's XPC vars_part for "
+				"partition %d has bad magic value (=0x%llx)\n",
+				partid, sn_partition_id, pulled_entry->magic);
+			return xpBadMagic;
+		}
+
+		/* they've not been initialized yet */
+		return xpRetry;
+	}
+
+	if (xpc_vars_part_sn2[partid].magic == XPC_VP_MAGIC1_SN2) {
+
+		/* validate the variables */
+
+		if (pulled_entry->GPs_pa == 0 ||
+		    pulled_entry->openclose_args_pa == 0 ||
+		    pulled_entry->chctl_amo_pa == 0) {
+
+			dev_err(xpc_chan, "partition %d's XPC vars_part for "
+				"partition %d are not valid\n", partid,
+				sn_partition_id);
+			return xpInvalidAddress;
+		}
+
+		/* the variables we imported look to be valid */
+
+		part_sn2->remote_GPs_pa = pulled_entry->GPs_pa;
+		part_sn2->remote_openclose_args_pa =
+		    pulled_entry->openclose_args_pa;
+		part_sn2->remote_chctl_amo_va =
+		    (struct amo *)__va(pulled_entry->chctl_amo_pa);
+		part_sn2->notify_IRQ_nasid = pulled_entry->notify_IRQ_nasid;
+		part_sn2->notify_IRQ_phys_cpuid =
+		    pulled_entry->notify_IRQ_phys_cpuid;
+
+		if (part->nchannels > pulled_entry->nchannels)
+			part->nchannels = pulled_entry->nchannels;
+
+		/* let the other side know that we've pulled their variables */
+
+		xpc_vars_part_sn2[partid].magic = XPC_VP_MAGIC2_SN2;
+	}
+
+	if (pulled_entry->magic == XPC_VP_MAGIC1_SN2)
+		return xpRetry;
+
+	return xpSuccess;
+}
+
+/*
+ * Establish first contact with the remote partititon. This involves pulling
+ * the XPC per partition variables from the remote partition and waiting for
+ * the remote partition to pull ours.
+ */
+static enum xp_retval
+xpc_make_first_contact_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	enum xp_retval ret;
+
+	/*
+	 * Register the remote partition's amos with SAL so it can handle
+	 * and cleanup errors within that address range should the remote
+	 * partition go down. We don't unregister this range because it is
+	 * difficult to tell when outstanding writes to the remote partition
+	 * are finished and thus when it is safe to unregister. This should
+	 * not result in wasted space in the SAL xp_addr_region table because
+	 * we should get the same page for remote_amos_page_pa after module
+	 * reloads and system reboots.
+	 */
+	if (sn_register_xp_addr_region(part_sn2->remote_amos_page_pa,
+				       PAGE_SIZE, 1) < 0) {
+		dev_warn(xpc_part, "xpc_activating(%d) failed to register "
+			 "xp_addr region\n", XPC_PARTID(part));
+
+		ret = xpPhysAddrRegFailed;
+		XPC_DEACTIVATE_PARTITION(part, ret);
+		return ret;
+	}
+
+	/*
+	 * Send activate IRQ to get other side to activate if they've not
+	 * already begun to do so.
+	 */
+	xpc_send_activate_IRQ_sn2(part_sn2->remote_amos_page_pa,
+				  cnodeid_to_nasid(0),
+				  part_sn2->activate_IRQ_nasid,
+				  part_sn2->activate_IRQ_phys_cpuid);
+
+	while ((ret = xpc_pull_remote_vars_part_sn2(part)) != xpSuccess) {
+		if (ret != xpRetry) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+			return ret;
+		}
+
+		dev_dbg(xpc_part, "waiting to make first contact with "
+			"partition %d\n", XPC_PARTID(part));
+
+		/* wait a 1/4 of a second or so */
+		(void)msleep_interruptible(250);
+
+		if (part->act_state == XPC_P_AS_DEACTIVATING)
+			return part->reason;
+	}
+
+	return xpSuccess;
+}
+
+/*
+ * Get the chctl flags and pull the openclose args and/or remote GPs as needed.
+ */
+static u64
+xpc_get_chctl_all_flags_sn2(struct xpc_partition *part)
+{
+	struct xpc_partition_sn2 *part_sn2 = &part->sn.sn2;
+	unsigned long irq_flags;
+	union xpc_channel_ctl_flags chctl;
+	enum xp_retval ret;
+
+	/*
+	 * See if there are any chctl flags to be handled.
+	 */
+
+	spin_lock_irqsave(&part->chctl_lock, irq_flags);
+	chctl = part->chctl;
+	if (chctl.all_flags != 0)
+		part->chctl.all_flags = 0;
+
+	spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+	if (xpc_any_openclose_chctl_flags_set(&chctl)) {
+		ret = xpc_pull_remote_cachelines_sn2(part, part->
+						     remote_openclose_args,
+						     part_sn2->
+						     remote_openclose_args_pa,
+						     XPC_OPENCLOSE_ARGS_SIZE);
+		if (ret != xpSuccess) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			dev_dbg(xpc_chan, "failed to pull openclose args from "
+				"partition %d, ret=%d\n", XPC_PARTID(part),
+				ret);
+
+			/* don't bother processing chctl flags anymore */
+			chctl.all_flags = 0;
+		}
+	}
+
+	if (xpc_any_msg_chctl_flags_set(&chctl)) {
+		ret = xpc_pull_remote_cachelines_sn2(part, part_sn2->remote_GPs,
+						     part_sn2->remote_GPs_pa,
+						     XPC_GP_SIZE);
+		if (ret != xpSuccess) {
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			dev_dbg(xpc_chan, "failed to pull GPs from partition "
+				"%d, ret=%d\n", XPC_PARTID(part), ret);
+
+			/* don't bother processing chctl flags anymore */
+			chctl.all_flags = 0;
+		}
+	}
+
+	return chctl.all_flags;
+}
+
+/*
+ * Allocate the local message queue and the notify queue.
+ */
+static enum xp_retval
+xpc_allocate_local_msgqueue_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	unsigned long irq_flags;
+	int nentries;
+	size_t nbytes;
+
+	for (nentries = ch->local_nentries; nentries > 0; nentries--) {
+
+		nbytes = nentries * ch->entry_size;
+		ch_sn2->local_msgqueue =
+		    xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL,
+						  &ch_sn2->local_msgqueue_base);
+		if (ch_sn2->local_msgqueue == NULL)
+			continue;
+
+		nbytes = nentries * sizeof(struct xpc_notify_sn2);
+		ch_sn2->notify_queue = kzalloc(nbytes, GFP_KERNEL);
+		if (ch_sn2->notify_queue == NULL) {
+			kfree(ch_sn2->local_msgqueue_base);
+			ch_sn2->local_msgqueue = NULL;
+			continue;
+		}
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->local_nentries) {
+			dev_dbg(xpc_chan, "nentries=%d local_nentries=%d, "
+				"partid=%d, channel=%d\n", nentries,
+				ch->local_nentries, ch->partid, ch->number);
+
+			ch->local_nentries = nentries;
+		}
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpSuccess;
+	}
+
+	dev_dbg(xpc_chan, "can't get memory for local message queue and notify "
+		"queue, partid=%d, channel=%d\n", ch->partid, ch->number);
+	return xpNoMemory;
+}
+
+/*
+ * Allocate the cached remote message queue.
+ */
+static enum xp_retval
+xpc_allocate_remote_msgqueue_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	unsigned long irq_flags;
+	int nentries;
+	size_t nbytes;
+
+	DBUG_ON(ch->remote_nentries <= 0);
+
+	for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
+
+		nbytes = nentries * ch->entry_size;
+		ch_sn2->remote_msgqueue =
+		    xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch_sn2->
+						  remote_msgqueue_base);
+		if (ch_sn2->remote_msgqueue == NULL)
+			continue;
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->remote_nentries) {
+			dev_dbg(xpc_chan, "nentries=%d remote_nentries=%d, "
+				"partid=%d, channel=%d\n", nentries,
+				ch->remote_nentries, ch->partid, ch->number);
+
+			ch->remote_nentries = nentries;
+		}
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpSuccess;
+	}
+
+	dev_dbg(xpc_chan, "can't get memory for cached remote message queue, "
+		"partid=%d, channel=%d\n", ch->partid, ch->number);
+	return xpNoMemory;
+}
+
+/*
+ * Allocate message queues and other stuff associated with a channel.
+ *
+ * Note: Assumes all of the channel sizes are filled in.
+ */
+static enum xp_retval
+xpc_setup_msg_structures_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	enum xp_retval ret;
+
+	DBUG_ON(ch->flags & XPC_C_SETUP);
+
+	ret = xpc_allocate_local_msgqueue_sn2(ch);
+	if (ret == xpSuccess) {
+
+		ret = xpc_allocate_remote_msgqueue_sn2(ch);
+		if (ret != xpSuccess) {
+			kfree(ch_sn2->local_msgqueue_base);
+			ch_sn2->local_msgqueue = NULL;
+			kfree(ch_sn2->notify_queue);
+			ch_sn2->notify_queue = NULL;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Free up message queues and other stuff that were allocated for the specified
+ * channel.
+ */
+static void
+xpc_teardown_msg_structures_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	ch_sn2->remote_msgqueue_pa = 0;
+
+	ch_sn2->local_GP->get = 0;
+	ch_sn2->local_GP->put = 0;
+	ch_sn2->remote_GP.get = 0;
+	ch_sn2->remote_GP.put = 0;
+	ch_sn2->w_local_GP.get = 0;
+	ch_sn2->w_local_GP.put = 0;
+	ch_sn2->w_remote_GP.get = 0;
+	ch_sn2->w_remote_GP.put = 0;
+	ch_sn2->next_msg_to_pull = 0;
+
+	if (ch->flags & XPC_C_SETUP) {
+		dev_dbg(xpc_chan, "ch->flags=0x%x, partid=%d, channel=%d\n",
+			ch->flags, ch->partid, ch->number);
+
+		kfree(ch_sn2->local_msgqueue_base);
+		ch_sn2->local_msgqueue = NULL;
+		kfree(ch_sn2->remote_msgqueue_base);
+		ch_sn2->remote_msgqueue = NULL;
+		kfree(ch_sn2->notify_queue);
+		ch_sn2->notify_queue = NULL;
+	}
+}
+
+/*
+ * Notify those who wanted to be notified upon delivery of their message.
+ */
+static void
+xpc_notify_senders_sn2(struct xpc_channel *ch, enum xp_retval reason, s64 put)
+{
+	struct xpc_notify_sn2 *notify;
+	u8 notify_type;
+	s64 get = ch->sn.sn2.w_remote_GP.get - 1;
+
+	while (++get < put && atomic_read(&ch->n_to_notify) > 0) {
+
+		notify = &ch->sn.sn2.notify_queue[get % ch->local_nentries];
+
+		/*
+		 * See if the notify entry indicates it was associated with
+		 * a message who's sender wants to be notified. It is possible
+		 * that it is, but someone else is doing or has done the
+		 * notification.
+		 */
+		notify_type = notify->type;
+		if (notify_type == 0 ||
+		    cmpxchg(&notify->type, notify_type, 0) != notify_type) {
+			continue;
+		}
+
+		DBUG_ON(notify_type != XPC_N_CALL);
+
+		atomic_dec(&ch->n_to_notify);
+
+		if (notify->func != NULL) {
+			dev_dbg(xpc_chan, "notify->func() called, notify=0x%p "
+				"msg_number=%lld partid=%d channel=%d\n",
+				(void *)notify, get, ch->partid, ch->number);
+
+			notify->func(reason, ch->partid, ch->number,
+				     notify->key);
+
+			dev_dbg(xpc_chan, "notify->func() returned, notify=0x%p"
+				" msg_number=%lld partid=%d channel=%d\n",
+				(void *)notify, get, ch->partid, ch->number);
+		}
+	}
+}
+
+static void
+xpc_notify_senders_of_disconnect_sn2(struct xpc_channel *ch)
+{
+	xpc_notify_senders_sn2(ch, ch->reason, ch->sn.sn2.w_local_GP.put);
+}
+
+/*
+ * Clear some of the msg flags in the local message queue.
+ */
+static inline void
+xpc_clear_local_msgqueue_flags_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	s64 get;
+
+	get = ch_sn2->w_remote_GP.get;
+	do {
+		msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue +
+					     (get % ch->local_nentries) *
+					     ch->entry_size);
+		DBUG_ON(!(msg->flags & XPC_M_SN2_READY));
+		msg->flags = 0;
+	} while (++get < ch_sn2->remote_GP.get);
+}
+
+/*
+ * Clear some of the msg flags in the remote message queue.
+ */
+static inline void
+xpc_clear_remote_msgqueue_flags_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	s64 put, remote_nentries = ch->remote_nentries;
+
+	/* flags are zeroed when the buffer is allocated */
+	if (ch_sn2->remote_GP.put < remote_nentries)
+		return;
+
+	put = max(ch_sn2->w_remote_GP.put, remote_nentries);
+	do {
+		msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue +
+					     (put % remote_nentries) *
+					     ch->entry_size);
+		DBUG_ON(!(msg->flags & XPC_M_SN2_READY));
+		DBUG_ON(!(msg->flags & XPC_M_SN2_DONE));
+		DBUG_ON(msg->number != put - remote_nentries);
+		msg->flags = 0;
+	} while (++put < ch_sn2->remote_GP.put);
+}
+
+static int
+xpc_n_of_deliverable_payloads_sn2(struct xpc_channel *ch)
+{
+	return ch->sn.sn2.w_remote_GP.put - ch->sn.sn2.w_local_GP.get;
+}
+
+static void
+xpc_process_msg_chctl_flags_sn2(struct xpc_partition *part, int ch_number)
+{
+	struct xpc_channel *ch = &part->channels[ch_number];
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	int npayloads_sent;
+
+	ch_sn2->remote_GP = part->sn.sn2.remote_GPs[ch_number];
+
+	/* See what, if anything, has changed for each connected channel */
+
+	xpc_msgqueue_ref(ch);
+
+	if (ch_sn2->w_remote_GP.get == ch_sn2->remote_GP.get &&
+	    ch_sn2->w_remote_GP.put == ch_sn2->remote_GP.put) {
+		/* nothing changed since GPs were last pulled */
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	/*
+	 * First check to see if messages recently sent by us have been
+	 * received by the other side. (The remote GET value will have
+	 * changed since we last looked at it.)
+	 */
+
+	if (ch_sn2->w_remote_GP.get != ch_sn2->remote_GP.get) {
+
+		/*
+		 * We need to notify any senders that want to be notified
+		 * that their sent messages have been received by their
+		 * intended recipients. We need to do this before updating
+		 * w_remote_GP.get so that we don't allocate the same message
+		 * queue entries prematurely (see xpc_allocate_msg()).
+		 */
+		if (atomic_read(&ch->n_to_notify) > 0) {
+			/*
+			 * Notify senders that messages sent have been
+			 * received and delivered by the other side.
+			 */
+			xpc_notify_senders_sn2(ch, xpMsgDelivered,
+					       ch_sn2->remote_GP.get);
+		}
+
+		/*
+		 * Clear msg->flags in previously sent messages, so that
+		 * they're ready for xpc_allocate_msg().
+		 */
+		xpc_clear_local_msgqueue_flags_sn2(ch);
+
+		ch_sn2->w_remote_GP.get = ch_sn2->remote_GP.get;
+
+		dev_dbg(xpc_chan, "w_remote_GP.get changed to %lld, partid=%d, "
+			"channel=%d\n", ch_sn2->w_remote_GP.get, ch->partid,
+			ch->number);
+
+		/*
+		 * If anyone was waiting for message queue entries to become
+		 * available, wake them up.
+		 */
+		if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+			wake_up(&ch->msg_allocate_wq);
+	}
+
+	/*
+	 * Now check for newly sent messages by the other side. (The remote
+	 * PUT value will have changed since we last looked at it.)
+	 */
+
+	if (ch_sn2->w_remote_GP.put != ch_sn2->remote_GP.put) {
+		/*
+		 * Clear msg->flags in previously received messages, so that
+		 * they're ready for xpc_get_deliverable_payload_sn2().
+		 */
+		xpc_clear_remote_msgqueue_flags_sn2(ch);
+
+		smp_wmb(); /* ensure flags have been cleared before bte_copy */
+		ch_sn2->w_remote_GP.put = ch_sn2->remote_GP.put;
+
+		dev_dbg(xpc_chan, "w_remote_GP.put changed to %lld, partid=%d, "
+			"channel=%d\n", ch_sn2->w_remote_GP.put, ch->partid,
+			ch->number);
+
+		npayloads_sent = xpc_n_of_deliverable_payloads_sn2(ch);
+		if (npayloads_sent > 0) {
+			dev_dbg(xpc_chan, "msgs waiting to be copied and "
+				"delivered=%d, partid=%d, channel=%d\n",
+				npayloads_sent, ch->partid, ch->number);
+
+			if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)
+				xpc_activate_kthreads(ch, npayloads_sent);
+		}
+	}
+
+	xpc_msgqueue_deref(ch);
+}
+
+static struct xpc_msg_sn2 *
+xpc_pull_remote_msg_sn2(struct xpc_channel *ch, s64 get)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	unsigned long remote_msg_pa;
+	struct xpc_msg_sn2 *msg;
+	u32 msg_index;
+	u32 nmsgs;
+	u64 msg_offset;
+	enum xp_retval ret;
+
+	if (mutex_lock_interruptible(&ch_sn2->msg_to_pull_mutex) != 0) {
+		/* we were interrupted by a signal */
+		return NULL;
+	}
+
+	while (get >= ch_sn2->next_msg_to_pull) {
+
+		/* pull as many messages as are ready and able to be pulled */
+
+		msg_index = ch_sn2->next_msg_to_pull % ch->remote_nentries;
+
+		DBUG_ON(ch_sn2->next_msg_to_pull >= ch_sn2->w_remote_GP.put);
+		nmsgs = ch_sn2->w_remote_GP.put - ch_sn2->next_msg_to_pull;
+		if (msg_index + nmsgs > ch->remote_nentries) {
+			/* ignore the ones that wrap the msg queue for now */
+			nmsgs = ch->remote_nentries - msg_index;
+		}
+
+		msg_offset = msg_index * ch->entry_size;
+		msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue +
+		    msg_offset);
+		remote_msg_pa = ch_sn2->remote_msgqueue_pa + msg_offset;
+
+		ret = xpc_pull_remote_cachelines_sn2(part, msg, remote_msg_pa,
+						     nmsgs * ch->entry_size);
+		if (ret != xpSuccess) {
+
+			dev_dbg(xpc_chan, "failed to pull %d msgs starting with"
+				" msg %lld from partition %d, channel=%d, "
+				"ret=%d\n", nmsgs, ch_sn2->next_msg_to_pull,
+				ch->partid, ch->number, ret);
+
+			XPC_DEACTIVATE_PARTITION(part, ret);
+
+			mutex_unlock(&ch_sn2->msg_to_pull_mutex);
+			return NULL;
+		}
+
+		ch_sn2->next_msg_to_pull += nmsgs;
+	}
+
+	mutex_unlock(&ch_sn2->msg_to_pull_mutex);
+
+	/* return the message we were looking for */
+	msg_offset = (get % ch->remote_nentries) * ch->entry_size;
+	msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->remote_msgqueue + msg_offset);
+
+	return msg;
+}
+
+/*
+ * Get the next deliverable message's payload.
+ */
+static void *
+xpc_get_deliverable_payload_sn2(struct xpc_channel *ch)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	void *payload = NULL;
+	s64 get;
+
+	do {
+		if (ch->flags & XPC_C_DISCONNECTING)
+			break;
+
+		get = ch_sn2->w_local_GP.get;
+		smp_rmb();	/* guarantee that .get loads before .put */
+		if (get == ch_sn2->w_remote_GP.put)
+			break;
+
+		/* There are messages waiting to be pulled and delivered.
+		 * We need to try to secure one for ourselves. We'll do this
+		 * by trying to increment w_local_GP.get and hope that no one
+		 * else beats us to it. If they do, we'll we'll simply have
+		 * to try again for the next one.
+		 */
+
+		if (cmpxchg(&ch_sn2->w_local_GP.get, get, get + 1) == get) {
+			/* we got the entry referenced by get */
+
+			dev_dbg(xpc_chan, "w_local_GP.get changed to %lld, "
+				"partid=%d, channel=%d\n", get + 1,
+				ch->partid, ch->number);
+
+			/* pull the message from the remote partition */
+
+			msg = xpc_pull_remote_msg_sn2(ch, get);
+
+			if (msg != NULL) {
+				DBUG_ON(msg->number != get);
+				DBUG_ON(msg->flags & XPC_M_SN2_DONE);
+				DBUG_ON(!(msg->flags & XPC_M_SN2_READY));
+
+				payload = &msg->payload;
+			}
+			break;
+		}
+
+	} while (1);
+
+	return payload;
+}
+
+/*
+ * Now we actually send the messages that are ready to be sent by advancing
+ * the local message queue's Put value and then send a chctl msgrequest to the
+ * recipient partition.
+ */
+static void
+xpc_send_msgs_sn2(struct xpc_channel *ch, s64 initial_put)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	s64 put = initial_put + 1;
+	int send_msgrequest = 0;
+
+	while (1) {
+
+		while (1) {
+			if (put == ch_sn2->w_local_GP.put)
+				break;
+
+			msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->
+						     local_msgqueue + (put %
+						     ch->local_nentries) *
+						     ch->entry_size);
+
+			if (!(msg->flags & XPC_M_SN2_READY))
+				break;
+
+			put++;
+		}
+
+		if (put == initial_put) {
+			/* nothing's changed */
+			break;
+		}
+
+		if (cmpxchg_rel(&ch_sn2->local_GP->put, initial_put, put) !=
+		    initial_put) {
+			/* someone else beat us to it */
+			DBUG_ON(ch_sn2->local_GP->put < initial_put);
+			break;
+		}
+
+		/* we just set the new value of local_GP->put */
+
+		dev_dbg(xpc_chan, "local_GP->put changed to %lld, partid=%d, "
+			"channel=%d\n", put, ch->partid, ch->number);
+
+		send_msgrequest = 1;
+
+		/*
+		 * We need to ensure that the message referenced by
+		 * local_GP->put is not XPC_M_SN2_READY or that local_GP->put
+		 * equals w_local_GP.put, so we'll go have a look.
+		 */
+		initial_put = put;
+	}
+
+	if (send_msgrequest)
+		xpc_send_chctl_msgrequest_sn2(ch);
+}
+
+/*
+ * Allocate an entry for a message from the message queue associated with the
+ * specified channel.
+ */
+static enum xp_retval
+xpc_allocate_msg_sn2(struct xpc_channel *ch, u32 flags,
+		     struct xpc_msg_sn2 **address_of_msg)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	enum xp_retval ret;
+	s64 put;
+
+	/*
+	 * Get the next available message entry from the local message queue.
+	 * If none are available, we'll make sure that we grab the latest
+	 * GP values.
+	 */
+	ret = xpTimeout;
+
+	while (1) {
+
+		put = ch_sn2->w_local_GP.put;
+		smp_rmb();	/* guarantee that .put loads before .get */
+		if (put - ch_sn2->w_remote_GP.get < ch->local_nentries) {
+
+			/* There are available message entries. We need to try
+			 * to secure one for ourselves. We'll do this by trying
+			 * to increment w_local_GP.put as long as someone else
+			 * doesn't beat us to it. If they do, we'll have to
+			 * try again.
+			 */
+			if (cmpxchg(&ch_sn2->w_local_GP.put, put, put + 1) ==
+			    put) {
+				/* we got the entry referenced by put */
+				break;
+			}
+			continue;	/* try again */
+		}
+
+		/*
+		 * There aren't any available msg entries at this time.
+		 *
+		 * In waiting for a message entry to become available,
+		 * we set a timeout in case the other side is not sending
+		 * completion interrupts. This lets us fake a notify IRQ
+		 * that will cause the notify IRQ handler to fetch the latest
+		 * GP values as if an interrupt was sent by the other side.
+		 */
+		if (ret == xpTimeout)
+			xpc_send_chctl_local_msgrequest_sn2(ch);
+
+		if (flags & XPC_NOWAIT)
+			return xpNoWait;
+
+		ret = xpc_allocate_msg_wait(ch);
+		if (ret != xpInterrupted && ret != xpTimeout)
+			return ret;
+	}
+
+	/* get the message's address and initialize it */
+	msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->local_msgqueue +
+				     (put % ch->local_nentries) *
+				     ch->entry_size);
+
+	DBUG_ON(msg->flags != 0);
+	msg->number = put;
+
+	dev_dbg(xpc_chan, "w_local_GP.put changed to %lld; msg=0x%p, "
+		"msg_number=%lld, partid=%d, channel=%d\n", put + 1,
+		(void *)msg, msg->number, ch->partid, ch->number);
+
+	*address_of_msg = msg;
+	return xpSuccess;
+}
+
+/*
+ * Common code that does the actual sending of the message by advancing the
+ * local message queue's Put value and sends a chctl msgrequest to the
+ * partition the message is being sent to.
+ */
+static enum xp_retval
+xpc_send_payload_sn2(struct xpc_channel *ch, u32 flags, void *payload,
+		     u16 payload_size, u8 notify_type, xpc_notify_func func,
+		     void *key)
+{
+	enum xp_retval ret = xpSuccess;
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg = msg;
+	struct xpc_notify_sn2 *notify = notify;
+	s64 msg_number;
+	s64 put;
+
+	DBUG_ON(notify_type == XPC_N_CALL && func == NULL);
+
+	if (XPC_MSG_SIZE(payload_size) > ch->entry_size)
+		return xpPayloadTooBig;
+
+	xpc_msgqueue_ref(ch);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		ret = ch->reason;
+		goto out_1;
+	}
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		ret = xpNotConnected;
+		goto out_1;
+	}
+
+	ret = xpc_allocate_msg_sn2(ch, flags, &msg);
+	if (ret != xpSuccess)
+		goto out_1;
+
+	msg_number = msg->number;
+
+	if (notify_type != 0) {
+		/*
+		 * Tell the remote side to send an ACK interrupt when the
+		 * message has been delivered.
+		 */
+		msg->flags |= XPC_M_SN2_INTERRUPT;
+
+		atomic_inc(&ch->n_to_notify);
+
+		notify = &ch_sn2->notify_queue[msg_number % ch->local_nentries];
+		notify->func = func;
+		notify->key = key;
+		notify->type = notify_type;
+
+		/* ??? Is a mb() needed here? */
+
+		if (ch->flags & XPC_C_DISCONNECTING) {
+			/*
+			 * An error occurred between our last error check and
+			 * this one. We will try to clear the type field from
+			 * the notify entry. If we succeed then
+			 * xpc_disconnect_channel() didn't already process
+			 * the notify entry.
+			 */
+			if (cmpxchg(&notify->type, notify_type, 0) ==
+			    notify_type) {
+				atomic_dec(&ch->n_to_notify);
+				ret = ch->reason;
+			}
+			goto out_1;
+		}
+	}
+
+	memcpy(&msg->payload, payload, payload_size);
+
+	msg->flags |= XPC_M_SN2_READY;
+
+	/*
+	 * The preceding store of msg->flags must occur before the following
+	 * load of local_GP->put.
+	 */
+	smp_mb();
+
+	/* see if the message is next in line to be sent, if so send it */
+
+	put = ch_sn2->local_GP->put;
+	if (put == msg_number)
+		xpc_send_msgs_sn2(ch, put);
+
+out_1:
+	xpc_msgqueue_deref(ch);
+	return ret;
+}
+
+/*
+ * Now we actually acknowledge the messages that have been delivered and ack'd
+ * by advancing the cached remote message queue's Get value and if requested
+ * send a chctl msgrequest to the message sender's partition.
+ *
+ * If a message has XPC_M_SN2_INTERRUPT set, send an interrupt to the partition
+ * that sent the message.
+ */
+static void
+xpc_acknowledge_msgs_sn2(struct xpc_channel *ch, s64 initial_get, u8 msg_flags)
+{
+	struct xpc_channel_sn2 *ch_sn2 = &ch->sn.sn2;
+	struct xpc_msg_sn2 *msg;
+	s64 get = initial_get + 1;
+	int send_msgrequest = 0;
+
+	while (1) {
+
+		while (1) {
+			if (get == ch_sn2->w_local_GP.get)
+				break;
+
+			msg = (struct xpc_msg_sn2 *)((u64)ch_sn2->
+						     remote_msgqueue + (get %
+						     ch->remote_nentries) *
+						     ch->entry_size);
+
+			if (!(msg->flags & XPC_M_SN2_DONE))
+				break;
+
+			msg_flags |= msg->flags;
+			get++;
+		}
+
+		if (get == initial_get) {
+			/* nothing's changed */
+			break;
+		}
+
+		if (cmpxchg_rel(&ch_sn2->local_GP->get, initial_get, get) !=
+		    initial_get) {
+			/* someone else beat us to it */
+			DBUG_ON(ch_sn2->local_GP->get <= initial_get);
+			break;
+		}
+
+		/* we just set the new value of local_GP->get */
+
+		dev_dbg(xpc_chan, "local_GP->get changed to %lld, partid=%d, "
+			"channel=%d\n", get, ch->partid, ch->number);
+
+		send_msgrequest = (msg_flags & XPC_M_SN2_INTERRUPT);
+
+		/*
+		 * We need to ensure that the message referenced by
+		 * local_GP->get is not XPC_M_SN2_DONE or that local_GP->get
+		 * equals w_local_GP.get, so we'll go have a look.
+		 */
+		initial_get = get;
+	}
+
+	if (send_msgrequest)
+		xpc_send_chctl_msgrequest_sn2(ch);
+}
+
+static void
+xpc_received_payload_sn2(struct xpc_channel *ch, void *payload)
+{
+	struct xpc_msg_sn2 *msg;
+	s64 msg_number;
+	s64 get;
+
+	msg = container_of(payload, struct xpc_msg_sn2, payload);
+	msg_number = msg->number;
+
+	dev_dbg(xpc_chan, "msg=0x%p, msg_number=%lld, partid=%d, channel=%d\n",
+		(void *)msg, msg_number, ch->partid, ch->number);
+
+	DBUG_ON((((u64)msg - (u64)ch->sn.sn2.remote_msgqueue) / ch->entry_size) !=
+		msg_number % ch->remote_nentries);
+	DBUG_ON(!(msg->flags & XPC_M_SN2_READY));
+	DBUG_ON(msg->flags & XPC_M_SN2_DONE);
+
+	msg->flags |= XPC_M_SN2_DONE;
+
+	/*
+	 * The preceding store of msg->flags must occur before the following
+	 * load of local_GP->get.
+	 */
+	smp_mb();
+
+	/*
+	 * See if this message is next in line to be acknowledged as having
+	 * been delivered.
+	 */
+	get = ch->sn.sn2.local_GP->get;
+	if (get == msg_number)
+		xpc_acknowledge_msgs_sn2(ch, get, msg->flags);
+}
+
+static struct xpc_arch_operations xpc_arch_ops_sn2 = {
+	.setup_partitions = xpc_setup_partitions_sn2,
+	.teardown_partitions = xpc_teardown_partitions_sn2,
+	.process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_sn2,
+	.get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_sn2,
+	.setup_rsvd_page = xpc_setup_rsvd_page_sn2,
+
+	.allow_hb = xpc_allow_hb_sn2,
+	.disallow_hb = xpc_disallow_hb_sn2,
+	.disallow_all_hbs = xpc_disallow_all_hbs_sn2,
+	.increment_heartbeat = xpc_increment_heartbeat_sn2,
+	.offline_heartbeat = xpc_offline_heartbeat_sn2,
+	.online_heartbeat = xpc_online_heartbeat_sn2,
+	.heartbeat_init = xpc_heartbeat_init_sn2,
+	.heartbeat_exit = xpc_heartbeat_exit_sn2,
+	.get_remote_heartbeat = xpc_get_remote_heartbeat_sn2,
+
+	.request_partition_activation =
+		xpc_request_partition_activation_sn2,
+	.request_partition_reactivation =
+		xpc_request_partition_reactivation_sn2,
+	.request_partition_deactivation =
+		xpc_request_partition_deactivation_sn2,
+	.cancel_partition_deactivation_request =
+		xpc_cancel_partition_deactivation_request_sn2,
+
+	.setup_ch_structures = xpc_setup_ch_structures_sn2,
+	.teardown_ch_structures = xpc_teardown_ch_structures_sn2,
+
+	.make_first_contact = xpc_make_first_contact_sn2,
+
+	.get_chctl_all_flags = xpc_get_chctl_all_flags_sn2,
+	.send_chctl_closerequest = xpc_send_chctl_closerequest_sn2,
+	.send_chctl_closereply = xpc_send_chctl_closereply_sn2,
+	.send_chctl_openrequest = xpc_send_chctl_openrequest_sn2,
+	.send_chctl_openreply = xpc_send_chctl_openreply_sn2,
+	.send_chctl_opencomplete = xpc_send_chctl_opencomplete_sn2,
+	.process_msg_chctl_flags = xpc_process_msg_chctl_flags_sn2,
+
+	.save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_sn2,
+
+	.setup_msg_structures = xpc_setup_msg_structures_sn2,
+	.teardown_msg_structures = xpc_teardown_msg_structures_sn2,
+
+	.indicate_partition_engaged = xpc_indicate_partition_engaged_sn2,
+	.indicate_partition_disengaged = xpc_indicate_partition_disengaged_sn2,
+	.partition_engaged = xpc_partition_engaged_sn2,
+	.any_partition_engaged = xpc_any_partition_engaged_sn2,
+	.assume_partition_disengaged = xpc_assume_partition_disengaged_sn2,
+
+	.n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_sn2,
+	.send_payload = xpc_send_payload_sn2,
+	.get_deliverable_payload = xpc_get_deliverable_payload_sn2,
+	.received_payload = xpc_received_payload_sn2,
+	.notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_sn2,
+};
+
+int
+xpc_init_sn2(void)
+{
+	int ret;
+	size_t buf_size;
+
+	xpc_arch_ops = xpc_arch_ops_sn2;
+
+	if (offsetof(struct xpc_msg_sn2, payload) > XPC_MSG_HDR_MAX_SIZE) {
+		dev_err(xpc_part, "header portion of struct xpc_msg_sn2 is "
+			"larger than %d\n", XPC_MSG_HDR_MAX_SIZE);
+		return -E2BIG;
+	}
+
+	buf_size = max(XPC_RP_VARS_SIZE,
+		       XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES_SN2);
+	xpc_remote_copy_buffer_sn2 = xpc_kmalloc_cacheline_aligned(buf_size,
+								   GFP_KERNEL,
+					      &xpc_remote_copy_buffer_base_sn2);
+	if (xpc_remote_copy_buffer_sn2 == NULL) {
+		dev_err(xpc_part, "can't get memory for remote copy buffer\n");
+		return -ENOMEM;
+	}
+
+	/* open up protections for IPI and [potentially] amo operations */
+	xpc_allow_IPI_ops_sn2();
+	xpc_allow_amo_ops_shub_wars_1_1_sn2();
+
+	/*
+	 * This is safe to do before the xpc_hb_checker thread has started
+	 * because the handler releases a wait queue.  If an interrupt is
+	 * received before the thread is waiting, it will not go to sleep,
+	 * but rather immediately process the interrupt.
+	 */
+	ret = request_irq(SGI_XPC_ACTIVATE, xpc_handle_activate_IRQ_sn2, 0,
+			  "xpc hb", NULL);
+	if (ret != 0) {
+		dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
+			"errno=%d\n", -ret);
+		xpc_disallow_IPI_ops_sn2();
+		kfree(xpc_remote_copy_buffer_base_sn2);
+	}
+	return ret;
+}
+
+void
+xpc_exit_sn2(void)
+{
+	free_irq(SGI_XPC_ACTIVATE, NULL);
+	xpc_disallow_IPI_ops_sn2();
+	kfree(xpc_remote_copy_buffer_base_sn2);
+}
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
new file mode 100644
index 000000000..340b44d9e
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -0,0 +1,1813 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) uv-based functions.
+ *
+ *     Architecture specific implementation of common functions.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <asm/uv/uv_hub.h>
+#if defined CONFIG_X86_64
+#include <asm/uv/bios.h>
+#include <asm/uv/uv_irq.h>
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#endif
+#include "../sgi-gru/gru.h"
+#include "../sgi-gru/grukservices.h"
+#include "xpc.h"
+
+#if defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+struct uv_IO_APIC_route_entry {
+	__u64	vector		:  8,
+		delivery_mode	:  3,
+		dest_mode	:  1,
+		delivery_status	:  1,
+		polarity	:  1,
+		__reserved_1	:  1,
+		trigger		:  1,
+		mask		:  1,
+		__reserved_2	: 15,
+		dest		: 32;
+};
+#endif
+
+static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
+
+#define XPC_ACTIVATE_MSG_SIZE_UV	(1 * GRU_CACHE_LINE_BYTES)
+#define XPC_ACTIVATE_MQ_SIZE_UV		(4 * XP_MAX_NPARTITIONS_UV * \
+					 XPC_ACTIVATE_MSG_SIZE_UV)
+#define XPC_ACTIVATE_IRQ_NAME		"xpc_activate"
+
+#define XPC_NOTIFY_MSG_SIZE_UV		(2 * GRU_CACHE_LINE_BYTES)
+#define XPC_NOTIFY_MQ_SIZE_UV		(4 * XP_MAX_NPARTITIONS_UV * \
+					 XPC_NOTIFY_MSG_SIZE_UV)
+#define XPC_NOTIFY_IRQ_NAME		"xpc_notify"
+
+static int xpc_mq_node = -1;
+
+static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
+static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
+
+static int
+xpc_setup_partitions_uv(void)
+{
+	short partid;
+	struct xpc_partition_uv *part_uv;
+
+	for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+		part_uv = &xpc_partitions[partid].sn.uv;
+
+		mutex_init(&part_uv->cached_activate_gru_mq_desc_mutex);
+		spin_lock_init(&part_uv->flags_lock);
+		part_uv->remote_act_state = XPC_P_AS_INACTIVE;
+	}
+	return 0;
+}
+
+static void
+xpc_teardown_partitions_uv(void)
+{
+	short partid;
+	struct xpc_partition_uv *part_uv;
+	unsigned long irq_flags;
+
+	for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+		part_uv = &xpc_partitions[partid].sn.uv;
+
+		if (part_uv->cached_activate_gru_mq_desc != NULL) {
+			mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex);
+			spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+			part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+			spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+			kfree(part_uv->cached_activate_gru_mq_desc);
+			part_uv->cached_activate_gru_mq_desc = NULL;
+			mutex_unlock(&part_uv->
+				     cached_activate_gru_mq_desc_mutex);
+		}
+	}
+}
+
+static int
+xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
+{
+	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+#if defined CONFIG_X86_64
+	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
+			UV_AFFINITY_CPU);
+	if (mq->irq < 0)
+		return mq->irq;
+
+	mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
+
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	if (strcmp(irq_name, XPC_ACTIVATE_IRQ_NAME) == 0)
+		mq->irq = SGI_XPC_ACTIVATE;
+	else if (strcmp(irq_name, XPC_NOTIFY_IRQ_NAME) == 0)
+		mq->irq = SGI_XPC_NOTIFY;
+	else
+		return -EINVAL;
+
+	mq->mmr_value = (unsigned long)cpu_physical_id(cpu) << 32 | mq->irq;
+	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mq->mmr_value);
+#else
+	#error not a supported configuration
+#endif
+
+	return 0;
+}
+
+static void
+xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq)
+{
+#if defined CONFIG_X86_64
+	uv_teardown_irq(mq->irq);
+
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	int mmr_pnode;
+	unsigned long mmr_value;
+
+	mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+	mmr_value = 1UL << 16;
+
+	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mmr_value);
+#else
+	#error not a supported configuration
+#endif
+}
+
+static int
+xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
+{
+	int ret;
+
+#if defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+	ret = sn_mq_watchlist_alloc(mmr_pnode, (void *)uv_gpa(mq->address),
+				    mq->order, &mq->mmr_offset);
+	if (ret < 0) {
+		dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n",
+			ret);
+		return -EBUSY;
+	}
+#elif defined CONFIG_X86_64
+	ret = uv_bios_mq_watchlist_alloc(uv_gpa(mq->address),
+					 mq->order, &mq->mmr_offset);
+	if (ret < 0) {
+		dev_err(xpc_part, "uv_bios_mq_watchlist_alloc() failed, "
+			"ret=%d\n", ret);
+		return ret;
+	}
+#else
+	#error not a supported configuration
+#endif
+
+	mq->watchlist_num = ret;
+	return 0;
+}
+
+static void
+xpc_gru_mq_watchlist_free_uv(struct xpc_gru_mq_uv *mq)
+{
+	int ret;
+	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+#if defined CONFIG_X86_64
+	ret = uv_bios_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
+	BUG_ON(ret != BIOS_STATUS_SUCCESS);
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	ret = sn_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
+	BUG_ON(ret != SALRET_OK);
+#else
+	#error not a supported configuration
+#endif
+}
+
+static struct xpc_gru_mq_uv *
+xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
+		     irq_handler_t irq_handler)
+{
+	enum xp_retval xp_ret;
+	int ret;
+	int nid;
+	int nasid;
+	int pg_order;
+	struct page *page;
+	struct xpc_gru_mq_uv *mq;
+	struct uv_IO_APIC_route_entry *mmr_value;
+
+	mq = kmalloc(sizeof(struct xpc_gru_mq_uv), GFP_KERNEL);
+	if (mq == NULL) {
+		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() "
+			"a xpc_gru_mq_uv structure\n");
+		ret = -ENOMEM;
+		goto out_0;
+	}
+
+	mq->gru_mq_desc = kzalloc(sizeof(struct gru_message_queue_desc),
+				  GFP_KERNEL);
+	if (mq->gru_mq_desc == NULL) {
+		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() "
+			"a gru_message_queue_desc structure\n");
+		ret = -ENOMEM;
+		goto out_1;
+	}
+
+	pg_order = get_order(mq_size);
+	mq->order = pg_order + PAGE_SHIFT;
+	mq_size = 1UL << mq->order;
+
+	mq->mmr_blade = uv_cpu_to_blade_id(cpu);
+
+	nid = cpu_to_node(cpu);
+	page = __alloc_pages_node(nid,
+				      GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+				      pg_order);
+	if (page == NULL) {
+		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
+			"bytes of memory on nid=%d for GRU mq\n", mq_size, nid);
+		ret = -ENOMEM;
+		goto out_2;
+	}
+	mq->address = page_address(page);
+
+	/* enable generation of irq when GRU mq operation occurs to this mq */
+	ret = xpc_gru_mq_watchlist_alloc_uv(mq);
+	if (ret != 0)
+		goto out_3;
+
+	ret = xpc_get_gru_mq_irq_uv(mq, cpu, irq_name);
+	if (ret != 0)
+		goto out_4;
+
+	ret = request_irq(mq->irq, irq_handler, 0, irq_name, NULL);
+	if (ret != 0) {
+		dev_err(xpc_part, "request_irq(irq=%d) returned error=%d\n",
+			mq->irq, -ret);
+		goto out_5;
+	}
+
+	nasid = UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpu));
+
+	mmr_value = (struct uv_IO_APIC_route_entry *)&mq->mmr_value;
+	ret = gru_create_message_queue(mq->gru_mq_desc, mq->address, mq_size,
+				     nasid, mmr_value->vector, mmr_value->dest);
+	if (ret != 0) {
+		dev_err(xpc_part, "gru_create_message_queue() returned "
+			"error=%d\n", ret);
+		ret = -EINVAL;
+		goto out_6;
+	}
+
+	/* allow other partitions to access this GRU mq */
+	xp_ret = xp_expand_memprotect(xp_pa(mq->address), mq_size);
+	if (xp_ret != xpSuccess) {
+		ret = -EACCES;
+		goto out_6;
+	}
+
+	return mq;
+
+	/* something went wrong */
+out_6:
+	free_irq(mq->irq, NULL);
+out_5:
+	xpc_release_gru_mq_irq_uv(mq);
+out_4:
+	xpc_gru_mq_watchlist_free_uv(mq);
+out_3:
+	free_pages((unsigned long)mq->address, pg_order);
+out_2:
+	kfree(mq->gru_mq_desc);
+out_1:
+	kfree(mq);
+out_0:
+	return ERR_PTR(ret);
+}
+
+static void
+xpc_destroy_gru_mq_uv(struct xpc_gru_mq_uv *mq)
+{
+	unsigned int mq_size;
+	int pg_order;
+	int ret;
+
+	/* disallow other partitions to access GRU mq */
+	mq_size = 1UL << mq->order;
+	ret = xp_restrict_memprotect(xp_pa(mq->address), mq_size);
+	BUG_ON(ret != xpSuccess);
+
+	/* unregister irq handler and release mq irq/vector mapping */
+	free_irq(mq->irq, NULL);
+	xpc_release_gru_mq_irq_uv(mq);
+
+	/* disable generation of irq when GRU mq op occurs to this mq */
+	xpc_gru_mq_watchlist_free_uv(mq);
+
+	pg_order = mq->order - PAGE_SHIFT;
+	free_pages((unsigned long)mq->address, pg_order);
+
+	kfree(mq);
+}
+
+static enum xp_retval
+xpc_send_gru_msg(struct gru_message_queue_desc *gru_mq_desc, void *msg,
+		 size_t msg_size)
+{
+	enum xp_retval xp_ret;
+	int ret;
+
+	while (1) {
+		ret = gru_send_message_gpa(gru_mq_desc, msg, msg_size);
+		if (ret == MQE_OK) {
+			xp_ret = xpSuccess;
+			break;
+		}
+
+		if (ret == MQE_QUEUE_FULL) {
+			dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
+				"error=MQE_QUEUE_FULL\n");
+			/* !!! handle QLimit reached; delay & try again */
+			/* ??? Do we add a limit to the number of retries? */
+			(void)msleep_interruptible(10);
+		} else if (ret == MQE_CONGESTION) {
+			dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
+				"error=MQE_CONGESTION\n");
+			/* !!! handle LB Overflow; simply try again */
+			/* ??? Do we add a limit to the number of retries? */
+		} else {
+			/* !!! Currently this is MQE_UNEXPECTED_CB_ERR */
+			dev_err(xpc_chan, "gru_send_message_gpa() returned "
+				"error=%d\n", ret);
+			xp_ret = xpGruSendMqError;
+			break;
+		}
+	}
+	return xp_ret;
+}
+
+static void
+xpc_process_activate_IRQ_rcvd_uv(void)
+{
+	unsigned long irq_flags;
+	short partid;
+	struct xpc_partition *part;
+	u8 act_state_req;
+
+	DBUG_ON(xpc_activate_IRQ_rcvd == 0);
+
+	spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+	for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (part->sn.uv.act_state_req == 0)
+			continue;
+
+		xpc_activate_IRQ_rcvd--;
+		BUG_ON(xpc_activate_IRQ_rcvd < 0);
+
+		act_state_req = part->sn.uv.act_state_req;
+		part->sn.uv.act_state_req = 0;
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		if (act_state_req == XPC_P_ASR_ACTIVATE_UV) {
+			if (part->act_state == XPC_P_AS_INACTIVE)
+				xpc_activate_partition(part);
+			else if (part->act_state == XPC_P_AS_DEACTIVATING)
+				XPC_DEACTIVATE_PARTITION(part, xpReactivating);
+
+		} else if (act_state_req == XPC_P_ASR_REACTIVATE_UV) {
+			if (part->act_state == XPC_P_AS_INACTIVE)
+				xpc_activate_partition(part);
+			else
+				XPC_DEACTIVATE_PARTITION(part, xpReactivating);
+
+		} else if (act_state_req == XPC_P_ASR_DEACTIVATE_UV) {
+			XPC_DEACTIVATE_PARTITION(part, part->sn.uv.reason);
+
+		} else {
+			BUG();
+		}
+
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (xpc_activate_IRQ_rcvd == 0)
+			break;
+	}
+	spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+}
+
+static void
+xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
+			      struct xpc_activate_mq_msghdr_uv *msg_hdr,
+			      int part_setup,
+			      int *wakeup_hb_checker)
+{
+	unsigned long irq_flags;
+	struct xpc_partition_uv *part_uv = &part->sn.uv;
+	struct xpc_openclose_args *args;
+
+	part_uv->remote_act_state = msg_hdr->act_state;
+
+	switch (msg_hdr->type) {
+	case XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV:
+		/* syncing of remote_act_state was just done above */
+		break;
+
+	case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: {
+		struct xpc_activate_mq_msg_activate_req_uv *msg;
+
+		/*
+		 * ??? Do we deal here with ts_jiffies being different
+		 * ??? if act_state != XPC_P_AS_INACTIVE instead of
+		 * ??? below?
+		 */
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_activate_req_uv, hdr);
+
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (part_uv->act_state_req == 0)
+			xpc_activate_IRQ_rcvd++;
+		part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV;
+		part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */
+		part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies;
+		part_uv->heartbeat_gpa = msg->heartbeat_gpa;
+
+		if (msg->activate_gru_mq_desc_gpa !=
+		    part_uv->activate_gru_mq_desc_gpa) {
+			spin_lock(&part_uv->flags_lock);
+			part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+			spin_unlock(&part_uv->flags_lock);
+			part_uv->activate_gru_mq_desc_gpa =
+			    msg->activate_gru_mq_desc_gpa;
+		}
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		(*wakeup_hb_checker)++;
+		break;
+	}
+	case XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV: {
+		struct xpc_activate_mq_msg_deactivate_req_uv *msg;
+
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_deactivate_req_uv, hdr);
+
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (part_uv->act_state_req == 0)
+			xpc_activate_IRQ_rcvd++;
+		part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+		part_uv->reason = msg->reason;
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		(*wakeup_hb_checker)++;
+		return;
+	}
+	case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: {
+		struct xpc_activate_mq_msg_chctl_closerequest_uv *msg;
+
+		if (!part_setup)
+			break;
+
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_chctl_closerequest_uv,
+				   hdr);
+		args = &part->remote_openclose_args[msg->ch_number];
+		args->reason = msg->reason;
+
+		spin_lock_irqsave(&part->chctl_lock, irq_flags);
+		part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREQUEST;
+		spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+		xpc_wakeup_channel_mgr(part);
+		break;
+	}
+	case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: {
+		struct xpc_activate_mq_msg_chctl_closereply_uv *msg;
+
+		if (!part_setup)
+			break;
+
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_chctl_closereply_uv,
+				   hdr);
+
+		spin_lock_irqsave(&part->chctl_lock, irq_flags);
+		part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREPLY;
+		spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+		xpc_wakeup_channel_mgr(part);
+		break;
+	}
+	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: {
+		struct xpc_activate_mq_msg_chctl_openrequest_uv *msg;
+
+		if (!part_setup)
+			break;
+
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_chctl_openrequest_uv,
+				   hdr);
+		args = &part->remote_openclose_args[msg->ch_number];
+		args->entry_size = msg->entry_size;
+		args->local_nentries = msg->local_nentries;
+
+		spin_lock_irqsave(&part->chctl_lock, irq_flags);
+		part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREQUEST;
+		spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+		xpc_wakeup_channel_mgr(part);
+		break;
+	}
+	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: {
+		struct xpc_activate_mq_msg_chctl_openreply_uv *msg;
+
+		if (!part_setup)
+			break;
+
+		msg = container_of(msg_hdr, struct
+				   xpc_activate_mq_msg_chctl_openreply_uv, hdr);
+		args = &part->remote_openclose_args[msg->ch_number];
+		args->remote_nentries = msg->remote_nentries;
+		args->local_nentries = msg->local_nentries;
+		args->local_msgqueue_pa = msg->notify_gru_mq_desc_gpa;
+
+		spin_lock_irqsave(&part->chctl_lock, irq_flags);
+		part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREPLY;
+		spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+		xpc_wakeup_channel_mgr(part);
+		break;
+	}
+	case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV: {
+		struct xpc_activate_mq_msg_chctl_opencomplete_uv *msg;
+
+		if (!part_setup)
+			break;
+
+		msg = container_of(msg_hdr, struct
+				xpc_activate_mq_msg_chctl_opencomplete_uv, hdr);
+		spin_lock_irqsave(&part->chctl_lock, irq_flags);
+		part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENCOMPLETE;
+		spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+		xpc_wakeup_channel_mgr(part);
+	}
+	case XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV:
+		spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+		part_uv->flags |= XPC_P_ENGAGED_UV;
+		spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+		break;
+
+	case XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV:
+		spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+		part_uv->flags &= ~XPC_P_ENGAGED_UV;
+		spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+		break;
+
+	default:
+		dev_err(xpc_part, "received unknown activate_mq msg type=%d "
+			"from partition=%d\n", msg_hdr->type, XPC_PARTID(part));
+
+		/* get hb checker to deactivate from the remote partition */
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (part_uv->act_state_req == 0)
+			xpc_activate_IRQ_rcvd++;
+		part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+		part_uv->reason = xpBadMsgType;
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		(*wakeup_hb_checker)++;
+		return;
+	}
+
+	if (msg_hdr->rp_ts_jiffies != part->remote_rp_ts_jiffies &&
+	    part->remote_rp_ts_jiffies != 0) {
+		/*
+		 * ??? Does what we do here need to be sensitive to
+		 * ??? act_state or remote_act_state?
+		 */
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (part_uv->act_state_req == 0)
+			xpc_activate_IRQ_rcvd++;
+		part_uv->act_state_req = XPC_P_ASR_REACTIVATE_UV;
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		(*wakeup_hb_checker)++;
+	}
+}
+
+static irqreturn_t
+xpc_handle_activate_IRQ_uv(int irq, void *dev_id)
+{
+	struct xpc_activate_mq_msghdr_uv *msg_hdr;
+	short partid;
+	struct xpc_partition *part;
+	int wakeup_hb_checker = 0;
+	int part_referenced;
+
+	while (1) {
+		msg_hdr = gru_get_next_message(xpc_activate_mq_uv->gru_mq_desc);
+		if (msg_hdr == NULL)
+			break;
+
+		partid = msg_hdr->partid;
+		if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
+			dev_err(xpc_part, "xpc_handle_activate_IRQ_uv() "
+				"received invalid partid=0x%x in message\n",
+				partid);
+		} else {
+			part = &xpc_partitions[partid];
+
+			part_referenced = xpc_part_ref(part);
+			xpc_handle_activate_mq_msg_uv(part, msg_hdr,
+						      part_referenced,
+						      &wakeup_hb_checker);
+			if (part_referenced)
+				xpc_part_deref(part);
+		}
+
+		gru_free_message(xpc_activate_mq_uv->gru_mq_desc, msg_hdr);
+	}
+
+	if (wakeup_hb_checker)
+		wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+	return IRQ_HANDLED;
+}
+
+static enum xp_retval
+xpc_cache_remote_gru_mq_desc_uv(struct gru_message_queue_desc *gru_mq_desc,
+				unsigned long gru_mq_desc_gpa)
+{
+	enum xp_retval ret;
+
+	ret = xp_remote_memcpy(uv_gpa(gru_mq_desc), gru_mq_desc_gpa,
+			       sizeof(struct gru_message_queue_desc));
+	if (ret == xpSuccess)
+		gru_mq_desc->mq = NULL;
+
+	return ret;
+}
+
+static enum xp_retval
+xpc_send_activate_IRQ_uv(struct xpc_partition *part, void *msg, size_t msg_size,
+			 int msg_type)
+{
+	struct xpc_activate_mq_msghdr_uv *msg_hdr = msg;
+	struct xpc_partition_uv *part_uv = &part->sn.uv;
+	struct gru_message_queue_desc *gru_mq_desc;
+	unsigned long irq_flags;
+	enum xp_retval ret;
+
+	DBUG_ON(msg_size > XPC_ACTIVATE_MSG_SIZE_UV);
+
+	msg_hdr->type = msg_type;
+	msg_hdr->partid = xp_partition_id;
+	msg_hdr->act_state = part->act_state;
+	msg_hdr->rp_ts_jiffies = xpc_rsvd_page->ts_jiffies;
+
+	mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex);
+again:
+	if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV)) {
+		gru_mq_desc = part_uv->cached_activate_gru_mq_desc;
+		if (gru_mq_desc == NULL) {
+			gru_mq_desc = kmalloc(sizeof(struct
+					      gru_message_queue_desc),
+					      GFP_KERNEL);
+			if (gru_mq_desc == NULL) {
+				ret = xpNoMemory;
+				goto done;
+			}
+			part_uv->cached_activate_gru_mq_desc = gru_mq_desc;
+		}
+
+		ret = xpc_cache_remote_gru_mq_desc_uv(gru_mq_desc,
+						      part_uv->
+						      activate_gru_mq_desc_gpa);
+		if (ret != xpSuccess)
+			goto done;
+
+		spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+		part_uv->flags |= XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+		spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+	}
+
+	/* ??? Is holding a spin_lock (ch->lock) during this call a bad idea? */
+	ret = xpc_send_gru_msg(part_uv->cached_activate_gru_mq_desc, msg,
+			       msg_size);
+	if (ret != xpSuccess) {
+		smp_rmb();	/* ensure a fresh copy of part_uv->flags */
+		if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV))
+			goto again;
+	}
+done:
+	mutex_unlock(&part_uv->cached_activate_gru_mq_desc_mutex);
+	return ret;
+}
+
+static void
+xpc_send_activate_IRQ_part_uv(struct xpc_partition *part, void *msg,
+			      size_t msg_size, int msg_type)
+{
+	enum xp_retval ret;
+
+	ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
+	if (unlikely(ret != xpSuccess))
+		XPC_DEACTIVATE_PARTITION(part, ret);
+}
+
+static void
+xpc_send_activate_IRQ_ch_uv(struct xpc_channel *ch, unsigned long *irq_flags,
+			 void *msg, size_t msg_size, int msg_type)
+{
+	struct xpc_partition *part = &xpc_partitions[ch->partid];
+	enum xp_retval ret;
+
+	ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
+	if (unlikely(ret != xpSuccess)) {
+		if (irq_flags != NULL)
+			spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+		XPC_DEACTIVATE_PARTITION(part, ret);
+
+		if (irq_flags != NULL)
+			spin_lock_irqsave(&ch->lock, *irq_flags);
+	}
+}
+
+static void
+xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req)
+{
+	unsigned long irq_flags;
+	struct xpc_partition_uv *part_uv = &part->sn.uv;
+
+	/*
+	 * !!! Make our side think that the remote partition sent an activate
+	 * !!! mq message our way by doing what the activate IRQ handler would
+	 * !!! do had one really been sent.
+	 */
+
+	spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+	if (part_uv->act_state_req == 0)
+		xpc_activate_IRQ_rcvd++;
+	part_uv->act_state_req = act_state_req;
+	spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+	wake_up_interruptible(&xpc_activate_IRQ_wq);
+}
+
+static enum xp_retval
+xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
+				  size_t *len)
+{
+	s64 status;
+	enum xp_retval ret;
+
+#if defined CONFIG_X86_64
+	status = uv_bios_reserved_page_pa((u64)buf, cookie, (u64 *)rp_pa,
+					  (u64 *)len);
+	if (status == BIOS_STATUS_SUCCESS)
+		ret = xpSuccess;
+	else if (status == BIOS_STATUS_MORE_PASSES)
+		ret = xpNeedMoreInfo;
+	else
+		ret = xpBiosError;
+
+#elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV
+	status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len);
+	if (status == SALRET_OK)
+		ret = xpSuccess;
+	else if (status == SALRET_MORE_PASSES)
+		ret = xpNeedMoreInfo;
+	else
+		ret = xpSalError;
+
+#else
+	#error not a supported configuration
+#endif
+
+	return ret;
+}
+
+static int
+xpc_setup_rsvd_page_uv(struct xpc_rsvd_page *rp)
+{
+	xpc_heartbeat_uv =
+	    &xpc_partitions[sn_partition_id].sn.uv.cached_heartbeat;
+	rp->sn.uv.heartbeat_gpa = uv_gpa(xpc_heartbeat_uv);
+	rp->sn.uv.activate_gru_mq_desc_gpa =
+	    uv_gpa(xpc_activate_mq_uv->gru_mq_desc);
+	return 0;
+}
+
+static void
+xpc_allow_hb_uv(short partid)
+{
+}
+
+static void
+xpc_disallow_hb_uv(short partid)
+{
+}
+
+static void
+xpc_disallow_all_hbs_uv(void)
+{
+}
+
+static void
+xpc_increment_heartbeat_uv(void)
+{
+	xpc_heartbeat_uv->value++;
+}
+
+static void
+xpc_offline_heartbeat_uv(void)
+{
+	xpc_increment_heartbeat_uv();
+	xpc_heartbeat_uv->offline = 1;
+}
+
+static void
+xpc_online_heartbeat_uv(void)
+{
+	xpc_increment_heartbeat_uv();
+	xpc_heartbeat_uv->offline = 0;
+}
+
+static void
+xpc_heartbeat_init_uv(void)
+{
+	xpc_heartbeat_uv->value = 1;
+	xpc_heartbeat_uv->offline = 0;
+}
+
+static void
+xpc_heartbeat_exit_uv(void)
+{
+	xpc_offline_heartbeat_uv();
+}
+
+static enum xp_retval
+xpc_get_remote_heartbeat_uv(struct xpc_partition *part)
+{
+	struct xpc_partition_uv *part_uv = &part->sn.uv;
+	enum xp_retval ret;
+
+	ret = xp_remote_memcpy(uv_gpa(&part_uv->cached_heartbeat),
+			       part_uv->heartbeat_gpa,
+			       sizeof(struct xpc_heartbeat_uv));
+	if (ret != xpSuccess)
+		return ret;
+
+	if (part_uv->cached_heartbeat.value == part->last_heartbeat &&
+	    !part_uv->cached_heartbeat.offline) {
+
+		ret = xpNoHeartbeat;
+	} else {
+		part->last_heartbeat = part_uv->cached_heartbeat.value;
+	}
+	return ret;
+}
+
+static void
+xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp,
+				    unsigned long remote_rp_gpa, int nasid)
+{
+	short partid = remote_rp->SAL_partid;
+	struct xpc_partition *part = &xpc_partitions[partid];
+	struct xpc_activate_mq_msg_activate_req_uv msg;
+
+	part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */
+	part->remote_rp_ts_jiffies = remote_rp->ts_jiffies;
+	part->sn.uv.heartbeat_gpa = remote_rp->sn.uv.heartbeat_gpa;
+	part->sn.uv.activate_gru_mq_desc_gpa =
+	    remote_rp->sn.uv.activate_gru_mq_desc_gpa;
+
+	/*
+	 * ??? Is it a good idea to make this conditional on what is
+	 * ??? potentially stale state information?
+	 */
+	if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) {
+		msg.rp_gpa = uv_gpa(xpc_rsvd_page);
+		msg.heartbeat_gpa = xpc_rsvd_page->sn.uv.heartbeat_gpa;
+		msg.activate_gru_mq_desc_gpa =
+		    xpc_rsvd_page->sn.uv.activate_gru_mq_desc_gpa;
+		xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+					   XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV);
+	}
+
+	if (part->act_state == XPC_P_AS_INACTIVE)
+		xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
+}
+
+static void
+xpc_request_partition_reactivation_uv(struct xpc_partition *part)
+{
+	xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
+}
+
+static void
+xpc_request_partition_deactivation_uv(struct xpc_partition *part)
+{
+	struct xpc_activate_mq_msg_deactivate_req_uv msg;
+
+	/*
+	 * ??? Is it a good idea to make this conditional on what is
+	 * ??? potentially stale state information?
+	 */
+	if (part->sn.uv.remote_act_state != XPC_P_AS_DEACTIVATING &&
+	    part->sn.uv.remote_act_state != XPC_P_AS_INACTIVE) {
+
+		msg.reason = part->reason;
+		xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+					 XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV);
+	}
+}
+
+static void
+xpc_cancel_partition_deactivation_request_uv(struct xpc_partition *part)
+{
+	/* nothing needs to be done */
+	return;
+}
+
+static void
+xpc_init_fifo_uv(struct xpc_fifo_head_uv *head)
+{
+	head->first = NULL;
+	head->last = NULL;
+	spin_lock_init(&head->lock);
+	head->n_entries = 0;
+}
+
+static void *
+xpc_get_fifo_entry_uv(struct xpc_fifo_head_uv *head)
+{
+	unsigned long irq_flags;
+	struct xpc_fifo_entry_uv *first;
+
+	spin_lock_irqsave(&head->lock, irq_flags);
+	first = head->first;
+	if (head->first != NULL) {
+		head->first = first->next;
+		if (head->first == NULL)
+			head->last = NULL;
+
+		head->n_entries--;
+		BUG_ON(head->n_entries < 0);
+
+		first->next = NULL;
+	}
+	spin_unlock_irqrestore(&head->lock, irq_flags);
+	return first;
+}
+
+static void
+xpc_put_fifo_entry_uv(struct xpc_fifo_head_uv *head,
+		      struct xpc_fifo_entry_uv *last)
+{
+	unsigned long irq_flags;
+
+	last->next = NULL;
+	spin_lock_irqsave(&head->lock, irq_flags);
+	if (head->last != NULL)
+		head->last->next = last;
+	else
+		head->first = last;
+	head->last = last;
+	head->n_entries++;
+	spin_unlock_irqrestore(&head->lock, irq_flags);
+}
+
+static int
+xpc_n_of_fifo_entries_uv(struct xpc_fifo_head_uv *head)
+{
+	return head->n_entries;
+}
+
+/*
+ * Setup the channel structures that are uv specific.
+ */
+static enum xp_retval
+xpc_setup_ch_structures_uv(struct xpc_partition *part)
+{
+	struct xpc_channel_uv *ch_uv;
+	int ch_number;
+
+	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+		ch_uv = &part->channels[ch_number].sn.uv;
+
+		xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+		xpc_init_fifo_uv(&ch_uv->recv_msg_list);
+	}
+
+	return xpSuccess;
+}
+
+/*
+ * Teardown the channel structures that are uv specific.
+ */
+static void
+xpc_teardown_ch_structures_uv(struct xpc_partition *part)
+{
+	/* nothing needs to be done */
+	return;
+}
+
+static enum xp_retval
+xpc_make_first_contact_uv(struct xpc_partition *part)
+{
+	struct xpc_activate_mq_msg_uv msg;
+
+	/*
+	 * We send a sync msg to get the remote partition's remote_act_state
+	 * updated to our current act_state which at this point should
+	 * be XPC_P_AS_ACTIVATING.
+	 */
+	xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+				      XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV);
+
+	while (!((part->sn.uv.remote_act_state == XPC_P_AS_ACTIVATING) ||
+		 (part->sn.uv.remote_act_state == XPC_P_AS_ACTIVE))) {
+
+		dev_dbg(xpc_part, "waiting to make first contact with "
+			"partition %d\n", XPC_PARTID(part));
+
+		/* wait a 1/4 of a second or so */
+		(void)msleep_interruptible(250);
+
+		if (part->act_state == XPC_P_AS_DEACTIVATING)
+			return part->reason;
+	}
+
+	return xpSuccess;
+}
+
+static u64
+xpc_get_chctl_all_flags_uv(struct xpc_partition *part)
+{
+	unsigned long irq_flags;
+	union xpc_channel_ctl_flags chctl;
+
+	spin_lock_irqsave(&part->chctl_lock, irq_flags);
+	chctl = part->chctl;
+	if (chctl.all_flags != 0)
+		part->chctl.all_flags = 0;
+
+	spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+	return chctl.all_flags;
+}
+
+static enum xp_retval
+xpc_allocate_send_msg_slot_uv(struct xpc_channel *ch)
+{
+	struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+	struct xpc_send_msg_slot_uv *msg_slot;
+	unsigned long irq_flags;
+	int nentries;
+	int entry;
+	size_t nbytes;
+
+	for (nentries = ch->local_nentries; nentries > 0; nentries--) {
+		nbytes = nentries * sizeof(struct xpc_send_msg_slot_uv);
+		ch_uv->send_msg_slots = kzalloc(nbytes, GFP_KERNEL);
+		if (ch_uv->send_msg_slots == NULL)
+			continue;
+
+		for (entry = 0; entry < nentries; entry++) {
+			msg_slot = &ch_uv->send_msg_slots[entry];
+
+			msg_slot->msg_slot_number = entry;
+			xpc_put_fifo_entry_uv(&ch_uv->msg_slot_free_list,
+					      &msg_slot->next);
+		}
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->local_nentries)
+			ch->local_nentries = nentries;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpSuccess;
+	}
+
+	return xpNoMemory;
+}
+
+static enum xp_retval
+xpc_allocate_recv_msg_slot_uv(struct xpc_channel *ch)
+{
+	struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+	struct xpc_notify_mq_msg_uv *msg_slot;
+	unsigned long irq_flags;
+	int nentries;
+	int entry;
+	size_t nbytes;
+
+	for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
+		nbytes = nentries * ch->entry_size;
+		ch_uv->recv_msg_slots = kzalloc(nbytes, GFP_KERNEL);
+		if (ch_uv->recv_msg_slots == NULL)
+			continue;
+
+		for (entry = 0; entry < nentries; entry++) {
+			msg_slot = ch_uv->recv_msg_slots +
+			    entry * ch->entry_size;
+
+			msg_slot->hdr.msg_slot_number = entry;
+		}
+
+		spin_lock_irqsave(&ch->lock, irq_flags);
+		if (nentries < ch->remote_nentries)
+			ch->remote_nentries = nentries;
+		spin_unlock_irqrestore(&ch->lock, irq_flags);
+		return xpSuccess;
+	}
+
+	return xpNoMemory;
+}
+
+/*
+ * Allocate msg_slots associated with the channel.
+ */
+static enum xp_retval
+xpc_setup_msg_structures_uv(struct xpc_channel *ch)
+{
+	static enum xp_retval ret;
+	struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+	DBUG_ON(ch->flags & XPC_C_SETUP);
+
+	ch_uv->cached_notify_gru_mq_desc = kmalloc(sizeof(struct
+						   gru_message_queue_desc),
+						   GFP_KERNEL);
+	if (ch_uv->cached_notify_gru_mq_desc == NULL)
+		return xpNoMemory;
+
+	ret = xpc_allocate_send_msg_slot_uv(ch);
+	if (ret == xpSuccess) {
+
+		ret = xpc_allocate_recv_msg_slot_uv(ch);
+		if (ret != xpSuccess) {
+			kfree(ch_uv->send_msg_slots);
+			xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+		}
+	}
+	return ret;
+}
+
+/*
+ * Free up msg_slots and clear other stuff that were setup for the specified
+ * channel.
+ */
+static void
+xpc_teardown_msg_structures_uv(struct xpc_channel *ch)
+{
+	struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+	DBUG_ON(!spin_is_locked(&ch->lock));
+
+	kfree(ch_uv->cached_notify_gru_mq_desc);
+	ch_uv->cached_notify_gru_mq_desc = NULL;
+
+	if (ch->flags & XPC_C_SETUP) {
+		xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+		kfree(ch_uv->send_msg_slots);
+		xpc_init_fifo_uv(&ch_uv->recv_msg_list);
+		kfree(ch_uv->recv_msg_slots);
+	}
+}
+
+static void
+xpc_send_chctl_closerequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_activate_mq_msg_chctl_closerequest_uv msg;
+
+	msg.ch_number = ch->number;
+	msg.reason = ch->reason;
+	xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+				    XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV);
+}
+
+static void
+xpc_send_chctl_closereply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_activate_mq_msg_chctl_closereply_uv msg;
+
+	msg.ch_number = ch->number;
+	xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+				    XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV);
+}
+
+static void
+xpc_send_chctl_openrequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_activate_mq_msg_chctl_openrequest_uv msg;
+
+	msg.ch_number = ch->number;
+	msg.entry_size = ch->entry_size;
+	msg.local_nentries = ch->local_nentries;
+	xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+				    XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV);
+}
+
+static void
+xpc_send_chctl_openreply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_activate_mq_msg_chctl_openreply_uv msg;
+
+	msg.ch_number = ch->number;
+	msg.local_nentries = ch->local_nentries;
+	msg.remote_nentries = ch->remote_nentries;
+	msg.notify_gru_mq_desc_gpa = uv_gpa(xpc_notify_mq_uv->gru_mq_desc);
+	xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+				    XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV);
+}
+
+static void
+xpc_send_chctl_opencomplete_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+	struct xpc_activate_mq_msg_chctl_opencomplete_uv msg;
+
+	msg.ch_number = ch->number;
+	xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+				    XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV);
+}
+
+static void
+xpc_send_chctl_local_msgrequest_uv(struct xpc_partition *part, int ch_number)
+{
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&part->chctl_lock, irq_flags);
+	part->chctl.flags[ch_number] |= XPC_CHCTL_MSGREQUEST;
+	spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+	xpc_wakeup_channel_mgr(part);
+}
+
+static enum xp_retval
+xpc_save_remote_msgqueue_pa_uv(struct xpc_channel *ch,
+			       unsigned long gru_mq_desc_gpa)
+{
+	struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+	DBUG_ON(ch_uv->cached_notify_gru_mq_desc == NULL);
+	return xpc_cache_remote_gru_mq_desc_uv(ch_uv->cached_notify_gru_mq_desc,
+					       gru_mq_desc_gpa);
+}
+
+static void
+xpc_indicate_partition_engaged_uv(struct xpc_partition *part)
+{
+	struct xpc_activate_mq_msg_uv msg;
+
+	xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+				      XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV);
+}
+
+static void
+xpc_indicate_partition_disengaged_uv(struct xpc_partition *part)
+{
+	struct xpc_activate_mq_msg_uv msg;
+
+	xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+				      XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV);
+}
+
+static void
+xpc_assume_partition_disengaged_uv(short partid)
+{
+	struct xpc_partition_uv *part_uv = &xpc_partitions[partid].sn.uv;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+	part_uv->flags &= ~XPC_P_ENGAGED_UV;
+	spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+}
+
+static int
+xpc_partition_engaged_uv(short partid)
+{
+	return (xpc_partitions[partid].sn.uv.flags & XPC_P_ENGAGED_UV) != 0;
+}
+
+static int
+xpc_any_partition_engaged_uv(void)
+{
+	struct xpc_partition_uv *part_uv;
+	short partid;
+
+	for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+		part_uv = &xpc_partitions[partid].sn.uv;
+		if ((part_uv->flags & XPC_P_ENGAGED_UV) != 0)
+			return 1;
+	}
+	return 0;
+}
+
+static enum xp_retval
+xpc_allocate_msg_slot_uv(struct xpc_channel *ch, u32 flags,
+			 struct xpc_send_msg_slot_uv **address_of_msg_slot)
+{
+	enum xp_retval ret;
+	struct xpc_send_msg_slot_uv *msg_slot;
+	struct xpc_fifo_entry_uv *entry;
+
+	while (1) {
+		entry = xpc_get_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list);
+		if (entry != NULL)
+			break;
+
+		if (flags & XPC_NOWAIT)
+			return xpNoWait;
+
+		ret = xpc_allocate_msg_wait(ch);
+		if (ret != xpInterrupted && ret != xpTimeout)
+			return ret;
+	}
+
+	msg_slot = container_of(entry, struct xpc_send_msg_slot_uv, next);
+	*address_of_msg_slot = msg_slot;
+	return xpSuccess;
+}
+
+static void
+xpc_free_msg_slot_uv(struct xpc_channel *ch,
+		     struct xpc_send_msg_slot_uv *msg_slot)
+{
+	xpc_put_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list, &msg_slot->next);
+
+	/* wakeup anyone waiting for a free msg slot */
+	if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+		wake_up(&ch->msg_allocate_wq);
+}
+
+static void
+xpc_notify_sender_uv(struct xpc_channel *ch,
+		     struct xpc_send_msg_slot_uv *msg_slot,
+		     enum xp_retval reason)
+{
+	xpc_notify_func func = msg_slot->func;
+
+	if (func != NULL && cmpxchg(&msg_slot->func, func, NULL) == func) {
+
+		atomic_dec(&ch->n_to_notify);
+
+		dev_dbg(xpc_chan, "msg_slot->func() called, msg_slot=0x%p "
+			"msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
+			msg_slot->msg_slot_number, ch->partid, ch->number);
+
+		func(reason, ch->partid, ch->number, msg_slot->key);
+
+		dev_dbg(xpc_chan, "msg_slot->func() returned, msg_slot=0x%p "
+			"msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
+			msg_slot->msg_slot_number, ch->partid, ch->number);
+	}
+}
+
+static void
+xpc_handle_notify_mq_ack_uv(struct xpc_channel *ch,
+			    struct xpc_notify_mq_msg_uv *msg)
+{
+	struct xpc_send_msg_slot_uv *msg_slot;
+	int entry = msg->hdr.msg_slot_number % ch->local_nentries;
+
+	msg_slot = &ch->sn.uv.send_msg_slots[entry];
+
+	BUG_ON(msg_slot->msg_slot_number != msg->hdr.msg_slot_number);
+	msg_slot->msg_slot_number += ch->local_nentries;
+
+	if (msg_slot->func != NULL)
+		xpc_notify_sender_uv(ch, msg_slot, xpMsgDelivered);
+
+	xpc_free_msg_slot_uv(ch, msg_slot);
+}
+
+static void
+xpc_handle_notify_mq_msg_uv(struct xpc_partition *part,
+			    struct xpc_notify_mq_msg_uv *msg)
+{
+	struct xpc_partition_uv *part_uv = &part->sn.uv;
+	struct xpc_channel *ch;
+	struct xpc_channel_uv *ch_uv;
+	struct xpc_notify_mq_msg_uv *msg_slot;
+	unsigned long irq_flags;
+	int ch_number = msg->hdr.ch_number;
+
+	if (unlikely(ch_number >= part->nchannels)) {
+		dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received invalid "
+			"channel number=0x%x in message from partid=%d\n",
+			ch_number, XPC_PARTID(part));
+
+		/* get hb checker to deactivate from the remote partition */
+		spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+		if (part_uv->act_state_req == 0)
+			xpc_activate_IRQ_rcvd++;
+		part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+		part_uv->reason = xpBadChannelNumber;
+		spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+		wake_up_interruptible(&xpc_activate_IRQ_wq);
+		return;
+	}
+
+	ch = &part->channels[ch_number];
+	xpc_msgqueue_ref(ch);
+
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	/* see if we're really dealing with an ACK for a previously sent msg */
+	if (msg->hdr.size == 0) {
+		xpc_handle_notify_mq_ack_uv(ch, msg);
+		xpc_msgqueue_deref(ch);
+		return;
+	}
+
+	/* we're dealing with a normal message sent via the notify_mq */
+	ch_uv = &ch->sn.uv;
+
+	msg_slot = ch_uv->recv_msg_slots +
+	    (msg->hdr.msg_slot_number % ch->remote_nentries) * ch->entry_size;
+
+	BUG_ON(msg_slot->hdr.size != 0);
+
+	memcpy(msg_slot, msg, msg->hdr.size);
+
+	xpc_put_fifo_entry_uv(&ch_uv->recv_msg_list, &msg_slot->hdr.u.next);
+
+	if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) {
+		/*
+		 * If there is an existing idle kthread get it to deliver
+		 * the payload, otherwise we'll have to get the channel mgr
+		 * for this partition to create a kthread to do the delivery.
+		 */
+		if (atomic_read(&ch->kthreads_idle) > 0)
+			wake_up_nr(&ch->idle_wq, 1);
+		else
+			xpc_send_chctl_local_msgrequest_uv(part, ch->number);
+	}
+	xpc_msgqueue_deref(ch);
+}
+
+static irqreturn_t
+xpc_handle_notify_IRQ_uv(int irq, void *dev_id)
+{
+	struct xpc_notify_mq_msg_uv *msg;
+	short partid;
+	struct xpc_partition *part;
+
+	while ((msg = gru_get_next_message(xpc_notify_mq_uv->gru_mq_desc)) !=
+	       NULL) {
+
+		partid = msg->hdr.partid;
+		if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
+			dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received "
+				"invalid partid=0x%x in message\n", partid);
+		} else {
+			part = &xpc_partitions[partid];
+
+			if (xpc_part_ref(part)) {
+				xpc_handle_notify_mq_msg_uv(part, msg);
+				xpc_part_deref(part);
+			}
+		}
+
+		gru_free_message(xpc_notify_mq_uv->gru_mq_desc, msg);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int
+xpc_n_of_deliverable_payloads_uv(struct xpc_channel *ch)
+{
+	return xpc_n_of_fifo_entries_uv(&ch->sn.uv.recv_msg_list);
+}
+
+static void
+xpc_process_msg_chctl_flags_uv(struct xpc_partition *part, int ch_number)
+{
+	struct xpc_channel *ch = &part->channels[ch_number];
+	int ndeliverable_payloads;
+
+	xpc_msgqueue_ref(ch);
+
+	ndeliverable_payloads = xpc_n_of_deliverable_payloads_uv(ch);
+
+	if (ndeliverable_payloads > 0 &&
+	    (ch->flags & XPC_C_CONNECTED) &&
+	    (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)) {
+
+		xpc_activate_kthreads(ch, ndeliverable_payloads);
+	}
+
+	xpc_msgqueue_deref(ch);
+}
+
+static enum xp_retval
+xpc_send_payload_uv(struct xpc_channel *ch, u32 flags, void *payload,
+		    u16 payload_size, u8 notify_type, xpc_notify_func func,
+		    void *key)
+{
+	enum xp_retval ret = xpSuccess;
+	struct xpc_send_msg_slot_uv *msg_slot = NULL;
+	struct xpc_notify_mq_msg_uv *msg;
+	u8 msg_buffer[XPC_NOTIFY_MSG_SIZE_UV];
+	size_t msg_size;
+
+	DBUG_ON(notify_type != XPC_N_CALL);
+
+	msg_size = sizeof(struct xpc_notify_mq_msghdr_uv) + payload_size;
+	if (msg_size > ch->entry_size)
+		return xpPayloadTooBig;
+
+	xpc_msgqueue_ref(ch);
+
+	if (ch->flags & XPC_C_DISCONNECTING) {
+		ret = ch->reason;
+		goto out_1;
+	}
+	if (!(ch->flags & XPC_C_CONNECTED)) {
+		ret = xpNotConnected;
+		goto out_1;
+	}
+
+	ret = xpc_allocate_msg_slot_uv(ch, flags, &msg_slot);
+	if (ret != xpSuccess)
+		goto out_1;
+
+	if (func != NULL) {
+		atomic_inc(&ch->n_to_notify);
+
+		msg_slot->key = key;
+		smp_wmb(); /* a non-NULL func must hit memory after the key */
+		msg_slot->func = func;
+
+		if (ch->flags & XPC_C_DISCONNECTING) {
+			ret = ch->reason;
+			goto out_2;
+		}
+	}
+
+	msg = (struct xpc_notify_mq_msg_uv *)&msg_buffer;
+	msg->hdr.partid = xp_partition_id;
+	msg->hdr.ch_number = ch->number;
+	msg->hdr.size = msg_size;
+	msg->hdr.msg_slot_number = msg_slot->msg_slot_number;
+	memcpy(&msg->payload, payload, payload_size);
+
+	ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg,
+			       msg_size);
+	if (ret == xpSuccess)
+		goto out_1;
+
+	XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
+out_2:
+	if (func != NULL) {
+		/*
+		 * Try to NULL the msg_slot's func field. If we fail, then
+		 * xpc_notify_senders_of_disconnect_uv() beat us to it, in which
+		 * case we need to pretend we succeeded to send the message
+		 * since the user will get a callout for the disconnect error
+		 * by xpc_notify_senders_of_disconnect_uv(), and to also get an
+		 * error returned here will confuse them. Additionally, since
+		 * in this case the channel is being disconnected we don't need
+		 * to put the the msg_slot back on the free list.
+		 */
+		if (cmpxchg(&msg_slot->func, func, NULL) != func) {
+			ret = xpSuccess;
+			goto out_1;
+		}
+
+		msg_slot->key = NULL;
+		atomic_dec(&ch->n_to_notify);
+	}
+	xpc_free_msg_slot_uv(ch, msg_slot);
+out_1:
+	xpc_msgqueue_deref(ch);
+	return ret;
+}
+
+/*
+ * Tell the callers of xpc_send_notify() that the status of their payloads
+ * is unknown because the channel is now disconnecting.
+ *
+ * We don't worry about putting these msg_slots on the free list since the
+ * msg_slots themselves are about to be kfree'd.
+ */
+static void
+xpc_notify_senders_of_disconnect_uv(struct xpc_channel *ch)
+{
+	struct xpc_send_msg_slot_uv *msg_slot;
+	int entry;
+
+	DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
+
+	for (entry = 0; entry < ch->local_nentries; entry++) {
+
+		if (atomic_read(&ch->n_to_notify) == 0)
+			break;
+
+		msg_slot = &ch->sn.uv.send_msg_slots[entry];
+		if (msg_slot->func != NULL)
+			xpc_notify_sender_uv(ch, msg_slot, ch->reason);
+	}
+}
+
+/*
+ * Get the next deliverable message's payload.
+ */
+static void *
+xpc_get_deliverable_payload_uv(struct xpc_channel *ch)
+{
+	struct xpc_fifo_entry_uv *entry;
+	struct xpc_notify_mq_msg_uv *msg;
+	void *payload = NULL;
+
+	if (!(ch->flags & XPC_C_DISCONNECTING)) {
+		entry = xpc_get_fifo_entry_uv(&ch->sn.uv.recv_msg_list);
+		if (entry != NULL) {
+			msg = container_of(entry, struct xpc_notify_mq_msg_uv,
+					   hdr.u.next);
+			payload = &msg->payload;
+		}
+	}
+	return payload;
+}
+
+static void
+xpc_received_payload_uv(struct xpc_channel *ch, void *payload)
+{
+	struct xpc_notify_mq_msg_uv *msg;
+	enum xp_retval ret;
+
+	msg = container_of(payload, struct xpc_notify_mq_msg_uv, payload);
+
+	/* return an ACK to the sender of this message */
+
+	msg->hdr.partid = xp_partition_id;
+	msg->hdr.size = 0;	/* size of zero indicates this is an ACK */
+
+	ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg,
+			       sizeof(struct xpc_notify_mq_msghdr_uv));
+	if (ret != xpSuccess)
+		XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
+}
+
+static struct xpc_arch_operations xpc_arch_ops_uv = {
+	.setup_partitions = xpc_setup_partitions_uv,
+	.teardown_partitions = xpc_teardown_partitions_uv,
+	.process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv,
+	.get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv,
+	.setup_rsvd_page = xpc_setup_rsvd_page_uv,
+
+	.allow_hb = xpc_allow_hb_uv,
+	.disallow_hb = xpc_disallow_hb_uv,
+	.disallow_all_hbs = xpc_disallow_all_hbs_uv,
+	.increment_heartbeat = xpc_increment_heartbeat_uv,
+	.offline_heartbeat = xpc_offline_heartbeat_uv,
+	.online_heartbeat = xpc_online_heartbeat_uv,
+	.heartbeat_init = xpc_heartbeat_init_uv,
+	.heartbeat_exit = xpc_heartbeat_exit_uv,
+	.get_remote_heartbeat = xpc_get_remote_heartbeat_uv,
+
+	.request_partition_activation =
+		xpc_request_partition_activation_uv,
+	.request_partition_reactivation =
+		xpc_request_partition_reactivation_uv,
+	.request_partition_deactivation =
+		xpc_request_partition_deactivation_uv,
+	.cancel_partition_deactivation_request =
+		xpc_cancel_partition_deactivation_request_uv,
+
+	.setup_ch_structures = xpc_setup_ch_structures_uv,
+	.teardown_ch_structures = xpc_teardown_ch_structures_uv,
+
+	.make_first_contact = xpc_make_first_contact_uv,
+
+	.get_chctl_all_flags = xpc_get_chctl_all_flags_uv,
+	.send_chctl_closerequest = xpc_send_chctl_closerequest_uv,
+	.send_chctl_closereply = xpc_send_chctl_closereply_uv,
+	.send_chctl_openrequest = xpc_send_chctl_openrequest_uv,
+	.send_chctl_openreply = xpc_send_chctl_openreply_uv,
+	.send_chctl_opencomplete = xpc_send_chctl_opencomplete_uv,
+	.process_msg_chctl_flags = xpc_process_msg_chctl_flags_uv,
+
+	.save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_uv,
+
+	.setup_msg_structures = xpc_setup_msg_structures_uv,
+	.teardown_msg_structures = xpc_teardown_msg_structures_uv,
+
+	.indicate_partition_engaged = xpc_indicate_partition_engaged_uv,
+	.indicate_partition_disengaged = xpc_indicate_partition_disengaged_uv,
+	.assume_partition_disengaged = xpc_assume_partition_disengaged_uv,
+	.partition_engaged = xpc_partition_engaged_uv,
+	.any_partition_engaged = xpc_any_partition_engaged_uv,
+
+	.n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_uv,
+	.send_payload = xpc_send_payload_uv,
+	.get_deliverable_payload = xpc_get_deliverable_payload_uv,
+	.received_payload = xpc_received_payload_uv,
+	.notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv,
+};
+
+static int
+xpc_init_mq_node(int nid)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	for_each_cpu(cpu, cpumask_of_node(nid)) {
+		xpc_activate_mq_uv =
+			xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid,
+					     XPC_ACTIVATE_IRQ_NAME,
+					     xpc_handle_activate_IRQ_uv);
+		if (!IS_ERR(xpc_activate_mq_uv))
+			break;
+	}
+	if (IS_ERR(xpc_activate_mq_uv)) {
+		put_online_cpus();
+		return PTR_ERR(xpc_activate_mq_uv);
+	}
+
+	for_each_cpu(cpu, cpumask_of_node(nid)) {
+		xpc_notify_mq_uv =
+			xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid,
+					     XPC_NOTIFY_IRQ_NAME,
+					     xpc_handle_notify_IRQ_uv);
+		if (!IS_ERR(xpc_notify_mq_uv))
+			break;
+	}
+	if (IS_ERR(xpc_notify_mq_uv)) {
+		xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
+		put_online_cpus();
+		return PTR_ERR(xpc_notify_mq_uv);
+	}
+
+	put_online_cpus();
+	return 0;
+}
+
+int
+xpc_init_uv(void)
+{
+	int nid;
+	int ret = 0;
+
+	xpc_arch_ops = xpc_arch_ops_uv;
+
+	if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) {
+		dev_err(xpc_part, "xpc_notify_mq_msghdr_uv is larger than %d\n",
+			XPC_MSG_HDR_MAX_SIZE);
+		return -E2BIG;
+	}
+
+	if (xpc_mq_node < 0)
+		for_each_online_node(nid) {
+			ret = xpc_init_mq_node(nid);
+
+			if (!ret)
+				break;
+		}
+	else
+		ret = xpc_init_mq_node(xpc_mq_node);
+
+	if (ret < 0)
+		dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n",
+			-ret);
+
+	return ret;
+}
+
+void
+xpc_exit_uv(void)
+{
+	xpc_destroy_gru_mq_uv(xpc_notify_mq_uv);
+	xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
+}
+
+module_param(xpc_mq_node, int, 0);
+MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues.");
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 000000000..44d750d98
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,596 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2009 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * Cross Partition Network Interface (XPNET) support
+ *
+ *	XPNET provides a virtual network layered on top of the Cross
+ *	Partition communication layer.
+ *
+ *	XPNET provides direct point-to-point and broadcast-like support
+ *	for an ethernet-like device.  The ethernet broadcast medium is
+ *	replaced with a point-to-point message structure which passes
+ *	pointers to a DMA-capable block that a remote partition should
+ *	retrieve and pass to the upper level networking layer.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include "xp.h"
+
+/*
+ * The message payload transferred by XPC.
+ *
+ * buf_pa is the physical address where the DMA should pull from.
+ *
+ * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
+ * cacheline boundary.  To accomplish this, we record the number of
+ * bytes from the beginning of the first cacheline to the first useful
+ * byte of the skb (leadin_ignore) and the number of bytes from the
+ * last useful byte of the skb to the end of the last cacheline
+ * (tailout_ignore).
+ *
+ * size is the number of bytes to transfer which includes the skb->len
+ * (useful bytes of the senders skb) plus the leadin and tailout
+ */
+struct xpnet_message {
+	u16 version;		/* Version for this message */
+	u16 embedded_bytes;	/* #of bytes embedded in XPC message */
+	u32 magic;		/* Special number indicating this is xpnet */
+	unsigned long buf_pa;	/* phys address of buffer to retrieve */
+	u32 size;		/* #of bytes in buffer */
+	u8 leadin_ignore;	/* #of bytes to ignore at the beginning */
+	u8 tailout_ignore;	/* #of bytes to ignore at the end */
+	unsigned char data;	/* body of small packets */
+};
+
+/*
+ * Determine the size of our message, the cacheline aligned size,
+ * and then the number of message will request from XPC.
+ *
+ * XPC expects each message to exist in an individual cacheline.
+ */
+#define XPNET_MSG_SIZE		XPC_MSG_PAYLOAD_MAX_SIZE
+#define XPNET_MSG_DATA_MAX	\
+		(XPNET_MSG_SIZE - offsetof(struct xpnet_message, data))
+#define XPNET_MSG_NENTRIES	(PAGE_SIZE / XPC_MSG_MAX_SIZE)
+
+#define XPNET_MAX_KTHREADS	(XPNET_MSG_NENTRIES + 1)
+#define XPNET_MAX_IDLE_KTHREADS	(XPNET_MSG_NENTRIES + 1)
+
+/*
+ * Version number of XPNET implementation. XPNET can always talk to versions
+ * with same major #, and never talk to versions with a different version.
+ */
+#define _XPNET_VERSION(_major, _minor)	(((_major) << 4) | (_minor))
+#define XPNET_VERSION_MAJOR(_v)		((_v) >> 4)
+#define XPNET_VERSION_MINOR(_v)		((_v) & 0xf)
+
+#define	XPNET_VERSION _XPNET_VERSION(1, 0)	/* version 1.0 */
+#define	XPNET_VERSION_EMBED _XPNET_VERSION(1, 1)	/* version 1.1 */
+#define XPNET_MAGIC	0x88786984	/* "XNET" */
+
+#define XPNET_VALID_MSG(_m)						     \
+   ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
+    && (msg->magic == XPNET_MAGIC))
+
+#define XPNET_DEVICE_NAME		"xp0"
+
+/*
+ * When messages are queued with xpc_send_notify, a kmalloc'd buffer
+ * of the following type is passed as a notification cookie.  When the
+ * notification function is called, we use the cookie to decide
+ * whether all outstanding message sends have completed.  The skb can
+ * then be released.
+ */
+struct xpnet_pending_msg {
+	struct sk_buff *skb;
+	atomic_t use_count;
+};
+
+struct net_device *xpnet_device;
+
+/*
+ * When we are notified of other partitions activating, we add them to
+ * our bitmask of partitions to which we broadcast.
+ */
+static unsigned long *xpnet_broadcast_partitions;
+/* protect above */
+static DEFINE_SPINLOCK(xpnet_broadcast_lock);
+
+/*
+ * Since the Block Transfer Engine (BTE) is being used for the transfer
+ * and it relies upon cache-line size transfers, we need to reserve at
+ * least one cache-line for head and tail alignment.  The BTE is
+ * limited to 8MB transfers.
+ *
+ * Testing has shown that changing MTU to greater than 64KB has no effect
+ * on TCP as the two sides negotiate a Max Segment Size that is limited
+ * to 64K.  Other protocols May use packets greater than this, but for
+ * now, the default is 64KB.
+ */
+#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
+/* 68 comes from min TCP+IP+MAC header */
+#define XPNET_MIN_MTU 68
+/* 32KB has been determined to be the ideal */
+#define XPNET_DEF_MTU (0x8000UL)
+
+/*
+ * The partid is encapsulated in the MAC address beginning in the following
+ * octet and it consists of two octets.
+ */
+#define XPNET_PARTID_OCTET	2
+
+/* Define the XPNET debug device structures to be used with dev_dbg() et al */
+
+struct device_driver xpnet_dbg_name = {
+	.name = "xpnet"
+};
+
+struct device xpnet_dbg_subname = {
+	.init_name = "",	/* set to "" */
+	.driver = &xpnet_dbg_name
+};
+
+struct device *xpnet = &xpnet_dbg_subname;
+
+/*
+ * Packet was recevied by XPC and forwarded to us.
+ */
+static void
+xpnet_receive(short partid, int channel, struct xpnet_message *msg)
+{
+	struct sk_buff *skb;
+	void *dst;
+	enum xp_retval ret;
+
+	if (!XPNET_VALID_MSG(msg)) {
+		/*
+		 * Packet with a different XPC version.  Ignore.
+		 */
+		xpc_received(partid, channel, (void *)msg);
+
+		xpnet_device->stats.rx_errors++;
+
+		return;
+	}
+	dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
+		msg->leadin_ignore, msg->tailout_ignore);
+
+	/* reserve an extra cache line */
+	skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
+	if (!skb) {
+		dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
+			msg->size + L1_CACHE_BYTES);
+
+		xpc_received(partid, channel, (void *)msg);
+
+		xpnet_device->stats.rx_errors++;
+
+		return;
+	}
+
+	/*
+	 * The allocated skb has some reserved space.
+	 * In order to use xp_remote_memcpy(), we need to get the
+	 * skb->data pointer moved forward.
+	 */
+	skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
+					    (L1_CACHE_BYTES - 1)) +
+			  msg->leadin_ignore));
+
+	/*
+	 * Update the tail pointer to indicate data actually
+	 * transferred.
+	 */
+	skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
+
+	/*
+	 * Move the data over from the other side.
+	 */
+	if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
+	    (msg->embedded_bytes != 0)) {
+		dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
+			"%lu)\n", skb->data, &msg->data,
+			(size_t)msg->embedded_bytes);
+
+		skb_copy_to_linear_data(skb, &msg->data,
+					(size_t)msg->embedded_bytes);
+	} else {
+		dst = (void *)((u64)skb->data & ~(L1_CACHE_BYTES - 1));
+		dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
+			"xp_remote_memcpy(0x%p, 0x%p, %hu)\n", dst,
+					  (void *)msg->buf_pa, msg->size);
+
+		ret = xp_remote_memcpy(xp_pa(dst), msg->buf_pa, msg->size);
+		if (ret != xpSuccess) {
+			/*
+			 * !!! Need better way of cleaning skb.  Currently skb
+			 * !!! appears in_use and we can't just call
+			 * !!! dev_kfree_skb.
+			 */
+			dev_err(xpnet, "xp_remote_memcpy(0x%p, 0x%p, 0x%hx) "
+				"returned error=0x%x\n", dst,
+				(void *)msg->buf_pa, msg->size, ret);
+
+			xpc_received(partid, channel, (void *)msg);
+
+			xpnet_device->stats.rx_errors++;
+
+			return;
+		}
+	}
+
+	dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+		(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+		skb->len);
+
+	skb->protocol = eth_type_trans(skb, xpnet_device);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	dev_dbg(xpnet, "passing skb to network layer\n"
+		"\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n",
+		(void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
+		skb_end_pointer(skb), skb->len);
+
+	xpnet_device->stats.rx_packets++;
+	xpnet_device->stats.rx_bytes += skb->len + ETH_HLEN;
+
+	netif_rx_ni(skb);
+	xpc_received(partid, channel, (void *)msg);
+}
+
+/*
+ * This is the handler which XPC calls during any sort of change in
+ * state or message reception on a connection.
+ */
+static void
+xpnet_connection_activity(enum xp_retval reason, short partid, int channel,
+			  void *data, void *key)
+{
+	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+	DBUG_ON(channel != XPC_NET_CHANNEL);
+
+	switch (reason) {
+	case xpMsgReceived:	/* message received */
+		DBUG_ON(data == NULL);
+
+		xpnet_receive(partid, channel, (struct xpnet_message *)data);
+		break;
+
+	case xpConnected:	/* connection completed to a partition */
+		spin_lock_bh(&xpnet_broadcast_lock);
+		__set_bit(partid, xpnet_broadcast_partitions);
+		spin_unlock_bh(&xpnet_broadcast_lock);
+
+		netif_carrier_on(xpnet_device);
+
+		dev_dbg(xpnet, "%s connected to partition %d\n",
+			xpnet_device->name, partid);
+		break;
+
+	default:
+		spin_lock_bh(&xpnet_broadcast_lock);
+		__clear_bit(partid, xpnet_broadcast_partitions);
+		spin_unlock_bh(&xpnet_broadcast_lock);
+
+		if (bitmap_empty((unsigned long *)xpnet_broadcast_partitions,
+				 xp_max_npartitions)) {
+			netif_carrier_off(xpnet_device);
+		}
+
+		dev_dbg(xpnet, "%s disconnected from partition %d\n",
+			xpnet_device->name, partid);
+		break;
+	}
+}
+
+static int
+xpnet_dev_open(struct net_device *dev)
+{
+	enum xp_retval ret;
+
+	dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
+		"%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
+		(unsigned long)XPNET_MSG_SIZE,
+		(unsigned long)XPNET_MSG_NENTRIES,
+		(unsigned long)XPNET_MAX_KTHREADS,
+		(unsigned long)XPNET_MAX_IDLE_KTHREADS);
+
+	ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
+			  XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
+			  XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
+	if (ret != xpSuccess) {
+		dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
+			"ret=%d\n", dev->name, ret);
+
+		return -ENOMEM;
+	}
+
+	dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
+
+	return 0;
+}
+
+static int
+xpnet_dev_stop(struct net_device *dev)
+{
+	xpc_disconnect(XPC_NET_CHANNEL);
+
+	dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
+
+	return 0;
+}
+
+/*
+ * Notification that the other end has received the message and
+ * DMA'd the skb information.  At this point, they are done with
+ * our side.  When all recipients are done processing, we
+ * release the skb and then release our pending message structure.
+ */
+static void
+xpnet_send_completed(enum xp_retval reason, short partid, int channel,
+		     void *__qm)
+{
+	struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
+
+	DBUG_ON(queued_msg == NULL);
+
+	dev_dbg(xpnet, "message to %d notified with reason %d\n",
+		partid, reason);
+
+	if (atomic_dec_return(&queued_msg->use_count) == 0) {
+		dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
+			(void *)queued_msg->skb->head);
+
+		dev_kfree_skb_any(queued_msg->skb);
+		kfree(queued_msg);
+	}
+}
+
+static void
+xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg,
+	   u64 start_addr, u64 end_addr, u16 embedded_bytes, int dest_partid)
+{
+	u8 msg_buffer[XPNET_MSG_SIZE];
+	struct xpnet_message *msg = (struct xpnet_message *)&msg_buffer;
+	u16 msg_size = sizeof(struct xpnet_message);
+	enum xp_retval ret;
+
+	msg->embedded_bytes = embedded_bytes;
+	if (unlikely(embedded_bytes != 0)) {
+		msg->version = XPNET_VERSION_EMBED;
+		dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
+			&msg->data, skb->data, (size_t)embedded_bytes);
+		skb_copy_from_linear_data(skb, &msg->data,
+					  (size_t)embedded_bytes);
+		msg_size += embedded_bytes - 1;
+	} else {
+		msg->version = XPNET_VERSION;
+	}
+	msg->magic = XPNET_MAGIC;
+	msg->size = end_addr - start_addr;
+	msg->leadin_ignore = (u64)skb->data - start_addr;
+	msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
+	msg->buf_pa = xp_pa((void *)start_addr);
+
+	dev_dbg(xpnet, "sending XPC message to %d:%d\n"
+		"msg->buf_pa=0x%lx, msg->size=%u, "
+		"msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
+		dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
+		msg->leadin_ignore, msg->tailout_ignore);
+
+	atomic_inc(&queued_msg->use_count);
+
+	ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, XPC_NOWAIT, msg,
+			      msg_size, xpnet_send_completed, queued_msg);
+	if (unlikely(ret != xpSuccess))
+		atomic_dec(&queued_msg->use_count);
+}
+
+/*
+ * Network layer has formatted a packet (skb) and is ready to place it
+ * "on the wire".  Prepare and send an xpnet_message to all partitions
+ * which have connected with us and are targets of this packet.
+ *
+ * MAC-NOTE:  For the XPNET driver, the MAC address contains the
+ * destination partid.  If the destination partid octets are 0xffff,
+ * this packet is to be broadcast to all connected partitions.
+ */
+static netdev_tx_t
+xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct xpnet_pending_msg *queued_msg;
+	u64 start_addr, end_addr;
+	short dest_partid;
+	u16 embedded_bytes = 0;
+
+	dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+		"skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+		(void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+		skb->len);
+
+	if (skb->data[0] == 0x33) {
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;	/* nothing needed to be done */
+	}
+
+	/*
+	 * The xpnet_pending_msg tracks how many outstanding
+	 * xpc_send_notifies are relying on this skb.  When none
+	 * remain, release the skb.
+	 */
+	queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
+	if (queued_msg == NULL) {
+		dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
+			 "packet\n", sizeof(struct xpnet_pending_msg));
+
+		dev->stats.tx_errors++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* get the beginning of the first cacheline and end of last */
+	start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
+	end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
+
+	/* calculate how many bytes to embed in the XPC message */
+	if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
+		/* skb->data does fit so embed */
+		embedded_bytes = skb->len;
+	}
+
+	/*
+	 * Since the send occurs asynchronously, we set the count to one
+	 * and begin sending.  Any sends that happen to complete before
+	 * we are done sending will not free the skb.  We will be left
+	 * with that task during exit.  This also handles the case of
+	 * a packet destined for a partition which is no longer up.
+	 */
+	atomic_set(&queued_msg->use_count, 1);
+	queued_msg->skb = skb;
+
+	if (skb->data[0] == 0xff) {
+		/* we are being asked to broadcast to all partitions */
+		for_each_set_bit(dest_partid, xpnet_broadcast_partitions,
+			     xp_max_npartitions) {
+
+			xpnet_send(skb, queued_msg, start_addr, end_addr,
+				   embedded_bytes, dest_partid);
+		}
+	} else {
+		dest_partid = (short)skb->data[XPNET_PARTID_OCTET + 1];
+		dest_partid |= (short)skb->data[XPNET_PARTID_OCTET + 0] << 8;
+
+		if (dest_partid >= 0 &&
+		    dest_partid < xp_max_npartitions &&
+		    test_bit(dest_partid, xpnet_broadcast_partitions) != 0) {
+
+			xpnet_send(skb, queued_msg, start_addr, end_addr,
+				   embedded_bytes, dest_partid);
+		}
+	}
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	if (atomic_dec_return(&queued_msg->use_count) == 0) {
+		dev_kfree_skb(skb);
+		kfree(queued_msg);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+/*
+ * Deal with transmit timeouts coming from the network layer.
+ */
+static void
+xpnet_dev_tx_timeout(struct net_device *dev)
+{
+	dev->stats.tx_errors++;
+}
+
+static const struct net_device_ops xpnet_netdev_ops = {
+	.ndo_open		= xpnet_dev_open,
+	.ndo_stop		= xpnet_dev_stop,
+	.ndo_start_xmit		= xpnet_dev_hard_start_xmit,
+	.ndo_tx_timeout		= xpnet_dev_tx_timeout,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static int __init
+xpnet_init(void)
+{
+	int result;
+
+	if (!is_shub() && !is_uv())
+		return -ENODEV;
+
+	dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
+
+	xpnet_broadcast_partitions = kcalloc(BITS_TO_LONGS(xp_max_npartitions),
+					     sizeof(long),
+					     GFP_KERNEL);
+	if (xpnet_broadcast_partitions == NULL)
+		return -ENOMEM;
+
+	/*
+	 * use ether_setup() to init the majority of our device
+	 * structure and then override the necessary pieces.
+	 */
+	xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN,
+				    ether_setup);
+	if (xpnet_device == NULL) {
+		kfree(xpnet_broadcast_partitions);
+		return -ENOMEM;
+	}
+
+	netif_carrier_off(xpnet_device);
+
+	xpnet_device->netdev_ops = &xpnet_netdev_ops;
+	xpnet_device->mtu = XPNET_DEF_MTU;
+	xpnet_device->min_mtu = XPNET_MIN_MTU;
+	xpnet_device->max_mtu = XPNET_MAX_MTU;
+
+	/*
+	 * Multicast assumes the LSB of the first octet is set for multicast
+	 * MAC addresses.  We chose the first octet of the MAC to be unlikely
+	 * to collide with any vendor's officially issued MAC.
+	 */
+	xpnet_device->dev_addr[0] = 0x02;     /* locally administered, no OUI */
+
+	xpnet_device->dev_addr[XPNET_PARTID_OCTET + 1] = xp_partition_id;
+	xpnet_device->dev_addr[XPNET_PARTID_OCTET + 0] = (xp_partition_id >> 8);
+
+	/*
+	 * ether_setup() sets this to a multicast device.  We are
+	 * really not supporting multicast at this time.
+	 */
+	xpnet_device->flags &= ~IFF_MULTICAST;
+
+	/*
+	 * No need to checksum as it is a DMA transfer.  The BTE will
+	 * report an error if the data is not retrievable and the
+	 * packet will be dropped.
+	 */
+	xpnet_device->features = NETIF_F_HW_CSUM;
+
+	result = register_netdev(xpnet_device);
+	if (result != 0) {
+		free_netdev(xpnet_device);
+		kfree(xpnet_broadcast_partitions);
+	}
+
+	return result;
+}
+
+module_init(xpnet_init);
+
+static void __exit
+xpnet_exit(void)
+{
+	dev_info(xpnet, "unregistering network device %s\n",
+		 xpnet_device[0].name);
+
+	unregister_netdev(xpnet_device);
+	free_netdev(xpnet_device);
+	kfree(xpnet_broadcast_partitions);
+}
+
+module_exit(xpnet_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
+MODULE_LICENSE("GPL");