summaryrefslogtreecommitdiffstats
path: root/drivers/misc/sgi-xp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /drivers/misc/sgi-xp
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/misc/sgi-xp')
-rw-r--r--drivers/misc/sgi-xp/Makefile13
-rw-r--r--drivers/misc/sgi-xp/xp.h341
-rw-r--r--drivers/misc/sgi-xp/xp_main.c261
-rw-r--r--drivers/misc/sgi-xp/xp_uv.c175
-rw-r--r--drivers/misc/sgi-xp/xpc.h732
-rw-r--r--drivers/misc/sgi-xp/xpc_channel.c1011
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c1349
-rw-r--r--drivers/misc/sgi-xp/xpc_partition.c545
-rw-r--r--drivers/misc/sgi-xp/xpc_uv.c1817
-rw-r--r--drivers/misc/sgi-xp/xpnet.c599
10 files changed, 6843 insertions, 0 deletions
diff --git a/drivers/misc/sgi-xp/Makefile b/drivers/misc/sgi-xp/Makefile
new file mode 100644
index 000000000..34c55a404
--- /dev/null
+++ b/drivers/misc/sgi-xp/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for SGI's XP devices.
+#
+
+obj-$(CONFIG_SGI_XP) += xp.o
+xp-y := xp_main.o xp_uv.o
+
+obj-$(CONFIG_SGI_XP) += xpc.o
+xpc-y := xpc_main.o xpc_channel.o xpc_partition.o \
+ xpc_uv.o
+
+obj-$(CONFIG_SGI_XP) += xpnet.o
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
new file mode 100644
index 000000000..f1336f43d
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp.h
@@ -0,0 +1,341 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2004-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * External Cross Partition (XP) structures and defines.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XP_H
+#define _DRIVERS_MISC_SGIXP_XP_H
+
+#include <linux/mutex.h>
+
+#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV
+#include <asm/uv/uv.h>
+#endif
+
+#ifdef USE_DBUG_ON
+#define DBUG_ON(condition) BUG_ON(condition)
+#else
+#define DBUG_ON(condition)
+#endif
+
+/*
+ * Define the maximum number of partitions the system can possibly support.
+ * It is based on the maximum number of hardware partitionable regions. The
+ * term 'region' in this context refers to the minimum number of nodes that
+ * can comprise an access protection grouping. The access protection is in
+ * regards to memory, IPI and IOI.
+ *
+ * The maximum number of hardware partitionable regions is equal to the
+ * maximum number of nodes in the entire system divided by the minimum number
+ * of nodes that comprise an access protection grouping.
+ */
+#define XP_MAX_NPARTITIONS_SN2 64
+#define XP_MAX_NPARTITIONS_UV 256
+
+/*
+ * XPC establishes channel connections between the local partition and any
+ * other partition that is currently up. Over these channels, kernel-level
+ * `users' can communicate with their counterparts on the other partitions.
+ *
+ * If the need for additional channels arises, one can simply increase
+ * XPC_MAX_NCHANNELS accordingly. If the day should come where that number
+ * exceeds the absolute MAXIMUM number of channels possible (eight), then one
+ * will need to make changes to the XPC code to accommodate for this.
+ *
+ * The absolute maximum number of channels possible is limited to eight for
+ * performance reasons on sn2 hardware. The internal cross partition structures
+ * require sixteen bytes per channel, and eight allows all of this
+ * interface-shared info to fit in one 128-byte cacheline.
+ */
+#define XPC_MEM_CHANNEL 0 /* memory channel number */
+#define XPC_NET_CHANNEL 1 /* network channel number */
+
+#define XPC_MAX_NCHANNELS 2 /* max #of channels allowed */
+
+#if XPC_MAX_NCHANNELS > 8
+#error XPC_MAX_NCHANNELS exceeds absolute MAXIMUM possible.
+#endif
+
+/*
+ * Define macro, XPC_MSG_SIZE(), is provided for the user
+ * that wants to fit as many msg entries as possible in a given memory size
+ * (e.g. a memory page).
+ */
+#define XPC_MSG_MAX_SIZE 128
+#define XPC_MSG_HDR_MAX_SIZE 16
+#define XPC_MSG_PAYLOAD_MAX_SIZE (XPC_MSG_MAX_SIZE - XPC_MSG_HDR_MAX_SIZE)
+
+#define XPC_MSG_SIZE(_payload_size) \
+ ALIGN(XPC_MSG_HDR_MAX_SIZE + (_payload_size), \
+ is_uv_system() ? 64 : 128)
+
+
+/*
+ * Define the return values and values passed to user's callout functions.
+ * (It is important to add new value codes at the end just preceding
+ * xpUnknownReason, which must have the highest numerical value.)
+ */
+enum xp_retval {
+ xpSuccess = 0,
+
+ xpNotConnected, /* 1: channel is not connected */
+ xpConnected, /* 2: channel connected (opened) */
+ xpRETIRED1, /* 3: (formerly xpDisconnected) */
+
+ xpMsgReceived, /* 4: message received */
+ xpMsgDelivered, /* 5: message delivered and acknowledged */
+
+ xpRETIRED2, /* 6: (formerly xpTransferFailed) */
+
+ xpNoWait, /* 7: operation would require wait */
+ xpRetry, /* 8: retry operation */
+ xpTimeout, /* 9: timeout in xpc_allocate_msg_wait() */
+ xpInterrupted, /* 10: interrupted wait */
+
+ xpUnequalMsgSizes, /* 11: message size disparity between sides */
+ xpInvalidAddress, /* 12: invalid address */
+
+ xpNoMemory, /* 13: no memory available for XPC structures */
+ xpLackOfResources, /* 14: insufficient resources for operation */
+ xpUnregistered, /* 15: channel is not registered */
+ xpAlreadyRegistered, /* 16: channel is already registered */
+
+ xpPartitionDown, /* 17: remote partition is down */
+ xpNotLoaded, /* 18: XPC module is not loaded */
+ xpUnloading, /* 19: this side is unloading XPC module */
+
+ xpBadMagic, /* 20: XPC MAGIC string not found */
+
+ xpReactivating, /* 21: remote partition was reactivated */
+
+ xpUnregistering, /* 22: this side is unregistering channel */
+ xpOtherUnregistering, /* 23: other side is unregistering channel */
+
+ xpCloneKThread, /* 24: cloning kernel thread */
+ xpCloneKThreadFailed, /* 25: cloning kernel thread failed */
+
+ xpNoHeartbeat, /* 26: remote partition has no heartbeat */
+
+ xpPioReadError, /* 27: PIO read error */
+ xpPhysAddrRegFailed, /* 28: registration of phys addr range failed */
+
+ xpRETIRED3, /* 29: (formerly xpBteDirectoryError) */
+ xpRETIRED4, /* 30: (formerly xpBtePoisonError) */
+ xpRETIRED5, /* 31: (formerly xpBteWriteError) */
+ xpRETIRED6, /* 32: (formerly xpBteAccessError) */
+ xpRETIRED7, /* 33: (formerly xpBtePWriteError) */
+ xpRETIRED8, /* 34: (formerly xpBtePReadError) */
+ xpRETIRED9, /* 35: (formerly xpBteTimeOutError) */
+ xpRETIRED10, /* 36: (formerly xpBteXtalkError) */
+ xpRETIRED11, /* 37: (formerly xpBteNotAvailable) */
+ xpRETIRED12, /* 38: (formerly xpBteUnmappedError) */
+
+ xpBadVersion, /* 39: bad version number */
+ xpVarsNotSet, /* 40: the XPC variables are not set up */
+ xpNoRsvdPageAddr, /* 41: unable to get rsvd page's phys addr */
+ xpInvalidPartid, /* 42: invalid partition ID */
+ xpLocalPartid, /* 43: local partition ID */
+
+ xpOtherGoingDown, /* 44: other side going down, reason unknown */
+ xpSystemGoingDown, /* 45: system is going down, reason unknown */
+ xpSystemHalt, /* 46: system is being halted */
+ xpSystemReboot, /* 47: system is being rebooted */
+ xpSystemPoweroff, /* 48: system is being powered off */
+
+ xpDisconnecting, /* 49: channel disconnecting (closing) */
+
+ xpOpenCloseError, /* 50: channel open/close protocol error */
+
+ xpDisconnected, /* 51: channel disconnected (closed) */
+
+ xpBteCopyError, /* 52: bte_copy() returned error */
+ xpSalError, /* 53: sn SAL error */
+ xpRsvdPageNotSet, /* 54: the reserved page is not set up */
+ xpPayloadTooBig, /* 55: payload too large for message slot */
+
+ xpUnsupported, /* 56: unsupported functionality or resource */
+ xpNeedMoreInfo, /* 57: more info is needed by SAL */
+
+ xpGruCopyError, /* 58: gru_copy_gru() returned error */
+ xpGruSendMqError, /* 59: gru send message queue related error */
+
+ xpBadChannelNumber, /* 60: invalid channel number */
+ xpBadMsgType, /* 61: invalid message type */
+ xpBiosError, /* 62: BIOS error */
+
+ xpUnknownReason /* 63: unknown reason - must be last in enum */
+};
+
+/*
+ * Define the callout function type used by XPC to update the user on
+ * connection activity and state changes via the user function registered
+ * by xpc_connect().
+ *
+ * Arguments:
+ *
+ * reason - reason code.
+ * partid - partition ID associated with condition.
+ * ch_number - channel # associated with condition.
+ * data - pointer to optional data.
+ * key - pointer to optional user-defined value provided as the "key"
+ * argument to xpc_connect().
+ *
+ * A reason code of xpConnected indicates that a connection has been
+ * established to the specified partition on the specified channel. The data
+ * argument indicates the max number of entries allowed in the message queue.
+ *
+ * A reason code of xpMsgReceived indicates that a XPC message arrived from
+ * the specified partition on the specified channel. The data argument
+ * specifies the address of the message's payload. The user must call
+ * xpc_received() when finished with the payload.
+ *
+ * All other reason codes indicate failure. The data argmument is NULL.
+ * When a failure reason code is received, one can assume that the channel
+ * is not connected.
+ */
+typedef void (*xpc_channel_func) (enum xp_retval reason, short partid,
+ int ch_number, void *data, void *key);
+
+/*
+ * Define the callout function type used by XPC to notify the user of
+ * messages received and delivered via the user function registered by
+ * xpc_send_notify().
+ *
+ * Arguments:
+ *
+ * reason - reason code.
+ * partid - partition ID associated with condition.
+ * ch_number - channel # associated with condition.
+ * key - pointer to optional user-defined value provided as the "key"
+ * argument to xpc_send_notify().
+ *
+ * A reason code of xpMsgDelivered indicates that the message was delivered
+ * to the intended recipient and that they have acknowledged its receipt by
+ * calling xpc_received().
+ *
+ * All other reason codes indicate failure.
+ *
+ * NOTE: The user defined function must be callable by an interrupt handler
+ * and thus cannot block.
+ */
+typedef void (*xpc_notify_func) (enum xp_retval reason, short partid,
+ int ch_number, void *key);
+
+/*
+ * The following is a registration entry. There is a global array of these,
+ * one per channel. It is used to record the connection registration made
+ * by the users of XPC. As long as a registration entry exists, for any
+ * partition that comes up, XPC will attempt to establish a connection on
+ * that channel. Notification that a connection has been made will occur via
+ * the xpc_channel_func function.
+ *
+ * The 'func' field points to the function to call when aynchronous
+ * notification is required for such events as: a connection established/lost,
+ * or an incoming message received, or an error condition encountered. A
+ * non-NULL 'func' field indicates that there is an active registration for
+ * the channel.
+ */
+struct xpc_registration {
+ struct mutex mutex;
+ xpc_channel_func func; /* function to call */
+ void *key; /* pointer to user's key */
+ u16 nentries; /* #of msg entries in local msg queue */
+ u16 entry_size; /* message queue's message entry size */
+ u32 assigned_limit; /* limit on #of assigned kthreads */
+ u32 idle_limit; /* limit on #of idle kthreads */
+} ____cacheline_aligned;
+
+#define XPC_CHANNEL_REGISTERED(_c) (xpc_registrations[_c].func != NULL)
+
+/* the following are valid xpc_send() or xpc_send_notify() flags */
+#define XPC_WAIT 0 /* wait flag */
+#define XPC_NOWAIT 1 /* no wait flag */
+
+struct xpc_interface {
+ void (*connect) (int);
+ void (*disconnect) (int);
+ enum xp_retval (*send) (short, int, u32, void *, u16);
+ enum xp_retval (*send_notify) (short, int, u32, void *, u16,
+ xpc_notify_func, void *);
+ void (*received) (short, int, void *);
+ enum xp_retval (*partid_to_nasids) (short, void *);
+};
+
+extern struct xpc_interface xpc_interface;
+
+extern void xpc_set_interface(void (*)(int),
+ void (*)(int),
+ enum xp_retval (*)(short, int, u32, void *, u16),
+ enum xp_retval (*)(short, int, u32, void *, u16,
+ xpc_notify_func, void *),
+ void (*)(short, int, void *),
+ enum xp_retval (*)(short, void *));
+extern void xpc_clear_interface(void);
+
+extern enum xp_retval xpc_connect(int, xpc_channel_func, void *, u16,
+ u16, u32, u32);
+extern void xpc_disconnect(int);
+
+static inline enum xp_retval
+xpc_send(short partid, int ch_number, u32 flags, void *payload,
+ u16 payload_size)
+{
+ if (!xpc_interface.send)
+ return xpNotLoaded;
+
+ return xpc_interface.send(partid, ch_number, flags, payload,
+ payload_size);
+}
+
+static inline enum xp_retval
+xpc_send_notify(short partid, int ch_number, u32 flags, void *payload,
+ u16 payload_size, xpc_notify_func func, void *key)
+{
+ if (!xpc_interface.send_notify)
+ return xpNotLoaded;
+
+ return xpc_interface.send_notify(partid, ch_number, flags, payload,
+ payload_size, func, key);
+}
+
+static inline void
+xpc_received(short partid, int ch_number, void *payload)
+{
+ if (xpc_interface.received)
+ xpc_interface.received(partid, ch_number, payload);
+}
+
+static inline enum xp_retval
+xpc_partid_to_nasids(short partid, void *nasids)
+{
+ if (!xpc_interface.partid_to_nasids)
+ return xpNotLoaded;
+
+ return xpc_interface.partid_to_nasids(partid, nasids);
+}
+
+extern short xp_max_npartitions;
+extern short xp_partition_id;
+extern u8 xp_region_size;
+
+extern unsigned long (*xp_pa) (void *);
+extern unsigned long (*xp_socket_pa) (unsigned long);
+extern enum xp_retval (*xp_remote_memcpy) (unsigned long, const unsigned long,
+ size_t);
+extern int (*xp_cpu_to_nasid) (int);
+extern enum xp_retval (*xp_expand_memprotect) (unsigned long, unsigned long);
+extern enum xp_retval (*xp_restrict_memprotect) (unsigned long, unsigned long);
+
+extern struct device *xp;
+extern enum xp_retval xp_init_uv(void);
+extern void xp_exit_uv(void);
+
+#endif /* _DRIVERS_MISC_SGIXP_XP_H */
diff --git a/drivers/misc/sgi-xp/xp_main.c b/drivers/misc/sgi-xp/xp_main.c
new file mode 100644
index 000000000..87d156c15
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_main.c
@@ -0,0 +1,261 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) base.
+ *
+ * XP provides a base from which its users can interact
+ * with XPC, yet not be dependent on XPC.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include "xp.h"
+
+/* define the XP debug device structures to be used with dev_dbg() et al */
+
+static struct device_driver xp_dbg_name = {
+ .name = "xp"
+};
+
+static struct device xp_dbg_subname = {
+ .init_name = "", /* set to "" */
+ .driver = &xp_dbg_name
+};
+
+struct device *xp = &xp_dbg_subname;
+
+/* max #of partitions possible */
+short xp_max_npartitions;
+EXPORT_SYMBOL_GPL(xp_max_npartitions);
+
+short xp_partition_id;
+EXPORT_SYMBOL_GPL(xp_partition_id);
+
+u8 xp_region_size;
+EXPORT_SYMBOL_GPL(xp_region_size);
+
+unsigned long (*xp_pa) (void *addr);
+EXPORT_SYMBOL_GPL(xp_pa);
+
+unsigned long (*xp_socket_pa) (unsigned long gpa);
+EXPORT_SYMBOL_GPL(xp_socket_pa);
+
+enum xp_retval (*xp_remote_memcpy) (unsigned long dst_gpa,
+ const unsigned long src_gpa, size_t len);
+EXPORT_SYMBOL_GPL(xp_remote_memcpy);
+
+int (*xp_cpu_to_nasid) (int cpuid);
+EXPORT_SYMBOL_GPL(xp_cpu_to_nasid);
+
+enum xp_retval (*xp_expand_memprotect) (unsigned long phys_addr,
+ unsigned long size);
+EXPORT_SYMBOL_GPL(xp_expand_memprotect);
+enum xp_retval (*xp_restrict_memprotect) (unsigned long phys_addr,
+ unsigned long size);
+EXPORT_SYMBOL_GPL(xp_restrict_memprotect);
+
+/*
+ * xpc_registrations[] keeps track of xpc_connect()'s done by the kernel-level
+ * users of XPC.
+ */
+struct xpc_registration xpc_registrations[XPC_MAX_NCHANNELS];
+EXPORT_SYMBOL_GPL(xpc_registrations);
+
+/*
+ * Initialize the XPC interface to NULL to indicate that XPC isn't loaded.
+ */
+struct xpc_interface xpc_interface = { };
+EXPORT_SYMBOL_GPL(xpc_interface);
+
+/*
+ * XPC calls this when it (the XPC module) has been loaded.
+ */
+void
+xpc_set_interface(void (*connect) (int),
+ void (*disconnect) (int),
+ enum xp_retval (*send) (short, int, u32, void *, u16),
+ enum xp_retval (*send_notify) (short, int, u32, void *, u16,
+ xpc_notify_func, void *),
+ void (*received) (short, int, void *),
+ enum xp_retval (*partid_to_nasids) (short, void *))
+{
+ xpc_interface.connect = connect;
+ xpc_interface.disconnect = disconnect;
+ xpc_interface.send = send;
+ xpc_interface.send_notify = send_notify;
+ xpc_interface.received = received;
+ xpc_interface.partid_to_nasids = partid_to_nasids;
+}
+EXPORT_SYMBOL_GPL(xpc_set_interface);
+
+/*
+ * XPC calls this when it (the XPC module) is being unloaded.
+ */
+void
+xpc_clear_interface(void)
+{
+ memset(&xpc_interface, 0, sizeof(xpc_interface));
+}
+EXPORT_SYMBOL_GPL(xpc_clear_interface);
+
+/*
+ * Register for automatic establishment of a channel connection whenever
+ * a partition comes up.
+ *
+ * Arguments:
+ *
+ * ch_number - channel # to register for connection.
+ * func - function to call for asynchronous notification of channel
+ * state changes (i.e., connection, disconnection, error) and
+ * the arrival of incoming messages.
+ * key - pointer to optional user-defined value that gets passed back
+ * to the user on any callouts made to func.
+ * payload_size - size in bytes of the XPC message's payload area which
+ * contains a user-defined message. The user should make
+ * this large enough to hold their largest message.
+ * nentries - max #of XPC message entries a message queue can contain.
+ * The actual number, which is determined when a connection
+ * is established and may be less then requested, will be
+ * passed to the user via the xpConnected callout.
+ * assigned_limit - max number of kthreads allowed to be processing
+ * messages (per connection) at any given instant.
+ * idle_limit - max number of kthreads allowed to be idle at any given
+ * instant.
+ */
+enum xp_retval
+xpc_connect(int ch_number, xpc_channel_func func, void *key, u16 payload_size,
+ u16 nentries, u32 assigned_limit, u32 idle_limit)
+{
+ struct xpc_registration *registration;
+
+ DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+ DBUG_ON(payload_size == 0 || nentries == 0);
+ DBUG_ON(func == NULL);
+ DBUG_ON(assigned_limit == 0 || idle_limit > assigned_limit);
+
+ if (XPC_MSG_SIZE(payload_size) > XPC_MSG_MAX_SIZE)
+ return xpPayloadTooBig;
+
+ registration = &xpc_registrations[ch_number];
+
+ if (mutex_lock_interruptible(&registration->mutex) != 0)
+ return xpInterrupted;
+
+ /* if XPC_CHANNEL_REGISTERED(ch_number) */
+ if (registration->func != NULL) {
+ mutex_unlock(&registration->mutex);
+ return xpAlreadyRegistered;
+ }
+
+ /* register the channel for connection */
+ registration->entry_size = XPC_MSG_SIZE(payload_size);
+ registration->nentries = nentries;
+ registration->assigned_limit = assigned_limit;
+ registration->idle_limit = idle_limit;
+ registration->key = key;
+ registration->func = func;
+
+ mutex_unlock(&registration->mutex);
+
+ if (xpc_interface.connect)
+ xpc_interface.connect(ch_number);
+
+ return xpSuccess;
+}
+EXPORT_SYMBOL_GPL(xpc_connect);
+
+/*
+ * Remove the registration for automatic connection of the specified channel
+ * when a partition comes up.
+ *
+ * Before returning this xpc_disconnect() will wait for all connections on the
+ * specified channel have been closed/torndown. So the caller can be assured
+ * that they will not be receiving any more callouts from XPC to their
+ * function registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ * ch_number - channel # to unregister.
+ */
+void
+xpc_disconnect(int ch_number)
+{
+ struct xpc_registration *registration;
+
+ DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+ registration = &xpc_registrations[ch_number];
+
+ /*
+ * We've decided not to make this a down_interruptible(), since we
+ * figured XPC's users will just turn around and call xpc_disconnect()
+ * again anyways, so we might as well wait, if need be.
+ */
+ mutex_lock(&registration->mutex);
+
+ /* if !XPC_CHANNEL_REGISTERED(ch_number) */
+ if (registration->func == NULL) {
+ mutex_unlock(&registration->mutex);
+ return;
+ }
+
+ /* remove the connection registration for the specified channel */
+ registration->func = NULL;
+ registration->key = NULL;
+ registration->nentries = 0;
+ registration->entry_size = 0;
+ registration->assigned_limit = 0;
+ registration->idle_limit = 0;
+
+ if (xpc_interface.disconnect)
+ xpc_interface.disconnect(ch_number);
+
+ mutex_unlock(&registration->mutex);
+
+ return;
+}
+EXPORT_SYMBOL_GPL(xpc_disconnect);
+
+static int __init
+xp_init(void)
+{
+ enum xp_retval ret;
+ int ch_number;
+
+ /* initialize the connection registration mutex */
+ for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++)
+ mutex_init(&xpc_registrations[ch_number].mutex);
+
+ if (is_uv_system())
+ ret = xp_init_uv();
+ else
+ ret = 0;
+
+ if (ret != xpSuccess)
+ return ret;
+
+ return 0;
+}
+
+module_init(xp_init);
+
+static void __exit
+xp_exit(void)
+{
+ if (is_uv_system())
+ xp_exit_uv();
+}
+
+module_exit(xp_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition (XP) base");
+MODULE_LICENSE("GPL");
diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c
new file mode 100644
index 000000000..19fc7076a
--- /dev/null
+++ b/drivers/misc/sgi-xp/xp_uv.c
@@ -0,0 +1,175 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition (XP) uv-based functions.
+ *
+ * Architecture specific implementation of common functions.
+ *
+ */
+
+#include <linux/device.h>
+#include <asm/uv/uv_hub.h>
+#if defined CONFIG_X86_64
+#include <asm/uv/bios.h>
+#elif defined CONFIG_IA64_SGI_UV
+#include <asm/sn/sn_sal.h>
+#endif
+#include "../sgi-gru/grukservices.h"
+#include "xp.h"
+
+/*
+ * Convert a virtual memory address to a physical memory address.
+ */
+static unsigned long
+xp_pa_uv(void *addr)
+{
+ return uv_gpa(addr);
+}
+
+/*
+ * Convert a global physical to socket physical address.
+ */
+static unsigned long
+xp_socket_pa_uv(unsigned long gpa)
+{
+ return uv_gpa_to_soc_phys_ram(gpa);
+}
+
+static enum xp_retval
+xp_remote_mmr_read(unsigned long dst_gpa, const unsigned long src_gpa,
+ size_t len)
+{
+ int ret;
+ unsigned long *dst_va = __va(uv_gpa_to_soc_phys_ram(dst_gpa));
+
+ BUG_ON(!uv_gpa_in_mmr_space(src_gpa));
+ BUG_ON(len != 8);
+
+ ret = gru_read_gpa(dst_va, src_gpa);
+ if (ret == 0)
+ return xpSuccess;
+
+ dev_err(xp, "gru_read_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx "
+ "len=%ld\n", dst_gpa, src_gpa, len);
+ return xpGruCopyError;
+}
+
+
+static enum xp_retval
+xp_remote_memcpy_uv(unsigned long dst_gpa, const unsigned long src_gpa,
+ size_t len)
+{
+ int ret;
+
+ if (uv_gpa_in_mmr_space(src_gpa))
+ return xp_remote_mmr_read(dst_gpa, src_gpa, len);
+
+ ret = gru_copy_gpa(dst_gpa, src_gpa, len);
+ if (ret == 0)
+ return xpSuccess;
+
+ dev_err(xp, "gru_copy_gpa() failed, dst_gpa=0x%016lx src_gpa=0x%016lx "
+ "len=%ld\n", dst_gpa, src_gpa, len);
+ return xpGruCopyError;
+}
+
+static int
+xp_cpu_to_nasid_uv(int cpuid)
+{
+ /* ??? Is this same as sn2 nasid in mach/part bitmaps set up by SAL? */
+ return UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpuid));
+}
+
+static enum xp_retval
+xp_expand_memprotect_uv(unsigned long phys_addr, unsigned long size)
+{
+ int ret;
+
+#if defined CONFIG_X86_64
+ ret = uv_bios_change_memprotect(phys_addr, size, UV_MEMPROT_ALLOW_RW);
+ if (ret != BIOS_STATUS_SUCCESS) {
+ dev_err(xp, "uv_bios_change_memprotect(,, "
+ "UV_MEMPROT_ALLOW_RW) failed, ret=%d\n", ret);
+ return xpBiosError;
+ }
+
+#elif defined CONFIG_IA64_SGI_UV
+ u64 nasid_array;
+
+ ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1,
+ &nasid_array);
+ if (ret != 0) {
+ dev_err(xp, "sn_change_memprotect(,, "
+ "SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret);
+ return xpSalError;
+ }
+#else
+ #error not a supported configuration
+#endif
+ return xpSuccess;
+}
+
+static enum xp_retval
+xp_restrict_memprotect_uv(unsigned long phys_addr, unsigned long size)
+{
+ int ret;
+
+#if defined CONFIG_X86_64
+ ret = uv_bios_change_memprotect(phys_addr, size,
+ UV_MEMPROT_RESTRICT_ACCESS);
+ if (ret != BIOS_STATUS_SUCCESS) {
+ dev_err(xp, "uv_bios_change_memprotect(,, "
+ "UV_MEMPROT_RESTRICT_ACCESS) failed, ret=%d\n", ret);
+ return xpBiosError;
+ }
+
+#elif defined CONFIG_IA64_SGI_UV
+ u64 nasid_array;
+
+ ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0,
+ &nasid_array);
+ if (ret != 0) {
+ dev_err(xp, "sn_change_memprotect(,, "
+ "SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret);
+ return xpSalError;
+ }
+#else
+ #error not a supported configuration
+#endif
+ return xpSuccess;
+}
+
+enum xp_retval
+xp_init_uv(void)
+{
+ WARN_ON(!is_uv_system());
+ if (!is_uv_system())
+ return xpUnsupported;
+
+ xp_max_npartitions = XP_MAX_NPARTITIONS_UV;
+#ifdef CONFIG_X86
+ xp_partition_id = sn_partition_id;
+ xp_region_size = sn_region_size;
+#endif
+ xp_pa = xp_pa_uv;
+ xp_socket_pa = xp_socket_pa_uv;
+ xp_remote_memcpy = xp_remote_memcpy_uv;
+ xp_cpu_to_nasid = xp_cpu_to_nasid_uv;
+ xp_expand_memprotect = xp_expand_memprotect_uv;
+ xp_restrict_memprotect = xp_restrict_memprotect_uv;
+
+ return xpSuccess;
+}
+
+void
+xp_exit_uv(void)
+{
+ WARN_ON(!is_uv_system());
+}
diff --git a/drivers/misc/sgi-xp/xpc.h b/drivers/misc/sgi-xp/xpc.h
new file mode 100644
index 000000000..225f2bb84
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc.h
@@ -0,0 +1,732 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) structures and macros.
+ */
+
+#ifndef _DRIVERS_MISC_SGIXP_XPC_H
+#define _DRIVERS_MISC_SGIXP_XPC_H
+
+#include <linux/wait.h>
+#include <linux/completion.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include "xp.h"
+
+/*
+ * XPC Version numbers consist of a major and minor number. XPC can always
+ * talk to versions with same major #, and never talk to versions with a
+ * different major #.
+ */
+#define _XPC_VERSION(_maj, _min) (((_maj) << 4) | ((_min) & 0xf))
+#define XPC_VERSION_MAJOR(_v) ((_v) >> 4)
+#define XPC_VERSION_MINOR(_v) ((_v) & 0xf)
+
+/* define frequency of the heartbeat and frequency how often it's checked */
+#define XPC_HB_DEFAULT_INTERVAL 5 /* incr HB every x secs */
+#define XPC_HB_CHECK_DEFAULT_INTERVAL 20 /* check HB every x secs */
+
+/* define the process name of HB checker and the CPU it is pinned to */
+#define XPC_HB_CHECK_THREAD_NAME "xpc_hb"
+#define XPC_HB_CHECK_CPU 0
+
+/* define the process name of the discovery thread */
+#define XPC_DISCOVERY_THREAD_NAME "xpc_discovery"
+
+/*
+ * the reserved page
+ *
+ * SAL reserves one page of memory per partition for XPC. Though a full page
+ * in length (16384 bytes), its starting address is not page aligned, but it
+ * is cacheline aligned. The reserved page consists of the following:
+ *
+ * reserved page header
+ *
+ * The first two 64-byte cachelines of the reserved page contain the
+ * header (struct xpc_rsvd_page). Before SAL initialization has completed,
+ * SAL has set up the following fields of the reserved page header:
+ * SAL_signature, SAL_version, SAL_partid, and SAL_nasids_size. The
+ * other fields are set up by XPC. (xpc_rsvd_page points to the local
+ * partition's reserved page.)
+ *
+ * part_nasids mask
+ * mach_nasids mask
+ *
+ * SAL also sets up two bitmaps (or masks), one that reflects the actual
+ * nasids in this partition (part_nasids), and the other that reflects
+ * the actual nasids in the entire machine (mach_nasids). We're only
+ * interested in the even numbered nasids (which contain the processors
+ * and/or memory), so we only need half as many bits to represent the
+ * nasids. When mapping nasid to bit in a mask (or bit to nasid) be sure
+ * to either divide or multiply by 2. The part_nasids mask is located
+ * starting at the first cacheline following the reserved page header. The
+ * mach_nasids mask follows right after the part_nasids mask. The size in
+ * bytes of each mask is reflected by the reserved page header field
+ * 'SAL_nasids_size'. (Local partition's mask pointers are xpc_part_nasids
+ * and xpc_mach_nasids.)
+ *
+ * Immediately following the mach_nasids mask are the XPC variables
+ * required by other partitions. First are those that are generic to all
+ * partitions (vars), followed on the next available cacheline by those
+ * which are partition specific (vars part). These are setup by XPC.
+ *
+ * Note: Until 'ts_jiffies' is set non-zero, the partition XPC code has not been
+ * initialized.
+ */
+struct xpc_rsvd_page {
+ u64 SAL_signature; /* SAL: unique signature */
+ u64 SAL_version; /* SAL: version */
+ short SAL_partid; /* SAL: partition ID */
+ short max_npartitions; /* value of XPC_MAX_PARTITIONS */
+ u8 version;
+ u8 pad1[3]; /* align to next u64 in 1st 64-byte cacheline */
+ unsigned long ts_jiffies; /* timestamp when rsvd pg was setup by XPC */
+ union {
+ struct {
+ unsigned long heartbeat_gpa; /* phys addr */
+ unsigned long activate_gru_mq_desc_gpa; /* phys addr */
+ } uv;
+ } sn;
+ u64 pad2[9]; /* align to last u64 in 2nd 64-byte cacheline */
+ u64 SAL_nasids_size; /* SAL: size of each nasid mask in bytes */
+};
+
+#define XPC_RP_VERSION _XPC_VERSION(3, 0) /* version 3.0 of the reserved page */
+
+/* the reserved page sizes and offsets */
+
+#define XPC_RP_HEADER_SIZE L1_CACHE_ALIGN(sizeof(struct xpc_rsvd_page))
+
+#define XPC_RP_PART_NASIDS(_rp) ((unsigned long *)((u8 *)(_rp) + \
+ XPC_RP_HEADER_SIZE))
+#define XPC_RP_MACH_NASIDS(_rp) (XPC_RP_PART_NASIDS(_rp) + \
+ xpc_nasid_mask_nlongs)
+
+
+/*
+ * The following structure describes the partition's heartbeat info which
+ * will be periodically read by other partitions to determine whether this
+ * XPC is still 'alive'.
+ */
+struct xpc_heartbeat_uv {
+ unsigned long value;
+ unsigned long offline; /* if 0, heartbeat should be changing */
+};
+
+/*
+ * Info pertinent to a GRU message queue using a watch list for irq generation.
+ */
+struct xpc_gru_mq_uv {
+ void *address; /* address of GRU message queue */
+ unsigned int order; /* size of GRU message queue as a power of 2 */
+ int irq; /* irq raised when message is received in mq */
+ int mmr_blade; /* blade where watchlist was allocated from */
+ unsigned long mmr_offset; /* offset of irq mmr located on mmr_blade */
+ unsigned long mmr_value; /* value of irq mmr located on mmr_blade */
+ int watchlist_num; /* number of watchlist allocatd by BIOS */
+ void *gru_mq_desc; /* opaque structure used by the GRU driver */
+};
+
+/*
+ * The activate_mq is used to send/receive GRU messages that affect XPC's
+ * partition active state and channel state. This is uv only.
+ */
+struct xpc_activate_mq_msghdr_uv {
+ unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */
+ short partid; /* sender's partid */
+ u8 act_state; /* sender's act_state at time msg sent */
+ u8 type; /* message's type */
+ unsigned long rp_ts_jiffies; /* timestamp of sender's rp setup by XPC */
+};
+
+/* activate_mq defined message types */
+#define XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV 0
+
+#define XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV 1
+#define XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV 2
+
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV 3
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV 4
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV 5
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV 6
+#define XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV 7
+
+#define XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV 8
+#define XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV 9
+
+struct xpc_activate_mq_msg_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+};
+
+struct xpc_activate_mq_msg_activate_req_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ unsigned long rp_gpa;
+ unsigned long heartbeat_gpa;
+ unsigned long activate_gru_mq_desc_gpa;
+};
+
+struct xpc_activate_mq_msg_deactivate_req_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ enum xp_retval reason;
+};
+
+struct xpc_activate_mq_msg_chctl_closerequest_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ short ch_number;
+ enum xp_retval reason;
+};
+
+struct xpc_activate_mq_msg_chctl_closereply_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ short ch_number;
+};
+
+struct xpc_activate_mq_msg_chctl_openrequest_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ short ch_number;
+ short entry_size; /* size of notify_mq's GRU messages */
+ short local_nentries; /* ??? Is this needed? What is? */
+};
+
+struct xpc_activate_mq_msg_chctl_openreply_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ short ch_number;
+ short remote_nentries; /* ??? Is this needed? What is? */
+ short local_nentries; /* ??? Is this needed? What is? */
+ unsigned long notify_gru_mq_desc_gpa;
+};
+
+struct xpc_activate_mq_msg_chctl_opencomplete_uv {
+ struct xpc_activate_mq_msghdr_uv hdr;
+ short ch_number;
+};
+
+/*
+ * Functions registered by add_timer() or called by kernel_thread() only
+ * allow for a single 64-bit argument. The following macros can be used to
+ * pack and unpack two (32-bit, 16-bit or 8-bit) arguments into or out from
+ * the passed argument.
+ */
+#define XPC_PACK_ARGS(_arg1, _arg2) \
+ ((((u64)_arg1) & 0xffffffff) | \
+ ((((u64)_arg2) & 0xffffffff) << 32))
+
+#define XPC_UNPACK_ARG1(_args) (((u64)_args) & 0xffffffff)
+#define XPC_UNPACK_ARG2(_args) ((((u64)_args) >> 32) & 0xffffffff)
+
+/*
+ * Define a structure that contains arguments associated with opening and
+ * closing a channel.
+ */
+struct xpc_openclose_args {
+ u16 reason; /* reason why channel is closing */
+ u16 entry_size; /* sizeof each message entry */
+ u16 remote_nentries; /* #of message entries in remote msg queue */
+ u16 local_nentries; /* #of message entries in local msg queue */
+ unsigned long local_msgqueue_pa; /* phys addr of local message queue */
+};
+
+#define XPC_OPENCLOSE_ARGS_SIZE \
+ L1_CACHE_ALIGN(sizeof(struct xpc_openclose_args) * \
+ XPC_MAX_NCHANNELS)
+
+
+/*
+ * Structures to define a fifo singly-linked list.
+ */
+
+struct xpc_fifo_entry_uv {
+ struct xpc_fifo_entry_uv *next;
+};
+
+struct xpc_fifo_head_uv {
+ struct xpc_fifo_entry_uv *first;
+ struct xpc_fifo_entry_uv *last;
+ spinlock_t lock;
+ int n_entries;
+};
+
+/*
+ * The format of a uv XPC notify_mq GRU message is as follows:
+ *
+ * A user-defined message resides in the payload area. The max size of the
+ * payload is defined by the user via xpc_connect().
+ *
+ * The size of a message (payload and header) sent via the GRU must be either 1
+ * or 2 GRU_CACHE_LINE_BYTES in length.
+ */
+
+struct xpc_notify_mq_msghdr_uv {
+ union {
+ unsigned int gru_msg_hdr; /* FOR GRU INTERNAL USE ONLY */
+ struct xpc_fifo_entry_uv next; /* FOR XPC INTERNAL USE ONLY */
+ } u;
+ short partid; /* FOR XPC INTERNAL USE ONLY */
+ u8 ch_number; /* FOR XPC INTERNAL USE ONLY */
+ u8 size; /* FOR XPC INTERNAL USE ONLY */
+ unsigned int msg_slot_number; /* FOR XPC INTERNAL USE ONLY */
+};
+
+struct xpc_notify_mq_msg_uv {
+ struct xpc_notify_mq_msghdr_uv hdr;
+ unsigned long payload;
+};
+
+/* struct xpc_notify_sn2 type of notification */
+
+#define XPC_N_CALL 0x01 /* notify function provided by user */
+
+/*
+ * Define uv's version of the notify entry. It additionally is used to allocate
+ * a msg slot on the remote partition into which is copied a sent message.
+ */
+struct xpc_send_msg_slot_uv {
+ struct xpc_fifo_entry_uv next;
+ unsigned int msg_slot_number;
+ xpc_notify_func func; /* user's notify function */
+ void *key; /* pointer to user's key */
+};
+
+/*
+ * Define the structure that manages all the stuff required by a channel. In
+ * particular, they are used to manage the messages sent across the channel.
+ *
+ * This structure is private to a partition, and is NOT shared across the
+ * partition boundary.
+ *
+ * There is an array of these structures for each remote partition. It is
+ * allocated at the time a partition becomes active. The array contains one
+ * of these structures for each potential channel connection to that partition.
+ */
+
+struct xpc_channel_uv {
+ void *cached_notify_gru_mq_desc; /* remote partition's notify mq's */
+ /* gru mq descriptor */
+
+ struct xpc_send_msg_slot_uv *send_msg_slots;
+ void *recv_msg_slots; /* each slot will hold a xpc_notify_mq_msg_uv */
+ /* structure plus the user's payload */
+
+ struct xpc_fifo_head_uv msg_slot_free_list;
+ struct xpc_fifo_head_uv recv_msg_list; /* deliverable payloads */
+};
+
+struct xpc_channel {
+ short partid; /* ID of remote partition connected */
+ spinlock_t lock; /* lock for updating this structure */
+ unsigned int flags; /* general flags */
+
+ enum xp_retval reason; /* reason why channel is disconnect'g */
+ int reason_line; /* line# disconnect initiated from */
+
+ u16 number; /* channel # */
+
+ u16 entry_size; /* sizeof each msg entry */
+ u16 local_nentries; /* #of msg entries in local msg queue */
+ u16 remote_nentries; /* #of msg entries in remote msg queue */
+
+ atomic_t references; /* #of external references to queues */
+
+ atomic_t n_on_msg_allocate_wq; /* #on msg allocation wait queue */
+ wait_queue_head_t msg_allocate_wq; /* msg allocation wait queue */
+
+ u8 delayed_chctl_flags; /* chctl flags received, but delayed */
+ /* action until channel disconnected */
+
+ atomic_t n_to_notify; /* #of msg senders to notify */
+
+ xpc_channel_func func; /* user's channel function */
+ void *key; /* pointer to user's key */
+
+ struct completion wdisconnect_wait; /* wait for channel disconnect */
+
+ /* kthread management related fields */
+
+ atomic_t kthreads_assigned; /* #of kthreads assigned to channel */
+ u32 kthreads_assigned_limit; /* limit on #of kthreads assigned */
+ atomic_t kthreads_idle; /* #of kthreads idle waiting for work */
+ u32 kthreads_idle_limit; /* limit on #of kthreads idle */
+ atomic_t kthreads_active; /* #of kthreads actively working */
+
+ wait_queue_head_t idle_wq; /* idle kthread wait queue */
+
+ union {
+ struct xpc_channel_uv uv;
+ } sn;
+
+} ____cacheline_aligned;
+
+/* struct xpc_channel flags */
+
+#define XPC_C_WASCONNECTED 0x00000001 /* channel was connected */
+
+#define XPC_C_ROPENCOMPLETE 0x00000002 /* remote open channel complete */
+#define XPC_C_OPENCOMPLETE 0x00000004 /* local open channel complete */
+#define XPC_C_ROPENREPLY 0x00000008 /* remote open channel reply */
+#define XPC_C_OPENREPLY 0x00000010 /* local open channel reply */
+#define XPC_C_ROPENREQUEST 0x00000020 /* remote open channel request */
+#define XPC_C_OPENREQUEST 0x00000040 /* local open channel request */
+
+#define XPC_C_SETUP 0x00000080 /* channel's msgqueues are alloc'd */
+#define XPC_C_CONNECTEDCALLOUT 0x00000100 /* connected callout initiated */
+#define XPC_C_CONNECTEDCALLOUT_MADE \
+ 0x00000200 /* connected callout completed */
+#define XPC_C_CONNECTED 0x00000400 /* local channel is connected */
+#define XPC_C_CONNECTING 0x00000800 /* channel is being connected */
+
+#define XPC_C_RCLOSEREPLY 0x00001000 /* remote close channel reply */
+#define XPC_C_CLOSEREPLY 0x00002000 /* local close channel reply */
+#define XPC_C_RCLOSEREQUEST 0x00004000 /* remote close channel request */
+#define XPC_C_CLOSEREQUEST 0x00008000 /* local close channel request */
+
+#define XPC_C_DISCONNECTED 0x00010000 /* channel is disconnected */
+#define XPC_C_DISCONNECTING 0x00020000 /* channel is being disconnected */
+#define XPC_C_DISCONNECTINGCALLOUT \
+ 0x00040000 /* disconnecting callout initiated */
+#define XPC_C_DISCONNECTINGCALLOUT_MADE \
+ 0x00080000 /* disconnecting callout completed */
+#define XPC_C_WDISCONNECT 0x00100000 /* waiting for channel disconnect */
+
+/*
+ * The channel control flags (chctl) union consists of a 64-bit variable which
+ * is divided up into eight bytes, ordered from right to left. Byte zero
+ * pertains to channel 0, byte one to channel 1, and so on. Each channel's byte
+ * can have one or more of the chctl flags set in it.
+ */
+
+union xpc_channel_ctl_flags {
+ u64 all_flags;
+ u8 flags[XPC_MAX_NCHANNELS];
+};
+
+/* chctl flags */
+#define XPC_CHCTL_CLOSEREQUEST 0x01
+#define XPC_CHCTL_CLOSEREPLY 0x02
+#define XPC_CHCTL_OPENREQUEST 0x04
+#define XPC_CHCTL_OPENREPLY 0x08
+#define XPC_CHCTL_OPENCOMPLETE 0x10
+#define XPC_CHCTL_MSGREQUEST 0x20
+
+#define XPC_OPENCLOSE_CHCTL_FLAGS \
+ (XPC_CHCTL_CLOSEREQUEST | XPC_CHCTL_CLOSEREPLY | \
+ XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY | \
+ XPC_CHCTL_OPENCOMPLETE)
+#define XPC_MSG_CHCTL_FLAGS XPC_CHCTL_MSGREQUEST
+
+static inline int
+xpc_any_openclose_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
+{
+ int ch_number;
+
+ for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
+ if (chctl->flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS)
+ return 1;
+ }
+ return 0;
+}
+
+static inline int
+xpc_any_msg_chctl_flags_set(union xpc_channel_ctl_flags *chctl)
+{
+ int ch_number;
+
+ for (ch_number = 0; ch_number < XPC_MAX_NCHANNELS; ch_number++) {
+ if (chctl->flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
+ return 1;
+ }
+ return 0;
+}
+
+struct xpc_partition_uv {
+ unsigned long heartbeat_gpa; /* phys addr of partition's heartbeat */
+ struct xpc_heartbeat_uv cached_heartbeat; /* cached copy of */
+ /* partition's heartbeat */
+ unsigned long activate_gru_mq_desc_gpa; /* phys addr of parititon's */
+ /* activate mq's gru mq */
+ /* descriptor */
+ void *cached_activate_gru_mq_desc; /* cached copy of partition's */
+ /* activate mq's gru mq descriptor */
+ struct mutex cached_activate_gru_mq_desc_mutex;
+ spinlock_t flags_lock; /* protect updating of flags */
+ unsigned int flags; /* general flags */
+ u8 remote_act_state; /* remote partition's act_state */
+ u8 act_state_req; /* act_state request from remote partition */
+ enum xp_retval reason; /* reason for deactivate act_state request */
+};
+
+/* struct xpc_partition_uv flags */
+
+#define XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV 0x00000001
+#define XPC_P_ENGAGED_UV 0x00000002
+
+/* struct xpc_partition_uv act_state change requests */
+
+#define XPC_P_ASR_ACTIVATE_UV 0x01
+#define XPC_P_ASR_REACTIVATE_UV 0x02
+#define XPC_P_ASR_DEACTIVATE_UV 0x03
+
+struct xpc_partition {
+
+ /* XPC HB infrastructure */
+
+ u8 remote_rp_version; /* version# of partition's rsvd pg */
+ unsigned long remote_rp_ts_jiffies; /* timestamp when rsvd pg setup */
+ unsigned long remote_rp_pa; /* phys addr of partition's rsvd pg */
+ u64 last_heartbeat; /* HB at last read */
+ u32 activate_IRQ_rcvd; /* IRQs since activation */
+ spinlock_t act_lock; /* protect updating of act_state */
+ u8 act_state; /* from XPC HB viewpoint */
+ enum xp_retval reason; /* reason partition is deactivating */
+ int reason_line; /* line# deactivation initiated from */
+
+ unsigned long disengage_timeout; /* timeout in jiffies */
+ struct timer_list disengage_timer;
+
+ /* XPC infrastructure referencing and teardown control */
+
+ u8 setup_state; /* infrastructure setup state */
+ wait_queue_head_t teardown_wq; /* kthread waiting to teardown infra */
+ atomic_t references; /* #of references to infrastructure */
+
+ u8 nchannels; /* #of defined channels supported */
+ atomic_t nchannels_active; /* #of channels that are not DISCONNECTED */
+ atomic_t nchannels_engaged; /* #of channels engaged with remote part */
+ struct xpc_channel *channels; /* array of channel structures */
+
+ /* fields used for managing channel avialability and activity */
+
+ union xpc_channel_ctl_flags chctl; /* chctl flags yet to be processed */
+ spinlock_t chctl_lock; /* chctl flags lock */
+
+ void *remote_openclose_args_base; /* base address of kmalloc'd space */
+ struct xpc_openclose_args *remote_openclose_args; /* copy of remote's */
+ /* args */
+
+ /* channel manager related fields */
+
+ atomic_t channel_mgr_requests; /* #of requests to activate chan mgr */
+ wait_queue_head_t channel_mgr_wq; /* channel mgr's wait queue */
+
+ union {
+ struct xpc_partition_uv uv;
+ } sn;
+
+} ____cacheline_aligned;
+
+struct xpc_arch_operations {
+ int (*setup_partitions) (void);
+ void (*teardown_partitions) (void);
+ void (*process_activate_IRQ_rcvd) (void);
+ enum xp_retval (*get_partition_rsvd_page_pa)
+ (void *, u64 *, unsigned long *, size_t *);
+ int (*setup_rsvd_page) (struct xpc_rsvd_page *);
+
+ void (*allow_hb) (short);
+ void (*disallow_hb) (short);
+ void (*disallow_all_hbs) (void);
+ void (*increment_heartbeat) (void);
+ void (*offline_heartbeat) (void);
+ void (*online_heartbeat) (void);
+ void (*heartbeat_init) (void);
+ void (*heartbeat_exit) (void);
+ enum xp_retval (*get_remote_heartbeat) (struct xpc_partition *);
+
+ void (*request_partition_activation) (struct xpc_rsvd_page *,
+ unsigned long, int);
+ void (*request_partition_reactivation) (struct xpc_partition *);
+ void (*request_partition_deactivation) (struct xpc_partition *);
+ void (*cancel_partition_deactivation_request) (struct xpc_partition *);
+ enum xp_retval (*setup_ch_structures) (struct xpc_partition *);
+ void (*teardown_ch_structures) (struct xpc_partition *);
+
+ enum xp_retval (*make_first_contact) (struct xpc_partition *);
+
+ u64 (*get_chctl_all_flags) (struct xpc_partition *);
+ void (*send_chctl_closerequest) (struct xpc_channel *, unsigned long *);
+ void (*send_chctl_closereply) (struct xpc_channel *, unsigned long *);
+ void (*send_chctl_openrequest) (struct xpc_channel *, unsigned long *);
+ void (*send_chctl_openreply) (struct xpc_channel *, unsigned long *);
+ void (*send_chctl_opencomplete) (struct xpc_channel *, unsigned long *);
+ void (*process_msg_chctl_flags) (struct xpc_partition *, int);
+
+ enum xp_retval (*save_remote_msgqueue_pa) (struct xpc_channel *,
+ unsigned long);
+
+ enum xp_retval (*setup_msg_structures) (struct xpc_channel *);
+ void (*teardown_msg_structures) (struct xpc_channel *);
+
+ void (*indicate_partition_engaged) (struct xpc_partition *);
+ void (*indicate_partition_disengaged) (struct xpc_partition *);
+ void (*assume_partition_disengaged) (short);
+ int (*partition_engaged) (short);
+ int (*any_partition_engaged) (void);
+
+ int (*n_of_deliverable_payloads) (struct xpc_channel *);
+ enum xp_retval (*send_payload) (struct xpc_channel *, u32, void *,
+ u16, u8, xpc_notify_func, void *);
+ void *(*get_deliverable_payload) (struct xpc_channel *);
+ void (*received_payload) (struct xpc_channel *, void *);
+ void (*notify_senders_of_disconnect) (struct xpc_channel *);
+};
+
+/* struct xpc_partition act_state values (for XPC HB) */
+
+#define XPC_P_AS_INACTIVE 0x00 /* partition is not active */
+#define XPC_P_AS_ACTIVATION_REQ 0x01 /* created thread to activate */
+#define XPC_P_AS_ACTIVATING 0x02 /* activation thread started */
+#define XPC_P_AS_ACTIVE 0x03 /* xpc_partition_up() was called */
+#define XPC_P_AS_DEACTIVATING 0x04 /* partition deactivation initiated */
+
+#define XPC_DEACTIVATE_PARTITION(_p, _reason) \
+ xpc_deactivate_partition(__LINE__, (_p), (_reason))
+
+/* struct xpc_partition setup_state values */
+
+#define XPC_P_SS_UNSET 0x00 /* infrastructure was never setup */
+#define XPC_P_SS_SETUP 0x01 /* infrastructure is setup */
+#define XPC_P_SS_WTEARDOWN 0x02 /* waiting to teardown infrastructure */
+#define XPC_P_SS_TORNDOWN 0x03 /* infrastructure is torndown */
+
+/* number of seconds to wait for other partitions to disengage */
+#define XPC_DISENGAGE_DEFAULT_TIMELIMIT 90
+
+/* interval in seconds to print 'waiting deactivation' messages */
+#define XPC_DEACTIVATE_PRINTMSG_INTERVAL 10
+
+#define XPC_PARTID(_p) ((short)((_p) - &xpc_partitions[0]))
+
+/* found in xp_main.c */
+extern struct xpc_registration xpc_registrations[];
+
+/* found in xpc_main.c */
+extern struct device *xpc_part;
+extern struct device *xpc_chan;
+extern struct xpc_arch_operations xpc_arch_ops;
+extern int xpc_disengage_timelimit;
+extern int xpc_disengage_timedout;
+extern int xpc_activate_IRQ_rcvd;
+extern spinlock_t xpc_activate_IRQ_rcvd_lock;
+extern wait_queue_head_t xpc_activate_IRQ_wq;
+extern void *xpc_kzalloc_cacheline_aligned(size_t, gfp_t, void **);
+extern void xpc_activate_partition(struct xpc_partition *);
+extern void xpc_activate_kthreads(struct xpc_channel *, int);
+extern void xpc_create_kthreads(struct xpc_channel *, int, int);
+extern void xpc_disconnect_wait(int);
+
+/* found in xpc_uv.c */
+extern int xpc_init_uv(void);
+extern void xpc_exit_uv(void);
+
+/* found in xpc_partition.c */
+extern int xpc_exiting;
+extern int xpc_nasid_mask_nlongs;
+extern struct xpc_rsvd_page *xpc_rsvd_page;
+extern unsigned long *xpc_mach_nasids;
+extern struct xpc_partition *xpc_partitions;
+extern void *xpc_kmalloc_cacheline_aligned(size_t, gfp_t, void **);
+extern int xpc_setup_rsvd_page(void);
+extern void xpc_teardown_rsvd_page(void);
+extern int xpc_identify_activate_IRQ_sender(void);
+extern int xpc_partition_disengaged(struct xpc_partition *);
+extern int xpc_partition_disengaged_from_timer(struct xpc_partition *part);
+extern enum xp_retval xpc_mark_partition_active(struct xpc_partition *);
+extern void xpc_mark_partition_inactive(struct xpc_partition *);
+extern void xpc_discovery(void);
+extern enum xp_retval xpc_get_remote_rp(int, unsigned long *,
+ struct xpc_rsvd_page *,
+ unsigned long *);
+extern void xpc_deactivate_partition(const int, struct xpc_partition *,
+ enum xp_retval);
+extern enum xp_retval xpc_initiate_partid_to_nasids(short, void *);
+
+/* found in xpc_channel.c */
+extern void xpc_initiate_connect(int);
+extern void xpc_initiate_disconnect(int);
+extern enum xp_retval xpc_allocate_msg_wait(struct xpc_channel *);
+extern enum xp_retval xpc_initiate_send(short, int, u32, void *, u16);
+extern enum xp_retval xpc_initiate_send_notify(short, int, u32, void *, u16,
+ xpc_notify_func, void *);
+extern void xpc_initiate_received(short, int, void *);
+extern void xpc_process_sent_chctl_flags(struct xpc_partition *);
+extern void xpc_connected_callout(struct xpc_channel *);
+extern void xpc_deliver_payload(struct xpc_channel *);
+extern void xpc_disconnect_channel(const int, struct xpc_channel *,
+ enum xp_retval, unsigned long *);
+extern void xpc_disconnect_callout(struct xpc_channel *, enum xp_retval);
+extern void xpc_partition_going_down(struct xpc_partition *, enum xp_retval);
+
+static inline void
+xpc_wakeup_channel_mgr(struct xpc_partition *part)
+{
+ if (atomic_inc_return(&part->channel_mgr_requests) == 1)
+ wake_up(&part->channel_mgr_wq);
+}
+
+/*
+ * These next two inlines are used to keep us from tearing down a channel's
+ * msg queues while a thread may be referencing them.
+ */
+static inline void
+xpc_msgqueue_ref(struct xpc_channel *ch)
+{
+ atomic_inc(&ch->references);
+}
+
+static inline void
+xpc_msgqueue_deref(struct xpc_channel *ch)
+{
+ s32 refs = atomic_dec_return(&ch->references);
+
+ DBUG_ON(refs < 0);
+ if (refs == 0)
+ xpc_wakeup_channel_mgr(&xpc_partitions[ch->partid]);
+}
+
+#define XPC_DISCONNECT_CHANNEL(_ch, _reason, _irqflgs) \
+ xpc_disconnect_channel(__LINE__, _ch, _reason, _irqflgs)
+
+/*
+ * These two inlines are used to keep us from tearing down a partition's
+ * setup infrastructure while a thread may be referencing it.
+ */
+static inline void
+xpc_part_deref(struct xpc_partition *part)
+{
+ s32 refs = atomic_dec_return(&part->references);
+
+ DBUG_ON(refs < 0);
+ if (refs == 0 && part->setup_state == XPC_P_SS_WTEARDOWN)
+ wake_up(&part->teardown_wq);
+}
+
+static inline int
+xpc_part_ref(struct xpc_partition *part)
+{
+ int setup;
+
+ atomic_inc(&part->references);
+ setup = (part->setup_state == XPC_P_SS_SETUP);
+ if (!setup)
+ xpc_part_deref(part);
+
+ return setup;
+}
+
+/*
+ * The following macro is to be used for the setting of the reason and
+ * reason_line fields in both the struct xpc_channel and struct xpc_partition
+ * structures.
+ */
+#define XPC_SET_REASON(_p, _reason, _line) \
+ { \
+ (_p)->reason = _reason; \
+ (_p)->reason_line = _line; \
+ }
+
+#endif /* _DRIVERS_MISC_SGIXP_XPC_H */
diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c
new file mode 100644
index 000000000..8e6607fc8
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_channel.c
@@ -0,0 +1,1011 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) channel support.
+ *
+ * This is the part of XPC that manages the channels and
+ * sends/receives messages across them to/from other partitions.
+ *
+ */
+
+#include <linux/device.h>
+#include "xpc.h"
+
+/*
+ * Process a connect message from a remote partition.
+ *
+ * Note: xpc_process_connect() is expecting to be called with the
+ * spin_lock_irqsave held and will leave it locked upon return.
+ */
+static void
+xpc_process_connect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ enum xp_retval ret;
+
+ lockdep_assert_held(&ch->lock);
+
+ if (!(ch->flags & XPC_C_OPENREQUEST) ||
+ !(ch->flags & XPC_C_ROPENREQUEST)) {
+ /* nothing more to do for now */
+ return;
+ }
+ DBUG_ON(!(ch->flags & XPC_C_CONNECTING));
+
+ if (!(ch->flags & XPC_C_SETUP)) {
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+ ret = xpc_arch_ops.setup_msg_structures(ch);
+ spin_lock_irqsave(&ch->lock, *irq_flags);
+
+ if (ret != xpSuccess)
+ XPC_DISCONNECT_CHANNEL(ch, ret, irq_flags);
+ else
+ ch->flags |= XPC_C_SETUP;
+
+ if (ch->flags & XPC_C_DISCONNECTING)
+ return;
+ }
+
+ if (!(ch->flags & XPC_C_OPENREPLY)) {
+ ch->flags |= XPC_C_OPENREPLY;
+ xpc_arch_ops.send_chctl_openreply(ch, irq_flags);
+ }
+
+ if (!(ch->flags & XPC_C_ROPENREPLY))
+ return;
+
+ if (!(ch->flags & XPC_C_OPENCOMPLETE)) {
+ ch->flags |= (XPC_C_OPENCOMPLETE | XPC_C_CONNECTED);
+ xpc_arch_ops.send_chctl_opencomplete(ch, irq_flags);
+ }
+
+ if (!(ch->flags & XPC_C_ROPENCOMPLETE))
+ return;
+
+ dev_info(xpc_chan, "channel %d to partition %d connected\n",
+ ch->number, ch->partid);
+
+ ch->flags = (XPC_C_CONNECTED | XPC_C_SETUP); /* clear all else */
+}
+
+/*
+ * spin_lock_irqsave() is expected to be held on entry.
+ */
+static void
+xpc_process_disconnect(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_partition *part = &xpc_partitions[ch->partid];
+ u32 channel_was_connected = (ch->flags & XPC_C_WASCONNECTED);
+
+ lockdep_assert_held(&ch->lock);
+
+ if (!(ch->flags & XPC_C_DISCONNECTING))
+ return;
+
+ DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+ /* make sure all activity has settled down first */
+
+ if (atomic_read(&ch->kthreads_assigned) > 0 ||
+ atomic_read(&ch->references) > 0) {
+ return;
+ }
+ DBUG_ON((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+ !(ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE));
+
+ if (part->act_state == XPC_P_AS_DEACTIVATING) {
+ /* can't proceed until the other side disengages from us */
+ if (xpc_arch_ops.partition_engaged(ch->partid))
+ return;
+
+ } else {
+
+ /* as long as the other side is up do the full protocol */
+
+ if (!(ch->flags & XPC_C_RCLOSEREQUEST))
+ return;
+
+ if (!(ch->flags & XPC_C_CLOSEREPLY)) {
+ ch->flags |= XPC_C_CLOSEREPLY;
+ xpc_arch_ops.send_chctl_closereply(ch, irq_flags);
+ }
+
+ if (!(ch->flags & XPC_C_RCLOSEREPLY))
+ return;
+ }
+
+ /* wake those waiting for notify completion */
+ if (atomic_read(&ch->n_to_notify) > 0) {
+ /* we do callout while holding ch->lock, callout can't block */
+ xpc_arch_ops.notify_senders_of_disconnect(ch);
+ }
+
+ /* both sides are disconnected now */
+
+ if (ch->flags & XPC_C_DISCONNECTINGCALLOUT_MADE) {
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+ xpc_disconnect_callout(ch, xpDisconnected);
+ spin_lock_irqsave(&ch->lock, *irq_flags);
+ }
+
+ DBUG_ON(atomic_read(&ch->n_to_notify) != 0);
+
+ /* it's now safe to free the channel's message queues */
+ xpc_arch_ops.teardown_msg_structures(ch);
+
+ ch->func = NULL;
+ ch->key = NULL;
+ ch->entry_size = 0;
+ ch->local_nentries = 0;
+ ch->remote_nentries = 0;
+ ch->kthreads_assigned_limit = 0;
+ ch->kthreads_idle_limit = 0;
+
+ /*
+ * Mark the channel disconnected and clear all other flags, including
+ * XPC_C_SETUP (because of call to
+ * xpc_arch_ops.teardown_msg_structures()) but not including
+ * XPC_C_WDISCONNECT (if it was set).
+ */
+ ch->flags = (XPC_C_DISCONNECTED | (ch->flags & XPC_C_WDISCONNECT));
+
+ atomic_dec(&part->nchannels_active);
+
+ if (channel_was_connected) {
+ dev_info(xpc_chan, "channel %d to partition %d disconnected, "
+ "reason=%d\n", ch->number, ch->partid, ch->reason);
+ }
+
+ if (ch->flags & XPC_C_WDISCONNECT) {
+ /* we won't lose the CPU since we're holding ch->lock */
+ complete(&ch->wdisconnect_wait);
+ } else if (ch->delayed_chctl_flags) {
+ if (part->act_state != XPC_P_AS_DEACTIVATING) {
+ /* time to take action on any delayed chctl flags */
+ spin_lock(&part->chctl_lock);
+ part->chctl.flags[ch->number] |=
+ ch->delayed_chctl_flags;
+ spin_unlock(&part->chctl_lock);
+ }
+ ch->delayed_chctl_flags = 0;
+ }
+}
+
+/*
+ * Process a change in the channel's remote connection state.
+ */
+static void
+xpc_process_openclose_chctl_flags(struct xpc_partition *part, int ch_number,
+ u8 chctl_flags)
+{
+ unsigned long irq_flags;
+ struct xpc_openclose_args *args =
+ &part->remote_openclose_args[ch_number];
+ struct xpc_channel *ch = &part->channels[ch_number];
+ enum xp_retval reason;
+ enum xp_retval ret;
+ int create_kthread = 0;
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+
+again:
+
+ if ((ch->flags & XPC_C_DISCONNECTED) &&
+ (ch->flags & XPC_C_WDISCONNECT)) {
+ /*
+ * Delay processing chctl flags until thread waiting disconnect
+ * has had a chance to see that the channel is disconnected.
+ */
+ ch->delayed_chctl_flags |= chctl_flags;
+ goto out;
+ }
+
+ if (chctl_flags & XPC_CHCTL_CLOSEREQUEST) {
+
+ dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREQUEST (reason=%d) received "
+ "from partid=%d, channel=%d\n", args->reason,
+ ch->partid, ch->number);
+
+ /*
+ * If RCLOSEREQUEST is set, we're probably waiting for
+ * RCLOSEREPLY. We should find it and a ROPENREQUEST packed
+ * with this RCLOSEREQUEST in the chctl_flags.
+ */
+
+ if (ch->flags & XPC_C_RCLOSEREQUEST) {
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
+ DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+ DBUG_ON(!(ch->flags & XPC_C_CLOSEREPLY));
+ DBUG_ON(ch->flags & XPC_C_RCLOSEREPLY);
+
+ DBUG_ON(!(chctl_flags & XPC_CHCTL_CLOSEREPLY));
+ chctl_flags &= ~XPC_CHCTL_CLOSEREPLY;
+ ch->flags |= XPC_C_RCLOSEREPLY;
+
+ /* both sides have finished disconnecting */
+ xpc_process_disconnect(ch, &irq_flags);
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+ goto again;
+ }
+
+ if (ch->flags & XPC_C_DISCONNECTED) {
+ if (!(chctl_flags & XPC_CHCTL_OPENREQUEST)) {
+ if (part->chctl.flags[ch_number] &
+ XPC_CHCTL_OPENREQUEST) {
+
+ DBUG_ON(ch->delayed_chctl_flags != 0);
+ spin_lock(&part->chctl_lock);
+ part->chctl.flags[ch_number] |=
+ XPC_CHCTL_CLOSEREQUEST;
+ spin_unlock(&part->chctl_lock);
+ }
+ goto out;
+ }
+
+ XPC_SET_REASON(ch, 0, 0);
+ ch->flags &= ~XPC_C_DISCONNECTED;
+
+ atomic_inc(&part->nchannels_active);
+ ch->flags |= (XPC_C_CONNECTING | XPC_C_ROPENREQUEST);
+ }
+
+ chctl_flags &= ~(XPC_CHCTL_OPENREQUEST | XPC_CHCTL_OPENREPLY |
+ XPC_CHCTL_OPENCOMPLETE);
+
+ /*
+ * The meaningful CLOSEREQUEST connection state fields are:
+ * reason = reason connection is to be closed
+ */
+
+ ch->flags |= XPC_C_RCLOSEREQUEST;
+
+ if (!(ch->flags & XPC_C_DISCONNECTING)) {
+ reason = args->reason;
+ if (reason <= xpSuccess || reason > xpUnknownReason)
+ reason = xpUnknownReason;
+ else if (reason == xpUnregistering)
+ reason = xpOtherUnregistering;
+
+ XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+ DBUG_ON(chctl_flags & XPC_CHCTL_CLOSEREPLY);
+ goto out;
+ }
+
+ xpc_process_disconnect(ch, &irq_flags);
+ }
+
+ if (chctl_flags & XPC_CHCTL_CLOSEREPLY) {
+
+ dev_dbg(xpc_chan, "XPC_CHCTL_CLOSEREPLY received from partid="
+ "%d, channel=%d\n", ch->partid, ch->number);
+
+ if (ch->flags & XPC_C_DISCONNECTED) {
+ DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING);
+ goto out;
+ }
+
+ DBUG_ON(!(ch->flags & XPC_C_CLOSEREQUEST));
+
+ if (!(ch->flags & XPC_C_RCLOSEREQUEST)) {
+ if (part->chctl.flags[ch_number] &
+ XPC_CHCTL_CLOSEREQUEST) {
+
+ DBUG_ON(ch->delayed_chctl_flags != 0);
+ spin_lock(&part->chctl_lock);
+ part->chctl.flags[ch_number] |=
+ XPC_CHCTL_CLOSEREPLY;
+ spin_unlock(&part->chctl_lock);
+ }
+ goto out;
+ }
+
+ ch->flags |= XPC_C_RCLOSEREPLY;
+
+ if (ch->flags & XPC_C_CLOSEREPLY) {
+ /* both sides have finished disconnecting */
+ xpc_process_disconnect(ch, &irq_flags);
+ }
+ }
+
+ if (chctl_flags & XPC_CHCTL_OPENREQUEST) {
+
+ dev_dbg(xpc_chan, "XPC_CHCTL_OPENREQUEST (entry_size=%d, "
+ "local_nentries=%d) received from partid=%d, "
+ "channel=%d\n", args->entry_size, args->local_nentries,
+ ch->partid, ch->number);
+
+ if (part->act_state == XPC_P_AS_DEACTIVATING ||
+ (ch->flags & XPC_C_ROPENREQUEST)) {
+ goto out;
+ }
+
+ if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_WDISCONNECT)) {
+ ch->delayed_chctl_flags |= XPC_CHCTL_OPENREQUEST;
+ goto out;
+ }
+ DBUG_ON(!(ch->flags & (XPC_C_DISCONNECTED |
+ XPC_C_OPENREQUEST)));
+ DBUG_ON(ch->flags & (XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+ XPC_C_OPENREPLY | XPC_C_CONNECTED));
+
+ /*
+ * The meaningful OPENREQUEST connection state fields are:
+ * entry_size = size of channel's messages in bytes
+ * local_nentries = remote partition's local_nentries
+ */
+ if (args->entry_size == 0 || args->local_nentries == 0) {
+ /* assume OPENREQUEST was delayed by mistake */
+ goto out;
+ }
+
+ ch->flags |= (XPC_C_ROPENREQUEST | XPC_C_CONNECTING);
+ ch->remote_nentries = args->local_nentries;
+
+ if (ch->flags & XPC_C_OPENREQUEST) {
+ if (args->entry_size != ch->entry_size) {
+ XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes,
+ &irq_flags);
+ goto out;
+ }
+ } else {
+ ch->entry_size = args->entry_size;
+
+ XPC_SET_REASON(ch, 0, 0);
+ ch->flags &= ~XPC_C_DISCONNECTED;
+
+ atomic_inc(&part->nchannels_active);
+ }
+
+ xpc_process_connect(ch, &irq_flags);
+ }
+
+ if (chctl_flags & XPC_CHCTL_OPENREPLY) {
+
+ dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY (local_msgqueue_pa="
+ "0x%lx, local_nentries=%d, remote_nentries=%d) "
+ "received from partid=%d, channel=%d\n",
+ args->local_msgqueue_pa, args->local_nentries,
+ args->remote_nentries, ch->partid, ch->number);
+
+ if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+ goto out;
+
+ if (!(ch->flags & XPC_C_OPENREQUEST)) {
+ XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError,
+ &irq_flags);
+ goto out;
+ }
+
+ DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
+ DBUG_ON(ch->flags & XPC_C_CONNECTED);
+
+ /*
+ * The meaningful OPENREPLY connection state fields are:
+ * local_msgqueue_pa = physical address of remote
+ * partition's local_msgqueue
+ * local_nentries = remote partition's local_nentries
+ * remote_nentries = remote partition's remote_nentries
+ */
+ DBUG_ON(args->local_msgqueue_pa == 0);
+ DBUG_ON(args->local_nentries == 0);
+ DBUG_ON(args->remote_nentries == 0);
+
+ ret = xpc_arch_ops.save_remote_msgqueue_pa(ch,
+ args->local_msgqueue_pa);
+ if (ret != xpSuccess) {
+ XPC_DISCONNECT_CHANNEL(ch, ret, &irq_flags);
+ goto out;
+ }
+ ch->flags |= XPC_C_ROPENREPLY;
+
+ if (args->local_nentries < ch->remote_nentries) {
+ dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
+ "remote_nentries=%d, old remote_nentries=%d, "
+ "partid=%d, channel=%d\n",
+ args->local_nentries, ch->remote_nentries,
+ ch->partid, ch->number);
+
+ ch->remote_nentries = args->local_nentries;
+ }
+ if (args->remote_nentries < ch->local_nentries) {
+ dev_dbg(xpc_chan, "XPC_CHCTL_OPENREPLY: new "
+ "local_nentries=%d, old local_nentries=%d, "
+ "partid=%d, channel=%d\n",
+ args->remote_nentries, ch->local_nentries,
+ ch->partid, ch->number);
+
+ ch->local_nentries = args->remote_nentries;
+ }
+
+ xpc_process_connect(ch, &irq_flags);
+ }
+
+ if (chctl_flags & XPC_CHCTL_OPENCOMPLETE) {
+
+ dev_dbg(xpc_chan, "XPC_CHCTL_OPENCOMPLETE received from "
+ "partid=%d, channel=%d\n", ch->partid, ch->number);
+
+ if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+ goto out;
+
+ if (!(ch->flags & XPC_C_OPENREQUEST) ||
+ !(ch->flags & XPC_C_OPENREPLY)) {
+ XPC_DISCONNECT_CHANNEL(ch, xpOpenCloseError,
+ &irq_flags);
+ goto out;
+ }
+
+ DBUG_ON(!(ch->flags & XPC_C_ROPENREQUEST));
+ DBUG_ON(!(ch->flags & XPC_C_ROPENREPLY));
+ DBUG_ON(!(ch->flags & XPC_C_CONNECTED));
+
+ ch->flags |= XPC_C_ROPENCOMPLETE;
+
+ xpc_process_connect(ch, &irq_flags);
+ create_kthread = 1;
+ }
+
+out:
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ if (create_kthread)
+ xpc_create_kthreads(ch, 1, 0);
+}
+
+/*
+ * Attempt to establish a channel connection to a remote partition.
+ */
+static enum xp_retval
+xpc_connect_channel(struct xpc_channel *ch)
+{
+ unsigned long irq_flags;
+ struct xpc_registration *registration = &xpc_registrations[ch->number];
+
+ if (mutex_trylock(&registration->mutex) == 0)
+ return xpRetry;
+
+ if (!XPC_CHANNEL_REGISTERED(ch->number)) {
+ mutex_unlock(&registration->mutex);
+ return xpUnregistered;
+ }
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+
+ DBUG_ON(ch->flags & XPC_C_CONNECTED);
+ DBUG_ON(ch->flags & XPC_C_OPENREQUEST);
+
+ if (ch->flags & XPC_C_DISCONNECTING) {
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ mutex_unlock(&registration->mutex);
+ return ch->reason;
+ }
+
+ /* add info from the channel connect registration to the channel */
+
+ ch->kthreads_assigned_limit = registration->assigned_limit;
+ ch->kthreads_idle_limit = registration->idle_limit;
+ DBUG_ON(atomic_read(&ch->kthreads_assigned) != 0);
+ DBUG_ON(atomic_read(&ch->kthreads_idle) != 0);
+ DBUG_ON(atomic_read(&ch->kthreads_active) != 0);
+
+ ch->func = registration->func;
+ DBUG_ON(registration->func == NULL);
+ ch->key = registration->key;
+
+ ch->local_nentries = registration->nentries;
+
+ if (ch->flags & XPC_C_ROPENREQUEST) {
+ if (registration->entry_size != ch->entry_size) {
+ /* the local and remote sides aren't the same */
+
+ /*
+ * Because XPC_DISCONNECT_CHANNEL() can block we're
+ * forced to up the registration sema before we unlock
+ * the channel lock. But that's okay here because we're
+ * done with the part that required the registration
+ * sema. XPC_DISCONNECT_CHANNEL() requires that the
+ * channel lock be locked and will unlock and relock
+ * the channel lock as needed.
+ */
+ mutex_unlock(&registration->mutex);
+ XPC_DISCONNECT_CHANNEL(ch, xpUnequalMsgSizes,
+ &irq_flags);
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return xpUnequalMsgSizes;
+ }
+ } else {
+ ch->entry_size = registration->entry_size;
+
+ XPC_SET_REASON(ch, 0, 0);
+ ch->flags &= ~XPC_C_DISCONNECTED;
+
+ atomic_inc(&xpc_partitions[ch->partid].nchannels_active);
+ }
+
+ mutex_unlock(&registration->mutex);
+
+ /* initiate the connection */
+
+ ch->flags |= (XPC_C_OPENREQUEST | XPC_C_CONNECTING);
+ xpc_arch_ops.send_chctl_openrequest(ch, &irq_flags);
+
+ xpc_process_connect(ch, &irq_flags);
+
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ return xpSuccess;
+}
+
+void
+xpc_process_sent_chctl_flags(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ union xpc_channel_ctl_flags chctl;
+ struct xpc_channel *ch;
+ int ch_number;
+ u32 ch_flags;
+
+ chctl.all_flags = xpc_arch_ops.get_chctl_all_flags(part);
+
+ /*
+ * Initiate channel connections for registered channels.
+ *
+ * For each connected channel that has pending messages activate idle
+ * kthreads and/or create new kthreads as needed.
+ */
+
+ for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+ ch = &part->channels[ch_number];
+
+ /*
+ * Process any open or close related chctl flags, and then deal
+ * with connecting or disconnecting the channel as required.
+ */
+
+ if (chctl.flags[ch_number] & XPC_OPENCLOSE_CHCTL_FLAGS) {
+ xpc_process_openclose_chctl_flags(part, ch_number,
+ chctl.flags[ch_number]);
+ }
+
+ ch_flags = ch->flags; /* need an atomic snapshot of flags */
+
+ if (ch_flags & XPC_C_DISCONNECTING) {
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ xpc_process_disconnect(ch, &irq_flags);
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ continue;
+ }
+
+ if (part->act_state == XPC_P_AS_DEACTIVATING)
+ continue;
+
+ if (!(ch_flags & XPC_C_CONNECTED)) {
+ if (!(ch_flags & XPC_C_OPENREQUEST)) {
+ DBUG_ON(ch_flags & XPC_C_SETUP);
+ (void)xpc_connect_channel(ch);
+ }
+ continue;
+ }
+
+ /*
+ * Process any message related chctl flags, this may involve
+ * the activation of kthreads to deliver any pending messages
+ * sent from the other partition.
+ */
+
+ if (chctl.flags[ch_number] & XPC_MSG_CHCTL_FLAGS)
+ xpc_arch_ops.process_msg_chctl_flags(part, ch_number);
+ }
+}
+
+/*
+ * XPC's heartbeat code calls this function to inform XPC that a partition is
+ * going down. XPC responds by tearing down the XPartition Communication
+ * infrastructure used for the just downed partition.
+ *
+ * XPC's heartbeat code will never call this function and xpc_partition_up()
+ * at the same time. Nor will it ever make multiple calls to either function
+ * at the same time.
+ */
+void
+xpc_partition_going_down(struct xpc_partition *part, enum xp_retval reason)
+{
+ unsigned long irq_flags;
+ int ch_number;
+ struct xpc_channel *ch;
+
+ dev_dbg(xpc_chan, "deactivating partition %d, reason=%d\n",
+ XPC_PARTID(part), reason);
+
+ if (!xpc_part_ref(part)) {
+ /* infrastructure for this partition isn't currently set up */
+ return;
+ }
+
+ /* disconnect channels associated with the partition going down */
+
+ for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+ ch = &part->channels[ch_number];
+
+ xpc_msgqueue_ref(ch);
+ spin_lock_irqsave(&ch->lock, irq_flags);
+
+ XPC_DISCONNECT_CHANNEL(ch, reason, &irq_flags);
+
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ xpc_msgqueue_deref(ch);
+ }
+
+ xpc_wakeup_channel_mgr(part);
+
+ xpc_part_deref(part);
+}
+
+/*
+ * Called by XP at the time of channel connection registration to cause
+ * XPC to establish connections to all currently active partitions.
+ */
+void
+xpc_initiate_connect(int ch_number)
+{
+ short partid;
+ struct xpc_partition *part;
+
+ DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (xpc_part_ref(part)) {
+ /*
+ * Initiate the establishment of a connection on the
+ * newly registered channel to the remote partition.
+ */
+ xpc_wakeup_channel_mgr(part);
+ xpc_part_deref(part);
+ }
+ }
+}
+
+void
+xpc_connected_callout(struct xpc_channel *ch)
+{
+ /* let the registerer know that a connection has been established */
+
+ if (ch->func != NULL) {
+ dev_dbg(xpc_chan, "ch->func() called, reason=xpConnected, "
+ "partid=%d, channel=%d\n", ch->partid, ch->number);
+
+ ch->func(xpConnected, ch->partid, ch->number,
+ (void *)(u64)ch->local_nentries, ch->key);
+
+ dev_dbg(xpc_chan, "ch->func() returned, reason=xpConnected, "
+ "partid=%d, channel=%d\n", ch->partid, ch->number);
+ }
+}
+
+/*
+ * Called by XP at the time of channel connection unregistration to cause
+ * XPC to teardown all current connections for the specified channel.
+ *
+ * Before returning xpc_initiate_disconnect() will wait until all connections
+ * on the specified channel have been closed/torndown. So the caller can be
+ * assured that they will not be receiving any more callouts from XPC to the
+ * function they registered via xpc_connect().
+ *
+ * Arguments:
+ *
+ * ch_number - channel # to unregister.
+ */
+void
+xpc_initiate_disconnect(int ch_number)
+{
+ unsigned long irq_flags;
+ short partid;
+ struct xpc_partition *part;
+ struct xpc_channel *ch;
+
+ DBUG_ON(ch_number < 0 || ch_number >= XPC_MAX_NCHANNELS);
+
+ /* initiate the channel disconnect for every active partition */
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (xpc_part_ref(part)) {
+ ch = &part->channels[ch_number];
+ xpc_msgqueue_ref(ch);
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+
+ if (!(ch->flags & XPC_C_DISCONNECTED)) {
+ ch->flags |= XPC_C_WDISCONNECT;
+
+ XPC_DISCONNECT_CHANNEL(ch, xpUnregistering,
+ &irq_flags);
+ }
+
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ xpc_msgqueue_deref(ch);
+ xpc_part_deref(part);
+ }
+ }
+
+ xpc_disconnect_wait(ch_number);
+}
+
+/*
+ * To disconnect a channel, and reflect it back to all who may be waiting.
+ *
+ * An OPEN is not allowed until XPC_C_DISCONNECTING is cleared by
+ * xpc_process_disconnect(), and if set, XPC_C_WDISCONNECT is cleared by
+ * xpc_disconnect_wait().
+ *
+ * THE CHANNEL IS TO BE LOCKED BY THE CALLER AND WILL REMAIN LOCKED UPON RETURN.
+ */
+void
+xpc_disconnect_channel(const int line, struct xpc_channel *ch,
+ enum xp_retval reason, unsigned long *irq_flags)
+{
+ u32 channel_was_connected = (ch->flags & XPC_C_CONNECTED);
+
+ lockdep_assert_held(&ch->lock);
+
+ if (ch->flags & (XPC_C_DISCONNECTING | XPC_C_DISCONNECTED))
+ return;
+
+ DBUG_ON(!(ch->flags & (XPC_C_CONNECTING | XPC_C_CONNECTED)));
+
+ dev_dbg(xpc_chan, "reason=%d, line=%d, partid=%d, channel=%d\n",
+ reason, line, ch->partid, ch->number);
+
+ XPC_SET_REASON(ch, reason, line);
+
+ ch->flags |= (XPC_C_CLOSEREQUEST | XPC_C_DISCONNECTING);
+ /* some of these may not have been set */
+ ch->flags &= ~(XPC_C_OPENREQUEST | XPC_C_OPENREPLY |
+ XPC_C_ROPENREQUEST | XPC_C_ROPENREPLY |
+ XPC_C_CONNECTING | XPC_C_CONNECTED);
+
+ xpc_arch_ops.send_chctl_closerequest(ch, irq_flags);
+
+ if (channel_was_connected)
+ ch->flags |= XPC_C_WASCONNECTED;
+
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+ /* wake all idle kthreads so they can exit */
+ if (atomic_read(&ch->kthreads_idle) > 0) {
+ wake_up_all(&ch->idle_wq);
+
+ } else if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+ !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+ /* start a kthread that will do the xpDisconnecting callout */
+ xpc_create_kthreads(ch, 1, 1);
+ }
+
+ /* wake those waiting to allocate an entry from the local msg queue */
+ if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+ wake_up(&ch->msg_allocate_wq);
+
+ spin_lock_irqsave(&ch->lock, *irq_flags);
+}
+
+void
+xpc_disconnect_callout(struct xpc_channel *ch, enum xp_retval reason)
+{
+ /*
+ * Let the channel's registerer know that the channel is being
+ * disconnected. We don't want to do this if the registerer was never
+ * informed of a connection being made.
+ */
+
+ if (ch->func != NULL) {
+ dev_dbg(xpc_chan, "ch->func() called, reason=%d, partid=%d, "
+ "channel=%d\n", reason, ch->partid, ch->number);
+
+ ch->func(reason, ch->partid, ch->number, NULL, ch->key);
+
+ dev_dbg(xpc_chan, "ch->func() returned, reason=%d, partid=%d, "
+ "channel=%d\n", reason, ch->partid, ch->number);
+ }
+}
+
+/*
+ * Wait for a message entry to become available for the specified channel,
+ * but don't wait any longer than 1 jiffy.
+ */
+enum xp_retval
+xpc_allocate_msg_wait(struct xpc_channel *ch)
+{
+ enum xp_retval ret;
+ DEFINE_WAIT(wait);
+
+ if (ch->flags & XPC_C_DISCONNECTING) {
+ DBUG_ON(ch->reason == xpInterrupted);
+ return ch->reason;
+ }
+
+ atomic_inc(&ch->n_on_msg_allocate_wq);
+ prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE);
+ ret = schedule_timeout(1);
+ finish_wait(&ch->msg_allocate_wq, &wait);
+ atomic_dec(&ch->n_on_msg_allocate_wq);
+
+ if (ch->flags & XPC_C_DISCONNECTING) {
+ ret = ch->reason;
+ DBUG_ON(ch->reason == xpInterrupted);
+ } else if (ret == 0) {
+ ret = xpTimeout;
+ } else {
+ ret = xpInterrupted;
+ }
+
+ return ret;
+}
+
+/*
+ * Send a message that contains the user's payload on the specified channel
+ * connected to the specified partition.
+ *
+ * NOTE that this routine can sleep waiting for a message entry to become
+ * available. To not sleep, pass in the XPC_NOWAIT flag.
+ *
+ * Once sent, this routine will not wait for the message to be received, nor
+ * will notification be given when it does happen.
+ *
+ * Arguments:
+ *
+ * partid - ID of partition to which the channel is connected.
+ * ch_number - channel # to send message on.
+ * flags - see xp.h for valid flags.
+ * payload - pointer to the payload which is to be sent.
+ * payload_size - size of the payload in bytes.
+ */
+enum xp_retval
+xpc_initiate_send(short partid, int ch_number, u32 flags, void *payload,
+ u16 payload_size)
+{
+ struct xpc_partition *part = &xpc_partitions[partid];
+ enum xp_retval ret = xpUnknownReason;
+
+ dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
+ partid, ch_number);
+
+ DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+ DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+ DBUG_ON(payload == NULL);
+
+ if (xpc_part_ref(part)) {
+ ret = xpc_arch_ops.send_payload(&part->channels[ch_number],
+ flags, payload, payload_size, 0, NULL, NULL);
+ xpc_part_deref(part);
+ }
+
+ return ret;
+}
+
+/*
+ * Send a message that contains the user's payload on the specified channel
+ * connected to the specified partition.
+ *
+ * NOTE that this routine can sleep waiting for a message entry to become
+ * available. To not sleep, pass in the XPC_NOWAIT flag.
+ *
+ * This routine will not wait for the message to be sent or received.
+ *
+ * Once the remote end of the channel has received the message, the function
+ * passed as an argument to xpc_initiate_send_notify() will be called. This
+ * allows the sender to free up or re-use any buffers referenced by the
+ * message, but does NOT mean the message has been processed at the remote
+ * end by a receiver.
+ *
+ * If this routine returns an error, the caller's function will NOT be called.
+ *
+ * Arguments:
+ *
+ * partid - ID of partition to which the channel is connected.
+ * ch_number - channel # to send message on.
+ * flags - see xp.h for valid flags.
+ * payload - pointer to the payload which is to be sent.
+ * payload_size - size of the payload in bytes.
+ * func - function to call with asynchronous notification of message
+ * receipt. THIS FUNCTION MUST BE NON-BLOCKING.
+ * key - user-defined key to be passed to the function when it's called.
+ */
+enum xp_retval
+xpc_initiate_send_notify(short partid, int ch_number, u32 flags, void *payload,
+ u16 payload_size, xpc_notify_func func, void *key)
+{
+ struct xpc_partition *part = &xpc_partitions[partid];
+ enum xp_retval ret = xpUnknownReason;
+
+ dev_dbg(xpc_chan, "payload=0x%p, partid=%d, channel=%d\n", payload,
+ partid, ch_number);
+
+ DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+ DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+ DBUG_ON(payload == NULL);
+ DBUG_ON(func == NULL);
+
+ if (xpc_part_ref(part)) {
+ ret = xpc_arch_ops.send_payload(&part->channels[ch_number],
+ flags, payload, payload_size, XPC_N_CALL, func, key);
+ xpc_part_deref(part);
+ }
+ return ret;
+}
+
+/*
+ * Deliver a message's payload to its intended recipient.
+ */
+void
+xpc_deliver_payload(struct xpc_channel *ch)
+{
+ void *payload;
+
+ payload = xpc_arch_ops.get_deliverable_payload(ch);
+ if (payload != NULL) {
+
+ /*
+ * This ref is taken to protect the payload itself from being
+ * freed before the user is finished with it, which the user
+ * indicates by calling xpc_initiate_received().
+ */
+ xpc_msgqueue_ref(ch);
+
+ atomic_inc(&ch->kthreads_active);
+
+ if (ch->func != NULL) {
+ dev_dbg(xpc_chan, "ch->func() called, payload=0x%p "
+ "partid=%d channel=%d\n", payload, ch->partid,
+ ch->number);
+
+ /* deliver the message to its intended recipient */
+ ch->func(xpMsgReceived, ch->partid, ch->number, payload,
+ ch->key);
+
+ dev_dbg(xpc_chan, "ch->func() returned, payload=0x%p "
+ "partid=%d channel=%d\n", payload, ch->partid,
+ ch->number);
+ }
+
+ atomic_dec(&ch->kthreads_active);
+ }
+}
+
+/*
+ * Acknowledge receipt of a delivered message's payload.
+ *
+ * This function, although called by users, does not call xpc_part_ref() to
+ * ensure that the partition infrastructure is in place. It relies on the
+ * fact that we called xpc_msgqueue_ref() in xpc_deliver_payload().
+ *
+ * Arguments:
+ *
+ * partid - ID of partition to which the channel is connected.
+ * ch_number - channel # message received on.
+ * payload - pointer to the payload area allocated via
+ * xpc_initiate_send() or xpc_initiate_send_notify().
+ */
+void
+xpc_initiate_received(short partid, int ch_number, void *payload)
+{
+ struct xpc_partition *part = &xpc_partitions[partid];
+ struct xpc_channel *ch;
+
+ DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+ DBUG_ON(ch_number < 0 || ch_number >= part->nchannels);
+
+ ch = &part->channels[ch_number];
+ xpc_arch_ops.received_payload(ch, payload);
+
+ /* the call to xpc_msgqueue_ref() was done by xpc_deliver_payload() */
+ xpc_msgqueue_deref(ch);
+}
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
new file mode 100644
index 000000000..b2c3c22fc
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -0,0 +1,1349 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) support - standard version.
+ *
+ * XPC provides a message passing capability that crosses partition
+ * boundaries. This module is made up of two parts:
+ *
+ * partition This part detects the presence/absence of other
+ * partitions. It provides a heartbeat and monitors
+ * the heartbeats of other partitions.
+ *
+ * channel This part manages the channels and sends/receives
+ * messages across them to/from other partitions.
+ *
+ * There are a couple of additional functions residing in XP, which
+ * provide an interface to XPC for its users.
+ *
+ *
+ * Caveats:
+ *
+ * . Currently on sn2, we have no way to determine which nasid an IRQ
+ * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
+ * followed by an IPI. The amo indicates where data is to be pulled
+ * from, so after the IPI arrives, the remote partition checks the amo
+ * word. The IPI can actually arrive before the amo however, so other
+ * code must periodically check for this case. Also, remote amo
+ * operations do not reliably time out. Thus we do a remote PIO read
+ * solely to know whether the remote partition is down and whether we
+ * should stop sending IPIs to it. This remote PIO read operation is
+ * set up in a special nofault region so SAL knows to ignore (and
+ * cleanup) any errors due to the remote amo write, PIO read, and/or
+ * PIO write operations.
+ *
+ * If/when new hardware solves this IPI problem, we should abandon
+ * the current approach.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/kdebug.h>
+#include <linux/kthread.h>
+#include "xpc.h"
+
+#ifdef CONFIG_X86_64
+#include <asm/traps.h>
+#endif
+
+/* define two XPC debug device structures to be used with dev_dbg() et al */
+
+static struct device_driver xpc_dbg_name = {
+ .name = "xpc"
+};
+
+static struct device xpc_part_dbg_subname = {
+ .init_name = "", /* set to "part" at xpc_init() time */
+ .driver = &xpc_dbg_name
+};
+
+static struct device xpc_chan_dbg_subname = {
+ .init_name = "", /* set to "chan" at xpc_init() time */
+ .driver = &xpc_dbg_name
+};
+
+struct device *xpc_part = &xpc_part_dbg_subname;
+struct device *xpc_chan = &xpc_chan_dbg_subname;
+
+static int xpc_kdebug_ignore;
+
+/* systune related variables for /proc/sys directories */
+
+static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
+static int xpc_hb_min_interval = 1;
+static int xpc_hb_max_interval = 10;
+
+static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
+static int xpc_hb_check_min_interval = 10;
+static int xpc_hb_check_max_interval = 120;
+
+int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
+static int xpc_disengage_min_timelimit; /* = 0 */
+static int xpc_disengage_max_timelimit = 120;
+
+static struct ctl_table xpc_sys_xpc_hb_dir[] = {
+ {
+ .procname = "hb_interval",
+ .data = &xpc_hb_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xpc_hb_min_interval,
+ .extra2 = &xpc_hb_max_interval},
+ {
+ .procname = "hb_check_interval",
+ .data = &xpc_hb_check_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xpc_hb_check_min_interval,
+ .extra2 = &xpc_hb_check_max_interval},
+ {}
+};
+static struct ctl_table xpc_sys_xpc_dir[] = {
+ {
+ .procname = "hb",
+ .mode = 0555,
+ .child = xpc_sys_xpc_hb_dir},
+ {
+ .procname = "disengage_timelimit",
+ .data = &xpc_disengage_timelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xpc_disengage_min_timelimit,
+ .extra2 = &xpc_disengage_max_timelimit},
+ {}
+};
+static struct ctl_table xpc_sys_dir[] = {
+ {
+ .procname = "xpc",
+ .mode = 0555,
+ .child = xpc_sys_xpc_dir},
+ {}
+};
+static struct ctl_table_header *xpc_sysctl;
+
+/* non-zero if any remote partition disengage was timed out */
+int xpc_disengage_timedout;
+
+/* #of activate IRQs received and not yet processed */
+int xpc_activate_IRQ_rcvd;
+DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
+
+/* IRQ handler notifies this wait queue on receipt of an IRQ */
+DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
+
+static unsigned long xpc_hb_check_timeout;
+static struct timer_list xpc_hb_timer;
+
+/* notification that the xpc_hb_checker thread has exited */
+static DECLARE_COMPLETION(xpc_hb_checker_exited);
+
+/* notification that the xpc_discovery thread has exited */
+static DECLARE_COMPLETION(xpc_discovery_exited);
+
+static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
+
+static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_reboot_notifier = {
+ .notifier_call = xpc_system_reboot,
+};
+
+static int xpc_system_die(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_die_notifier = {
+ .notifier_call = xpc_system_die,
+};
+
+struct xpc_arch_operations xpc_arch_ops;
+
+/*
+ * Timer function to enforce the timelimit on the partition disengage.
+ */
+static void
+xpc_timeout_partition_disengage(struct timer_list *t)
+{
+ struct xpc_partition *part = from_timer(part, t, disengage_timer);
+
+ DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
+
+ xpc_partition_disengaged_from_timer(part);
+
+ DBUG_ON(part->disengage_timeout != 0);
+ DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
+}
+
+/*
+ * Timer to produce the heartbeat. The timer structures function is
+ * already set when this is initially called. A tunable is used to
+ * specify when the next timeout should occur.
+ */
+static void
+xpc_hb_beater(struct timer_list *unused)
+{
+ xpc_arch_ops.increment_heartbeat();
+
+ if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
+ wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+ xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
+ add_timer(&xpc_hb_timer);
+}
+
+static void
+xpc_start_hb_beater(void)
+{
+ xpc_arch_ops.heartbeat_init();
+ timer_setup(&xpc_hb_timer, xpc_hb_beater, 0);
+ xpc_hb_beater(NULL);
+}
+
+static void
+xpc_stop_hb_beater(void)
+{
+ del_timer_sync(&xpc_hb_timer);
+ xpc_arch_ops.heartbeat_exit();
+}
+
+/*
+ * At periodic intervals, scan through all active partitions and ensure
+ * their heartbeat is still active. If not, the partition is deactivated.
+ */
+static void
+xpc_check_remote_hb(void)
+{
+ struct xpc_partition *part;
+ short partid;
+ enum xp_retval ret;
+
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+
+ if (xpc_exiting)
+ break;
+
+ if (partid == xp_partition_id)
+ continue;
+
+ part = &xpc_partitions[partid];
+
+ if (part->act_state == XPC_P_AS_INACTIVE ||
+ part->act_state == XPC_P_AS_DEACTIVATING) {
+ continue;
+ }
+
+ ret = xpc_arch_ops.get_remote_heartbeat(part);
+ if (ret != xpSuccess)
+ XPC_DEACTIVATE_PARTITION(part, ret);
+ }
+}
+
+/*
+ * This thread is responsible for nearly all of the partition
+ * activation/deactivation.
+ */
+static int
+xpc_hb_checker(void *ignore)
+{
+ int force_IRQ = 0;
+
+ /* this thread was marked active by xpc_hb_init() */
+
+ set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
+
+ /* set our heartbeating to other partitions into motion */
+ xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
+ xpc_start_hb_beater();
+
+ while (!xpc_exiting) {
+
+ dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
+ "been received\n",
+ (int)(xpc_hb_check_timeout - jiffies),
+ xpc_activate_IRQ_rcvd);
+
+ /* checking of remote heartbeats is skewed by IRQ handling */
+ if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
+ xpc_hb_check_timeout = jiffies +
+ (xpc_hb_check_interval * HZ);
+
+ dev_dbg(xpc_part, "checking remote heartbeats\n");
+ xpc_check_remote_hb();
+ }
+
+ /* check for outstanding IRQs */
+ if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
+ force_IRQ = 0;
+ dev_dbg(xpc_part, "processing activate IRQs "
+ "received\n");
+ xpc_arch_ops.process_activate_IRQ_rcvd();
+ }
+
+ /* wait for IRQ or timeout */
+ (void)wait_event_interruptible(xpc_activate_IRQ_wq,
+ (time_is_before_eq_jiffies(
+ xpc_hb_check_timeout) ||
+ xpc_activate_IRQ_rcvd > 0 ||
+ xpc_exiting));
+ }
+
+ xpc_stop_hb_beater();
+
+ dev_dbg(xpc_part, "heartbeat checker is exiting\n");
+
+ /* mark this thread as having exited */
+ complete(&xpc_hb_checker_exited);
+ return 0;
+}
+
+/*
+ * This thread will attempt to discover other partitions to activate
+ * based on info provided by SAL. This new thread is short lived and
+ * will exit once discovery is complete.
+ */
+static int
+xpc_initiate_discovery(void *ignore)
+{
+ xpc_discovery();
+
+ dev_dbg(xpc_part, "discovery thread is exiting\n");
+
+ /* mark this thread as having exited */
+ complete(&xpc_discovery_exited);
+ return 0;
+}
+
+/*
+ * The first kthread assigned to a newly activated partition is the one
+ * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
+ * that kthread until the partition is brought down, at which time that kthread
+ * returns back to XPC HB. (The return of that kthread will signify to XPC HB
+ * that XPC has dismantled all communication infrastructure for the associated
+ * partition.) This kthread becomes the channel manager for that partition.
+ *
+ * Each active partition has a channel manager, who, besides connecting and
+ * disconnecting channels, will ensure that each of the partition's connected
+ * channels has the required number of assigned kthreads to get the work done.
+ */
+static void
+xpc_channel_mgr(struct xpc_partition *part)
+{
+ while (part->act_state != XPC_P_AS_DEACTIVATING ||
+ atomic_read(&part->nchannels_active) > 0 ||
+ !xpc_partition_disengaged(part)) {
+
+ xpc_process_sent_chctl_flags(part);
+
+ /*
+ * Wait until we've been requested to activate kthreads or
+ * all of the channel's message queues have been torn down or
+ * a signal is pending.
+ *
+ * The channel_mgr_requests is set to 1 after being awakened,
+ * This is done to prevent the channel mgr from making one pass
+ * through the loop for each request, since he will
+ * be servicing all the requests in one pass. The reason it's
+ * set to 1 instead of 0 is so that other kthreads will know
+ * that the channel mgr is running and won't bother trying to
+ * wake him up.
+ */
+ atomic_dec(&part->channel_mgr_requests);
+ (void)wait_event_interruptible(part->channel_mgr_wq,
+ (atomic_read(&part->channel_mgr_requests) > 0 ||
+ part->chctl.all_flags != 0 ||
+ (part->act_state == XPC_P_AS_DEACTIVATING &&
+ atomic_read(&part->nchannels_active) == 0 &&
+ xpc_partition_disengaged(part))));
+ atomic_set(&part->channel_mgr_requests, 1);
+ }
+}
+
+/*
+ * Guarantee that the kzalloc'd memory is cacheline aligned.
+ */
+void *
+xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+ /* see if kzalloc will give us cachline aligned memory by default */
+ *base = kzalloc(size, flags);
+ if (*base == NULL)
+ return NULL;
+
+ if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+ return *base;
+
+ kfree(*base);
+
+ /* nope, we'll have to do it ourselves */
+ *base = kzalloc(size + L1_CACHE_BYTES, flags);
+ if (*base == NULL)
+ return NULL;
+
+ return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Setup the channel structures necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+static enum xp_retval
+xpc_setup_ch_structures(struct xpc_partition *part)
+{
+ enum xp_retval ret;
+ int ch_number;
+ struct xpc_channel *ch;
+ short partid = XPC_PARTID(part);
+
+ /*
+ * Allocate all of the channel structures as a contiguous chunk of
+ * memory.
+ */
+ DBUG_ON(part->channels != NULL);
+ part->channels = kcalloc(XPC_MAX_NCHANNELS,
+ sizeof(struct xpc_channel),
+ GFP_KERNEL);
+ if (part->channels == NULL) {
+ dev_err(xpc_chan, "can't get memory for channels\n");
+ return xpNoMemory;
+ }
+
+ /* allocate the remote open and close args */
+
+ part->remote_openclose_args =
+ xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
+ GFP_KERNEL, &part->
+ remote_openclose_args_base);
+ if (part->remote_openclose_args == NULL) {
+ dev_err(xpc_chan, "can't get memory for remote connect args\n");
+ ret = xpNoMemory;
+ goto out_1;
+ }
+
+ part->chctl.all_flags = 0;
+ spin_lock_init(&part->chctl_lock);
+
+ atomic_set(&part->channel_mgr_requests, 1);
+ init_waitqueue_head(&part->channel_mgr_wq);
+
+ part->nchannels = XPC_MAX_NCHANNELS;
+
+ atomic_set(&part->nchannels_active, 0);
+ atomic_set(&part->nchannels_engaged, 0);
+
+ for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+ ch = &part->channels[ch_number];
+
+ ch->partid = partid;
+ ch->number = ch_number;
+ ch->flags = XPC_C_DISCONNECTED;
+
+ atomic_set(&ch->kthreads_assigned, 0);
+ atomic_set(&ch->kthreads_idle, 0);
+ atomic_set(&ch->kthreads_active, 0);
+
+ atomic_set(&ch->references, 0);
+ atomic_set(&ch->n_to_notify, 0);
+
+ spin_lock_init(&ch->lock);
+ init_completion(&ch->wdisconnect_wait);
+
+ atomic_set(&ch->n_on_msg_allocate_wq, 0);
+ init_waitqueue_head(&ch->msg_allocate_wq);
+ init_waitqueue_head(&ch->idle_wq);
+ }
+
+ ret = xpc_arch_ops.setup_ch_structures(part);
+ if (ret != xpSuccess)
+ goto out_2;
+
+ /*
+ * With the setting of the partition setup_state to XPC_P_SS_SETUP,
+ * we're declaring that this partition is ready to go.
+ */
+ part->setup_state = XPC_P_SS_SETUP;
+
+ return xpSuccess;
+
+ /* setup of ch structures failed */
+out_2:
+ kfree(part->remote_openclose_args_base);
+ part->remote_openclose_args = NULL;
+out_1:
+ kfree(part->channels);
+ part->channels = NULL;
+ return ret;
+}
+
+/*
+ * Teardown the channel structures necessary to support XPartition Communication
+ * between the specified remote partition and the local one.
+ */
+static void
+xpc_teardown_ch_structures(struct xpc_partition *part)
+{
+ DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
+ DBUG_ON(atomic_read(&part->nchannels_active) != 0);
+
+ /*
+ * Make this partition inaccessible to local processes by marking it
+ * as no longer setup. Then wait before proceeding with the teardown
+ * until all existing references cease.
+ */
+ DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
+ part->setup_state = XPC_P_SS_WTEARDOWN;
+
+ wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
+
+ /* now we can begin tearing down the infrastructure */
+
+ xpc_arch_ops.teardown_ch_structures(part);
+
+ kfree(part->remote_openclose_args_base);
+ part->remote_openclose_args = NULL;
+ kfree(part->channels);
+ part->channels = NULL;
+
+ part->setup_state = XPC_P_SS_TORNDOWN;
+}
+
+/*
+ * When XPC HB determines that a partition has come up, it will create a new
+ * kthread and that kthread will call this function to attempt to set up the
+ * basic infrastructure used for Cross Partition Communication with the newly
+ * upped partition.
+ *
+ * The kthread that was created by XPC HB and which setup the XPC
+ * infrastructure will remain assigned to the partition becoming the channel
+ * manager for that partition until the partition is deactivating, at which
+ * time the kthread will teardown the XPC infrastructure and then exit.
+ */
+static int
+xpc_activating(void *__partid)
+{
+ short partid = (u64)__partid;
+ struct xpc_partition *part = &xpc_partitions[partid];
+ unsigned long irq_flags;
+
+ DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+
+ if (part->act_state == XPC_P_AS_DEACTIVATING) {
+ part->act_state = XPC_P_AS_INACTIVE;
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+ part->remote_rp_pa = 0;
+ return 0;
+ }
+
+ /* indicate the thread is activating */
+ DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
+ part->act_state = XPC_P_AS_ACTIVATING;
+
+ XPC_SET_REASON(part, 0, 0);
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+ dev_dbg(xpc_part, "activating partition %d\n", partid);
+
+ xpc_arch_ops.allow_hb(partid);
+
+ if (xpc_setup_ch_structures(part) == xpSuccess) {
+ (void)xpc_part_ref(part); /* this will always succeed */
+
+ if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
+ xpc_mark_partition_active(part);
+ xpc_channel_mgr(part);
+ /* won't return until partition is deactivating */
+ }
+
+ xpc_part_deref(part);
+ xpc_teardown_ch_structures(part);
+ }
+
+ xpc_arch_ops.disallow_hb(partid);
+ xpc_mark_partition_inactive(part);
+
+ if (part->reason == xpReactivating) {
+ /* interrupting ourselves results in activating partition */
+ xpc_arch_ops.request_partition_reactivation(part);
+ }
+
+ return 0;
+}
+
+void
+xpc_activate_partition(struct xpc_partition *part)
+{
+ short partid = XPC_PARTID(part);
+ unsigned long irq_flags;
+ struct task_struct *kthread;
+
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+
+ DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
+
+ part->act_state = XPC_P_AS_ACTIVATION_REQ;
+ XPC_SET_REASON(part, xpCloneKThread, __LINE__);
+
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+ kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
+ partid);
+ if (IS_ERR(kthread)) {
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+ part->act_state = XPC_P_AS_INACTIVE;
+ XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+ }
+}
+
+void
+xpc_activate_kthreads(struct xpc_channel *ch, int needed)
+{
+ int idle = atomic_read(&ch->kthreads_idle);
+ int assigned = atomic_read(&ch->kthreads_assigned);
+ int wakeup;
+
+ DBUG_ON(needed <= 0);
+
+ if (idle > 0) {
+ wakeup = (needed > idle) ? idle : needed;
+ needed -= wakeup;
+
+ dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
+ "channel=%d\n", wakeup, ch->partid, ch->number);
+
+ /* only wakeup the requested number of kthreads */
+ wake_up_nr(&ch->idle_wq, wakeup);
+ }
+
+ if (needed <= 0)
+ return;
+
+ if (needed + assigned > ch->kthreads_assigned_limit) {
+ needed = ch->kthreads_assigned_limit - assigned;
+ if (needed <= 0)
+ return;
+ }
+
+ dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
+ needed, ch->partid, ch->number);
+
+ xpc_create_kthreads(ch, needed, 0);
+}
+
+/*
+ * This function is where XPC's kthreads wait for messages to deliver.
+ */
+static void
+xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
+{
+ int (*n_of_deliverable_payloads) (struct xpc_channel *) =
+ xpc_arch_ops.n_of_deliverable_payloads;
+
+ do {
+ /* deliver messages to their intended recipients */
+
+ while (n_of_deliverable_payloads(ch) > 0 &&
+ !(ch->flags & XPC_C_DISCONNECTING)) {
+ xpc_deliver_payload(ch);
+ }
+
+ if (atomic_inc_return(&ch->kthreads_idle) >
+ ch->kthreads_idle_limit) {
+ /* too many idle kthreads on this channel */
+ atomic_dec(&ch->kthreads_idle);
+ break;
+ }
+
+ dev_dbg(xpc_chan, "idle kthread calling "
+ "wait_event_interruptible_exclusive()\n");
+
+ (void)wait_event_interruptible_exclusive(ch->idle_wq,
+ (n_of_deliverable_payloads(ch) > 0 ||
+ (ch->flags & XPC_C_DISCONNECTING)));
+
+ atomic_dec(&ch->kthreads_idle);
+
+ } while (!(ch->flags & XPC_C_DISCONNECTING));
+}
+
+static int
+xpc_kthread_start(void *args)
+{
+ short partid = XPC_UNPACK_ARG1(args);
+ u16 ch_number = XPC_UNPACK_ARG2(args);
+ struct xpc_partition *part = &xpc_partitions[partid];
+ struct xpc_channel *ch;
+ int n_needed;
+ unsigned long irq_flags;
+ int (*n_of_deliverable_payloads) (struct xpc_channel *) =
+ xpc_arch_ops.n_of_deliverable_payloads;
+
+ dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
+ partid, ch_number);
+
+ ch = &part->channels[ch_number];
+
+ if (!(ch->flags & XPC_C_DISCONNECTING)) {
+
+ /* let registerer know that connection has been established */
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
+ ch->flags |= XPC_C_CONNECTEDCALLOUT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ xpc_connected_callout(ch);
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ /*
+ * It is possible that while the callout was being
+ * made that the remote partition sent some messages.
+ * If that is the case, we may need to activate
+ * additional kthreads to help deliver them. We only
+ * need one less than total #of messages to deliver.
+ */
+ n_needed = n_of_deliverable_payloads(ch) - 1;
+ if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
+ xpc_activate_kthreads(ch, n_needed);
+
+ } else {
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ }
+
+ xpc_kthread_waitmsgs(part, ch);
+ }
+
+ /* let registerer know that connection is disconnecting */
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
+ !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
+ ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ xpc_disconnect_callout(ch, xpDisconnecting);
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
+ }
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+ atomic_dec_return(&part->nchannels_engaged) == 0) {
+ xpc_arch_ops.indicate_partition_disengaged(part);
+ }
+
+ xpc_msgqueue_deref(ch);
+
+ dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
+ partid, ch_number);
+
+ xpc_part_deref(part);
+ return 0;
+}
+
+/*
+ * For each partition that XPC has established communications with, there is
+ * a minimum of one kernel thread assigned to perform any operation that
+ * may potentially sleep or block (basically the callouts to the asynchronous
+ * functions registered via xpc_connect()).
+ *
+ * Additional kthreads are created and destroyed by XPC as the workload
+ * demands.
+ *
+ * A kthread is assigned to one of the active channels that exists for a given
+ * partition.
+ */
+void
+xpc_create_kthreads(struct xpc_channel *ch, int needed,
+ int ignore_disconnecting)
+{
+ unsigned long irq_flags;
+ u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
+ struct xpc_partition *part = &xpc_partitions[ch->partid];
+ struct task_struct *kthread;
+ void (*indicate_partition_disengaged) (struct xpc_partition *) =
+ xpc_arch_ops.indicate_partition_disengaged;
+
+ while (needed-- > 0) {
+
+ /*
+ * The following is done on behalf of the newly created
+ * kthread. That kthread is responsible for doing the
+ * counterpart to the following before it exits.
+ */
+ if (ignore_disconnecting) {
+ if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
+ /* kthreads assigned had gone to zero */
+ BUG_ON(!(ch->flags &
+ XPC_C_DISCONNECTINGCALLOUT_MADE));
+ break;
+ }
+
+ } else if (ch->flags & XPC_C_DISCONNECTING) {
+ break;
+
+ } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
+ atomic_inc_return(&part->nchannels_engaged) == 1) {
+ xpc_arch_ops.indicate_partition_engaged(part);
+ }
+ (void)xpc_part_ref(part);
+ xpc_msgqueue_ref(ch);
+
+ kthread = kthread_run(xpc_kthread_start, (void *)args,
+ "xpc%02dc%d", ch->partid, ch->number);
+ if (IS_ERR(kthread)) {
+ /* the fork failed */
+
+ /*
+ * NOTE: if (ignore_disconnecting &&
+ * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
+ * then we'll deadlock if all other kthreads assigned
+ * to this channel are blocked in the channel's
+ * registerer, because the only thing that will unblock
+ * them is the xpDisconnecting callout that this
+ * failed kthread_run() would have made.
+ */
+
+ if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
+ atomic_dec_return(&part->nchannels_engaged) == 0) {
+ indicate_partition_disengaged(part);
+ }
+ xpc_msgqueue_deref(ch);
+ xpc_part_deref(part);
+
+ if (atomic_read(&ch->kthreads_assigned) <
+ ch->kthreads_idle_limit) {
+ /*
+ * Flag this as an error only if we have an
+ * insufficient #of kthreads for the channel
+ * to function.
+ */
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
+ &irq_flags);
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ }
+ break;
+ }
+ }
+}
+
+void
+xpc_disconnect_wait(int ch_number)
+{
+ unsigned long irq_flags;
+ short partid;
+ struct xpc_partition *part;
+ struct xpc_channel *ch;
+ int wakeup_channel_mgr;
+
+ /* now wait for all callouts to the caller's function to cease */
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (!xpc_part_ref(part))
+ continue;
+
+ ch = &part->channels[ch_number];
+
+ if (!(ch->flags & XPC_C_WDISCONNECT)) {
+ xpc_part_deref(part);
+ continue;
+ }
+
+ wait_for_completion(&ch->wdisconnect_wait);
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
+ wakeup_channel_mgr = 0;
+
+ if (ch->delayed_chctl_flags) {
+ if (part->act_state != XPC_P_AS_DEACTIVATING) {
+ spin_lock(&part->chctl_lock);
+ part->chctl.flags[ch->number] |=
+ ch->delayed_chctl_flags;
+ spin_unlock(&part->chctl_lock);
+ wakeup_channel_mgr = 1;
+ }
+ ch->delayed_chctl_flags = 0;
+ }
+
+ ch->flags &= ~XPC_C_WDISCONNECT;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+
+ if (wakeup_channel_mgr)
+ xpc_wakeup_channel_mgr(part);
+
+ xpc_part_deref(part);
+ }
+}
+
+static int
+xpc_setup_partitions(void)
+{
+ short partid;
+ struct xpc_partition *part;
+
+ xpc_partitions = kcalloc(xp_max_npartitions,
+ sizeof(struct xpc_partition),
+ GFP_KERNEL);
+ if (xpc_partitions == NULL) {
+ dev_err(xpc_part, "can't get memory for partition structure\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * The first few fields of each entry of xpc_partitions[] need to
+ * be initialized now so that calls to xpc_connect() and
+ * xpc_disconnect() can be made prior to the activation of any remote
+ * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
+ * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
+ * PARTITION HAS BEEN ACTIVATED.
+ */
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
+
+ part->activate_IRQ_rcvd = 0;
+ spin_lock_init(&part->act_lock);
+ part->act_state = XPC_P_AS_INACTIVE;
+ XPC_SET_REASON(part, 0, 0);
+
+ timer_setup(&part->disengage_timer,
+ xpc_timeout_partition_disengage, 0);
+
+ part->setup_state = XPC_P_SS_UNSET;
+ init_waitqueue_head(&part->teardown_wq);
+ atomic_set(&part->references, 0);
+ }
+
+ return xpc_arch_ops.setup_partitions();
+}
+
+static void
+xpc_teardown_partitions(void)
+{
+ xpc_arch_ops.teardown_partitions();
+ kfree(xpc_partitions);
+}
+
+static void
+xpc_do_exit(enum xp_retval reason)
+{
+ short partid;
+ int active_part_count, printed_waiting_msg = 0;
+ struct xpc_partition *part;
+ unsigned long printmsg_time, disengage_timeout = 0;
+
+ /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
+ DBUG_ON(xpc_exiting == 1);
+
+ /*
+ * Let the heartbeat checker thread and the discovery thread
+ * (if one is running) know that they should exit. Also wake up
+ * the heartbeat checker thread in case it's sleeping.
+ */
+ xpc_exiting = 1;
+ wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+ /* wait for the discovery thread to exit */
+ wait_for_completion(&xpc_discovery_exited);
+
+ /* wait for the heartbeat checker thread to exit */
+ wait_for_completion(&xpc_hb_checker_exited);
+
+ /* sleep for a 1/3 of a second or so */
+ (void)msleep_interruptible(300);
+
+ /* wait for all partitions to become inactive */
+
+ printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
+ xpc_disengage_timedout = 0;
+
+ do {
+ active_part_count = 0;
+
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (xpc_partition_disengaged(part) &&
+ part->act_state == XPC_P_AS_INACTIVE) {
+ continue;
+ }
+
+ active_part_count++;
+
+ XPC_DEACTIVATE_PARTITION(part, reason);
+
+ if (part->disengage_timeout > disengage_timeout)
+ disengage_timeout = part->disengage_timeout;
+ }
+
+ if (xpc_arch_ops.any_partition_engaged()) {
+ if (time_is_before_jiffies(printmsg_time)) {
+ dev_info(xpc_part, "waiting for remote "
+ "partitions to deactivate, timeout in "
+ "%ld seconds\n", (disengage_timeout -
+ jiffies) / HZ);
+ printmsg_time = jiffies +
+ (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
+ printed_waiting_msg = 1;
+ }
+
+ } else if (active_part_count > 0) {
+ if (printed_waiting_msg) {
+ dev_info(xpc_part, "waiting for local partition"
+ " to deactivate\n");
+ printed_waiting_msg = 0;
+ }
+
+ } else {
+ if (!xpc_disengage_timedout) {
+ dev_info(xpc_part, "all partitions have "
+ "deactivated\n");
+ }
+ break;
+ }
+
+ /* sleep for a 1/3 of a second or so */
+ (void)msleep_interruptible(300);
+
+ } while (1);
+
+ DBUG_ON(xpc_arch_ops.any_partition_engaged());
+
+ xpc_teardown_rsvd_page();
+
+ if (reason == xpUnloading) {
+ (void)unregister_die_notifier(&xpc_die_notifier);
+ (void)unregister_reboot_notifier(&xpc_reboot_notifier);
+ }
+
+ /* clear the interface to XPC's functions */
+ xpc_clear_interface();
+
+ if (xpc_sysctl)
+ unregister_sysctl_table(xpc_sysctl);
+
+ xpc_teardown_partitions();
+
+ if (is_uv_system())
+ xpc_exit_uv();
+}
+
+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
+{
+ enum xp_retval reason;
+
+ switch (event) {
+ case SYS_RESTART:
+ reason = xpSystemReboot;
+ break;
+ case SYS_HALT:
+ reason = xpSystemHalt;
+ break;
+ case SYS_POWER_OFF:
+ reason = xpSystemPoweroff;
+ break;
+ default:
+ reason = xpSystemGoingDown;
+ }
+
+ xpc_do_exit(reason);
+ return NOTIFY_DONE;
+}
+
+/* Used to only allow one cpu to complete disconnect */
+static unsigned int xpc_die_disconnecting;
+
+/*
+ * Notify other partitions to deactivate from us by first disengaging from all
+ * references to our memory.
+ */
+static void
+xpc_die_deactivate(void)
+{
+ struct xpc_partition *part;
+ short partid;
+ int any_engaged;
+ long keep_waiting;
+ long wait_to_print;
+
+ if (cmpxchg(&xpc_die_disconnecting, 0, 1))
+ return;
+
+ /* keep xpc_hb_checker thread from doing anything (just in case) */
+ xpc_exiting = 1;
+
+ xpc_arch_ops.disallow_all_hbs(); /*indicate we're deactivated */
+
+ for (partid = 0; partid < xp_max_npartitions; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (xpc_arch_ops.partition_engaged(partid) ||
+ part->act_state != XPC_P_AS_INACTIVE) {
+ xpc_arch_ops.request_partition_deactivation(part);
+ xpc_arch_ops.indicate_partition_disengaged(part);
+ }
+ }
+
+ /*
+ * Though we requested that all other partitions deactivate from us,
+ * we only wait until they've all disengaged or we've reached the
+ * defined timelimit.
+ *
+ * Given that one iteration through the following while-loop takes
+ * approximately 200 microseconds, calculate the #of loops to take
+ * before bailing and the #of loops before printing a waiting message.
+ */
+ keep_waiting = xpc_disengage_timelimit * 1000 * 5;
+ wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
+
+ while (1) {
+ any_engaged = xpc_arch_ops.any_partition_engaged();
+ if (!any_engaged) {
+ dev_info(xpc_part, "all partitions have deactivated\n");
+ break;
+ }
+
+ if (!keep_waiting--) {
+ for (partid = 0; partid < xp_max_npartitions;
+ partid++) {
+ if (xpc_arch_ops.partition_engaged(partid)) {
+ dev_info(xpc_part, "deactivate from "
+ "remote partition %d timed "
+ "out\n", partid);
+ }
+ }
+ break;
+ }
+
+ if (!wait_to_print--) {
+ dev_info(xpc_part, "waiting for remote partitions to "
+ "deactivate, timeout in %ld seconds\n",
+ keep_waiting / (1000 * 5));
+ wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
+ 1000 * 5;
+ }
+
+ udelay(200);
+ }
+}
+
+/*
+ * This function is called when the system is being restarted or halted due
+ * to some sort of system failure. If this is the case we need to notify the
+ * other partitions to disengage from all references to our memory.
+ * This function can also be called when our heartbeater could be offlined
+ * for a time. In this case we need to notify other partitions to not worry
+ * about the lack of a heartbeat.
+ */
+static int
+xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
+{
+#ifdef CONFIG_IA64 /* !!! temporary kludge */
+ switch (event) {
+ case DIE_MACHINE_RESTART:
+ case DIE_MACHINE_HALT:
+ xpc_die_deactivate();
+ break;
+
+ case DIE_KDEBUG_ENTER:
+ /* Should lack of heartbeat be ignored by other partitions? */
+ if (!xpc_kdebug_ignore)
+ break;
+
+ fallthrough;
+ case DIE_MCA_MONARCH_ENTER:
+ case DIE_INIT_MONARCH_ENTER:
+ xpc_arch_ops.offline_heartbeat();
+ break;
+
+ case DIE_KDEBUG_LEAVE:
+ /* Is lack of heartbeat being ignored by other partitions? */
+ if (!xpc_kdebug_ignore)
+ break;
+
+ fallthrough;
+ case DIE_MCA_MONARCH_LEAVE:
+ case DIE_INIT_MONARCH_LEAVE:
+ xpc_arch_ops.online_heartbeat();
+ break;
+ }
+#else
+ struct die_args *die_args = _die_args;
+
+ switch (event) {
+ case DIE_TRAP:
+ if (die_args->trapnr == X86_TRAP_DF)
+ xpc_die_deactivate();
+
+ if (((die_args->trapnr == X86_TRAP_MF) ||
+ (die_args->trapnr == X86_TRAP_XF)) &&
+ !user_mode(die_args->regs))
+ xpc_die_deactivate();
+
+ break;
+ case DIE_INT3:
+ case DIE_DEBUG:
+ break;
+ case DIE_OOPS:
+ case DIE_GPF:
+ default:
+ xpc_die_deactivate();
+ }
+#endif
+
+ return NOTIFY_DONE;
+}
+
+static int __init
+xpc_init(void)
+{
+ int ret;
+ struct task_struct *kthread;
+
+ dev_set_name(xpc_part, "part");
+ dev_set_name(xpc_chan, "chan");
+
+ if (is_uv_system()) {
+ ret = xpc_init_uv();
+
+ } else {
+ ret = -ENODEV;
+ }
+
+ if (ret != 0)
+ return ret;
+
+ ret = xpc_setup_partitions();
+ if (ret != 0) {
+ dev_err(xpc_part, "can't get memory for partition structure\n");
+ goto out_1;
+ }
+
+ xpc_sysctl = register_sysctl_table(xpc_sys_dir);
+
+ /*
+ * Fill the partition reserved page with the information needed by
+ * other partitions to discover we are alive and establish initial
+ * communications.
+ */
+ ret = xpc_setup_rsvd_page();
+ if (ret != 0) {
+ dev_err(xpc_part, "can't setup our reserved page\n");
+ goto out_2;
+ }
+
+ /* add ourselves to the reboot_notifier_list */
+ ret = register_reboot_notifier(&xpc_reboot_notifier);
+ if (ret != 0)
+ dev_warn(xpc_part, "can't register reboot notifier\n");
+
+ /* add ourselves to the die_notifier list */
+ ret = register_die_notifier(&xpc_die_notifier);
+ if (ret != 0)
+ dev_warn(xpc_part, "can't register die notifier\n");
+
+ /*
+ * The real work-horse behind xpc. This processes incoming
+ * interrupts and monitors remote heartbeats.
+ */
+ kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
+ if (IS_ERR(kthread)) {
+ dev_err(xpc_part, "failed while forking hb check thread\n");
+ ret = -EBUSY;
+ goto out_3;
+ }
+
+ /*
+ * Startup a thread that will attempt to discover other partitions to
+ * activate based on info provided by SAL. This new thread is short
+ * lived and will exit once discovery is complete.
+ */
+ kthread = kthread_run(xpc_initiate_discovery, NULL,
+ XPC_DISCOVERY_THREAD_NAME);
+ if (IS_ERR(kthread)) {
+ dev_err(xpc_part, "failed while forking discovery thread\n");
+
+ /* mark this new thread as a non-starter */
+ complete(&xpc_discovery_exited);
+
+ xpc_do_exit(xpUnloading);
+ return -EBUSY;
+ }
+
+ /* set the interface to point at XPC's functions */
+ xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
+ xpc_initiate_send, xpc_initiate_send_notify,
+ xpc_initiate_received, xpc_initiate_partid_to_nasids);
+
+ return 0;
+
+ /* initialization was not successful */
+out_3:
+ xpc_teardown_rsvd_page();
+
+ (void)unregister_die_notifier(&xpc_die_notifier);
+ (void)unregister_reboot_notifier(&xpc_reboot_notifier);
+out_2:
+ if (xpc_sysctl)
+ unregister_sysctl_table(xpc_sysctl);
+
+ xpc_teardown_partitions();
+out_1:
+ if (is_uv_system())
+ xpc_exit_uv();
+ return ret;
+}
+
+module_init(xpc_init);
+
+static void __exit
+xpc_exit(void)
+{
+ xpc_do_exit(xpUnloading);
+}
+
+module_exit(xpc_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
+MODULE_LICENSE("GPL");
+
+module_param(xpc_hb_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
+ "heartbeat increments.");
+
+module_param(xpc_hb_check_interval, int, 0);
+MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
+ "heartbeat checks.");
+
+module_param(xpc_disengage_timelimit, int, 0);
+MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
+ "for disengage to complete.");
+
+module_param(xpc_kdebug_ignore, int, 0);
+MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
+ "other partitions when dropping into kdebug.");
diff --git a/drivers/misc/sgi-xp/xpc_partition.c b/drivers/misc/sgi-xp/xpc_partition.c
new file mode 100644
index 000000000..1999d0292
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_partition.c
@@ -0,0 +1,545 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2004-2008 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) partition support.
+ *
+ * This is the part of XPC that detects the presence/absence of
+ * other partitions. It provides a heartbeat and monitors the
+ * heartbeats of other partitions.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/hardirq.h>
+#include <linux/slab.h>
+#include "xpc.h"
+#include <asm/uv/uv_hub.h>
+
+/* XPC is exiting flag */
+int xpc_exiting;
+
+/* this partition's reserved page pointers */
+struct xpc_rsvd_page *xpc_rsvd_page;
+static unsigned long *xpc_part_nasids;
+unsigned long *xpc_mach_nasids;
+
+static int xpc_nasid_mask_nbytes; /* #of bytes in nasid mask */
+int xpc_nasid_mask_nlongs; /* #of longs in nasid mask */
+
+struct xpc_partition *xpc_partitions;
+
+/*
+ * Guarantee that the kmalloc'd memory is cacheline aligned.
+ */
+void *
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+ /* see if kmalloc will give us cachline aligned memory by default */
+ *base = kmalloc(size, flags);
+ if (*base == NULL)
+ return NULL;
+
+ if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
+ return *base;
+
+ kfree(*base);
+
+ /* nope, we'll have to do it ourselves */
+ *base = kmalloc(size + L1_CACHE_BYTES, flags);
+ if (*base == NULL)
+ return NULL;
+
+ return (void *)L1_CACHE_ALIGN((u64)*base);
+}
+
+/*
+ * Given a nasid, get the physical address of the partition's reserved page
+ * for that nasid. This function returns 0 on any error.
+ */
+static unsigned long
+xpc_get_rsvd_page_pa(int nasid)
+{
+ enum xp_retval ret;
+ u64 cookie = 0;
+ unsigned long rp_pa = nasid; /* seed with nasid */
+ size_t len = 0;
+ size_t buf_len = 0;
+ void *buf = NULL;
+ void *buf_base = NULL;
+ enum xp_retval (*get_partition_rsvd_page_pa)
+ (void *, u64 *, unsigned long *, size_t *) =
+ xpc_arch_ops.get_partition_rsvd_page_pa;
+
+ while (1) {
+
+ /* !!! rp_pa will need to be _gpa on UV.
+ * ??? So do we save it into the architecture specific parts
+ * ??? of the xpc_partition structure? Do we rename this
+ * ??? function or have two versions? Rename rp_pa for UV to
+ * ??? rp_gpa?
+ */
+ ret = get_partition_rsvd_page_pa(buf, &cookie, &rp_pa, &len);
+
+ dev_dbg(xpc_part, "SAL returned with ret=%d, cookie=0x%016lx, "
+ "address=0x%016lx, len=0x%016lx\n", ret,
+ (unsigned long)cookie, rp_pa, len);
+
+ if (ret != xpNeedMoreInfo)
+ break;
+
+ if (len > buf_len) {
+ kfree(buf_base);
+ buf_len = L1_CACHE_ALIGN(len);
+ buf = xpc_kmalloc_cacheline_aligned(buf_len, GFP_KERNEL,
+ &buf_base);
+ if (buf_base == NULL) {
+ dev_err(xpc_part, "unable to kmalloc "
+ "len=0x%016lx\n", buf_len);
+ ret = xpNoMemory;
+ break;
+ }
+ }
+
+ ret = xp_remote_memcpy(xp_pa(buf), rp_pa, len);
+ if (ret != xpSuccess) {
+ dev_dbg(xpc_part, "xp_remote_memcpy failed %d\n", ret);
+ break;
+ }
+ }
+
+ kfree(buf_base);
+
+ if (ret != xpSuccess)
+ rp_pa = 0;
+
+ dev_dbg(xpc_part, "reserved page at phys address 0x%016lx\n", rp_pa);
+ return rp_pa;
+}
+
+/*
+ * Fill the partition reserved page with the information needed by
+ * other partitions to discover we are alive and establish initial
+ * communications.
+ */
+int
+xpc_setup_rsvd_page(void)
+{
+ int ret;
+ struct xpc_rsvd_page *rp;
+ unsigned long rp_pa;
+ unsigned long new_ts_jiffies;
+
+ /* get the local reserved page's address */
+
+ preempt_disable();
+ rp_pa = xpc_get_rsvd_page_pa(xp_cpu_to_nasid(smp_processor_id()));
+ preempt_enable();
+ if (rp_pa == 0) {
+ dev_err(xpc_part, "SAL failed to locate the reserved page\n");
+ return -ESRCH;
+ }
+ rp = (struct xpc_rsvd_page *)__va(xp_socket_pa(rp_pa));
+
+ if (rp->SAL_version < 3) {
+ /* SAL_versions < 3 had a SAL_partid defined as a u8 */
+ rp->SAL_partid &= 0xff;
+ }
+ BUG_ON(rp->SAL_partid != xp_partition_id);
+
+ if (rp->SAL_partid < 0 || rp->SAL_partid >= xp_max_npartitions) {
+ dev_err(xpc_part, "the reserved page's partid of %d is outside "
+ "supported range (< 0 || >= %d)\n", rp->SAL_partid,
+ xp_max_npartitions);
+ return -EINVAL;
+ }
+
+ rp->version = XPC_RP_VERSION;
+ rp->max_npartitions = xp_max_npartitions;
+
+ /* establish the actual sizes of the nasid masks */
+ if (rp->SAL_version == 1) {
+ /* SAL_version 1 didn't set the nasids_size field */
+ rp->SAL_nasids_size = 128;
+ }
+ xpc_nasid_mask_nbytes = rp->SAL_nasids_size;
+ xpc_nasid_mask_nlongs = BITS_TO_LONGS(rp->SAL_nasids_size *
+ BITS_PER_BYTE);
+
+ /* setup the pointers to the various items in the reserved page */
+ xpc_part_nasids = XPC_RP_PART_NASIDS(rp);
+ xpc_mach_nasids = XPC_RP_MACH_NASIDS(rp);
+
+ ret = xpc_arch_ops.setup_rsvd_page(rp);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Set timestamp of when reserved page was setup by XPC.
+ * This signifies to the remote partition that our reserved
+ * page is initialized.
+ */
+ new_ts_jiffies = jiffies;
+ if (new_ts_jiffies == 0 || new_ts_jiffies == rp->ts_jiffies)
+ new_ts_jiffies++;
+ rp->ts_jiffies = new_ts_jiffies;
+
+ xpc_rsvd_page = rp;
+ return 0;
+}
+
+void
+xpc_teardown_rsvd_page(void)
+{
+ /* a zero timestamp indicates our rsvd page is not initialized */
+ xpc_rsvd_page->ts_jiffies = 0;
+}
+
+/*
+ * Get a copy of a portion of the remote partition's rsvd page.
+ *
+ * remote_rp points to a buffer that is cacheline aligned for BTE copies and
+ * is large enough to contain a copy of their reserved page header and
+ * part_nasids mask.
+ */
+enum xp_retval
+xpc_get_remote_rp(int nasid, unsigned long *discovered_nasids,
+ struct xpc_rsvd_page *remote_rp, unsigned long *remote_rp_pa)
+{
+ int l;
+ enum xp_retval ret;
+
+ /* get the reserved page's physical address */
+
+ *remote_rp_pa = xpc_get_rsvd_page_pa(nasid);
+ if (*remote_rp_pa == 0)
+ return xpNoRsvdPageAddr;
+
+ /* pull over the reserved page header and part_nasids mask */
+ ret = xp_remote_memcpy(xp_pa(remote_rp), *remote_rp_pa,
+ XPC_RP_HEADER_SIZE + xpc_nasid_mask_nbytes);
+ if (ret != xpSuccess)
+ return ret;
+
+ if (discovered_nasids != NULL) {
+ unsigned long *remote_part_nasids =
+ XPC_RP_PART_NASIDS(remote_rp);
+
+ for (l = 0; l < xpc_nasid_mask_nlongs; l++)
+ discovered_nasids[l] |= remote_part_nasids[l];
+ }
+
+ /* zero timestamp indicates the reserved page has not been setup */
+ if (remote_rp->ts_jiffies == 0)
+ return xpRsvdPageNotSet;
+
+ if (XPC_VERSION_MAJOR(remote_rp->version) !=
+ XPC_VERSION_MAJOR(XPC_RP_VERSION)) {
+ return xpBadVersion;
+ }
+
+ /* check that both remote and local partids are valid for each side */
+ if (remote_rp->SAL_partid < 0 ||
+ remote_rp->SAL_partid >= xp_max_npartitions ||
+ remote_rp->max_npartitions <= xp_partition_id) {
+ return xpInvalidPartid;
+ }
+
+ if (remote_rp->SAL_partid == xp_partition_id)
+ return xpLocalPartid;
+
+ return xpSuccess;
+}
+
+/*
+ * See if the other side has responded to a partition deactivate request
+ * from us. Though we requested the remote partition to deactivate with regard
+ * to us, we really only need to wait for the other side to disengage from us.
+ */
+static int __xpc_partition_disengaged(struct xpc_partition *part,
+ bool from_timer)
+{
+ short partid = XPC_PARTID(part);
+ int disengaged;
+
+ disengaged = !xpc_arch_ops.partition_engaged(partid);
+ if (part->disengage_timeout) {
+ if (!disengaged) {
+ if (time_is_after_jiffies(part->disengage_timeout)) {
+ /* timelimit hasn't been reached yet */
+ return 0;
+ }
+
+ /*
+ * Other side hasn't responded to our deactivate
+ * request in a timely fashion, so assume it's dead.
+ */
+
+ dev_info(xpc_part, "deactivate request to remote "
+ "partition %d timed out\n", partid);
+ xpc_disengage_timedout = 1;
+ xpc_arch_ops.assume_partition_disengaged(partid);
+ disengaged = 1;
+ }
+ part->disengage_timeout = 0;
+
+ /* Cancel the timer function if not called from it */
+ if (!from_timer)
+ del_timer_sync(&part->disengage_timer);
+
+ DBUG_ON(part->act_state != XPC_P_AS_DEACTIVATING &&
+ part->act_state != XPC_P_AS_INACTIVE);
+ if (part->act_state != XPC_P_AS_INACTIVE)
+ xpc_wakeup_channel_mgr(part);
+
+ xpc_arch_ops.cancel_partition_deactivation_request(part);
+ }
+ return disengaged;
+}
+
+int xpc_partition_disengaged(struct xpc_partition *part)
+{
+ return __xpc_partition_disengaged(part, false);
+}
+
+int xpc_partition_disengaged_from_timer(struct xpc_partition *part)
+{
+ return __xpc_partition_disengaged(part, true);
+}
+
+/*
+ * Mark specified partition as active.
+ */
+enum xp_retval
+xpc_mark_partition_active(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ enum xp_retval ret;
+
+ dev_dbg(xpc_part, "setting partition %d to ACTIVE\n", XPC_PARTID(part));
+
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+ if (part->act_state == XPC_P_AS_ACTIVATING) {
+ part->act_state = XPC_P_AS_ACTIVE;
+ ret = xpSuccess;
+ } else {
+ DBUG_ON(part->reason == xpSuccess);
+ ret = part->reason;
+ }
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+ return ret;
+}
+
+/*
+ * Start the process of deactivating the specified partition.
+ */
+void
+xpc_deactivate_partition(const int line, struct xpc_partition *part,
+ enum xp_retval reason)
+{
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+
+ if (part->act_state == XPC_P_AS_INACTIVE) {
+ XPC_SET_REASON(part, reason, line);
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+ if (reason == xpReactivating) {
+ /* we interrupt ourselves to reactivate partition */
+ xpc_arch_ops.request_partition_reactivation(part);
+ }
+ return;
+ }
+ if (part->act_state == XPC_P_AS_DEACTIVATING) {
+ if ((part->reason == xpUnloading && reason != xpUnloading) ||
+ reason == xpReactivating) {
+ XPC_SET_REASON(part, reason, line);
+ }
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+ return;
+ }
+
+ part->act_state = XPC_P_AS_DEACTIVATING;
+ XPC_SET_REASON(part, reason, line);
+
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+
+ /* ask remote partition to deactivate with regard to us */
+ xpc_arch_ops.request_partition_deactivation(part);
+
+ /* set a timelimit on the disengage phase of the deactivation request */
+ part->disengage_timeout = jiffies + (xpc_disengage_timelimit * HZ);
+ part->disengage_timer.expires = part->disengage_timeout;
+ add_timer(&part->disengage_timer);
+
+ dev_dbg(xpc_part, "bringing partition %d down, reason = %d\n",
+ XPC_PARTID(part), reason);
+
+ xpc_partition_going_down(part, reason);
+}
+
+/*
+ * Mark specified partition as inactive.
+ */
+void
+xpc_mark_partition_inactive(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+
+ dev_dbg(xpc_part, "setting partition %d to INACTIVE\n",
+ XPC_PARTID(part));
+
+ spin_lock_irqsave(&part->act_lock, irq_flags);
+ part->act_state = XPC_P_AS_INACTIVE;
+ spin_unlock_irqrestore(&part->act_lock, irq_flags);
+ part->remote_rp_pa = 0;
+}
+
+/*
+ * SAL has provided a partition and machine mask. The partition mask
+ * contains a bit for each even nasid in our partition. The machine
+ * mask contains a bit for each even nasid in the entire machine.
+ *
+ * Using those two bit arrays, we can determine which nasids are
+ * known in the machine. Each should also have a reserved page
+ * initialized if they are available for partitioning.
+ */
+void
+xpc_discovery(void)
+{
+ void *remote_rp_base;
+ struct xpc_rsvd_page *remote_rp;
+ unsigned long remote_rp_pa;
+ int region;
+ int region_size;
+ int max_regions;
+ int nasid;
+ unsigned long *discovered_nasids;
+ enum xp_retval ret;
+
+ remote_rp = xpc_kmalloc_cacheline_aligned(XPC_RP_HEADER_SIZE +
+ xpc_nasid_mask_nbytes,
+ GFP_KERNEL, &remote_rp_base);
+ if (remote_rp == NULL)
+ return;
+
+ discovered_nasids = kcalloc(xpc_nasid_mask_nlongs, sizeof(long),
+ GFP_KERNEL);
+ if (discovered_nasids == NULL) {
+ kfree(remote_rp_base);
+ return;
+ }
+
+ /*
+ * The term 'region' in this context refers to the minimum number of
+ * nodes that can comprise an access protection grouping. The access
+ * protection is in regards to memory, IOI and IPI.
+ */
+ region_size = xp_region_size;
+
+ if (is_uv_system())
+ max_regions = 256;
+ else {
+ max_regions = 64;
+
+ switch (region_size) {
+ case 128:
+ max_regions *= 2;
+ fallthrough;
+ case 64:
+ max_regions *= 2;
+ fallthrough;
+ case 32:
+ max_regions *= 2;
+ region_size = 16;
+ }
+ }
+
+ for (region = 0; region < max_regions; region++) {
+
+ if (xpc_exiting)
+ break;
+
+ dev_dbg(xpc_part, "searching region %d\n", region);
+
+ for (nasid = (region * region_size * 2);
+ nasid < ((region + 1) * region_size * 2); nasid += 2) {
+
+ if (xpc_exiting)
+ break;
+
+ dev_dbg(xpc_part, "checking nasid %d\n", nasid);
+
+ if (test_bit(nasid / 2, xpc_part_nasids)) {
+ dev_dbg(xpc_part, "PROM indicates Nasid %d is "
+ "part of the local partition; skipping "
+ "region\n", nasid);
+ break;
+ }
+
+ if (!(test_bit(nasid / 2, xpc_mach_nasids))) {
+ dev_dbg(xpc_part, "PROM indicates Nasid %d was "
+ "not on Numa-Link network at reset\n",
+ nasid);
+ continue;
+ }
+
+ if (test_bit(nasid / 2, discovered_nasids)) {
+ dev_dbg(xpc_part, "Nasid %d is part of a "
+ "partition which was previously "
+ "discovered\n", nasid);
+ continue;
+ }
+
+ /* pull over the rsvd page header & part_nasids mask */
+
+ ret = xpc_get_remote_rp(nasid, discovered_nasids,
+ remote_rp, &remote_rp_pa);
+ if (ret != xpSuccess) {
+ dev_dbg(xpc_part, "unable to get reserved page "
+ "from nasid %d, reason=%d\n", nasid,
+ ret);
+
+ if (ret == xpLocalPartid)
+ break;
+
+ continue;
+ }
+
+ xpc_arch_ops.request_partition_activation(remote_rp,
+ remote_rp_pa, nasid);
+ }
+ }
+
+ kfree(discovered_nasids);
+ kfree(remote_rp_base);
+}
+
+/*
+ * Given a partid, get the nasids owned by that partition from the
+ * remote partition's reserved page.
+ */
+enum xp_retval
+xpc_initiate_partid_to_nasids(short partid, void *nasid_mask)
+{
+ struct xpc_partition *part;
+ unsigned long part_nasid_pa;
+
+ part = &xpc_partitions[partid];
+ if (part->remote_rp_pa == 0)
+ return xpPartitionDown;
+
+ memset(nasid_mask, 0, xpc_nasid_mask_nbytes);
+
+ part_nasid_pa = (unsigned long)XPC_RP_PART_NASIDS(part->remote_rp_pa);
+
+ return xp_remote_memcpy(xp_pa(nasid_mask), part_nasid_pa,
+ xpc_nasid_mask_nbytes);
+}
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
new file mode 100644
index 000000000..fff522d34
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -0,0 +1,1817 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ */
+
+/*
+ * Cross Partition Communication (XPC) uv-based functions.
+ *
+ * Architecture specific implementation of common functions.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <asm/uv/uv_hub.h>
+#if defined CONFIG_X86_64
+#include <asm/uv/bios.h>
+#include <asm/uv/uv_irq.h>
+#elif defined CONFIG_IA64_SGI_UV
+#include <asm/sn/intr.h>
+#include <asm/sn/sn_sal.h>
+#endif
+#include "../sgi-gru/gru.h"
+#include "../sgi-gru/grukservices.h"
+#include "xpc.h"
+
+#if defined CONFIG_IA64_SGI_UV
+struct uv_IO_APIC_route_entry {
+ __u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode : 1,
+ delivery_status : 1,
+ polarity : 1,
+ __reserved_1 : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15,
+ dest : 32;
+};
+
+#define sn_partition_id 0
+#endif
+
+static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
+
+#define XPC_ACTIVATE_MSG_SIZE_UV (1 * GRU_CACHE_LINE_BYTES)
+#define XPC_ACTIVATE_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \
+ XPC_ACTIVATE_MSG_SIZE_UV)
+#define XPC_ACTIVATE_IRQ_NAME "xpc_activate"
+
+#define XPC_NOTIFY_MSG_SIZE_UV (2 * GRU_CACHE_LINE_BYTES)
+#define XPC_NOTIFY_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \
+ XPC_NOTIFY_MSG_SIZE_UV)
+#define XPC_NOTIFY_IRQ_NAME "xpc_notify"
+
+static int xpc_mq_node = NUMA_NO_NODE;
+
+static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
+static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
+
+static int
+xpc_setup_partitions_uv(void)
+{
+ short partid;
+ struct xpc_partition_uv *part_uv;
+
+ for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+ part_uv = &xpc_partitions[partid].sn.uv;
+
+ mutex_init(&part_uv->cached_activate_gru_mq_desc_mutex);
+ spin_lock_init(&part_uv->flags_lock);
+ part_uv->remote_act_state = XPC_P_AS_INACTIVE;
+ }
+ return 0;
+}
+
+static void
+xpc_teardown_partitions_uv(void)
+{
+ short partid;
+ struct xpc_partition_uv *part_uv;
+ unsigned long irq_flags;
+
+ for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+ part_uv = &xpc_partitions[partid].sn.uv;
+
+ if (part_uv->cached_activate_gru_mq_desc != NULL) {
+ mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex);
+ spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+ part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+ spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+ kfree(part_uv->cached_activate_gru_mq_desc);
+ part_uv->cached_activate_gru_mq_desc = NULL;
+ mutex_unlock(&part_uv->
+ cached_activate_gru_mq_desc_mutex);
+ }
+ }
+}
+
+static int
+xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
+{
+ int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+#if defined CONFIG_X86_64
+ mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
+ UV_AFFINITY_CPU);
+ if (mq->irq < 0)
+ return mq->irq;
+
+ mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
+
+#elif defined CONFIG_IA64_SGI_UV
+ if (strcmp(irq_name, XPC_ACTIVATE_IRQ_NAME) == 0)
+ mq->irq = SGI_XPC_ACTIVATE;
+ else if (strcmp(irq_name, XPC_NOTIFY_IRQ_NAME) == 0)
+ mq->irq = SGI_XPC_NOTIFY;
+ else
+ return -EINVAL;
+
+ mq->mmr_value = (unsigned long)cpu_physical_id(cpu) << 32 | mq->irq;
+ uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mq->mmr_value);
+#else
+ #error not a supported configuration
+#endif
+
+ return 0;
+}
+
+static void
+xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq)
+{
+#if defined CONFIG_X86_64
+ uv_teardown_irq(mq->irq);
+
+#elif defined CONFIG_IA64_SGI_UV
+ int mmr_pnode;
+ unsigned long mmr_value;
+
+ mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+ mmr_value = 1UL << 16;
+
+ uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mmr_value);
+#else
+ #error not a supported configuration
+#endif
+}
+
+static int
+xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
+{
+ int ret;
+
+#if defined CONFIG_IA64_SGI_UV
+ int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+ ret = sn_mq_watchlist_alloc(mmr_pnode, (void *)uv_gpa(mq->address),
+ mq->order, &mq->mmr_offset);
+ if (ret < 0) {
+ dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n",
+ ret);
+ return -EBUSY;
+ }
+#elif defined CONFIG_X86_64
+ ret = uv_bios_mq_watchlist_alloc(uv_gpa(mq->address),
+ mq->order, &mq->mmr_offset);
+ if (ret < 0) {
+ dev_err(xpc_part, "uv_bios_mq_watchlist_alloc() failed, "
+ "ret=%d\n", ret);
+ return ret;
+ }
+#else
+ #error not a supported configuration
+#endif
+
+ mq->watchlist_num = ret;
+ return 0;
+}
+
+static void
+xpc_gru_mq_watchlist_free_uv(struct xpc_gru_mq_uv *mq)
+{
+ int ret;
+ int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
+
+#if defined CONFIG_X86_64
+ ret = uv_bios_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
+ BUG_ON(ret != BIOS_STATUS_SUCCESS);
+#elif defined CONFIG_IA64_SGI_UV
+ ret = sn_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
+ BUG_ON(ret != SALRET_OK);
+#else
+ #error not a supported configuration
+#endif
+}
+
+static struct xpc_gru_mq_uv *
+xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
+ irq_handler_t irq_handler)
+{
+ enum xp_retval xp_ret;
+ int ret;
+ int nid;
+ int nasid;
+ int pg_order;
+ struct page *page;
+ struct xpc_gru_mq_uv *mq;
+ struct uv_IO_APIC_route_entry *mmr_value;
+
+ mq = kmalloc(sizeof(struct xpc_gru_mq_uv), GFP_KERNEL);
+ if (mq == NULL) {
+ dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() "
+ "a xpc_gru_mq_uv structure\n");
+ ret = -ENOMEM;
+ goto out_0;
+ }
+
+ mq->gru_mq_desc = kzalloc(sizeof(struct gru_message_queue_desc),
+ GFP_KERNEL);
+ if (mq->gru_mq_desc == NULL) {
+ dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to kmalloc() "
+ "a gru_message_queue_desc structure\n");
+ ret = -ENOMEM;
+ goto out_1;
+ }
+
+ pg_order = get_order(mq_size);
+ mq->order = pg_order + PAGE_SHIFT;
+ mq_size = 1UL << mq->order;
+
+ mq->mmr_blade = uv_cpu_to_blade_id(cpu);
+
+ nid = cpu_to_node(cpu);
+ page = __alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+ pg_order);
+ if (page == NULL) {
+ dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
+ "bytes of memory on nid=%d for GRU mq\n", mq_size, nid);
+ ret = -ENOMEM;
+ goto out_2;
+ }
+ mq->address = page_address(page);
+
+ /* enable generation of irq when GRU mq operation occurs to this mq */
+ ret = xpc_gru_mq_watchlist_alloc_uv(mq);
+ if (ret != 0)
+ goto out_3;
+
+ ret = xpc_get_gru_mq_irq_uv(mq, cpu, irq_name);
+ if (ret != 0)
+ goto out_4;
+
+ ret = request_irq(mq->irq, irq_handler, 0, irq_name, NULL);
+ if (ret != 0) {
+ dev_err(xpc_part, "request_irq(irq=%d) returned error=%d\n",
+ mq->irq, -ret);
+ goto out_5;
+ }
+
+ nasid = UV_PNODE_TO_NASID(uv_cpu_to_pnode(cpu));
+
+ mmr_value = (struct uv_IO_APIC_route_entry *)&mq->mmr_value;
+ ret = gru_create_message_queue(mq->gru_mq_desc, mq->address, mq_size,
+ nasid, mmr_value->vector, mmr_value->dest);
+ if (ret != 0) {
+ dev_err(xpc_part, "gru_create_message_queue() returned "
+ "error=%d\n", ret);
+ ret = -EINVAL;
+ goto out_6;
+ }
+
+ /* allow other partitions to access this GRU mq */
+ xp_ret = xp_expand_memprotect(xp_pa(mq->address), mq_size);
+ if (xp_ret != xpSuccess) {
+ ret = -EACCES;
+ goto out_6;
+ }
+
+ return mq;
+
+ /* something went wrong */
+out_6:
+ free_irq(mq->irq, NULL);
+out_5:
+ xpc_release_gru_mq_irq_uv(mq);
+out_4:
+ xpc_gru_mq_watchlist_free_uv(mq);
+out_3:
+ free_pages((unsigned long)mq->address, pg_order);
+out_2:
+ kfree(mq->gru_mq_desc);
+out_1:
+ kfree(mq);
+out_0:
+ return ERR_PTR(ret);
+}
+
+static void
+xpc_destroy_gru_mq_uv(struct xpc_gru_mq_uv *mq)
+{
+ unsigned int mq_size;
+ int pg_order;
+ int ret;
+
+ /* disallow other partitions to access GRU mq */
+ mq_size = 1UL << mq->order;
+ ret = xp_restrict_memprotect(xp_pa(mq->address), mq_size);
+ BUG_ON(ret != xpSuccess);
+
+ /* unregister irq handler and release mq irq/vector mapping */
+ free_irq(mq->irq, NULL);
+ xpc_release_gru_mq_irq_uv(mq);
+
+ /* disable generation of irq when GRU mq op occurs to this mq */
+ xpc_gru_mq_watchlist_free_uv(mq);
+
+ pg_order = mq->order - PAGE_SHIFT;
+ free_pages((unsigned long)mq->address, pg_order);
+
+ kfree(mq);
+}
+
+static enum xp_retval
+xpc_send_gru_msg(struct gru_message_queue_desc *gru_mq_desc, void *msg,
+ size_t msg_size)
+{
+ enum xp_retval xp_ret;
+ int ret;
+
+ while (1) {
+ ret = gru_send_message_gpa(gru_mq_desc, msg, msg_size);
+ if (ret == MQE_OK) {
+ xp_ret = xpSuccess;
+ break;
+ }
+
+ if (ret == MQE_QUEUE_FULL) {
+ dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
+ "error=MQE_QUEUE_FULL\n");
+ /* !!! handle QLimit reached; delay & try again */
+ /* ??? Do we add a limit to the number of retries? */
+ (void)msleep_interruptible(10);
+ } else if (ret == MQE_CONGESTION) {
+ dev_dbg(xpc_chan, "gru_send_message_gpa() returned "
+ "error=MQE_CONGESTION\n");
+ /* !!! handle LB Overflow; simply try again */
+ /* ??? Do we add a limit to the number of retries? */
+ } else {
+ /* !!! Currently this is MQE_UNEXPECTED_CB_ERR */
+ dev_err(xpc_chan, "gru_send_message_gpa() returned "
+ "error=%d\n", ret);
+ xp_ret = xpGruSendMqError;
+ break;
+ }
+ }
+ return xp_ret;
+}
+
+static void
+xpc_process_activate_IRQ_rcvd_uv(void)
+{
+ unsigned long irq_flags;
+ short partid;
+ struct xpc_partition *part;
+ u8 act_state_req;
+
+ DBUG_ON(xpc_activate_IRQ_rcvd == 0);
+
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+ part = &xpc_partitions[partid];
+
+ if (part->sn.uv.act_state_req == 0)
+ continue;
+
+ xpc_activate_IRQ_rcvd--;
+ BUG_ON(xpc_activate_IRQ_rcvd < 0);
+
+ act_state_req = part->sn.uv.act_state_req;
+ part->sn.uv.act_state_req = 0;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ if (act_state_req == XPC_P_ASR_ACTIVATE_UV) {
+ if (part->act_state == XPC_P_AS_INACTIVE)
+ xpc_activate_partition(part);
+ else if (part->act_state == XPC_P_AS_DEACTIVATING)
+ XPC_DEACTIVATE_PARTITION(part, xpReactivating);
+
+ } else if (act_state_req == XPC_P_ASR_REACTIVATE_UV) {
+ if (part->act_state == XPC_P_AS_INACTIVE)
+ xpc_activate_partition(part);
+ else
+ XPC_DEACTIVATE_PARTITION(part, xpReactivating);
+
+ } else if (act_state_req == XPC_P_ASR_DEACTIVATE_UV) {
+ XPC_DEACTIVATE_PARTITION(part, part->sn.uv.reason);
+
+ } else {
+ BUG();
+ }
+
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (xpc_activate_IRQ_rcvd == 0)
+ break;
+ }
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+}
+
+static void
+xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
+ struct xpc_activate_mq_msghdr_uv *msg_hdr,
+ int part_setup,
+ int *wakeup_hb_checker)
+{
+ unsigned long irq_flags;
+ struct xpc_partition_uv *part_uv = &part->sn.uv;
+ struct xpc_openclose_args *args;
+
+ part_uv->remote_act_state = msg_hdr->act_state;
+
+ switch (msg_hdr->type) {
+ case XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV:
+ /* syncing of remote_act_state was just done above */
+ break;
+
+ case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: {
+ struct xpc_activate_mq_msg_activate_req_uv *msg;
+
+ /*
+ * ??? Do we deal here with ts_jiffies being different
+ * ??? if act_state != XPC_P_AS_INACTIVE instead of
+ * ??? below?
+ */
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_activate_req_uv, hdr);
+
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV;
+ part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */
+ part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies;
+ part_uv->heartbeat_gpa = msg->heartbeat_gpa;
+
+ if (msg->activate_gru_mq_desc_gpa !=
+ part_uv->activate_gru_mq_desc_gpa) {
+ spin_lock(&part_uv->flags_lock);
+ part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+ spin_unlock(&part_uv->flags_lock);
+ part_uv->activate_gru_mq_desc_gpa =
+ msg->activate_gru_mq_desc_gpa;
+ }
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ (*wakeup_hb_checker)++;
+ break;
+ }
+ case XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV: {
+ struct xpc_activate_mq_msg_deactivate_req_uv *msg;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_deactivate_req_uv, hdr);
+
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+ part_uv->reason = msg->reason;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ (*wakeup_hb_checker)++;
+ return;
+ }
+ case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV: {
+ struct xpc_activate_mq_msg_chctl_closerequest_uv *msg;
+
+ if (!part_setup)
+ break;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_chctl_closerequest_uv,
+ hdr);
+ args = &part->remote_openclose_args[msg->ch_number];
+ args->reason = msg->reason;
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREQUEST;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+ break;
+ }
+ case XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV: {
+ struct xpc_activate_mq_msg_chctl_closereply_uv *msg;
+
+ if (!part_setup)
+ break;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_chctl_closereply_uv,
+ hdr);
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[msg->ch_number] |= XPC_CHCTL_CLOSEREPLY;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+ break;
+ }
+ case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV: {
+ struct xpc_activate_mq_msg_chctl_openrequest_uv *msg;
+
+ if (!part_setup)
+ break;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_chctl_openrequest_uv,
+ hdr);
+ args = &part->remote_openclose_args[msg->ch_number];
+ args->entry_size = msg->entry_size;
+ args->local_nentries = msg->local_nentries;
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREQUEST;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+ break;
+ }
+ case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV: {
+ struct xpc_activate_mq_msg_chctl_openreply_uv *msg;
+
+ if (!part_setup)
+ break;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_chctl_openreply_uv, hdr);
+ args = &part->remote_openclose_args[msg->ch_number];
+ args->remote_nentries = msg->remote_nentries;
+ args->local_nentries = msg->local_nentries;
+ args->local_msgqueue_pa = msg->notify_gru_mq_desc_gpa;
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENREPLY;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+ break;
+ }
+ case XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV: {
+ struct xpc_activate_mq_msg_chctl_opencomplete_uv *msg;
+
+ if (!part_setup)
+ break;
+
+ msg = container_of(msg_hdr, struct
+ xpc_activate_mq_msg_chctl_opencomplete_uv, hdr);
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[msg->ch_number] |= XPC_CHCTL_OPENCOMPLETE;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+ }
+ fallthrough;
+ case XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV:
+ spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+ part_uv->flags |= XPC_P_ENGAGED_UV;
+ spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+ break;
+
+ case XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV:
+ spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+ part_uv->flags &= ~XPC_P_ENGAGED_UV;
+ spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+ break;
+
+ default:
+ dev_err(xpc_part, "received unknown activate_mq msg type=%d "
+ "from partition=%d\n", msg_hdr->type, XPC_PARTID(part));
+
+ /* get hb checker to deactivate from the remote partition */
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+ part_uv->reason = xpBadMsgType;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ (*wakeup_hb_checker)++;
+ return;
+ }
+
+ if (msg_hdr->rp_ts_jiffies != part->remote_rp_ts_jiffies &&
+ part->remote_rp_ts_jiffies != 0) {
+ /*
+ * ??? Does what we do here need to be sensitive to
+ * ??? act_state or remote_act_state?
+ */
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = XPC_P_ASR_REACTIVATE_UV;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ (*wakeup_hb_checker)++;
+ }
+}
+
+static irqreturn_t
+xpc_handle_activate_IRQ_uv(int irq, void *dev_id)
+{
+ struct xpc_activate_mq_msghdr_uv *msg_hdr;
+ short partid;
+ struct xpc_partition *part;
+ int wakeup_hb_checker = 0;
+ int part_referenced;
+
+ while (1) {
+ msg_hdr = gru_get_next_message(xpc_activate_mq_uv->gru_mq_desc);
+ if (msg_hdr == NULL)
+ break;
+
+ partid = msg_hdr->partid;
+ if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
+ dev_err(xpc_part, "xpc_handle_activate_IRQ_uv() "
+ "received invalid partid=0x%x in message\n",
+ partid);
+ } else {
+ part = &xpc_partitions[partid];
+
+ part_referenced = xpc_part_ref(part);
+ xpc_handle_activate_mq_msg_uv(part, msg_hdr,
+ part_referenced,
+ &wakeup_hb_checker);
+ if (part_referenced)
+ xpc_part_deref(part);
+ }
+
+ gru_free_message(xpc_activate_mq_uv->gru_mq_desc, msg_hdr);
+ }
+
+ if (wakeup_hb_checker)
+ wake_up_interruptible(&xpc_activate_IRQ_wq);
+
+ return IRQ_HANDLED;
+}
+
+static enum xp_retval
+xpc_cache_remote_gru_mq_desc_uv(struct gru_message_queue_desc *gru_mq_desc,
+ unsigned long gru_mq_desc_gpa)
+{
+ enum xp_retval ret;
+
+ ret = xp_remote_memcpy(uv_gpa(gru_mq_desc), gru_mq_desc_gpa,
+ sizeof(struct gru_message_queue_desc));
+ if (ret == xpSuccess)
+ gru_mq_desc->mq = NULL;
+
+ return ret;
+}
+
+static enum xp_retval
+xpc_send_activate_IRQ_uv(struct xpc_partition *part, void *msg, size_t msg_size,
+ int msg_type)
+{
+ struct xpc_activate_mq_msghdr_uv *msg_hdr = msg;
+ struct xpc_partition_uv *part_uv = &part->sn.uv;
+ struct gru_message_queue_desc *gru_mq_desc;
+ unsigned long irq_flags;
+ enum xp_retval ret;
+
+ DBUG_ON(msg_size > XPC_ACTIVATE_MSG_SIZE_UV);
+
+ msg_hdr->type = msg_type;
+ msg_hdr->partid = xp_partition_id;
+ msg_hdr->act_state = part->act_state;
+ msg_hdr->rp_ts_jiffies = xpc_rsvd_page->ts_jiffies;
+
+ mutex_lock(&part_uv->cached_activate_gru_mq_desc_mutex);
+again:
+ if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV)) {
+ gru_mq_desc = part_uv->cached_activate_gru_mq_desc;
+ if (gru_mq_desc == NULL) {
+ gru_mq_desc = kmalloc(sizeof(struct
+ gru_message_queue_desc),
+ GFP_ATOMIC);
+ if (gru_mq_desc == NULL) {
+ ret = xpNoMemory;
+ goto done;
+ }
+ part_uv->cached_activate_gru_mq_desc = gru_mq_desc;
+ }
+
+ ret = xpc_cache_remote_gru_mq_desc_uv(gru_mq_desc,
+ part_uv->
+ activate_gru_mq_desc_gpa);
+ if (ret != xpSuccess)
+ goto done;
+
+ spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+ part_uv->flags |= XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV;
+ spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+ }
+
+ /* ??? Is holding a spin_lock (ch->lock) during this call a bad idea? */
+ ret = xpc_send_gru_msg(part_uv->cached_activate_gru_mq_desc, msg,
+ msg_size);
+ if (ret != xpSuccess) {
+ smp_rmb(); /* ensure a fresh copy of part_uv->flags */
+ if (!(part_uv->flags & XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV))
+ goto again;
+ }
+done:
+ mutex_unlock(&part_uv->cached_activate_gru_mq_desc_mutex);
+ return ret;
+}
+
+static void
+xpc_send_activate_IRQ_part_uv(struct xpc_partition *part, void *msg,
+ size_t msg_size, int msg_type)
+{
+ enum xp_retval ret;
+
+ ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
+ if (unlikely(ret != xpSuccess))
+ XPC_DEACTIVATE_PARTITION(part, ret);
+}
+
+static void
+xpc_send_activate_IRQ_ch_uv(struct xpc_channel *ch, unsigned long *irq_flags,
+ void *msg, size_t msg_size, int msg_type)
+{
+ struct xpc_partition *part = &xpc_partitions[ch->partid];
+ enum xp_retval ret;
+
+ ret = xpc_send_activate_IRQ_uv(part, msg, msg_size, msg_type);
+ if (unlikely(ret != xpSuccess)) {
+ if (irq_flags != NULL)
+ spin_unlock_irqrestore(&ch->lock, *irq_flags);
+
+ XPC_DEACTIVATE_PARTITION(part, ret);
+
+ if (irq_flags != NULL)
+ spin_lock_irqsave(&ch->lock, *irq_flags);
+ }
+}
+
+static void
+xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req)
+{
+ unsigned long irq_flags;
+ struct xpc_partition_uv *part_uv = &part->sn.uv;
+
+ /*
+ * !!! Make our side think that the remote partition sent an activate
+ * !!! mq message our way by doing what the activate IRQ handler would
+ * !!! do had one really been sent.
+ */
+
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = act_state_req;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ wake_up_interruptible(&xpc_activate_IRQ_wq);
+}
+
+static enum xp_retval
+xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
+ size_t *len)
+{
+ s64 status;
+ enum xp_retval ret;
+
+#if defined CONFIG_X86_64
+ status = uv_bios_reserved_page_pa((u64)buf, cookie, (u64 *)rp_pa,
+ (u64 *)len);
+ if (status == BIOS_STATUS_SUCCESS)
+ ret = xpSuccess;
+ else if (status == BIOS_STATUS_MORE_PASSES)
+ ret = xpNeedMoreInfo;
+ else
+ ret = xpBiosError;
+
+#elif defined CONFIG_IA64_SGI_UV
+ status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len);
+ if (status == SALRET_OK)
+ ret = xpSuccess;
+ else if (status == SALRET_MORE_PASSES)
+ ret = xpNeedMoreInfo;
+ else
+ ret = xpSalError;
+
+#else
+ #error not a supported configuration
+#endif
+
+ return ret;
+}
+
+static int
+xpc_setup_rsvd_page_uv(struct xpc_rsvd_page *rp)
+{
+ xpc_heartbeat_uv =
+ &xpc_partitions[sn_partition_id].sn.uv.cached_heartbeat;
+ rp->sn.uv.heartbeat_gpa = uv_gpa(xpc_heartbeat_uv);
+ rp->sn.uv.activate_gru_mq_desc_gpa =
+ uv_gpa(xpc_activate_mq_uv->gru_mq_desc);
+ return 0;
+}
+
+static void
+xpc_allow_hb_uv(short partid)
+{
+}
+
+static void
+xpc_disallow_hb_uv(short partid)
+{
+}
+
+static void
+xpc_disallow_all_hbs_uv(void)
+{
+}
+
+static void
+xpc_increment_heartbeat_uv(void)
+{
+ xpc_heartbeat_uv->value++;
+}
+
+static void
+xpc_offline_heartbeat_uv(void)
+{
+ xpc_increment_heartbeat_uv();
+ xpc_heartbeat_uv->offline = 1;
+}
+
+static void
+xpc_online_heartbeat_uv(void)
+{
+ xpc_increment_heartbeat_uv();
+ xpc_heartbeat_uv->offline = 0;
+}
+
+static void
+xpc_heartbeat_init_uv(void)
+{
+ xpc_heartbeat_uv->value = 1;
+ xpc_heartbeat_uv->offline = 0;
+}
+
+static void
+xpc_heartbeat_exit_uv(void)
+{
+ xpc_offline_heartbeat_uv();
+}
+
+static enum xp_retval
+xpc_get_remote_heartbeat_uv(struct xpc_partition *part)
+{
+ struct xpc_partition_uv *part_uv = &part->sn.uv;
+ enum xp_retval ret;
+
+ ret = xp_remote_memcpy(uv_gpa(&part_uv->cached_heartbeat),
+ part_uv->heartbeat_gpa,
+ sizeof(struct xpc_heartbeat_uv));
+ if (ret != xpSuccess)
+ return ret;
+
+ if (part_uv->cached_heartbeat.value == part->last_heartbeat &&
+ !part_uv->cached_heartbeat.offline) {
+
+ ret = xpNoHeartbeat;
+ } else {
+ part->last_heartbeat = part_uv->cached_heartbeat.value;
+ }
+ return ret;
+}
+
+static void
+xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp,
+ unsigned long remote_rp_gpa, int nasid)
+{
+ short partid = remote_rp->SAL_partid;
+ struct xpc_partition *part = &xpc_partitions[partid];
+ struct xpc_activate_mq_msg_activate_req_uv msg;
+
+ part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */
+ part->remote_rp_ts_jiffies = remote_rp->ts_jiffies;
+ part->sn.uv.heartbeat_gpa = remote_rp->sn.uv.heartbeat_gpa;
+ part->sn.uv.activate_gru_mq_desc_gpa =
+ remote_rp->sn.uv.activate_gru_mq_desc_gpa;
+
+ /*
+ * ??? Is it a good idea to make this conditional on what is
+ * ??? potentially stale state information?
+ */
+ if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) {
+ msg.rp_gpa = uv_gpa(xpc_rsvd_page);
+ msg.heartbeat_gpa = xpc_rsvd_page->sn.uv.heartbeat_gpa;
+ msg.activate_gru_mq_desc_gpa =
+ xpc_rsvd_page->sn.uv.activate_gru_mq_desc_gpa;
+ xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV);
+ }
+
+ if (part->act_state == XPC_P_AS_INACTIVE)
+ xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
+}
+
+static void
+xpc_request_partition_reactivation_uv(struct xpc_partition *part)
+{
+ xpc_send_local_activate_IRQ_uv(part, XPC_P_ASR_ACTIVATE_UV);
+}
+
+static void
+xpc_request_partition_deactivation_uv(struct xpc_partition *part)
+{
+ struct xpc_activate_mq_msg_deactivate_req_uv msg;
+
+ /*
+ * ??? Is it a good idea to make this conditional on what is
+ * ??? potentially stale state information?
+ */
+ if (part->sn.uv.remote_act_state != XPC_P_AS_DEACTIVATING &&
+ part->sn.uv.remote_act_state != XPC_P_AS_INACTIVE) {
+
+ msg.reason = part->reason;
+ xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_DEACTIVATE_REQ_UV);
+ }
+}
+
+static void
+xpc_cancel_partition_deactivation_request_uv(struct xpc_partition *part)
+{
+ /* nothing needs to be done */
+ return;
+}
+
+static void
+xpc_init_fifo_uv(struct xpc_fifo_head_uv *head)
+{
+ head->first = NULL;
+ head->last = NULL;
+ spin_lock_init(&head->lock);
+ head->n_entries = 0;
+}
+
+static void *
+xpc_get_fifo_entry_uv(struct xpc_fifo_head_uv *head)
+{
+ unsigned long irq_flags;
+ struct xpc_fifo_entry_uv *first;
+
+ spin_lock_irqsave(&head->lock, irq_flags);
+ first = head->first;
+ if (head->first != NULL) {
+ head->first = first->next;
+ if (head->first == NULL)
+ head->last = NULL;
+
+ head->n_entries--;
+ BUG_ON(head->n_entries < 0);
+
+ first->next = NULL;
+ }
+ spin_unlock_irqrestore(&head->lock, irq_flags);
+ return first;
+}
+
+static void
+xpc_put_fifo_entry_uv(struct xpc_fifo_head_uv *head,
+ struct xpc_fifo_entry_uv *last)
+{
+ unsigned long irq_flags;
+
+ last->next = NULL;
+ spin_lock_irqsave(&head->lock, irq_flags);
+ if (head->last != NULL)
+ head->last->next = last;
+ else
+ head->first = last;
+ head->last = last;
+ head->n_entries++;
+ spin_unlock_irqrestore(&head->lock, irq_flags);
+}
+
+static int
+xpc_n_of_fifo_entries_uv(struct xpc_fifo_head_uv *head)
+{
+ return head->n_entries;
+}
+
+/*
+ * Setup the channel structures that are uv specific.
+ */
+static enum xp_retval
+xpc_setup_ch_structures_uv(struct xpc_partition *part)
+{
+ struct xpc_channel_uv *ch_uv;
+ int ch_number;
+
+ for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
+ ch_uv = &part->channels[ch_number].sn.uv;
+
+ xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+ xpc_init_fifo_uv(&ch_uv->recv_msg_list);
+ }
+
+ return xpSuccess;
+}
+
+/*
+ * Teardown the channel structures that are uv specific.
+ */
+static void
+xpc_teardown_ch_structures_uv(struct xpc_partition *part)
+{
+ /* nothing needs to be done */
+ return;
+}
+
+static enum xp_retval
+xpc_make_first_contact_uv(struct xpc_partition *part)
+{
+ struct xpc_activate_mq_msg_uv msg;
+
+ /*
+ * We send a sync msg to get the remote partition's remote_act_state
+ * updated to our current act_state which at this point should
+ * be XPC_P_AS_ACTIVATING.
+ */
+ xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_SYNC_ACT_STATE_UV);
+
+ while (!((part->sn.uv.remote_act_state == XPC_P_AS_ACTIVATING) ||
+ (part->sn.uv.remote_act_state == XPC_P_AS_ACTIVE))) {
+
+ dev_dbg(xpc_part, "waiting to make first contact with "
+ "partition %d\n", XPC_PARTID(part));
+
+ /* wait a 1/4 of a second or so */
+ (void)msleep_interruptible(250);
+
+ if (part->act_state == XPC_P_AS_DEACTIVATING)
+ return part->reason;
+ }
+
+ return xpSuccess;
+}
+
+static u64
+xpc_get_chctl_all_flags_uv(struct xpc_partition *part)
+{
+ unsigned long irq_flags;
+ union xpc_channel_ctl_flags chctl;
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ chctl = part->chctl;
+ if (chctl.all_flags != 0)
+ part->chctl.all_flags = 0;
+
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+ return chctl.all_flags;
+}
+
+static enum xp_retval
+xpc_allocate_send_msg_slot_uv(struct xpc_channel *ch)
+{
+ struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+ struct xpc_send_msg_slot_uv *msg_slot;
+ unsigned long irq_flags;
+ int nentries;
+ int entry;
+ size_t nbytes;
+
+ for (nentries = ch->local_nentries; nentries > 0; nentries--) {
+ nbytes = nentries * sizeof(struct xpc_send_msg_slot_uv);
+ ch_uv->send_msg_slots = kzalloc(nbytes, GFP_KERNEL);
+ if (ch_uv->send_msg_slots == NULL)
+ continue;
+
+ for (entry = 0; entry < nentries; entry++) {
+ msg_slot = &ch_uv->send_msg_slots[entry];
+
+ msg_slot->msg_slot_number = entry;
+ xpc_put_fifo_entry_uv(&ch_uv->msg_slot_free_list,
+ &msg_slot->next);
+ }
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if (nentries < ch->local_nentries)
+ ch->local_nentries = nentries;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return xpSuccess;
+ }
+
+ return xpNoMemory;
+}
+
+static enum xp_retval
+xpc_allocate_recv_msg_slot_uv(struct xpc_channel *ch)
+{
+ struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+ struct xpc_notify_mq_msg_uv *msg_slot;
+ unsigned long irq_flags;
+ int nentries;
+ int entry;
+ size_t nbytes;
+
+ for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
+ nbytes = nentries * ch->entry_size;
+ ch_uv->recv_msg_slots = kzalloc(nbytes, GFP_KERNEL);
+ if (ch_uv->recv_msg_slots == NULL)
+ continue;
+
+ for (entry = 0; entry < nentries; entry++) {
+ msg_slot = ch_uv->recv_msg_slots +
+ entry * ch->entry_size;
+
+ msg_slot->hdr.msg_slot_number = entry;
+ }
+
+ spin_lock_irqsave(&ch->lock, irq_flags);
+ if (nentries < ch->remote_nentries)
+ ch->remote_nentries = nentries;
+ spin_unlock_irqrestore(&ch->lock, irq_flags);
+ return xpSuccess;
+ }
+
+ return xpNoMemory;
+}
+
+/*
+ * Allocate msg_slots associated with the channel.
+ */
+static enum xp_retval
+xpc_setup_msg_structures_uv(struct xpc_channel *ch)
+{
+ static enum xp_retval ret;
+ struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+ DBUG_ON(ch->flags & XPC_C_SETUP);
+
+ ch_uv->cached_notify_gru_mq_desc = kmalloc(sizeof(struct
+ gru_message_queue_desc),
+ GFP_KERNEL);
+ if (ch_uv->cached_notify_gru_mq_desc == NULL)
+ return xpNoMemory;
+
+ ret = xpc_allocate_send_msg_slot_uv(ch);
+ if (ret == xpSuccess) {
+
+ ret = xpc_allocate_recv_msg_slot_uv(ch);
+ if (ret != xpSuccess) {
+ kfree(ch_uv->send_msg_slots);
+ xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+ }
+ }
+ return ret;
+}
+
+/*
+ * Free up msg_slots and clear other stuff that were setup for the specified
+ * channel.
+ */
+static void
+xpc_teardown_msg_structures_uv(struct xpc_channel *ch)
+{
+ struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+ lockdep_assert_held(&ch->lock);
+
+ kfree(ch_uv->cached_notify_gru_mq_desc);
+ ch_uv->cached_notify_gru_mq_desc = NULL;
+
+ if (ch->flags & XPC_C_SETUP) {
+ xpc_init_fifo_uv(&ch_uv->msg_slot_free_list);
+ kfree(ch_uv->send_msg_slots);
+ xpc_init_fifo_uv(&ch_uv->recv_msg_list);
+ kfree(ch_uv->recv_msg_slots);
+ }
+}
+
+static void
+xpc_send_chctl_closerequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_activate_mq_msg_chctl_closerequest_uv msg;
+
+ msg.ch_number = ch->number;
+ msg.reason = ch->reason;
+ xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREQUEST_UV);
+}
+
+static void
+xpc_send_chctl_closereply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_activate_mq_msg_chctl_closereply_uv msg;
+
+ msg.ch_number = ch->number;
+ xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_CHCTL_CLOSEREPLY_UV);
+}
+
+static void
+xpc_send_chctl_openrequest_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_activate_mq_msg_chctl_openrequest_uv msg;
+
+ msg.ch_number = ch->number;
+ msg.entry_size = ch->entry_size;
+ msg.local_nentries = ch->local_nentries;
+ xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREQUEST_UV);
+}
+
+static void
+xpc_send_chctl_openreply_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_activate_mq_msg_chctl_openreply_uv msg;
+
+ msg.ch_number = ch->number;
+ msg.local_nentries = ch->local_nentries;
+ msg.remote_nentries = ch->remote_nentries;
+ msg.notify_gru_mq_desc_gpa = uv_gpa(xpc_notify_mq_uv->gru_mq_desc);
+ xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_CHCTL_OPENREPLY_UV);
+}
+
+static void
+xpc_send_chctl_opencomplete_uv(struct xpc_channel *ch, unsigned long *irq_flags)
+{
+ struct xpc_activate_mq_msg_chctl_opencomplete_uv msg;
+
+ msg.ch_number = ch->number;
+ xpc_send_activate_IRQ_ch_uv(ch, irq_flags, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_CHCTL_OPENCOMPLETE_UV);
+}
+
+static void
+xpc_send_chctl_local_msgrequest_uv(struct xpc_partition *part, int ch_number)
+{
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&part->chctl_lock, irq_flags);
+ part->chctl.flags[ch_number] |= XPC_CHCTL_MSGREQUEST;
+ spin_unlock_irqrestore(&part->chctl_lock, irq_flags);
+
+ xpc_wakeup_channel_mgr(part);
+}
+
+static enum xp_retval
+xpc_save_remote_msgqueue_pa_uv(struct xpc_channel *ch,
+ unsigned long gru_mq_desc_gpa)
+{
+ struct xpc_channel_uv *ch_uv = &ch->sn.uv;
+
+ DBUG_ON(ch_uv->cached_notify_gru_mq_desc == NULL);
+ return xpc_cache_remote_gru_mq_desc_uv(ch_uv->cached_notify_gru_mq_desc,
+ gru_mq_desc_gpa);
+}
+
+static void
+xpc_indicate_partition_engaged_uv(struct xpc_partition *part)
+{
+ struct xpc_activate_mq_msg_uv msg;
+
+ xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_MARK_ENGAGED_UV);
+}
+
+static void
+xpc_indicate_partition_disengaged_uv(struct xpc_partition *part)
+{
+ struct xpc_activate_mq_msg_uv msg;
+
+ xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
+ XPC_ACTIVATE_MQ_MSG_MARK_DISENGAGED_UV);
+}
+
+static void
+xpc_assume_partition_disengaged_uv(short partid)
+{
+ struct xpc_partition_uv *part_uv = &xpc_partitions[partid].sn.uv;
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
+ part_uv->flags &= ~XPC_P_ENGAGED_UV;
+ spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
+}
+
+static int
+xpc_partition_engaged_uv(short partid)
+{
+ return (xpc_partitions[partid].sn.uv.flags & XPC_P_ENGAGED_UV) != 0;
+}
+
+static int
+xpc_any_partition_engaged_uv(void)
+{
+ struct xpc_partition_uv *part_uv;
+ short partid;
+
+ for (partid = 0; partid < XP_MAX_NPARTITIONS_UV; partid++) {
+ part_uv = &xpc_partitions[partid].sn.uv;
+ if ((part_uv->flags & XPC_P_ENGAGED_UV) != 0)
+ return 1;
+ }
+ return 0;
+}
+
+static enum xp_retval
+xpc_allocate_msg_slot_uv(struct xpc_channel *ch, u32 flags,
+ struct xpc_send_msg_slot_uv **address_of_msg_slot)
+{
+ enum xp_retval ret;
+ struct xpc_send_msg_slot_uv *msg_slot;
+ struct xpc_fifo_entry_uv *entry;
+
+ while (1) {
+ entry = xpc_get_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list);
+ if (entry != NULL)
+ break;
+
+ if (flags & XPC_NOWAIT)
+ return xpNoWait;
+
+ ret = xpc_allocate_msg_wait(ch);
+ if (ret != xpInterrupted && ret != xpTimeout)
+ return ret;
+ }
+
+ msg_slot = container_of(entry, struct xpc_send_msg_slot_uv, next);
+ *address_of_msg_slot = msg_slot;
+ return xpSuccess;
+}
+
+static void
+xpc_free_msg_slot_uv(struct xpc_channel *ch,
+ struct xpc_send_msg_slot_uv *msg_slot)
+{
+ xpc_put_fifo_entry_uv(&ch->sn.uv.msg_slot_free_list, &msg_slot->next);
+
+ /* wakeup anyone waiting for a free msg slot */
+ if (atomic_read(&ch->n_on_msg_allocate_wq) > 0)
+ wake_up(&ch->msg_allocate_wq);
+}
+
+static void
+xpc_notify_sender_uv(struct xpc_channel *ch,
+ struct xpc_send_msg_slot_uv *msg_slot,
+ enum xp_retval reason)
+{
+ xpc_notify_func func = msg_slot->func;
+
+ if (func != NULL && cmpxchg(&msg_slot->func, func, NULL) == func) {
+
+ atomic_dec(&ch->n_to_notify);
+
+ dev_dbg(xpc_chan, "msg_slot->func() called, msg_slot=0x%p "
+ "msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
+ msg_slot->msg_slot_number, ch->partid, ch->number);
+
+ func(reason, ch->partid, ch->number, msg_slot->key);
+
+ dev_dbg(xpc_chan, "msg_slot->func() returned, msg_slot=0x%p "
+ "msg_slot_number=%d partid=%d channel=%d\n", msg_slot,
+ msg_slot->msg_slot_number, ch->partid, ch->number);
+ }
+}
+
+static void
+xpc_handle_notify_mq_ack_uv(struct xpc_channel *ch,
+ struct xpc_notify_mq_msg_uv *msg)
+{
+ struct xpc_send_msg_slot_uv *msg_slot;
+ int entry = msg->hdr.msg_slot_number % ch->local_nentries;
+
+ msg_slot = &ch->sn.uv.send_msg_slots[entry];
+
+ BUG_ON(msg_slot->msg_slot_number != msg->hdr.msg_slot_number);
+ msg_slot->msg_slot_number += ch->local_nentries;
+
+ if (msg_slot->func != NULL)
+ xpc_notify_sender_uv(ch, msg_slot, xpMsgDelivered);
+
+ xpc_free_msg_slot_uv(ch, msg_slot);
+}
+
+static void
+xpc_handle_notify_mq_msg_uv(struct xpc_partition *part,
+ struct xpc_notify_mq_msg_uv *msg)
+{
+ struct xpc_partition_uv *part_uv = &part->sn.uv;
+ struct xpc_channel *ch;
+ struct xpc_channel_uv *ch_uv;
+ struct xpc_notify_mq_msg_uv *msg_slot;
+ unsigned long irq_flags;
+ int ch_number = msg->hdr.ch_number;
+
+ if (unlikely(ch_number >= part->nchannels)) {
+ dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received invalid "
+ "channel number=0x%x in message from partid=%d\n",
+ ch_number, XPC_PARTID(part));
+
+ /* get hb checker to deactivate from the remote partition */
+ spin_lock_irqsave(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+ if (part_uv->act_state_req == 0)
+ xpc_activate_IRQ_rcvd++;
+ part_uv->act_state_req = XPC_P_ASR_DEACTIVATE_UV;
+ part_uv->reason = xpBadChannelNumber;
+ spin_unlock_irqrestore(&xpc_activate_IRQ_rcvd_lock, irq_flags);
+
+ wake_up_interruptible(&xpc_activate_IRQ_wq);
+ return;
+ }
+
+ ch = &part->channels[ch_number];
+ xpc_msgqueue_ref(ch);
+
+ if (!(ch->flags & XPC_C_CONNECTED)) {
+ xpc_msgqueue_deref(ch);
+ return;
+ }
+
+ /* see if we're really dealing with an ACK for a previously sent msg */
+ if (msg->hdr.size == 0) {
+ xpc_handle_notify_mq_ack_uv(ch, msg);
+ xpc_msgqueue_deref(ch);
+ return;
+ }
+
+ /* we're dealing with a normal message sent via the notify_mq */
+ ch_uv = &ch->sn.uv;
+
+ msg_slot = ch_uv->recv_msg_slots +
+ (msg->hdr.msg_slot_number % ch->remote_nentries) * ch->entry_size;
+
+ BUG_ON(msg_slot->hdr.size != 0);
+
+ memcpy(msg_slot, msg, msg->hdr.size);
+
+ xpc_put_fifo_entry_uv(&ch_uv->recv_msg_list, &msg_slot->hdr.u.next);
+
+ if (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) {
+ /*
+ * If there is an existing idle kthread get it to deliver
+ * the payload, otherwise we'll have to get the channel mgr
+ * for this partition to create a kthread to do the delivery.
+ */
+ if (atomic_read(&ch->kthreads_idle) > 0)
+ wake_up_nr(&ch->idle_wq, 1);
+ else
+ xpc_send_chctl_local_msgrequest_uv(part, ch->number);
+ }
+ xpc_msgqueue_deref(ch);
+}
+
+static irqreturn_t
+xpc_handle_notify_IRQ_uv(int irq, void *dev_id)
+{
+ struct xpc_notify_mq_msg_uv *msg;
+ short partid;
+ struct xpc_partition *part;
+
+ while ((msg = gru_get_next_message(xpc_notify_mq_uv->gru_mq_desc)) !=
+ NULL) {
+
+ partid = msg->hdr.partid;
+ if (partid < 0 || partid >= XP_MAX_NPARTITIONS_UV) {
+ dev_err(xpc_part, "xpc_handle_notify_IRQ_uv() received "
+ "invalid partid=0x%x in message\n", partid);
+ } else {
+ part = &xpc_partitions[partid];
+
+ if (xpc_part_ref(part)) {
+ xpc_handle_notify_mq_msg_uv(part, msg);
+ xpc_part_deref(part);
+ }
+ }
+
+ gru_free_message(xpc_notify_mq_uv->gru_mq_desc, msg);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static int
+xpc_n_of_deliverable_payloads_uv(struct xpc_channel *ch)
+{
+ return xpc_n_of_fifo_entries_uv(&ch->sn.uv.recv_msg_list);
+}
+
+static void
+xpc_process_msg_chctl_flags_uv(struct xpc_partition *part, int ch_number)
+{
+ struct xpc_channel *ch = &part->channels[ch_number];
+ int ndeliverable_payloads;
+
+ xpc_msgqueue_ref(ch);
+
+ ndeliverable_payloads = xpc_n_of_deliverable_payloads_uv(ch);
+
+ if (ndeliverable_payloads > 0 &&
+ (ch->flags & XPC_C_CONNECTED) &&
+ (ch->flags & XPC_C_CONNECTEDCALLOUT_MADE)) {
+
+ xpc_activate_kthreads(ch, ndeliverable_payloads);
+ }
+
+ xpc_msgqueue_deref(ch);
+}
+
+static enum xp_retval
+xpc_send_payload_uv(struct xpc_channel *ch, u32 flags, void *payload,
+ u16 payload_size, u8 notify_type, xpc_notify_func func,
+ void *key)
+{
+ enum xp_retval ret = xpSuccess;
+ struct xpc_send_msg_slot_uv *msg_slot = NULL;
+ struct xpc_notify_mq_msg_uv *msg;
+ u8 msg_buffer[XPC_NOTIFY_MSG_SIZE_UV];
+ size_t msg_size;
+
+ DBUG_ON(notify_type != XPC_N_CALL);
+
+ msg_size = sizeof(struct xpc_notify_mq_msghdr_uv) + payload_size;
+ if (msg_size > ch->entry_size)
+ return xpPayloadTooBig;
+
+ xpc_msgqueue_ref(ch);
+
+ if (ch->flags & XPC_C_DISCONNECTING) {
+ ret = ch->reason;
+ goto out_1;
+ }
+ if (!(ch->flags & XPC_C_CONNECTED)) {
+ ret = xpNotConnected;
+ goto out_1;
+ }
+
+ ret = xpc_allocate_msg_slot_uv(ch, flags, &msg_slot);
+ if (ret != xpSuccess)
+ goto out_1;
+
+ if (func != NULL) {
+ atomic_inc(&ch->n_to_notify);
+
+ msg_slot->key = key;
+ smp_wmb(); /* a non-NULL func must hit memory after the key */
+ msg_slot->func = func;
+
+ if (ch->flags & XPC_C_DISCONNECTING) {
+ ret = ch->reason;
+ goto out_2;
+ }
+ }
+
+ msg = (struct xpc_notify_mq_msg_uv *)&msg_buffer;
+ msg->hdr.partid = xp_partition_id;
+ msg->hdr.ch_number = ch->number;
+ msg->hdr.size = msg_size;
+ msg->hdr.msg_slot_number = msg_slot->msg_slot_number;
+ memcpy(&msg->payload, payload, payload_size);
+
+ ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg,
+ msg_size);
+ if (ret == xpSuccess)
+ goto out_1;
+
+ XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
+out_2:
+ if (func != NULL) {
+ /*
+ * Try to NULL the msg_slot's func field. If we fail, then
+ * xpc_notify_senders_of_disconnect_uv() beat us to it, in which
+ * case we need to pretend we succeeded to send the message
+ * since the user will get a callout for the disconnect error
+ * by xpc_notify_senders_of_disconnect_uv(), and to also get an
+ * error returned here will confuse them. Additionally, since
+ * in this case the channel is being disconnected we don't need
+ * to put the msg_slot back on the free list.
+ */
+ if (cmpxchg(&msg_slot->func, func, NULL) != func) {
+ ret = xpSuccess;
+ goto out_1;
+ }
+
+ msg_slot->key = NULL;
+ atomic_dec(&ch->n_to_notify);
+ }
+ xpc_free_msg_slot_uv(ch, msg_slot);
+out_1:
+ xpc_msgqueue_deref(ch);
+ return ret;
+}
+
+/*
+ * Tell the callers of xpc_send_notify() that the status of their payloads
+ * is unknown because the channel is now disconnecting.
+ *
+ * We don't worry about putting these msg_slots on the free list since the
+ * msg_slots themselves are about to be kfree'd.
+ */
+static void
+xpc_notify_senders_of_disconnect_uv(struct xpc_channel *ch)
+{
+ struct xpc_send_msg_slot_uv *msg_slot;
+ int entry;
+
+ DBUG_ON(!(ch->flags & XPC_C_DISCONNECTING));
+
+ for (entry = 0; entry < ch->local_nentries; entry++) {
+
+ if (atomic_read(&ch->n_to_notify) == 0)
+ break;
+
+ msg_slot = &ch->sn.uv.send_msg_slots[entry];
+ if (msg_slot->func != NULL)
+ xpc_notify_sender_uv(ch, msg_slot, ch->reason);
+ }
+}
+
+/*
+ * Get the next deliverable message's payload.
+ */
+static void *
+xpc_get_deliverable_payload_uv(struct xpc_channel *ch)
+{
+ struct xpc_fifo_entry_uv *entry;
+ struct xpc_notify_mq_msg_uv *msg;
+ void *payload = NULL;
+
+ if (!(ch->flags & XPC_C_DISCONNECTING)) {
+ entry = xpc_get_fifo_entry_uv(&ch->sn.uv.recv_msg_list);
+ if (entry != NULL) {
+ msg = container_of(entry, struct xpc_notify_mq_msg_uv,
+ hdr.u.next);
+ payload = &msg->payload;
+ }
+ }
+ return payload;
+}
+
+static void
+xpc_received_payload_uv(struct xpc_channel *ch, void *payload)
+{
+ struct xpc_notify_mq_msg_uv *msg;
+ enum xp_retval ret;
+
+ msg = container_of(payload, struct xpc_notify_mq_msg_uv, payload);
+
+ /* return an ACK to the sender of this message */
+
+ msg->hdr.partid = xp_partition_id;
+ msg->hdr.size = 0; /* size of zero indicates this is an ACK */
+
+ ret = xpc_send_gru_msg(ch->sn.uv.cached_notify_gru_mq_desc, msg,
+ sizeof(struct xpc_notify_mq_msghdr_uv));
+ if (ret != xpSuccess)
+ XPC_DEACTIVATE_PARTITION(&xpc_partitions[ch->partid], ret);
+}
+
+static const struct xpc_arch_operations xpc_arch_ops_uv = {
+ .setup_partitions = xpc_setup_partitions_uv,
+ .teardown_partitions = xpc_teardown_partitions_uv,
+ .process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv,
+ .get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv,
+ .setup_rsvd_page = xpc_setup_rsvd_page_uv,
+
+ .allow_hb = xpc_allow_hb_uv,
+ .disallow_hb = xpc_disallow_hb_uv,
+ .disallow_all_hbs = xpc_disallow_all_hbs_uv,
+ .increment_heartbeat = xpc_increment_heartbeat_uv,
+ .offline_heartbeat = xpc_offline_heartbeat_uv,
+ .online_heartbeat = xpc_online_heartbeat_uv,
+ .heartbeat_init = xpc_heartbeat_init_uv,
+ .heartbeat_exit = xpc_heartbeat_exit_uv,
+ .get_remote_heartbeat = xpc_get_remote_heartbeat_uv,
+
+ .request_partition_activation =
+ xpc_request_partition_activation_uv,
+ .request_partition_reactivation =
+ xpc_request_partition_reactivation_uv,
+ .request_partition_deactivation =
+ xpc_request_partition_deactivation_uv,
+ .cancel_partition_deactivation_request =
+ xpc_cancel_partition_deactivation_request_uv,
+
+ .setup_ch_structures = xpc_setup_ch_structures_uv,
+ .teardown_ch_structures = xpc_teardown_ch_structures_uv,
+
+ .make_first_contact = xpc_make_first_contact_uv,
+
+ .get_chctl_all_flags = xpc_get_chctl_all_flags_uv,
+ .send_chctl_closerequest = xpc_send_chctl_closerequest_uv,
+ .send_chctl_closereply = xpc_send_chctl_closereply_uv,
+ .send_chctl_openrequest = xpc_send_chctl_openrequest_uv,
+ .send_chctl_openreply = xpc_send_chctl_openreply_uv,
+ .send_chctl_opencomplete = xpc_send_chctl_opencomplete_uv,
+ .process_msg_chctl_flags = xpc_process_msg_chctl_flags_uv,
+
+ .save_remote_msgqueue_pa = xpc_save_remote_msgqueue_pa_uv,
+
+ .setup_msg_structures = xpc_setup_msg_structures_uv,
+ .teardown_msg_structures = xpc_teardown_msg_structures_uv,
+
+ .indicate_partition_engaged = xpc_indicate_partition_engaged_uv,
+ .indicate_partition_disengaged = xpc_indicate_partition_disengaged_uv,
+ .assume_partition_disengaged = xpc_assume_partition_disengaged_uv,
+ .partition_engaged = xpc_partition_engaged_uv,
+ .any_partition_engaged = xpc_any_partition_engaged_uv,
+
+ .n_of_deliverable_payloads = xpc_n_of_deliverable_payloads_uv,
+ .send_payload = xpc_send_payload_uv,
+ .get_deliverable_payload = xpc_get_deliverable_payload_uv,
+ .received_payload = xpc_received_payload_uv,
+ .notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv,
+};
+
+static int
+xpc_init_mq_node(int nid)
+{
+ int cpu;
+
+ cpus_read_lock();
+
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ xpc_activate_mq_uv =
+ xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid,
+ XPC_ACTIVATE_IRQ_NAME,
+ xpc_handle_activate_IRQ_uv);
+ if (!IS_ERR(xpc_activate_mq_uv))
+ break;
+ }
+ if (IS_ERR(xpc_activate_mq_uv)) {
+ cpus_read_unlock();
+ return PTR_ERR(xpc_activate_mq_uv);
+ }
+
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ xpc_notify_mq_uv =
+ xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid,
+ XPC_NOTIFY_IRQ_NAME,
+ xpc_handle_notify_IRQ_uv);
+ if (!IS_ERR(xpc_notify_mq_uv))
+ break;
+ }
+ if (IS_ERR(xpc_notify_mq_uv)) {
+ xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
+ cpus_read_unlock();
+ return PTR_ERR(xpc_notify_mq_uv);
+ }
+
+ cpus_read_unlock();
+ return 0;
+}
+
+int
+xpc_init_uv(void)
+{
+ int nid;
+ int ret = 0;
+
+ xpc_arch_ops = xpc_arch_ops_uv;
+
+ if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) {
+ dev_err(xpc_part, "xpc_notify_mq_msghdr_uv is larger than %d\n",
+ XPC_MSG_HDR_MAX_SIZE);
+ return -E2BIG;
+ }
+
+ if (xpc_mq_node < 0)
+ for_each_online_node(nid) {
+ ret = xpc_init_mq_node(nid);
+
+ if (!ret)
+ break;
+ }
+ else
+ ret = xpc_init_mq_node(xpc_mq_node);
+
+ if (ret < 0)
+ dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n",
+ -ret);
+
+ return ret;
+}
+
+void
+xpc_exit_uv(void)
+{
+ xpc_destroy_gru_mq_uv(xpc_notify_mq_uv);
+ xpc_destroy_gru_mq_uv(xpc_activate_mq_uv);
+}
+
+module_param(xpc_mq_node, int, 0);
+MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues.");
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
new file mode 100644
index 000000000..2396ba3b0
--- /dev/null
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -0,0 +1,599 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 1999-2009 Silicon Graphics, Inc. All rights reserved.
+ */
+
+/*
+ * Cross Partition Network Interface (XPNET) support
+ *
+ * XPNET provides a virtual network layered on top of the Cross
+ * Partition communication layer.
+ *
+ * XPNET provides direct point-to-point and broadcast-like support
+ * for an ethernet-like device. The ethernet broadcast medium is
+ * replaced with a point-to-point message structure which passes
+ * pointers to a DMA-capable block that a remote partition should
+ * retrieve and pass to the upper level networking layer.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include "xp.h"
+
+/*
+ * The message payload transferred by XPC.
+ *
+ * buf_pa is the physical address where the DMA should pull from.
+ *
+ * NOTE: for performance reasons, buf_pa should _ALWAYS_ begin on a
+ * cacheline boundary. To accomplish this, we record the number of
+ * bytes from the beginning of the first cacheline to the first useful
+ * byte of the skb (leadin_ignore) and the number of bytes from the
+ * last useful byte of the skb to the end of the last cacheline
+ * (tailout_ignore).
+ *
+ * size is the number of bytes to transfer which includes the skb->len
+ * (useful bytes of the senders skb) plus the leadin and tailout
+ */
+struct xpnet_message {
+ u16 version; /* Version for this message */
+ u16 embedded_bytes; /* #of bytes embedded in XPC message */
+ u32 magic; /* Special number indicating this is xpnet */
+ unsigned long buf_pa; /* phys address of buffer to retrieve */
+ u32 size; /* #of bytes in buffer */
+ u8 leadin_ignore; /* #of bytes to ignore at the beginning */
+ u8 tailout_ignore; /* #of bytes to ignore at the end */
+ unsigned char data; /* body of small packets */
+};
+
+/*
+ * Determine the size of our message, the cacheline aligned size,
+ * and then the number of message will request from XPC.
+ *
+ * XPC expects each message to exist in an individual cacheline.
+ */
+#define XPNET_MSG_SIZE XPC_MSG_PAYLOAD_MAX_SIZE
+#define XPNET_MSG_DATA_MAX \
+ (XPNET_MSG_SIZE - offsetof(struct xpnet_message, data))
+#define XPNET_MSG_NENTRIES (PAGE_SIZE / XPC_MSG_MAX_SIZE)
+
+#define XPNET_MAX_KTHREADS (XPNET_MSG_NENTRIES + 1)
+#define XPNET_MAX_IDLE_KTHREADS (XPNET_MSG_NENTRIES + 1)
+
+/*
+ * Version number of XPNET implementation. XPNET can always talk to versions
+ * with same major #, and never talk to versions with a different version.
+ */
+#define _XPNET_VERSION(_major, _minor) (((_major) << 4) | (_minor))
+#define XPNET_VERSION_MAJOR(_v) ((_v) >> 4)
+#define XPNET_VERSION_MINOR(_v) ((_v) & 0xf)
+
+#define XPNET_VERSION _XPNET_VERSION(1, 0) /* version 1.0 */
+#define XPNET_VERSION_EMBED _XPNET_VERSION(1, 1) /* version 1.1 */
+#define XPNET_MAGIC 0x88786984 /* "XNET" */
+
+#define XPNET_VALID_MSG(_m) \
+ ((XPNET_VERSION_MAJOR(_m->version) == XPNET_VERSION_MAJOR(XPNET_VERSION)) \
+ && (msg->magic == XPNET_MAGIC))
+
+#define XPNET_DEVICE_NAME "xp0"
+
+/*
+ * When messages are queued with xpc_send_notify, a kmalloc'd buffer
+ * of the following type is passed as a notification cookie. When the
+ * notification function is called, we use the cookie to decide
+ * whether all outstanding message sends have completed. The skb can
+ * then be released.
+ */
+struct xpnet_pending_msg {
+ struct sk_buff *skb;
+ atomic_t use_count;
+};
+
+static struct net_device *xpnet_device;
+
+/*
+ * When we are notified of other partitions activating, we add them to
+ * our bitmask of partitions to which we broadcast.
+ */
+static unsigned long *xpnet_broadcast_partitions;
+/* protect above */
+static DEFINE_SPINLOCK(xpnet_broadcast_lock);
+
+/*
+ * Since the Block Transfer Engine (BTE) is being used for the transfer
+ * and it relies upon cache-line size transfers, we need to reserve at
+ * least one cache-line for head and tail alignment. The BTE is
+ * limited to 8MB transfers.
+ *
+ * Testing has shown that changing MTU to greater than 64KB has no effect
+ * on TCP as the two sides negotiate a Max Segment Size that is limited
+ * to 64K. Other protocols May use packets greater than this, but for
+ * now, the default is 64KB.
+ */
+#define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
+/* 68 comes from min TCP+IP+MAC header */
+#define XPNET_MIN_MTU 68
+/* 32KB has been determined to be the ideal */
+#define XPNET_DEF_MTU (0x8000UL)
+
+/*
+ * The partid is encapsulated in the MAC address beginning in the following
+ * octet and it consists of two octets.
+ */
+#define XPNET_PARTID_OCTET 2
+
+/* Define the XPNET debug device structures to be used with dev_dbg() et al */
+
+static struct device_driver xpnet_dbg_name = {
+ .name = "xpnet"
+};
+
+static struct device xpnet_dbg_subname = {
+ .init_name = "", /* set to "" */
+ .driver = &xpnet_dbg_name
+};
+
+static struct device *xpnet = &xpnet_dbg_subname;
+
+/*
+ * Packet was recevied by XPC and forwarded to us.
+ */
+static void
+xpnet_receive(short partid, int channel, struct xpnet_message *msg)
+{
+ struct sk_buff *skb;
+ void *dst;
+ enum xp_retval ret;
+
+ if (!XPNET_VALID_MSG(msg)) {
+ /*
+ * Packet with a different XPC version. Ignore.
+ */
+ xpc_received(partid, channel, (void *)msg);
+
+ xpnet_device->stats.rx_errors++;
+
+ return;
+ }
+ dev_dbg(xpnet, "received 0x%lx, %d, %d, %d\n", msg->buf_pa, msg->size,
+ msg->leadin_ignore, msg->tailout_ignore);
+
+ /* reserve an extra cache line */
+ skb = dev_alloc_skb(msg->size + L1_CACHE_BYTES);
+ if (!skb) {
+ dev_err(xpnet, "failed on dev_alloc_skb(%d)\n",
+ msg->size + L1_CACHE_BYTES);
+
+ xpc_received(partid, channel, (void *)msg);
+
+ xpnet_device->stats.rx_errors++;
+
+ return;
+ }
+
+ /*
+ * The allocated skb has some reserved space.
+ * In order to use xp_remote_memcpy(), we need to get the
+ * skb->data pointer moved forward.
+ */
+ skb_reserve(skb, (L1_CACHE_BYTES - ((u64)skb->data &
+ (L1_CACHE_BYTES - 1)) +
+ msg->leadin_ignore));
+
+ /*
+ * Update the tail pointer to indicate data actually
+ * transferred.
+ */
+ skb_put(skb, (msg->size - msg->leadin_ignore - msg->tailout_ignore));
+
+ /*
+ * Move the data over from the other side.
+ */
+ if ((XPNET_VERSION_MINOR(msg->version) == 1) &&
+ (msg->embedded_bytes != 0)) {
+ dev_dbg(xpnet, "copying embedded message. memcpy(0x%p, 0x%p, "
+ "%lu)\n", skb->data, &msg->data,
+ (size_t)msg->embedded_bytes);
+
+ skb_copy_to_linear_data(skb, &msg->data,
+ (size_t)msg->embedded_bytes);
+ } else {
+ dst = (void *)((u64)skb->data & ~(L1_CACHE_BYTES - 1));
+ dev_dbg(xpnet, "transferring buffer to the skb->data area;\n\t"
+ "xp_remote_memcpy(0x%p, 0x%p, %u)\n", dst,
+ (void *)msg->buf_pa, msg->size);
+
+ ret = xp_remote_memcpy(xp_pa(dst), msg->buf_pa, msg->size);
+ if (ret != xpSuccess) {
+ /*
+ * !!! Need better way of cleaning skb. Currently skb
+ * !!! appears in_use and we can't just call
+ * !!! dev_kfree_skb.
+ */
+ dev_err(xpnet, "xp_remote_memcpy(0x%p, 0x%p, 0x%x) "
+ "returned error=0x%x\n", dst,
+ (void *)msg->buf_pa, msg->size, ret);
+
+ xpc_received(partid, channel, (void *)msg);
+
+ xpnet_device->stats.rx_errors++;
+
+ return;
+ }
+ }
+
+ dev_dbg(xpnet, "<skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+ "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+ (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+ skb->len);
+
+ skb->protocol = eth_type_trans(skb, xpnet_device);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ dev_dbg(xpnet, "passing skb to network layer\n"
+ "\tskb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+ "skb->end=0x%p skb->len=%d\n",
+ (void *)skb->head, (void *)skb->data, skb_tail_pointer(skb),
+ skb_end_pointer(skb), skb->len);
+
+ xpnet_device->stats.rx_packets++;
+ xpnet_device->stats.rx_bytes += skb->len + ETH_HLEN;
+
+ netif_rx(skb);
+ xpc_received(partid, channel, (void *)msg);
+}
+
+/*
+ * This is the handler which XPC calls during any sort of change in
+ * state or message reception on a connection.
+ */
+static void
+xpnet_connection_activity(enum xp_retval reason, short partid, int channel,
+ void *data, void *key)
+{
+ DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
+ DBUG_ON(channel != XPC_NET_CHANNEL);
+
+ switch (reason) {
+ case xpMsgReceived: /* message received */
+ DBUG_ON(data == NULL);
+
+ xpnet_receive(partid, channel, (struct xpnet_message *)data);
+ break;
+
+ case xpConnected: /* connection completed to a partition */
+ spin_lock_bh(&xpnet_broadcast_lock);
+ __set_bit(partid, xpnet_broadcast_partitions);
+ spin_unlock_bh(&xpnet_broadcast_lock);
+
+ netif_carrier_on(xpnet_device);
+
+ dev_dbg(xpnet, "%s connected to partition %d\n",
+ xpnet_device->name, partid);
+ break;
+
+ default:
+ spin_lock_bh(&xpnet_broadcast_lock);
+ __clear_bit(partid, xpnet_broadcast_partitions);
+ spin_unlock_bh(&xpnet_broadcast_lock);
+
+ if (bitmap_empty(xpnet_broadcast_partitions,
+ xp_max_npartitions)) {
+ netif_carrier_off(xpnet_device);
+ }
+
+ dev_dbg(xpnet, "%s disconnected from partition %d\n",
+ xpnet_device->name, partid);
+ break;
+ }
+}
+
+static int
+xpnet_dev_open(struct net_device *dev)
+{
+ enum xp_retval ret;
+
+ dev_dbg(xpnet, "calling xpc_connect(%d, 0x%p, NULL, %ld, %ld, %ld, "
+ "%ld)\n", XPC_NET_CHANNEL, xpnet_connection_activity,
+ (unsigned long)XPNET_MSG_SIZE,
+ (unsigned long)XPNET_MSG_NENTRIES,
+ (unsigned long)XPNET_MAX_KTHREADS,
+ (unsigned long)XPNET_MAX_IDLE_KTHREADS);
+
+ ret = xpc_connect(XPC_NET_CHANNEL, xpnet_connection_activity, NULL,
+ XPNET_MSG_SIZE, XPNET_MSG_NENTRIES,
+ XPNET_MAX_KTHREADS, XPNET_MAX_IDLE_KTHREADS);
+ if (ret != xpSuccess) {
+ dev_err(xpnet, "ifconfig up of %s failed on XPC connect, "
+ "ret=%d\n", dev->name, ret);
+
+ return -ENOMEM;
+ }
+
+ dev_dbg(xpnet, "ifconfig up of %s; XPC connected\n", dev->name);
+
+ return 0;
+}
+
+static int
+xpnet_dev_stop(struct net_device *dev)
+{
+ xpc_disconnect(XPC_NET_CHANNEL);
+
+ dev_dbg(xpnet, "ifconfig down of %s; XPC disconnected\n", dev->name);
+
+ return 0;
+}
+
+/*
+ * Notification that the other end has received the message and
+ * DMA'd the skb information. At this point, they are done with
+ * our side. When all recipients are done processing, we
+ * release the skb and then release our pending message structure.
+ */
+static void
+xpnet_send_completed(enum xp_retval reason, short partid, int channel,
+ void *__qm)
+{
+ struct xpnet_pending_msg *queued_msg = (struct xpnet_pending_msg *)__qm;
+
+ DBUG_ON(queued_msg == NULL);
+
+ dev_dbg(xpnet, "message to %d notified with reason %d\n",
+ partid, reason);
+
+ if (atomic_dec_return(&queued_msg->use_count) == 0) {
+ dev_dbg(xpnet, "all acks for skb->head=-x%p\n",
+ (void *)queued_msg->skb->head);
+
+ dev_kfree_skb_any(queued_msg->skb);
+ kfree(queued_msg);
+ }
+}
+
+static void
+xpnet_send(struct sk_buff *skb, struct xpnet_pending_msg *queued_msg,
+ u64 start_addr, u64 end_addr, u16 embedded_bytes, int dest_partid)
+{
+ u8 msg_buffer[XPNET_MSG_SIZE];
+ struct xpnet_message *msg = (struct xpnet_message *)&msg_buffer;
+ u16 msg_size = sizeof(struct xpnet_message);
+ enum xp_retval ret;
+
+ msg->embedded_bytes = embedded_bytes;
+ if (unlikely(embedded_bytes != 0)) {
+ msg->version = XPNET_VERSION_EMBED;
+ dev_dbg(xpnet, "calling memcpy(0x%p, 0x%p, 0x%lx)\n",
+ &msg->data, skb->data, (size_t)embedded_bytes);
+ skb_copy_from_linear_data(skb, &msg->data,
+ (size_t)embedded_bytes);
+ msg_size += embedded_bytes - 1;
+ } else {
+ msg->version = XPNET_VERSION;
+ }
+ msg->magic = XPNET_MAGIC;
+ msg->size = end_addr - start_addr;
+ msg->leadin_ignore = (u64)skb->data - start_addr;
+ msg->tailout_ignore = end_addr - (u64)skb_tail_pointer(skb);
+ msg->buf_pa = xp_pa((void *)start_addr);
+
+ dev_dbg(xpnet, "sending XPC message to %d:%d\n"
+ "msg->buf_pa=0x%lx, msg->size=%u, "
+ "msg->leadin_ignore=%u, msg->tailout_ignore=%u\n",
+ dest_partid, XPC_NET_CHANNEL, msg->buf_pa, msg->size,
+ msg->leadin_ignore, msg->tailout_ignore);
+
+ atomic_inc(&queued_msg->use_count);
+
+ ret = xpc_send_notify(dest_partid, XPC_NET_CHANNEL, XPC_NOWAIT, msg,
+ msg_size, xpnet_send_completed, queued_msg);
+ if (unlikely(ret != xpSuccess))
+ atomic_dec(&queued_msg->use_count);
+}
+
+/*
+ * Network layer has formatted a packet (skb) and is ready to place it
+ * "on the wire". Prepare and send an xpnet_message to all partitions
+ * which have connected with us and are targets of this packet.
+ *
+ * MAC-NOTE: For the XPNET driver, the MAC address contains the
+ * destination partid. If the destination partid octets are 0xffff,
+ * this packet is to be broadcast to all connected partitions.
+ */
+static netdev_tx_t
+xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xpnet_pending_msg *queued_msg;
+ u64 start_addr, end_addr;
+ short dest_partid;
+ u16 embedded_bytes = 0;
+
+ dev_dbg(xpnet, ">skb->head=0x%p skb->data=0x%p skb->tail=0x%p "
+ "skb->end=0x%p skb->len=%d\n", (void *)skb->head,
+ (void *)skb->data, skb_tail_pointer(skb), skb_end_pointer(skb),
+ skb->len);
+
+ if (skb->data[0] == 0x33) {
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK; /* nothing needed to be done */
+ }
+
+ /*
+ * The xpnet_pending_msg tracks how many outstanding
+ * xpc_send_notifies are relying on this skb. When none
+ * remain, release the skb.
+ */
+ queued_msg = kmalloc(sizeof(struct xpnet_pending_msg), GFP_ATOMIC);
+ if (queued_msg == NULL) {
+ dev_warn(xpnet, "failed to kmalloc %ld bytes; dropping "
+ "packet\n", sizeof(struct xpnet_pending_msg));
+
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* get the beginning of the first cacheline and end of last */
+ start_addr = ((u64)skb->data & ~(L1_CACHE_BYTES - 1));
+ end_addr = L1_CACHE_ALIGN((u64)skb_tail_pointer(skb));
+
+ /* calculate how many bytes to embed in the XPC message */
+ if (unlikely(skb->len <= XPNET_MSG_DATA_MAX)) {
+ /* skb->data does fit so embed */
+ embedded_bytes = skb->len;
+ }
+
+ /*
+ * Since the send occurs asynchronously, we set the count to one
+ * and begin sending. Any sends that happen to complete before
+ * we are done sending will not free the skb. We will be left
+ * with that task during exit. This also handles the case of
+ * a packet destined for a partition which is no longer up.
+ */
+ atomic_set(&queued_msg->use_count, 1);
+ queued_msg->skb = skb;
+
+ if (skb->data[0] == 0xff) {
+ /* we are being asked to broadcast to all partitions */
+ for_each_set_bit(dest_partid, xpnet_broadcast_partitions,
+ xp_max_npartitions) {
+
+ xpnet_send(skb, queued_msg, start_addr, end_addr,
+ embedded_bytes, dest_partid);
+ }
+ } else {
+ dest_partid = (short)skb->data[XPNET_PARTID_OCTET + 1];
+ dest_partid |= (short)skb->data[XPNET_PARTID_OCTET + 0] << 8;
+
+ if (dest_partid >= 0 &&
+ dest_partid < xp_max_npartitions &&
+ test_bit(dest_partid, xpnet_broadcast_partitions) != 0) {
+
+ xpnet_send(skb, queued_msg, start_addr, end_addr,
+ embedded_bytes, dest_partid);
+ }
+ }
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (atomic_dec_return(&queued_msg->use_count) == 0) {
+ dev_kfree_skb(skb);
+ kfree(queued_msg);
+ }
+
+ return NETDEV_TX_OK;
+}
+
+/*
+ * Deal with transmit timeouts coming from the network layer.
+ */
+static void
+xpnet_dev_tx_timeout(struct net_device *dev, unsigned int txqueue)
+{
+ dev->stats.tx_errors++;
+}
+
+static const struct net_device_ops xpnet_netdev_ops = {
+ .ndo_open = xpnet_dev_open,
+ .ndo_stop = xpnet_dev_stop,
+ .ndo_start_xmit = xpnet_dev_hard_start_xmit,
+ .ndo_tx_timeout = xpnet_dev_tx_timeout,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+static int __init
+xpnet_init(void)
+{
+ u8 addr[ETH_ALEN];
+ int result;
+
+ if (!is_uv_system())
+ return -ENODEV;
+
+ dev_info(xpnet, "registering network device %s\n", XPNET_DEVICE_NAME);
+
+ xpnet_broadcast_partitions = bitmap_zalloc(xp_max_npartitions,
+ GFP_KERNEL);
+ if (xpnet_broadcast_partitions == NULL)
+ return -ENOMEM;
+
+ /*
+ * use ether_setup() to init the majority of our device
+ * structure and then override the necessary pieces.
+ */
+ xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN,
+ ether_setup);
+ if (xpnet_device == NULL) {
+ bitmap_free(xpnet_broadcast_partitions);
+ return -ENOMEM;
+ }
+
+ netif_carrier_off(xpnet_device);
+
+ xpnet_device->netdev_ops = &xpnet_netdev_ops;
+ xpnet_device->mtu = XPNET_DEF_MTU;
+ xpnet_device->min_mtu = XPNET_MIN_MTU;
+ xpnet_device->max_mtu = XPNET_MAX_MTU;
+
+ memset(addr, 0, sizeof(addr));
+ /*
+ * Multicast assumes the LSB of the first octet is set for multicast
+ * MAC addresses. We chose the first octet of the MAC to be unlikely
+ * to collide with any vendor's officially issued MAC.
+ */
+ addr[0] = 0x02; /* locally administered, no OUI */
+
+ addr[XPNET_PARTID_OCTET + 1] = xp_partition_id;
+ addr[XPNET_PARTID_OCTET + 0] = (xp_partition_id >> 8);
+ eth_hw_addr_set(xpnet_device, addr);
+
+ /*
+ * ether_setup() sets this to a multicast device. We are
+ * really not supporting multicast at this time.
+ */
+ xpnet_device->flags &= ~IFF_MULTICAST;
+
+ /*
+ * No need to checksum as it is a DMA transfer. The BTE will
+ * report an error if the data is not retrievable and the
+ * packet will be dropped.
+ */
+ xpnet_device->features = NETIF_F_HW_CSUM;
+
+ result = register_netdev(xpnet_device);
+ if (result != 0) {
+ free_netdev(xpnet_device);
+ bitmap_free(xpnet_broadcast_partitions);
+ }
+
+ return result;
+}
+
+module_init(xpnet_init);
+
+static void __exit
+xpnet_exit(void)
+{
+ dev_info(xpnet, "unregistering network device %s\n",
+ xpnet_device[0].name);
+
+ unregister_netdev(xpnet_device);
+ free_netdev(xpnet_device);
+ bitmap_free(xpnet_broadcast_partitions);
+}
+
+module_exit(xpnet_exit);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION("Cross Partition Network adapter (XPNET)");
+MODULE_LICENSE("GPL");