summaryrefslogtreecommitdiffstats
path: root/agents/crosslink
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--agents/crosslink/README.md44
-rwxr-xr-xagents/crosslink/fence_crosslink.py113
2 files changed, 157 insertions, 0 deletions
diff --git a/agents/crosslink/README.md b/agents/crosslink/README.md
new file mode 100644
index 0000000..990b790
--- /dev/null
+++ b/agents/crosslink/README.md
@@ -0,0 +1,44 @@
+# Two node cross-link fence agent
+
+The problem that this fence agents tries to solve is the following:
+
+Given a two-node cluster with a direct crosslink ethernet cable
+between the two nodes (in addition to the normal networking setup), we want to
+be able to maintain quorum on node (A) when node (B) lost power.
+The loss of power on node (B) in this case implies its BMC/IPMI is also
+not available which would be normally used in fencing in this case.
+
+Note: An external PDU would be preferrable and would solve this
+situation more elegantly. The assumption here is that something
+like that won't be available in this environment.
+
+This works by creating a stonith level composed of a BMC/IPMI
+fencing at level 1 and then the fence_crosslink agent at level 2.
+
+In case node (A) has lost power, then node (B) will do the following:
+1. Try to fence node (B) via IPMI, which will fail since the node has no
+power and the BMC is unavailable
+2. Check via fence_crosslink the cross-cable interconnect. If the cross cable
+IP is not reachable, then we know for "sure" (this is a potentially broad
+assumption) that the node is really down and fence_crosslink tells pacemaker
+that the fencing was successful, so pacemaker can work with that new
+information.
+
+Here are some example configuration commands:
+~~~
+pcs stonith create crosslink-controller-1 fence_crosslink crosscableip=1.1.1.2 pcmk_host_list=controller-1 pcmk_reboot_action=off
+pcs stonith create crosslink-controller-0 fence_crosslink crosscableip=1.1.1.1 pcmk_host_list=controller-0 pcmk_reboot_action=off
+# We make sure the stonith resource do not run on the same node as the fencing target
+pcs constraint location crosslink-controller-1 avoids controller-1
+pcs constraint location crosslink-controller-0 avoids controller-0
+pcs stonith level add 2 controller-0 crosslink-controller-0
+pcs stonith level add 2 controller-1 crosslink-controller-1
+~~~
+
+Testing done:
+- Simulate power outage by turning off the controller-1 VM and its IPMI interface and leaving the crosslink intact.
+
+ * Expected Outcome:
+ We should retain quorum on controller-0 and all services should be running on controller-0. No UNCLEAN resources should be observed on controller-0.
+ * Actual Outcome:
+ Matched the expected outcome.
diff --git a/agents/crosslink/fence_crosslink.py b/agents/crosslink/fence_crosslink.py
new file mode 100755
index 0000000..7cfc90c
--- /dev/null
+++ b/agents/crosslink/fence_crosslink.py
@@ -0,0 +1,113 @@
+#!@PYTHON@ -tt
+
+# Copyright (c) 2020 Red Hat
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see
+# <http://www.gnu.org/licenses/>.
+
+import atexit
+import logging
+import sys
+sys.path.append("@FENCEAGENTSLIBDIR@")
+from fencing import (all_opt, atexit_handler, check_input, # noqa: E402
+ fence_action, process_input, run_command, run_delay,
+ show_docs)
+
+logger = logging.getLogger(__name__)
+logger.setLevel("WARNING")
+
+
+def get_power_status(conn, options):
+ logger.debug("get_power_status(): %s" % options)
+ ip = options['--crosscableip']
+ timeout = options['--timeout']
+ # This returns 'off' if not a single ICMP packet gets answered during the
+ # whole timeout window. (the ping executable will return 1 in such case and
+ # 0 if even a single packet gets replied to)
+ (status, stdout, stderr) = run_command(options, "ping -w%s -n %s" %
+ (timeout, ip))
+ logger.debug("get_power_status(): %s - Stdout: %s - Stderr: %s" %
+ (status, stdout, stderr))
+ if status == 0:
+ return "on"
+ else:
+ return "off"
+
+
+def set_power_status(conn, options):
+ logger.debug("set_power_status(): %s" % options)
+ # If we got here it means the previous call to get_power_status() returned
+ # on At this point we've been invoked but the node is still reachable over
+ # the cross connect, so we can just error out.
+ ip = options['--crosscableip']
+ if options['--action'] == 'off':
+ logger.error("We've been asked to turn off the node at %s but the "
+ "cross-cable link is up so erroring out" % ip)
+ sys.exit(1)
+ elif options['--action'] == 'on':
+ logger.error("We've been asked to turn on the node at %s but the "
+ "cross-cable link is off so erroring out" % ip)
+ sys.exit(1)
+ else:
+ logger.error("set_power_status() was called with action %s which "
+ "is not supported" % options['--action'])
+ sys.exit(1)
+
+
+def define_new_opts():
+ all_opt["crosscableip"] = {
+ "getopt": "a:",
+ "longopt": "crosscableip",
+ "help": "-a, --crosscableip=[IP] IP over the cross-cable link",
+ "required": "1",
+ "shortdesc": "Cross-cable IP",
+ "order": 1
+ }
+ all_opt["timeout"] = {
+ "getopt": "T:",
+ "longopt": "timeout",
+ "help": "-T, --timeout=[seconds] timeout in seconds",
+ "required": "0",
+ "shortdesc": "No ICMP reply in 5 seconds -> Node is considered dead",
+ "default": "5",
+ "order": 1
+ }
+
+
+def main():
+ atexit.register(atexit_handler)
+
+ device_opt = ["crosscableip", "timeout", "no_password", "no_login", "port"]
+ define_new_opts()
+
+ options = check_input(device_opt, process_input(device_opt))
+
+ docs = {}
+ docs["shortdesc"] = "Fence agent for cross-link two-node clusters"
+ docs["longdesc"] = "This agent helps two-node clusters to tackle the " \
+ "situation where one node lost power, cannot be " \
+ "fenced by telling pacemaker that if the node is not " \
+ "reachable over the crosslink cable, we can assume " \
+ "it is dead"
+ docs["vendorurl"] = ""
+ show_docs(options, docs)
+
+ run_delay(options)
+
+ result = fence_action(None, options, set_power_status, get_power_status)
+ sys.exit(result)
+
+
+if __name__ == "__main__":
+ main()