diff options
Diffstat (limited to 'doc')
-rw-r--r-- | doc/Makefile.am | 53 | ||||
-rw-r--r-- | doc/ha_logd.xml.in | 134 | ||||
-rw-r--r-- | doc/ha_logger.xml.in | 110 | ||||
-rw-r--r-- | doc/hb_report.8.txt | 478 | ||||
-rw-r--r-- | doc/meatclient.xml.in | 77 | ||||
-rw-r--r-- | doc/stonith.xml.in | 315 | ||||
-rw-r--r-- | doc/stonith/Makefile.am | 37 | ||||
-rw-r--r-- | doc/stonith/README.bladehpi | 101 | ||||
-rw-r--r-- | doc/stonith/README.cyclades | 61 | ||||
-rw-r--r-- | doc/stonith/README.drac3 | 18 | ||||
-rw-r--r-- | doc/stonith/README.dracmc | 87 | ||||
-rw-r--r-- | doc/stonith/README.external | 90 | ||||
-rw-r--r-- | doc/stonith/README.ibmrsa | 9 | ||||
-rw-r--r-- | doc/stonith/README.ibmrsa-telnet | 55 | ||||
-rw-r--r-- | doc/stonith/README.ipmilan | 131 | ||||
-rw-r--r-- | doc/stonith/README.ippower9258 | 68 | ||||
-rw-r--r-- | doc/stonith/README.meatware | 26 | ||||
-rw-r--r-- | doc/stonith/README.rackpdu | 21 | ||||
-rw-r--r-- | doc/stonith/README.rcd_serial | 186 | ||||
-rw-r--r-- | doc/stonith/README.riloe | 36 | ||||
-rw-r--r-- | doc/stonith/README.vacm | 40 | ||||
-rw-r--r-- | doc/stonith/README.vcenter | 90 | ||||
-rw-r--r-- | doc/stonith/README.wti_mpc | 85 | ||||
-rw-r--r-- | doc/stonith/README_kdumpcheck.txt | 151 |
24 files changed, 2459 insertions, 0 deletions
diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000..c8d67a8 --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,53 @@ +# +# heartbeat: Linux-HA heartbeat code +# +# Copyright (C) 2001 Michael Moerz +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +MAINTAINERCLEANFILES = Makefile.in hb_report.xml ha_logd.xml ha_logger.xml stonith.xml meatclient.xml + +CLEANFILES = $(man_MANS) + +SUBDIRS = stonith + +hanoarchdir = $(datadir)/heartbeat + +man_MANS = + +if BUILD_DOC +man_MANS += hb_report.8 ha_logd.8 ha_logger.1 stonith.8 meatclient.8 + +EXTRA_DIST = $(man_MANS) + +STYLESHEET_PREFIX ?= http://docbook.sourceforge.net/release/xsl/current +MANPAGES_STYLESHEET ?= $(STYLESHEET_PREFIX)/manpages/docbook.xsl +HTML_STYLESHEET ?= $(STYLESHEET_PREFIX)/xhtml/docbook.xsl +FO_STYLESHEET ?= $(STYLESHEET_PREFIX)/fo/docbook.xsl + +XSLTPROC_OPTIONS ?= --xinclude +XSLTPROC_MANPAGES_OPTIONS ?= $(XSLTPROC_OPTIONS) +XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) +XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) + +%.5 %.8 %.1: %.xml + $(XSLTPROC) \ + $(XSLTPROC_MANPAGES_OPTIONS) \ + $(MANPAGES_STYLESHEET) $< + +hb_report.8: hb_report.8.txt + a2x -f manpage $< + +endif diff --git a/doc/ha_logd.xml.in b/doc/ha_logd.xml.in new file mode 100644 index 0000000..368f06d --- /dev/null +++ b/doc/ha_logd.xml.in @@ -0,0 +1,134 @@ +<?xml version="1.0"?> +<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN" "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"> +<refentry id="re-ha_logd"> + <refentryinfo> + <date>December 8, 2009</date> + <productname>@PACKAGE_NAME@</productname> + <productnumber>@VERSION@</productnumber> + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Robertson</surname> + <contrib>ha_logd</contrib> + <email>alanr@unix.sh</email> + </author> + <author> + <surname>Shi</surname> + <firstname>Guochun</firstname> + <contrib>ha_logd</contrib> + <email>gshi@ncsa.uiuc.edu</email> + </author> + <author> + <surname>Lars</surname> + <firstname>Marowsky-Bree</firstname> + <contrib>ha_logd</contrib> + <email>lmb@suse.de</email> + </author> + <author> + <firstname>Florian</firstname> + <surname>Haas</surname> + <contrib>man page</contrib> + <email>florian.haas@linbit.com</email> + </author> + </authorgroup> + </refentryinfo> + <refmeta> + <refentrytitle>ha_logd</refentrytitle> + <manvolnum>8</manvolnum> + <refmiscinfo class="manual">System administration utilities</refmiscinfo> + </refmeta> + <refnamediv> + <refname>ha_logd</refname> + <refpurpose>Logging Daemon for High-Availability Linux</refpurpose> + </refnamediv> + <refsynopsisdiv> + <cmdsynopsis> + <command>ha_logd</command> + <arg choice="opt"><option>-s</option></arg> + <arg choice="opt"><option>-k</option></arg> + <arg choice="opt"><option>-d</option></arg> + <arg choice="opt"><option>-h</option></arg> + <arg choice="opt"><option>-c</option> <replaceable>file</replaceable></arg> + </cmdsynopsis> + </refsynopsisdiv> + <refsection id="rs-ha_logd-description"> + <title>Description</title> + <para><command>ha_logd</command> is a logging daemon for + Linux-HA. It receives messages from a local domain socket + <filename>@HA_LOGDAEMON_IPC@</filename>, and writes them to + appropriate files and syslog if enabled. The reason for utilizing + this logging daemon is that occasionally Heartbeat suffers from + disk I/O delays. By sending log messages to a logging daemon, + heartbeat can avoid such I/O delays.</para> + </refsection> + <refsection id="rs-ha_logd-options"> + <title>Options</title> + <para>The following options are supported:</para> + <variablelist> + <varlistentry> + <term> + <option>-s</option> + </term> + <listitem> + <para>Show <command>ha_logd</command> status (either + <token>running</token> or <token>stopped</token>)</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-k</option> + </term> + <listitem> + <para>Stop (kill) the daemon</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-d</option> + </term> + <listitem> + <para>Daemonize (without this option, + <command>ha_logd</command> will run in the + foreground)</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-h</option> + </term> + <listitem> + <para>Show a brief usage message</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-c</option> <replaceable>file</replaceable> + </term> + <listitem> + <para>Configuration file. You may configure a regular log + file, debug log file, log facility, and entity. For details, + see the example <filename>ha_logd.cf</filename> file found + in the documentation.</para> + </listitem> + </varlistentry> + </variablelist> + </refsection> + <refsection id="rs-ha_logd-files"> + <title>Files</title> + <itemizedlist> + <listitem> + <para><filename>@GLUE_STATE_DIR@/ha_logd.pid</filename> – PID file</para> + </listitem> + <listitem> + <para><filename>ha_logd.cf</filename> – example configuration file</para> + </listitem> + </itemizedlist> + </refsection> + <refsection id="rs-ha_logd-seealso"> + <title>See also</title> + <para> + <citerefentry><refentrytitle>heartbeat</refentrytitle><manvolnum>8</manvolnum></citerefentry>, + <citerefentry><refentrytitle>ha_logger</refentrytitle><manvolnum>1</manvolnum></citerefentry> + </para> + </refsection> +</refentry> diff --git a/doc/ha_logger.xml.in b/doc/ha_logger.xml.in new file mode 100644 index 0000000..dce7fe2 --- /dev/null +++ b/doc/ha_logger.xml.in @@ -0,0 +1,110 @@ +<?xml version="1.0"?> +<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN" "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"> +<refentry id="re-ha_logger"> + <refentryinfo> + <date>December 8, 2009</date> + <productname>@PACKAGE_NAME@</productname> + <productnumber>@VERSION@</productnumber> + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Robertson</surname> + <contrib>ha_logd</contrib> + <email>alanr@unix.sh</email> + </author> + <author> + <surname>Shi</surname> + <firstname>Guochun</firstname> + <contrib>ha_logd</contrib> + <email>gshi@ncsa.uiuc.edu</email> + </author> + <author> + <surname>Lars</surname> + <firstname>Marowsky-Bree</firstname> + <contrib>ha_logd</contrib> + <email>lmb@suse.de</email> + </author> + <author> + <firstname>Florian</firstname> + <surname>Haas</surname> + <contrib>man page</contrib> + <email>florian.haas@linbit.com</email> + </author> + </authorgroup> + </refentryinfo> + <refmeta> + <refentrytitle>ha_logger</refentrytitle> + <manvolnum>1</manvolnum> + <refmiscinfo class="manual">User commands</refmiscinfo> + </refmeta> + <refnamediv> + <refname>ha_logger</refname> + <refpurpose>Log a message to files and/or syslog through the HA + Logging Daemon</refpurpose> + </refnamediv> + <refsynopsisdiv> + <cmdsynopsis> + <command>ha_logger</command> + <arg choice="opt"> + <option>-D</option> + <group choice="plain"> + <arg>ha-log</arg> + <arg>ha-debug</arg> + </group> + </arg> + <arg choice="opt"> + <option>-t</option> + <replaceable>tag</replaceable> + </arg> + <arg choice="plain" rep="repeat"> + <replaceable>message</replaceable> + </arg> + </cmdsynopsis> + </refsynopsisdiv> + <refsection id="rs-ha_logger-description"> + <title>Description</title> + <para><command>ha_logger</command> is used to log a message to + files/syslog through the HA Logging Daemon.</para> + </refsection> + <refsection id="rs-ha_logger-options"> + <title>Options</title> + <para>The following options are supported:</para> + <variablelist> + <varlistentry> + <term> + <option>-D</option> <token>ha-log</token>|<token>ha-debug</token> + </term> + <listitem> + <para>Log the message to different + files. <token>ha-log</token> will log the message to the log + file and the debug file, while <token>ha-debug</token> will + log the message to the debug file only.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-t</option> <replaceable>tag</replaceable> + </term> + <listitem> + <para>Mark every line in the log with the specified + <replaceable>tag</replaceable>.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <replaceable>message</replaceable> + </term> + <listitem> + <para>The message that should be logged.</para> + </listitem> + </varlistentry> + </variablelist> + </refsection> + <refsection id="rs-ha_logger-seealso"> + <title>See also</title> + <para> + <citerefentry><refentrytitle>heartbeat</refentrytitle><manvolnum>8</manvolnum></citerefentry>, + <citerefentry><refentrytitle>ha_logd</refentrytitle><manvolnum>8</manvolnum></citerefentry> + </para> + </refsection> +</refentry> diff --git a/doc/hb_report.8.txt b/doc/hb_report.8.txt new file mode 100644 index 0000000..5efbc32 --- /dev/null +++ b/doc/hb_report.8.txt @@ -0,0 +1,478 @@ +:man source: hb_report +:man version: 1.2 +:man manual: Pacemaker documentation + +hb_report(8) +============ + + +NAME +---- +hb_report - create report for CRM based clusters (Pacemaker) + + +SYNOPSIS +-------- +*hb_report* -f {time|"cts:"testnum} [-t time] [-u user] [-l file] + [-n nodes] [-E files] [-p patt] [-L patt] [-e prog] + [-MSDCZAQVsvhd] [dest] + + +DESCRIPTION +----------- +The hb_report(1) is a utility to collect all information (logs, +configuration files, system information, etc) relevant to +Pacemaker (CRM) over the given period of time. + + +OPTIONS +------- +dest:: + The report name. It can also contain a path where to put the + report tarball. If left out, the tarball is created in the + current directory named "hb_report-current_date", for instance + hb_report-Wed-03-Mar-2010. + +*-d*:: + Don't create the compressed tar, but leave the result in a + directory. + +*-f* { time | "cts:"testnum }:: + The start time from which to collect logs. The time is in the + format as used by the Date::Parse perl module. For cts tests, + specify the "cts:" string followed by the test number. This + option is required. + +*-t* time:: + The end time to which to collect logs. Defaults to now. + +*-n* nodes:: + A list of space separated hostnames (cluster members). + hb_report may try to find out the set of nodes by itself, but + if it runs on the loghost which, as it is usually the case, + does not belong to the cluster, that may be difficult. Also, + OpenAIS doesn't contain a list of nodes and if Pacemaker is + not running, there is no way to find it out automatically. + This option is cumulative (i.e. use -n "a b" or -n a -n b). + +*-l* file:: + Log file location. If, for whatever reason, hb_report cannot + find the log files, you can specify its absolute path. + +*-E* files:: + Extra log files to collect. This option is cumulative. By + default, /var/log/messages are collected along with the + cluster logs. + +*-M*:: + Don't collect extra log files, but only the file containing + messages from the cluster subsystems. + +*-L* patt:: + A list of regular expressions to match in log files for + analysis. This option is additive (default: "CRIT: ERROR:"). + +*-p* patt:: + Additional patterns to match parameter name which contain + sensitive information. This option is additive (default: "passw.*"). + +*-Q*:: + Quick run. Gathering some system information can be expensive. + With this option, such operations are skipped and thus + information collecting sped up. The operations considered + I/O or CPU intensive: verifying installed packages content, + sanitizing files for sensitive information, and producing dot + files from PE inputs. + +*-A*:: + This is an OpenAIS cluster. hb_report has some heuristics to + find the cluster stack, but that is not always reliable. + By default, hb_report assumes that it is run on a Heartbeat + cluster. + +*-u* user:: + The ssh user. hb_report will try to login to other nodes + without specifying a user, then as "root", and finally as + "hacluster". If you have another user for administration over + ssh, please use this option. + +*-X* ssh-options:: + Extra ssh options. These will be added to every ssh + invocation. Alternatively, use `$HOME/.ssh/config` to setup + desired ssh connection options. + +*-S*:: + Single node operation. Run hb_report only on this node and + don't try to start slave collectors on other members of the + cluster. Under normal circumstances this option is not + needed. Use if ssh(1) does not work to other nodes. + +*-Z*:: + If the destination directory exist, remove it instead of + exiting (this is default for CTS). + +*-V*:: + Print the version including the last repository changeset. + +*-v*:: + Increase verbosity. Normally used to debug unexpected + behaviour. + +*-h*:: + Show usage and some examples. + +*-D* (obsolete):: + Don't invoke editor to fill the description text file. + +*-e* prog (obsolete):: + Your favourite text editor. Defaults to $EDITOR, vim, vi, + emacs, or nano, whichever is found first. + +*-C* (obsolete):: + Remove the destination directory once the report has been put + in a tarball. + +EXAMPLES +-------- +Last night during the backup there were several warnings +encountered (logserver is the log host): + + logserver# hb_report -f 3:00 -t 4:00 -n "node1 node2" report + +collects everything from all nodes from 3am to 4am last night. +The files are compressed to a tarball report.tar.bz2. + +Just found a problem during testing: + + # note the current time + node1# date + Fri Sep 11 18:51:40 CEST 2009 + node1# /etc/init.d/heartbeat start + node1# nasty-command-that-breaks-things + node1# sleep 120 #wait for the cluster to settle + node1# hb_report -f 18:51 hb1 + + # if hb_report can't figure out that this is corosync + node1# hb_report -f 18:51 -A hb1 + + # if hb_report can't figure out the cluster members + node1# hb_report -f 18:51 -n "node1 node2" hb1 + +The files are compressed to a tarball hb1.tar.bz2. + +INTERPRETING RESULTS +-------------------- +The compressed tar archive is the final product of hb_report. +This is one example of its content, for a CTS test case on a +three node OpenAIS cluster: + + $ ls -RF 001-Restart + + 001-Restart: + analysis.txt events.txt logd.cf s390vm13/ s390vm16/ + description.txt ha-log.txt openais.conf s390vm14/ + + 001-Restart/s390vm13: + STOPPED crm_verify.txt hb_uuid.txt openais.conf@ sysinfo.txt + cib.txt dlm_dump.txt logd.cf@ pengine/ sysstats.txt + cib.xml events.txt messages permissions.txt + + 001-Restart/s390vm13/pengine: + pe-input-738.bz2 pe-input-740.bz2 pe-warn-450.bz2 + pe-input-739.bz2 pe-warn-449.bz2 pe-warn-451.bz2 + + 001-Restart/s390vm14: + STOPPED crm_verify.txt hb_uuid.txt openais.conf@ sysstats.txt + cib.txt dlm_dump.txt logd.cf@ permissions.txt + cib.xml events.txt messages sysinfo.txt + + 001-Restart/s390vm16: + STOPPED crm_verify.txt hb_uuid.txt messages sysinfo.txt + cib.txt dlm_dump.txt hostcache openais.conf@ sysstats.txt + cib.xml events.txt logd.cf@ permissions.txt + +The top directory contains information which pertains to the +cluster or event as a whole. Files with exactly the same content +on all nodes will also be at the top, with per-node links created +(as it is in this example the case with openais.conf and logd.cf). + +The cluster log files are named ha-log.txt regardless of the +actual log file name on the system. If it is found on the +loghost, then it is placed in the top directory. If not, the top +directory ha-log.txt contains all nodes logs merged and sorted by +time. Files named messages are excerpts of /var/log/messages from +nodes. + +Most files are copied verbatim or they contain output of a +command. For instance, cib.xml is a copy of the CIB found in +/var/lib/heartbeat/crm/cib.xml. crm_verify.txt is output of the +crm_verify(8) program. + +Some files are result of a more involved processing: + + *analysis.txt*:: + A set of log messages matching user defined patterns (may be + provided with the -L option). + + *events.txt*:: + A set of log messages matching event patterns. It should + provide information about major cluster motions without + unnecessary details. These patterns are devised by the + cluster experts. Currently, the patterns cover membership + and quorum changes, resource starts and stops, fencing + (stonith) actions, and cluster starts and stops. events.txt + is always generated for each node. In case the central + cluster log was found, also combined for all nodes. + + *permissions.txt*:: + One of the more common problem causes are file and directory + permissions. hb_report looks for a set of predefined + directories and checks their permissions. Any issues are + reported here. + + *backtraces.txt*:: + gdb generated backtrace information for cores dumped + within the specified period. + + *sysinfo.txt*:: + Various release information about the platform, kernel, + operating system, packages, and anything else deemed to be + relevant. The static part of the system. + + *sysstats.txt*:: + Output of various system commands such as ps(1), uptime(1), + netstat(8), and ifconfig(8). The dynamic part of the system. + +description.txt should contain a user supplied description of the +problem, but since it is very seldom used, it will be dropped +from the future releases. + +PREREQUISITES +------------- + +ssh:: + It is not strictly required, but you won't regret having a + password-less ssh. It is not too difficult to setup and will save + you a lot of time. If you can't have it, for example because your + security policy does not allow such a thing, or you just prefer + menial work, then you will have to resort to the semi-manual + semi-automated report generation. See below for instructions. + + + If you need to supply a password for your passphrase/login, then + always use the `-u` option. + + + For extra ssh(1) options, if you're too lazy to setup + $HOME/.ssh/config, use the `-X` option. Do not forget to put + the options in quotes. + +sudo:: + If the ssh user (as specified with the `-u` option) is other + than `root`, then `hb_report` uses `sudo` to collect the + information which is readable only by the `root` user. In that + case it is required to setup the `sudoers` file properly. The + user (or group to which the user belongs) should have the + following line: + + + <user> ALL = NOPASSWD: /usr/sbin/hb_report + + + See the `sudoers(5)` man page for more details. + +Times:: + In order to find files and messages in the given period and to + parse the `-f` and `-t` options, `hb_report` uses perl and one of the + `Date::Parse` or `Date::Manip` perl modules. Note that you need + only one of these. Furthermore, on nodes which have no logs and + where you don't run `hb_report` directly, no date parsing is + necessary. In other words, if you run this on a loghost then you + don't need these perl modules on the cluster nodes. + + + On rpm based distributions, you can find `Date::Parse` in + `perl-TimeDate` and on Debian and its derivatives in + `libtimedate-perl`. + +Core dumps:: + To backtrace core dumps gdb is needed and the packages with + the debugging info. The debug info packages may be installed + at the time the report is created. Let's hope that you will + need this really seldom. + +TIMES +----- + +Specifying times can at times be a nuisance. That is why we have +chosen to use one of the perl modules--they do allow certain +freedom when talking dates. You can either read the instructions +at the +http://search.cpan.org/dist/TimeDate/lib/Date/Parse.pm#EXAMPLE_DATES[Date::Parse +examples page]. +or just rely on common sense and try stuff like: + + 3:00 (today at 3am) + 15:00 (today at 3pm) + 2007/9/1 2pm (September 1st at 2pm) + Tue Sep 15 20:46:27 CEST 2009 (September 15th etc) + +`hb_report` will (probably) complain if it can't figure out what do +you mean. + +Try to delimit the event as close as possible in order to reduce +the size of the report, but still leaving a minute or two around +for good measure. + +`-f` is not optional. And don't forget to quote dates when they +contain spaces. + + +Should I send all this to the rest of Internet? +----------------------------------------------- + +By default, the sensitive data in CIB and PE files is not mangled +by `hb_report` because that makes PE input files mostly useless. +If you still have no other option but to send the report to a +public mailing list and do not want the sensitive data to be +included, use the `-s` option. Without this option, `hb_report` +will issue a warning if it finds information which should not be +exposed. By default, parameters matching 'passw.*' are considered +sensitive. Use the `-p` option to specify additional regular +expressions to match variable names which may contain information +you don't want to leak. For example: + + # hb_report -f 18:00 -p "user.*" -p "secret.*" /var/tmp/report + +Heartbeat's ha.cf is always sanitized. Logs and other files are +not filtered. + +LOGS +---- + +It may be tricky to find syslog logs. The scheme used is to log a +unique message on all nodes and then look it up in the usual +syslog locations. This procedure is not foolproof, in particular +if the syslog files are in a non-standard directory. We look in +/var/log /var/logs /var/syslog /var/adm /var/log/ha +/var/log/cluster. In case we can't find the logs, please supply +their location: + + # hb_report -f 5pm -l /var/log/cluster1/ha-log -S /tmp/report_node1 + +If you have different log locations on different nodes, well, +perhaps you'd like to make them the same and make life easier for +everybody. + +Files starting with "ha-" are preferred. In case syslog sends +messages to more than one file, if one of them is named ha-log or +ha-debug those will be favoured over syslog or messages. + +hb_report supports also archived logs in case the period +specified extends that far in the past. The archives must reside +in the same directory as the current log and their names must +be prefixed with the name of the current log (syslog-1.gz or +messages-20090105.bz2). + +If there is no separate log for the cluster, possibly unrelated +messages from other programs are included. We don't filter logs, +but just pick a segment for the period you specified. + +MANUAL REPORT COLLECTION +------------------------ + +So, your ssh doesn't work. In that case, you will have to run +this procedure on all nodes. Use `-S` so that `hb_report` doesn't +bother with ssh: + + # hb_report -f 5:20pm -t 5:30pm -S /tmp/report_node1 + +If you also have a log host which is not in the cluster, then +you'll have to copy the log to one of the nodes and tell us where +it is: + + # hb_report -f 5:20pm -t 5:30pm -l /var/tmp/ha-log -S /tmp/report_node1 + +OPERATION +--------- +hb_report collects files and other information in a fairly +straightforward way. The most complex tasks are discovering the +log file locations (if syslog is used which is the most common +case) and coordinating the operation on multiple nodes. + +The instance of hb_report running on the host where it was +invoked is the master instance. Instances running on other nodes +are slave instances. The master instance communicates with slave +instances by ssh. There are multiple ssh invocations per run, so +it is essential that the ssh works without password, i.e. with +the public key authentication and authorized_keys. + +The operation consists of three phases. Each phase must finish +on all nodes before the next one can commence. The first phase +consists of logging unique messages through syslog on all nodes. +This is the shortest of all phases. + +The second phase is the most involved. During this phase all +local information is collected, which includes: + +- logs (both current and archived if the start time is far in the past) +- various configuration files (corosync, heartbeat, logd) +- the CIB (both as xml and as represented by the crm shell) +- pengine inputs (if this node was the DC at any point in + time over the given period) +- system information and status +- package information and status +- dlm lock information +- backtraces (if there were core dumps) + +The third phase is collecting information from all nodes and +analyzing it. The analyzis consists of the following tasks: + +- identify files equal on all nodes which may then be moved to + the top directory +- save log messages matching user defined patterns + (defaults to ERRORs and CRITical conditions) +- report if there were coredumps and by whom +- report crm_verify(8) results +- save log messages matching major events to events.txt +- in case logging is configured without loghost, node logs and + events files are combined using a perl utility + + +BUGS +---- +Finding logs may at times be extremely difficult, depending on +how weird the syslog configuration. It would be nice to ask +syslog-ng developers to provide a way to find out the log +destination based on facility and priority. + +If you think you found a bug, please rerun with the -v option and +attach the output to bugzilla. + +hb_report can function in a satisfactory way only if ssh works to +all nodes using authorized_keys (without password). + +There are way too many options. + + +AUTHOR +------ +Written by Dejan Muhamedagic, <dejan@suse.de> + + +RESOURCES +--------- +Pacemaker: <http://clusterlabs.org/> + +Heartbeat and other Linux HA resources: <http://linux-ha.org/wiki> + +OpenAIS: <http://www.openais.org/> + +Corosync: <http://www.corosync.org/> + + +SEE ALSO +-------- +Date::Parse(3) + + +COPYING +------- +Copyright \(C) 2007-2009 Dejan Muhamedagic. Free use of this +software is granted under the terms of the GNU General Public License (GPL). + diff --git a/doc/meatclient.xml.in b/doc/meatclient.xml.in new file mode 100644 index 0000000..778a57c --- /dev/null +++ b/doc/meatclient.xml.in @@ -0,0 +1,77 @@ +<?xml version="1.0"?> +<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN" "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"> +<refentry id="re-meatclient"> + <refentryinfo> + <date>December 4, 2009</date> + <productname>Cluster Glue</productname> + <productnumber>@VERSION@</productnumber> + <authorgroup> + <author> + <firstname>Gregor</firstname> + <surname>Binder</surname> + <contrib>meatclient</contrib> + <email>gbinder@sysfive.com</email> + </author> + <author> + <firstname>Michael</firstname> + <surname>Mörz</surname> + <contrib>man page</contrib> + <email>mimem@debian.org</email> + </author> + <author> + <firstname>Simon</firstname> + <surname>Horman</surname> + <contrib>man page</contrib> + <email>horms@vergenet.net</email> + </author> + <author> + <firstname>Florian</firstname> + <surname>Haas</surname> + <contrib>man page</contrib> + <email>florian.haas@linbit.com</email> + </author> + </authorgroup> + </refentryinfo> + <refmeta> + <refentrytitle>meatclient</refentrytitle> + <manvolnum>8</manvolnum> + <refmiscinfo class="manual">System administration utilities</refmiscinfo> + </refmeta> + <refnamediv> + <refname>meatclient</refname> + <refpurpose>Manually confirm that a node has been removed from the + cluster</refpurpose> + </refnamediv> + <refsynopsisdiv> + <para><command>meatclient</command> <option>-c</option> <replaceable>nodename</replaceable></para> + </refsynopsisdiv> + <refsection id="rs-meatclient-description"> + <title>Description</title> + <para><command>meatclient</command> confirms that a node has been + manually removed from the cluster. It instructs the cluster + manager, via the meatware STONITH plugin, that it is safe to + continue cluster operations.</para> + </refsection> + <refsection id="rs-meatclient-options"> + <title>Options</title> + <para>The following options are supported:</para> + <variablelist> + <varlistentry> + <term> + <option>-c</option> <replaceable>nodename</replaceable> + </term> + <listitem> + <para><replaceable>nodename</replaceable> is the name of the + cluster node that has been fenced.</para> + </listitem> + </varlistentry> + </variablelist> + </refsection> + <refsection id="rs-meatclient-seealso"> + <title>See also</title> + <para> + <citerefentry><refentrytitle>heartbeat</refentrytitle><manvolnum>8</manvolnum></citerefentry>, + <citerefentry><refentrytitle>stonith</refentrytitle><manvolnum>8</manvolnum></citerefentry> + </para> + </refsection> +</refentry> diff --git a/doc/stonith.xml.in b/doc/stonith.xml.in new file mode 100644 index 0000000..575c339 --- /dev/null +++ b/doc/stonith.xml.in @@ -0,0 +1,315 @@ +<?xml version="1.0"?> +<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN" "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"> +<refentry id="re-stonith"> + <refentryinfo> + <date>December 7, 2009</date> + <productname>@PACKAGE_NAME@</productname> + <productnumber>@VERSION@</productnumber> + <authorgroup> + <author> + <firstname>Alan</firstname> + <surname>Robertson</surname> + <contrib>stonith</contrib> + <email>alanr@unix.sh</email> + </author> + <author> + <firstname>Simon</firstname> + <surname>Horman</surname> + <contrib>man page</contrib> + <email>horms@vergenet.net</email> + </author> + <author> + <firstname>Florian</firstname> + <surname>Haas</surname> + <contrib>man page</contrib> + <email>florian.haas@linbit.com</email> + </author> + </authorgroup> + </refentryinfo> + <refmeta> + <refentrytitle>stonith</refentrytitle> + <manvolnum>8</manvolnum> + <refmiscinfo class="manual">System administration utilities</refmiscinfo> + </refmeta> + <refnamediv> + <refname>stonith</refname> + <refpurpose>extensible interface for remotely powering down a node + in the cluster</refpurpose> + </refnamediv> + <refsynopsisdiv> + <cmdsynopsis> + <command>stonith</command> + <arg choice="plain"><option>-h</option></arg> + </cmdsynopsis> + <cmdsynopsis> + <command>stonith</command> + <arg choice="opt"><option>-s</option></arg> + <arg choice="opt"><option>-h</option></arg> + <arg choice="plain"><option>-L</option></arg> + </cmdsynopsis> + <cmdsynopsis> + <command>stonith</command> + <arg choice="opt"><option>-s</option></arg> + <arg choice="opt"><option>-h</option></arg> + <arg choice="plain"><option>-t</option> <replaceable>stonith-device-type</replaceable></arg> + <arg choice="plain"><option>-n</option></arg> + </cmdsynopsis> + <cmdsynopsis> + <command>stonith</command> + <arg choice="opt"><option>-s</option></arg> + <arg choice="opt"><option>-h</option></arg> + <arg choice="plain"><option>-t</option> <replaceable>stonith-device-type</replaceable></arg> + <group choice="req" rep="norepeat"> + <group choice="plain" rep="repeat"> + <arg choice="plain"><replaceable>name</replaceable>=<replaceable>value</replaceable></arg> + </group> + <arg choice="plain"><option>-p</option> <replaceable>stonith-device-parameters</replaceable></arg> + <arg choice="plain"><option>-F</option> <replaceable>stonith-device-parameters-file</replaceable></arg> + </group> + <arg choice="opt"><option>-c</option> <replaceable>count</replaceable></arg> + <arg choice="opt"><option>-l</option></arg> + <arg choice="opt"><option>-S</option></arg> + </cmdsynopsis> + <cmdsynopsis> + <command>stonith</command> + <arg choice="opt"><option>-s</option></arg> + <arg choice="opt"><option>-h</option></arg> + <arg choice="plain"><option>-t</option> <replaceable>stonith-device-type</replaceable></arg> + <group choice="req" rep="norepeat"> + <group choice="plain" rep="repeat"> + <arg choice="plain"><replaceable>name</replaceable>=<replaceable>value</replaceable></arg> + </group> + <arg choice="plain"><option>-p</option> <replaceable>stonith-device-parameters</replaceable></arg> + <arg choice="plain"><option>-F</option> <replaceable>stonith-device-parameters-file</replaceable></arg> + </group> + <arg choice="opt"><option>-c</option> <replaceable>count</replaceable></arg> + <arg choice="opt"><option>-T</option> + <group choice="req"> + <arg choice="plain">reset</arg> + <arg choice="plain">on</arg> + <arg choice="plain">off</arg> + </group> + </arg> + <arg><replaceable>nodename</replaceable></arg> + </cmdsynopsis> + </refsynopsisdiv> + <refsection id="rs-stonith-description"> + <title>Description</title> + <para>The STONITH module provides an extensible interface for + remotely powering down a node in the cluster (STONITH = Shoot The + Other Node In The Head). The idea is quite simple: when the + software running on one machine wants to make sure another machine + in the cluster is not using a resource, pull the plug on the other + machine. It's simple and reliable, albeit admittedly + brutal.</para> + </refsection> + <refsection id="rs-stonith-options"> + <title>Options</title> + <para>The following options are supported:</para> + <variablelist> + <varlistentry> + <term> + <option>-c</option> <replaceable>count</replaceable> + </term> + <listitem> + <para>Perform any actions identified by the + <option>-l</option>, <option>-S</option> and + <option>-T</option> options <replaceable>count</replaceable> + times.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-F</option> <replaceable>stonith-device-parameters-file</replaceable> + </term> + <listitem> + <para>Path of file specifying parameters for a stonith + device. To determine the syntax of the parameters file for a + given device type run:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t stonith-device-type -n</userinput></screen> + <para>All of the listed parameters need to appear in order + on a single line in the parameters file and be delimited by + whitespace.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-h</option> + </term> + <listitem> + <para>Display detailed information about a stonith device + including description, configuration information, parameters + and any other related information. When specified without a + stonith-device-type, detailed information on all stonith + devices is displayed.</para> + <para>If you don't yet own a stonith device and want to know + more about the ones we support, this information is likely + to be helpful.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-L</option> + </term> + <listitem> + <para>List the valid stonith device types, suitable for + passing as an argument to the <option>-t</option> + option.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-l</option> + </term> + <listitem> + <para>List the hosts controlled by the stonith device.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-n</option> + </term> + <listitem> + <para>Output the parameter names of the stonith device.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <replaceable>name</replaceable>=<replaceable>value</replaceable> + </term> + <listitem> + <para>Parameter, in the form of a name/value pair, to pass + directly to the stonith device. To determine the syntax of + the parameters for a given device type run:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t stonith-device-type -n</userinput></screen> + <para>All of the listed parameter names need to be passed + with their corresponding values.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-p</option> <replaceable>stonith-device-parameters</replaceable> + </term> + <listitem> + <para>Parameters to pass directly to the stonith device. To + determine the syntax of the parameters for a given device + type run:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t stonith-device-type -n</userinput></screen> + <para>All of the listed parameter names need to appear in + order and be delimited by whitespace.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-S</option> + </term> + <listitem> + <para>Show the status of the stonith device.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-s</option> + </term> + <listitem> + <para>Silent operation. Suppress logging of error messages to standard error.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-T</option> <replaceable>action</replaceable> + </term> + <listitem> + <para>The stonith action to perform on the node identified + by nodename. Chosen from <token>reset</token>, + <token>on</token>, and <token>off</token>.</para> + <note> + <para>If a nodename is specified without the + <option>-T</option> option, the stonith action defaults to + <token>reset</token>.</para> + </note> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-t</option> <replaceable>stonith-device-type</replaceable> + </term> + <listitem> + <para>The type of the stonith device to be used to effect + stonith. A list of supported devices for an installation may + be obtained using the <option>-L</option> option.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-v</option> + </term> + <listitem> + <para>Ignored.</para> + </listitem> + </varlistentry> + </variablelist> + </refsection> + <refsection id="rs-stonith-examples"> + <title>Examples</title> + <para>To determine which stonith devices are available on your installation, use the <option>-L</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -L</userinput></screen> + <para>All of the supported devices will be displayed one per line. + Choose one from this list that is best for your environment - + let's use <code>wti_nps</code> for the rest of this example. To get detailed + information about this device, use the <option>-h</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps -h</userinput></screen> + <para>Included in the output is the list of valid parameter names + for <code>wti_nps</code>. To get <emphasis>just</emphasis> the + list of valid parameter names, use the <option>-n</option> option + instead:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps -n</userinput></screen> + <para>All of the required parameter names will be displayed one + per line. For <code>wti_nps</code> the output is:</para> + <screen><computeroutput>ipaddr</computeroutput> +<computeroutput>password</computeroutput></screen> + <para>There are three ways to pass these parameters to the device. + The first (and preferred) way is by passing name/value pairs on + the <command>stonith</command> command line:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps ipaddr=my-dev-ip password=my-dev-pw ...</userinput></screen> + <para>The second way, which is maintained only for backward + compatibility with legacy clusters, is passing the values + <emphasis>in order</emphasis> on the <command>stonith</command> + command line with the <option>-p</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps -p "my-dev-ip my-dev-pw" ...</userinput></screen> + <para>The third way, which is also maintained only for backward + compatibility with legacy clusters, is placing the values <emphasis>in order</emphasis> + on a single line in a config file:</para> + <programlisting>my-dev-ip my-dev-pw</programlisting> + <para>... and passing the name of the file on the stonith command + line with the <option>-F</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps -F ~/my-wtinps-config ...</userinput></screen> + <para>To make sure you have the configuration set up correctly and + that the device is available for stonith operations, use the + <option>-S</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps ipaddr=my-dev-ip password=my-dev-pw -S</userinput></screen> + <para>If all is well at this point, you should see something similar to:</para> + <screen><computeroutput>stonith: wti_nps device OK.</computeroutput></screen> + <para>If you don't, some debugging may be necessary to determine + if the config info is correct, the device is powered on, etc. The + <option>-d</option> option can come in handy here - you can add it + to any <command>stonith</command> command to cause it to generate + debug output.</para> + <para>To get the list of hosts controlled by the device, use the + <option>-l</option> option:</para> + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps ipaddr=my-dev-ip password=my-dev-pw -l</userinput></screen> + <para>All of the hosts controlled by the device will be displayed one per line. For <code>wti_nps</code> the output could be:</para> + <screen><computeroutput>node1</computeroutput> + <computeroutput>node2</computeroutput> + <computeroutput>node3</computeroutput></screen> + <para>To power off one of these hosts, use the <option>-T</option> option: + <screen><computeroutput># </computeroutput><userinput>stonith -t wti_nps ipaddr=my-dev-ip password=my-dev-pw -T off <replaceable>node</replaceable></userinput></screen></para> + </refsection> + <refsection id="rs-stonith-seealso"> + <title>See also</title> + <para> + <citerefentry><refentrytitle>heartbeat</refentrytitle><manvolnum>8</manvolnum></citerefentry>, + <citerefentry><refentrytitle>meatclient</refentrytitle><manvolnum>8</manvolnum></citerefentry> + </para> + </refsection> +</refentry> diff --git a/doc/stonith/Makefile.am b/doc/stonith/Makefile.am new file mode 100644 index 0000000..4c9b76f --- /dev/null +++ b/doc/stonith/Makefile.am @@ -0,0 +1,37 @@ +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +MAINTAINERCLEANFILES = Makefile.in + +stdocdir = $(docdir)/stonith + +stdoc_DATA = README.bladehpi \ + README.cyclades \ + README.drac3 \ + README.dracmc \ + README.external \ + README.ibmrsa \ + README.ibmrsa-telnet \ + README.meatware \ + README.rackpdu \ + README.rcd_serial \ + README.riloe \ + README.vacm \ + README.wti_mpc \ + README_kdumpcheck.txt \ + README.vcenter + +if IPMILAN_BUILD +stdoc_DATA += README.ipmilan +endif diff --git a/doc/stonith/README.bladehpi b/doc/stonith/README.bladehpi new file mode 100644 index 0000000..3119ef7 --- /dev/null +++ b/doc/stonith/README.bladehpi @@ -0,0 +1,101 @@ + +STONITH module for IBM BladeCenter via OpenHPI +---------------------------------------------- + +Requirements: + Linux-HA bladehpi STONITH plugin requires OpenHPI 2.6+ + OpenHPI requires Net-SNMP 5.0+ + OpenHPI requires BladeCenter Management Module 1.08+ + +This STONITH module talks to IBM BladeCenters via SNMP through use of +the OpenHPI BladeCenter plugin (snmp_bc). For more information about +installing OpenHPI, setting up the BladeCenter SNMP agent, etc. please +visit http://www.openhpi.org/. Once OpenHPI is installed properly, +the STONITH plugin will automatically be built the next time Linux-HA +is built. + +Use the OpenHPI configuration file (i.e. /etc/openhpi/openhpi.conf) +to configure the BladeCenters of interest to STONITH. For example, +the following excerpt: + + plugin libsnmp_bc + + handler libsnmp_bc { + entity_root = "{SYSTEM_CHASSIS,1}" # Required + host = "9.254.253.252" # Required + community = "community" # Version 1 Required. + version = "3" # Required. SNMP protocol version (1|3) + security_name = "userid" # Version 3 Required. + passphrase = "userpass" # Version 3. Required if security_level is authNoPriv or authPriv. + auth_type = "MD5" # Version 3. Passphrase encoding (MD5|SHA) + security_level = "authNoPriv" # Version 3. (noAuthNoPriv|authNoPriv|authPriv) + } + +defines how to access the BladeCenter at 9.254.253.252 using SNMPV3 +with an ID/password of userid/userpass. The entity_root must be +passed to the STONITH bladehpi plugin as its single required parameter. +For example, to query the list of blades present in the BladeCenter +configured above, run: + + stonith -t bladehpi -p "{SYSTEM_CHASSIS,1}" -l + +which is the same as: + + stonith -t bladehpi "entity_root={SYSTEM_CHASSIS,1}" -l + +Use the BladeCenter Management Module web interface to set the Blade +Information to match "uname -n" for each blade in the cluster. For +example, with the BladeCeter configured above use a brower to access +http://9.254.253.252, login with userid/userpass, and then go to +Blade Tasks -> Configuration -> Blade Information, enter the proper +names, and select Save. Be aware that heartbeat must be restarted +before these changes take effect or, if using the OpenHPI daemon, +the daemon must be restarted. + +More than one BladeCenter can be placed in the OpenHPI configuration +file by using different numbers with the entity_root. For example, + + plugin libsnmp_bc + + handler libsnmp_bc { + entity_root = "{SYSTEM_CHASSIS,1}" # Required + host = "9.254.253.252" # Required + : + } + handler libsnmp_bc { + entity_root = "{SYSTEM_CHASSIS,2}" # Required + host = "9.254.253.251" # Required + : + } + +There is an optional parameter, soft_reset, that is true|1 if bladehpi +should use soft reset (power cycle) to reset nodes or false|0 if it +should use hard reset (power off, wait, power on); the default is +false. As an example, to override the default value the above stonith +command would become: + + stonith -t bladehpi -p "{SYSTEM_CHASSIS,1} true" -l + +which is the same as: + + stonith -t bladehpi "entity_root={SYSTEM_CHASSIS,1} soft_reset=true" -l + +The difference between the two is that a soft reset is much quicker +but may return before the node has been reset because bladehpi relies +on BladeCenter firmware to cycle the node's power, while a hard reset +is slower but guaranteed not to return until the node is dead because +bladehpi powers off the node, waits until it is off, then powers it +on again. + +NOTE: Set the OPENHPI_CONF environment variable to contain the +fully-qualified path of the OpenHPI configuration file, for example: + + export OPENHPI_CONF=/etc/openhpi/openhpi.conf + +NOTE: If OpenHPI is not configured with --disable-daemon before being +built and installed, make sure that the OpenHPI daemon is running +before using the bladehpi plugin. + +NOTE: If debugging of the environment is needed, configure OpenHPI +with --enable-debuggable and rebuild/reinstall, export +OPENHPI_DEBUG=YES, and run stonith commands with the -d option. diff --git a/doc/stonith/README.cyclades b/doc/stonith/README.cyclades new file mode 100644 index 0000000..3ccf9db --- /dev/null +++ b/doc/stonith/README.cyclades @@ -0,0 +1,61 @@ +STONITH module for Cyclades AlterPath PM +---------------------------------------- + +This STONITH module talks to Cyclades AlterPath PM series of power managers +via TS, ACS or KVM equipment. + +Access to the frontend device (TS, ACS or KVM) is done via root user with +passwordless ssh. + +For that, it is necessary to create a public/private keypar with _empty_ +passphrase on _each_ machine which is part of the cluster. + +Small HOWTO follows: + +# ssh-keygen -t rsa +Generating public/private rsa key pair. +Enter file in which to save the key (/root/.ssh/id_rsa): +Created directory '/home/root/.ssh'. +Enter passphrase (empty for no passphrase): +Enter same passphrase again: +Your identification has been saved in /root/.ssh/id_rsa. +Your public key has been saved in /root/.ssh/id_rsa.pub. +The key fingerprint is: +dc:e0:71:55:fd:2a:b0:19:d6:3c:48:e5:45:db:b4:be root@hostname.network + +Next step is to append the public key (/root/.ssh/id_rsa.pub) +to the authorized_keys file on the TS/ACS/KVM box. The authorized +keys file location is set at the SSH daemon configuration file. +The default location is /etc/ssh/authorized_keys, so: + +[root@clusterhost]# scp /root/.ssh/id_rsa.pub root@alterpath:/tmp + +login to the TS/ACS/KVM box normally and append the public key. + +# ssh root@alterpath +Password: .... + +[root@CAS root]# cat /tmp/id_rsa.pub >> /etc/ssh/authorized_keys + +The following entries must be present on /etc/ssh/sshd_config for the +passwordless scheme to work properly: + +RSAAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /etc/ssh/authorized_keys + +Next step is to test if the configuration has been done successfully: + +[root@clusterhost root]# ssh root@alterpath +[root@CAS root]# + +If it logins automatically without asking for a password, then everything +has been done correctly! + +Note that such configuration procedure (including generation of the key pair) +has to be done for each machine in the cluster which intends to use the +AlterPath PM as a STONITH device. + +------ +Any questions please contact Cyclades support at <support@cyclades.com> +or <marcelo.tosatti@cyclades.com> diff --git a/doc/stonith/README.drac3 b/doc/stonith/README.drac3 new file mode 100644 index 0000000..e3c071b --- /dev/null +++ b/doc/stonith/README.drac3 @@ -0,0 +1,18 @@ +Stonith module for Dell DRACIII remote access card +-------------------------------------------------- + +This module uses the Dell DRACIII PCI card as a stonith device. +It sends the XML commands over HTTPS to the DRACIII web server. + +The card firmware must be version 2.0 at least, with support for SSL based +service and many bug fixes over 1.x versions. + +This module uses libcurl, libxml2 (gnome xml libs) and libssl. + +Any hints, bug reports, improvements, etc. will be apreciated. + +--- +Roberto Moreda <moreda@alfa21.com> http://www.alfa21.com +Alfa21 A Coruña (Spain) +UNIX, Linux & TCP/IP Services - High Availability Solutions + diff --git a/doc/stonith/README.dracmc b/doc/stonith/README.dracmc new file mode 100644 index 0000000..761f5ad --- /dev/null +++ b/doc/stonith/README.dracmc @@ -0,0 +1,87 @@ +dracmc-telnet - External stonith plugin for HAv2 (http://linux-ha.org/wiki) + Connects to Dell Drac/MC Blade Enclosure via a Cyclades + terminal server with telnet and switches power of named + blade servers appropriatelly. + +Description: + Dell offers the Drac/MC in their blade enclosures. The +Drac/MC can be accessed in different ways. One way to interface to it +is to connect the blade enclosure's Drac/MC serial port to a Cyclades +terminal server. You can then access the Drac/MC via telnet via the +Cyclades. Once logged in, you can use 'help' to show all available +commands. With the 'serveraction' command, you can control both +hard and soft resets as well as power to a particular blade. The +blades are named 'Server-X', where 'X' is a number which corresponds +to the blade number in the enclosure. This plugin allows using the +Drac/MC with stonith. It uses python's standards 'telnetlib' library +to log in and issue commands. The code is very similar to the original +ibmrsa-telnet plugin released by Andreas and was quite easy to +modify for this application. + One complication is that the Cyclades only allows one active +connection. If someone or something has a connection active, the +terminal server closes the new attempted connection. Since this +situation can be common, for example if trying to stonith two blades +or when the plugin is started by multiple cluster nodes, there is a +built in retry mechanism for login. On 10 retries, the code gives up +and throws. + When running this resource, it is best to not run it as a clone, +rather as a normal, single-instance resource. Make sure to create a +location constraint that excludes the node that is to be fenced. + +Required parameters: + nodename: The name of the server you want to touch on your network + cyclades_ip: The IP address of the cyclades terminal server + cyclades_port: The port for telnet to access on the cyclades (i.e. 7032) + servername: The DRAC/MC server name of the blade (i.e. Server-7) + username: The login user name for the DRAC/MC + password: The login password for the DRAC/MC + +Example configuration + +These are examples: you should adjust parameters, scores and +timeout values to fit your environment. + +crm shell: + + primitive fence_node1 stonith:external/dracmc-telnet \ + nodename=node1 cyclades_ip=10.0.0.1 cyclades_port=7001 \ + servername=Server-1 username=USERID password=PASSWORD \ + op monitor interval="200m" timeout="60s" + location loc-fence_node1 fence_node1 -inf: node1 + +XML: + +<?xml version="1.0" ?> +<cib> + <configuration> + <resources> + <primitive id="r_stonith-node01" class="stonith" type="external/dracmc-telnet" provider="heartbeat" resource_stickiness="0"> + <operations> + <op name="monitor" interval="200m" timeout="60s" prereq="nothing" id="r_stonith-node01-mon"/> + <op name="start" timeout="180" id="r_stonith-node01-start"/> + <op name="stop" timeout="180" id="r_stonith-node01-stop"/> + </operations> + <instance_attributes id="r_stonith-node01"> + <attributes> + <nvpair id="r_stonith-node01-nodename" name="nodename" value="node01"/> + <nvpair id="r_stonith-node01-cyclades_ip" name="cyclades_ip" value="192.168.0.1"/> + <nvpair id="r_stonith-node01-cyclades_port" name="cyclades_port" value="7032"/> + <nvpair id="r_stonith-node01-servername" name="servername" value="Server-7"/> + <nvpair id="r_stonith-node01-username" name="username" value="USERID"/> + <nvpair id="r_stonith-node01-password" name="password" value="PASSWORD"/> + <nvpair id="r_stonith-node01-type" name="type" value="dellblade"/> + </attributes> + </instance_attributes> + </primitive> + </resources> + <constraints> + <rsc_location id="r_stonith-node01_prefer_node02" rsc="r_stonith-node01"> + <rule id="r_stonith-node01_prefer_node02_rule" score="50"> + <expression attribute="#uname" id="r_stonith-node01_prefer_node02_expr" operation="eq" value="node02"/> + </rule> + </rsc_location> + </constraints> + + </configuration> +</cib> + diff --git a/doc/stonith/README.external b/doc/stonith/README.external new file mode 100644 index 0000000..a70ccde --- /dev/null +++ b/doc/stonith/README.external @@ -0,0 +1,90 @@ +EXTERNAL module for Linux-HA STONITH + + +This stonith plugin runs an external command written in your favorite +language to shutdown the given host. The external command should return +a zero exit status after a successful shutdown, or non-zero exit status +for a shutdown failure. Failures notifications will be sent to syslog. + +To create your own external plugin, write a script that supports the +following actions: + + reset + on (optional) + off (optional) + gethosts + status + getconfignames + getinfo-devid + getinfo-devname + getinfo-devdescr + getinfo-devurl + getinfo-xml + +and place it in the /usr/lib/stonith/plugins/external directory - the +script must be a regular executable file that is NOT writable by group +or others in order to be recognized as an external plugin. If the +action requires information to be returned, such as the list of hosts +or config names or any of the getinfo calls, simply write the +information to stdout. When complete, return zero to indicate the +action succeeded or non-zero to indicate the action failed. You can +use the ssh (sh) and riloe (pyhton) scripts already in that directory +as working examples. + +To make sure that your external plugin is recognized, run "stonith -L" +and look for its name in the output, something along the lines of: + + external/yourplugin + +To configure the plugin on an R1 (legacy) cluster, add a line similar +to the following to /etc/ha.d/ha.cf: + + stonith external/yourplugin /etc/ha.d/yourplugin.cfg + +where /etc/ha.d/yourplugin.cfg contains a single line with all of your +plugin's parameters: + + parm1-value parm2-value ... + +Another way to configure the plugin on a legacy cluster is to add a line +similiar to the following to /etc/ha.d/ha.cf instead: + + stonith_host * external/yourplugin parm1-value parm2-value ... + +where all of your plugin's parameters are placed at the end of the line. + +Please note that all parameters come in to the plugin in name/value +(environment variable) form, but in R1 configurations, they appear as a +list of parameters. They are ordered in the config file or on the +stonith_host line according to the ordering specified in the output of +the getconfignames operation. + +To configure the plugin on an R2 cluster, place lines similar to the +following into the <resources> section of your CIB, which is contained +in /var/lib/heartbeat/crm/cib.xml: + + <clone id="DoFencing"> + <instance_attributes> + <nvpair name="clone_max" value="2"/> + <nvpair name="clone_node_max" value="1"/> + </instance_attributes> + <primitive id="child_DoFencing" class="stonith" type="external/yourplugin" provider="heartbeat"> + <operations> + <op name="monitor" interval="5s" timeout="20s" requires="nothing"/> + <op name="start" timeout="20s" requires="nothing"/> + </operations> + <instance_attributes> + <nvpair name="parm1-name" value="parm1-value"/> + <nvpair name="parm2-name" value="parm2-value"/> + <!-- ... --> + </instance_attributes> + </primitive> + </clone> + +Whatever <nvpair> parameters specified in the <attributes> section of +the CIB are passed to the script as environment variables. For the +example above, the parameters are passed as parm1-name=parm1-value, +parm2-name=parm2-value and so on. + +Additional information can be found at +http://linux-ha.org/wiki/ExternalStonithPlugins. diff --git a/doc/stonith/README.ibmrsa b/doc/stonith/README.ibmrsa new file mode 100644 index 0000000..b34031b --- /dev/null +++ b/doc/stonith/README.ibmrsa @@ -0,0 +1,9 @@ +See + +ftp://ftp.software.ibm.com/systems/support/system_x_pdf/d3basmst.pdf +ftp://ftp.software.ibm.com/systems/support/system_x_pdf/88p9248.pdf +http://www.redbooks.ibm.com/abstracts/sg246495.html + +for documentation about IBM management processors and the +IBMmpcli utility. + diff --git a/doc/stonith/README.ibmrsa-telnet b/doc/stonith/README.ibmrsa-telnet new file mode 100644 index 0000000..109bdd9 --- /dev/null +++ b/doc/stonith/README.ibmrsa-telnet @@ -0,0 +1,55 @@ +ibmrsa-telnet - External stonith plugin for HAv2 (http://linux-ha.org/wiki) + Connects to IBM RSA Board via telnet and switches power + of server appropriately. + +Description: + + IBM offers Remote Supervisor Adapters II for several + servers. These RSA boards can be accessed in different ways. + One of that is via telnet. Once logged in you can use 'help' to + show all available commands. With 'power' you can reset, power on and + off the controlled server. This command is used in combination + with python's standard library 'telnetlib' to do it automatically. + +Code snippet for cib + + It's useful to give a location preference so that the stonith agent + is run on the/an other node. This is not necessary as one node can kill + itself via RSA Board. But: If this node becomes crazy my experiences + showed that the node is not able to shoot itself anymore properly. + + You have to adjust parameters, scores and timeout values to fit your + HA environment. + +<?xml version="1.0" ?> +<cib> + <configuration> + <resources> + <primitive id="r_stonith-node01" class="stonith" type="external/ibmrsa" provider="heartbeat" resource_stickiness="0"> + <operations> + <op name="monitor" interval="60" timeout="300" prereq="nothing" id="r_stonith-node01-mon"/> + <op name="start" timeout="180" id="r_stonith-node01-start"/> + <op name="stop" timeout="180" id="r_stonith-node01-stop"/> + </operations> + <instance_attributes id="r_stonith-node01"> + <attributes> + <nvpair id="r_stonith-node01-nodename" name="nodename" value="node01"/> + <nvpair id="r_stonith-node01-ipaddr" name="ipaddr" value="192.168.0.1"/> + <nvpair id="r_stonith-node01-userid" name="userid" value="userid"/> + <nvpair id="r_stonith-node01-passwd" name="passwd" value="password"/> + <nvpair id="r_stonith-node01-type" name="type" value="ibm"/> + </attributes> + </instance_attributes> + </primitive> + </resources> + <constraints> + <rsc_location id="r_stonith-node01_not_on_node01" rsc="r_stonith-node01"> + <rule id="r_stonith-node01_not_on_node01_rule" score="-INFINITY"> + <expression attribute="#uname" id="r_stonith-node01_not_on_node01_expr" operation="eq" value="node01"/> + </rule> + </rsc_location> + </constraints> + + </configuration> +</cib> + diff --git a/doc/stonith/README.ipmilan b/doc/stonith/README.ipmilan new file mode 100644 index 0000000..eef86cf --- /dev/null +++ b/doc/stonith/README.ipmilan @@ -0,0 +1,131 @@ + IPMILAN STONITH Module + Copyright (c) 2003 Intel Corp. + yixiong.zou@intel.com + +1. Intro + +IPMILAN STONITH module works by sending a node an IPMI message, in particular, +a 'chassis control' command. Currently the message is sent over the LAN. + +2. Hardware Requirement + +In order to use this module, the node has to be IPMI v1.5 compliant and +also supports IPMI over LAN. For example, the Intel Langley platform. + +Note: IPMI over LAN is an optional feature defined by IPMI v1.5 spec. +So even if a system is IPMI compliant/compatible, it might still not +support IPMI over LAN. If you are sure this is your case and you still +want to try this plugin, read section 6, IPMI v1.5 without IPMI over +LAN Support. + +3. Software Requirement + +This module needs OpenIPMI (http://openipmi.sf.net) to compile. +Version 1.4.x or 2.0.x is supported. + +4. Hardware Configuration + +How to configure the node so it accepts IPMI lan packets is beyond the +scope of this document. Consult your product manual for this. + +5. STONITH Configuration + +Each node in the cluster has to be configured individually. So normally there +would be at least two entries, unless you want to use a different STONITH +device for the other nodes in the cluster. ;) + +The configuration file syntax looks like this: + + <node1> <ip> <port> <auth> <priv> <user> <pass> <reset_method> + <node2> <ip> <port> <auth> <priv> <user> <pass> <reset_method> + ... + + node: the hostname. + + ip: the IP address of the node. If a node has more than one IP addresses, + this is the IP address of the interface which accepts IPMI messages. :) + + port: the port number to send the IPMI message to. The default is 623. + But it could be different or even configurable. + + auth: the authorization type of the IPMI session. Valid choices are + "none", "straight", "md2", and "md5". + + priv: the privilege level of the user. Valid choices are "operator" + or "admin". These are the privilege levels required to run the + 'chassis control' command. + + user: the username. use "" if it is empty. Cannot exceed 16 characters. + + pass: the password. use "" if it is empty. Cannot exceed 16 characters. + + reset_method: (optional) which IPMI chassis control to send + to reset the host. Possible values are power_cycle (default) + and hard_reset. + +Each line is white-space delimited and lines begins with '#' are ignored. + +6. IPMI v1.5 without IPMI over LAN Support + +If somehow your computer have a BMC but without LAN support, you might +still be able to use this module. + + 0) Make sure OpenIPMI is installed. OpenIPMI 1.0.3 should work. + + 1) Create a /etc/ipmi_lan.conf file. + + Here's a sample of how this file should look like + + addr 172.16.1.249 999 + PEF_alerting on + per_msg_auth off + priv_limit admin + allowed_auths_admin none md2 md5 + user 20 on "" "" admin 5 md2 md5 none + + If you do not understand what each line means, do a man on ipmilan. + + 2) run ipmilan as root. + + 3) Try send youself an IPMI packet over the network using ipmicmd see + if it works. + + ipmicmd -k "0f 00 06 01" lan 172.16.1.249 999 none admin "" "" + + The result should be something like: + + Connection 0 to the BMC is up0f 07 00 01 00 01 80 01 19 01 8f 77 00 00 4b 02 + + 4) Configure your system so everytime it boots up, the ipmi device + drivers are all loaded and ipmilan is run. This is all OS dependent + so I can't tell you what to do. + + The major draw back of this is that you will not be able to power it up + once it's power down, which for a real IPMI, you could. + + +7. Bugs + +Some IPMI device does not return 0x0, success, to the host who issued the reset +command. A timeout, 0xc3, could be returned instead. So I am counting that +also as a "successful reset". + +Note: This behavior is not fully IPMI v1.5 compliant. Based on the IPMI v1.5 +spec, the IPMI device should return the appropriate return code. And it is +even allowed to return the appropriate return code before performing the +action. + + +8. TODO + +1) Right now the timeout on each host is hard coded to be 10 seconds. It will + be nice to be able to set this value for individual host. + +2) A better way of detecting the success of the reset operation will be good. A + lot of times the host which carried out the reset does not return a success. + +3) The os_handler should be contributed back to the OpenIPMI project so that + we do not need to maintain it here. It does not make sense for every little + app like this to write its own os_handler. A generic one like in this + program should be sufficient. + diff --git a/doc/stonith/README.ippower9258 b/doc/stonith/README.ippower9258 new file mode 100644 index 0000000..6873efd --- /dev/null +++ b/doc/stonith/README.ippower9258 @@ -0,0 +1,68 @@ +IP Power 9258 as external stonith device. +========================================= + +Device Information +================== + + Warning: + ======== + + Aviosys provides different types and versions of IP Power 9258. + The device is currently available with four or eight power outlets. + This script was tested with firmware version: V1.55 2009/12/22 + + Especially "IP Power 9258 HP" uses a different http command interface. + ====================================================================== + + Resources for device documentation: + + Manufacturer URL: http://www.aviosys.com/ippower9258.htm + Manual URL: http://www.aviosys.com/manual.htm + Manual current version URL: + http://www.aviosys.com/images/9258_manual_20081104.pdf + +The documentation of the http command interface defines three +supported commands: + + GetPower - useful for testing status of the device and of each port + SetPower - used to control status of each power outlet + SetSchedule+Power - useless for stonith + +Common documented structure of these three commands is + + http://username:password@a.b.c.d/Set.cmd?CMD=command[+param=value...] + param is one or more of P60 to P67 and value is 0 or 1 + expected response for GetPower is of the format + <html>P60=1,P61=0,P62=1,P63=1,P64=0,P65=0,P66=0,P67=0</html> + SetPower does respond with the same format but restricts the list + to the modified ports. + P60 to P67 represent the status of the power outlet 1 to 8: 0 <=> + power off; 1 <=> power on. + +IP Power 9258 allows to assign port names (pw1Name to pw8Name) to each +port. These names can be used with the web interface (web form with +post-method). + +Script specific notes +===================== + +There is no documented http command to retrieve port names via the +http command interface. We try to get the hostlist via the web +interface. + +This script assumes a one to one mapping between names of hostlist and +port attributes of power outlet: + + 1st hostname in hostlist connected to 1st power outlet with port + status P60 and port name pw1Name. + ... + 8th hostname in hostlist connected to 8th power outlet with port + status P67 and port name pw8Name. + +If the hostlist parameter is not defined, then all assigned outlets +are inserted into the hostlist. Unused outlets should have empty +names. The node names obviously have to match the corresponding outlet +names. A reserved hostname is "*not-defined*". This is a +sript-internal placeholder for unused outlets. It does not appear in +the hostlist. + diff --git a/doc/stonith/README.meatware b/doc/stonith/README.meatware new file mode 100644 index 0000000..0b9b15d --- /dev/null +++ b/doc/stonith/README.meatware @@ -0,0 +1,26 @@ + +MEATWARE Module for Linux-HA STONITH + +ABOUT: + + This is a port of the "meatware" stomith method found in the GFS + distribution (see http://globalfilesystem.org/) to the Linux-HA + project. It notifies operators if a node needs to be reset and + waits for confirmation. + +USAGE: + + The module can be used like any other stonith module. It will + syslog a message at CRIT level if it needs an operator to power-cycle + a node on its behalf. + To confirm that a manual reset has been done, execute + + "meatclient -c <host>". + + If you abort the confirmation, the module will report that the reset + has failed. + +AUTHOR: + + Gregor Binder <gbinder@sysfive.com> + diff --git a/doc/stonith/README.rackpdu b/doc/stonith/README.rackpdu new file mode 100644 index 0000000..69a0f44 --- /dev/null +++ b/doc/stonith/README.rackpdu @@ -0,0 +1,21 @@ +APC Rack PDU + +The product information pages: + +http://www.apcc.com/products/family/index.cfm?id=70 + +The User's Guide: + +http://www.apcmedia.com/salestools/ASTE-6Z6KAV_R1_EN.pdf + +Apparently, an existing http or telnet session will make the +plugin fail. + +In case your nodes are equipped with multiple power supplies, the +PDU supports synchronous operation on multiple outlets on up to +four Switched Rack PDUs. See the User's Guide for more +information on how to setup outlet groups. + +NB: There has been a report by one user that in case a link +between two PDUs in the chain is broken, the PDU returns success +even though it failed. This needs to be verified. diff --git a/doc/stonith/README.rcd_serial b/doc/stonith/README.rcd_serial new file mode 100644 index 0000000..8b4abb4 --- /dev/null +++ b/doc/stonith/README.rcd_serial @@ -0,0 +1,186 @@ +rcd_serial - RC Delayed Serial +------------------------------ + +This stonith plugin uses one (or both) of the control lines of a serial +device (on the stonith host) to reboot another host (the stonith'ed host) +by closing its reset switch. A simple idea with one major problem - any +glitch which occurs on the serial line of the stonith host can potentially +cause a reset of the stonith'ed host. Such "glitches" can occur when the +stonith host is powered up or reset, during BIOS detection of the serial +ports, when the kernel loads up the serial port driver, etc. + +To fix this, you need to introduce a delay between the assertion of the +control signal on the serial port and the closing of the reset switch. +Then any glitches will be dissipated. When you really want to do the +business, you hold the control signal high for a "long time" rather than +just tickling it "glitch-fashion" by, e.g., using the rcd_serial plugin. + +As the name of the plugin suggests, one way to achieve the required delay is +to use a simple RC circuit and an npn transistor: + + + . . + RTS . . ----------- +5V + or ---------- . | + DTR . | . Rl reset + . | T1 . | |\logic + . Rt | ------RWL--------| -------> + . | b| /c . |/ + . |---Rb---|/ . + . | |\ . (m/b wiring typical + . C | \e . only - YMMV!) + . | | . + . | | . + SG ---------------------------RWG----------- 0V + . . + . . stonith'ed host + stonith host --->.<----- RC circuit ----->.<---- RWL = reset wire live + (serial port) . . RWG = reset wire ground + + +The characteristic delay (in seconds) is given by the product of Rt (in ohms) +and C (in Farads). Suitable values for the 4 components of the RC circuit +above are: + +Rt = 20k +C = 47uF +Rb = 360k +T1 = BC108 + +which gives a delay of 20 x 10e3 x 47 x 10e-6 = 0.94s. In practice the +actual delay achieved will depend on the pull-up load resistor Rl if Rl is +small: for Rl greater than 3k there is no significant dependence but lower +than this and the delay will increase - to about 1.4s at 1k and 1.9s at 0.5k. + +This circuit will work but it is a bit dangerous for the following reasons: + +1) If by mistake you open the serial port with minicom (or virtually any +other piece of software) you will cause a stonith reset ;-(. This is +because opening the port will by default cause the assertion of both DTR +and RTS, and a program like minicom will hold them high thenceforth (unless +and until a receive buffer overflow pulls RTS down). + +2) Some motherboards have the property that when held in the reset state, +all serial outputs are driven high. Thus, if you have the circuit above +attached to a serial port on such a motherboard, if you were to press the +(manual) reset switch and hold it in for more than a second or so, you will +cause a stonith reset of the attached system ;-(. + +This problem can be solved by adding a second npn transistor to act as a +shorting switch across the capacitor, driven by the other serial output: + + + . . + . . ----------- +5V + RTS ----------------- . | + . | . Rl reset + . | T1 . | |\logic + . Rt | ------RWL--------| -------> + . | b| /c . |/ + . T2 --|---Rb---|/ . + . | / | |\ . (m/b wiring typical + . b| /c | | \e . only - YMMV!) + DTR ------Rb--|/ C | . + . |\ | | . + . | \e | | . + . | | | . + SG ----------------------------------RWG------------- 0V + . . + . . stonith'ed host +stonith->.<--------- RC circuit ------->.<---- RWL = reset wire live + host . . RWG = reset wire ground + + +Now when RTS goes high it can only charge up C and cause a reset if DTR is +simultaneously kept low - if DTR goes high, T2 will switch on and discharge +the capacitor. Only a very unusual piece of software e.g. the rcd_serial +plugin, is going to achieve this (rather bizarre) combination of signals +(the "meaning" of which is something along the lines of "you are clear to +send but I'm not ready"!). T2 can be another BC108 and with Rb the same. + +RS232 signal levels are typically +-8V to +-12V so a 16V rating or greater +for the capacitor is sufficient BUT NOTE that a _polarised_ electrolytic should +not be used because the voltage switches around as the capacitor charges. +Nitai make a range of non-polar aluminium electrolytic capacitors. A 16V 47uF +radial capacitor measures 6mm diameter by 11mm long and along with the 3 +resistors (1/8W are fine) and the transistors, the whole circuit can be built +in the back of a DB9 serial "plug" so that all that emerges from the plug are +the 2 reset wires to go to the stonith'ed host's m/b reset pins. + +NOTE that with these circuits the reset wires are now POLARISED and hence +they are labelled RWG and RWL above. You cannot connect to the reset pins +either way round as you can when connecting a manual reset switch! You'll +soon enough know if you've got it the wrong way round because your machine +will be in permanent reset state ;-( + + +How to find out if your motherboard can be reset by these circuits +------------------------------------------------------------------ + +You can either build it first and then suck it and see, or, you need a +multimeter. The 0V rail of your system is available in either +of the 2 black wires in the middle of a spare power connector (one of +those horrible 4-way plugs which you push with difficulty into the back +of hard disks, etc. Curse IBM for ever specifying such a monstrosity!). +Likewise, the +5V rail is the red wire. (The yellow one is +12V, ignore +this.) + +First, with the system powered down and the meter set to read ohms: + + check that one of the reset pins is connected to 0V - this then + is the RWG pin; + + check that the other pin (RWL) has a high resistance wrt 0V + (probably > 2M) and has a small resistance wrt to +5V - between + 0.5k and 10k (or higher, doesn't really matter) will be fine. + +Second, with the system powered up and the meter set to read Volts: + + check that RWG is indeed that i.e. there should be 0V between it + and the 0V rail; + + check that RWL is around +5V wrt the 0V rail. + +If all this checks out, you are _probably_ OK. However, I've got one +system which checks out fine but actually won't work. The reason is that +when you short the reset pins, the actual current drain is much higher than +one would expect. Why, I don't know, but there is a final test you can do +to detect this kind of system. + +With the system powered up and the meter set to read milliamps: + + short the reset pins with the meter i.e. reset the system, and + note how much current is actually drained when the system is in + the reset state. + +Mostly you will find that the reset current is 1mA or less and this is +fine. On the system I mention above, it is 80mA! If the current is +greater than 20mA or so, you have probably had it with the simple circuits +above, although reducing the base bias resistor will get you a bit further. +Otherwise, you have to use an analog switch (like the 4066 - I had to use 4 +of these in parallel to reset my 80mA system) which is tedious because then +you need a +5V supply rail to the circuit so you can no longer just build it +in the back of a serial plug. Mail me if you want the details. + +With the circuit built and the rcd_serial plugin compiled, you can use: + +stonith -t rcd_serial -p "testhost /dev/ttyS0 rts XXX" testhost + +to test it. XXX is the duration in millisecs so just keep increasing this +until you get a reset - but wait a few secs between each attempt because +the capacitor takes time to discharge. Once you've found the minimum value +required to cause a reset, add say 200ms for safety and use this value +henceforth. + +Finally, of course, all the usual disclaimers apply. If you follow my +advice and destroy your system, sorry. But it's highly unlikely: serial +port outputs are internally protected against short circuits, and reset pins +are designed to be short circuited! The only circumstance in which I can +see a possibility of damaging something by incorrect wiring would be if the +2 systems concerned were not at the same earth potential. Provided both +systems are plugged into the same mains system (i.e. are not miles apart +and connected only by a very long reset wire ;-) this shouldn't arise. + +John Sutton +john@scl.co.uk +October 2002 diff --git a/doc/stonith/README.riloe b/doc/stonith/README.riloe new file mode 100644 index 0000000..4befe95 --- /dev/null +++ b/doc/stonith/README.riloe @@ -0,0 +1,36 @@ +Note for iLO 3 users + +This plugin doesn't support the iLO version 3. Please use ipmilan +or external/ipmi, iLO3 should support IPMI. + +Alain St-Denis wrote the riloe plugin. Here is short usage: + +primitive st0 stonith:external/riloe \ + hostlist=target-node \ + ilo_hostname=ilo-ip-address \ + ilo_user=admin ilo_password=secret ilo_protocol=2.0 + +The following additional parameters are available: + +ilo_can_reset: + Set to "1" if the ilo is capable of rebooting the host. + Defaults to '0'. + +ilo_protocol: + Defaults to 1.2. Set to the protocol version ilo supports. + +ilo_powerdown_method: + "button" or "power", the former simulates pressing the + button, the latter pulling the power plug. Defaults to + "power". The "button" method is easier on the host, but + requires ACPI. "power" should be more reliable, but not to + be used excessively for testing. + +ilo_proxyhost (string): Proxy hostname + proxy hostname if required to access ILO board + +ilo_proxyport (string, [3128]): Proxy port + proxy port if required to access ILO board + parameter will be ignored if proxy hostname is not set + + diff --git a/doc/stonith/README.vacm b/doc/stonith/README.vacm new file mode 100644 index 0000000..c9083ee --- /dev/null +++ b/doc/stonith/README.vacm @@ -0,0 +1,40 @@ +20 December 2000 + +I (rather poorly) integrated this contributed stonith driver into the +linux-ha-stonith release. There is a problem that needs to be +resolved by autoconf in that the driver will not compile unless +libvacmclient is installed on the system. + +For now, what I've done is included a line in stonith/Makefile that you can +uncomment if you want to compile the vacm stonith module. Look in the +Makefile in this directory for the following lines and do like it says + + +# If you want the VA Linux Cluster stonith module installed, +# uncomment the following line. You must have the vacmclient library +#VACM_STONITH = vacm_stonith.so + +Please direct questions about the operation of the stonith module to +Mike Tilstra (see the announcement to the linux-ha-dev mailing list +attached below.) + + +-Eric. +eric.ayers@compgen.com + +------------------------------------------------------------------------------ + +From: Mike Tilstra <conrad@sistina.com> +Sender: linux-ha-dev-admin@lists.tummy.com +To: linux-ha-dev@lists.tummy.com +Subject: [Linux-ha-dev] stonith module for VACM +Date: Tue, 19 Dec 2000 16:41:38 -0600 + +This was in need for some testing I'm doing, so I hacked this up quick. It +works for me, but I'm willing to bet there's atleast one bug in it. + +Figured others might like it. + +... +-- +Mike Tilstra conrad@sistina.com
\ No newline at end of file diff --git a/doc/stonith/README.vcenter b/doc/stonith/README.vcenter new file mode 100644 index 0000000..e6cc9a5 --- /dev/null +++ b/doc/stonith/README.vcenter @@ -0,0 +1,90 @@ +VMware vCenter/ESX STONITH Module +================================= + +1. Intro +-------- + +VMware vCenter/ESX STONITH Module is intended to provide STONITH support to +clusters in VMware Virtual Infrastructures. It is able to deal with virtual +machines running on physically different HostSystems (e.g. ESX/ESXi) by using +VMware vSphere Web Services SDK http://www.vmware.com/support/developer/vc-sdk/ +and connecting directly on each HostSystem or through a VMware vCenter: in this +last case the module locates the specified virtual machine in the Virtual +Infrastructure and performs actions required by cluster policies. + +2. Software requirements +------------------------ + +VMware vSphere CLI, which includes both CLI tools and Perl SDK +http://www.vmware.com/support/developer/vcli/ . The plugin has been tested with +version 4.1 http://www.vmware.com/download/download.do?downloadGroup=VCLI41 + + +3. vCenter/ESX authentication settings +-------------------------------------- + +Create the credentials file with credstore_admin.pl: + +/usr/lib/vmware-vcli/apps/general/credstore_admin.pl \ + -s 10.1.1.1 -u myuser -p mypass + +This should create $HOME/.vmware/credstore/vicredentials.xml +Copy it to a system folder, e.g. /etc + +cp -p $HOME/.vmware/credstore/vicredentials.xml /etc + + +4. Testing +---------- + +The plugin can be invoked directly to perform a very first connection test +(replace all the provided sample values): + +VI_SERVER=10.1.1.1 \ + VI_CREDSTORE=/etc/vicredentials.xml \ + HOSTLIST="hostname1=vmname1;hostname2=vmname2" \ + RESETPOWERON=0 \ + /usr/lib/stonith/plugins/external/vcenter gethosts + +If everything works correctly you should get: + +hostname1 +hostname2 + +When invoked in this way, the plugin connects to VI_SERVER, authenticates with +credentials stored in VI_CREDSTORE and tries to retrieve the list of virtual +machines (case insensitive) matching vmname1 and vmname2 (and any other listed). +When finished, it reports the list back by mapping virtual machine names to +hostnames as provided in HOSTLIST. If you see the full list of hostnames as a +result, then everything is going well. If otherwise you are having a partial or +empty list, you have to check parameters. + +You can even test "reset", "off" and "on" commands, to test (carefully!) the +full chain. E.g. + +VI_SERVER=10.1.1.1 \ + VI_CREDSTORE=/etc/vicredentials.xml \ + HOSTLIST="hostname1=vmname1;hostname2=vmname2" \ + RESETPOWERON=0 \ + /usr/lib/stonith/plugins/external/vcenter reset hostname2 + +In the above examples the referring infrastructure is a vCenter with several +ESXi nodes. Server IP and credentials are referred to vCenter. + +5. CRM configuration +-------------------- + +The following is a sample procedure to setup STONITH for an HA 2-node cluster +(replace all the provided sample values): + +crm configure primitive vfencing stonith::external/vcenter params \ + VI_SERVER="10.1.1.1" VI_CREDSTORE="/etc/vicredentials.xml" \ + HOSTLIST="hostname1=vmname1;hostname2=vmname2" RESETPOWERON="0" \ + op monitor interval="60s" + +crm configure clone Fencing vfencing + +crm configure property stonith-enabled="true" + + + diff --git a/doc/stonith/README.wti_mpc b/doc/stonith/README.wti_mpc new file mode 100644 index 0000000..050953d --- /dev/null +++ b/doc/stonith/README.wti_mpc @@ -0,0 +1,85 @@ +STONITH module for WTI MPC +-------------------------- + + +****Introduction. + +wti_mpc module uses snmp for controlling the MPC power distribution unit. It has +been tested with MPC-8H and MPC-18H and should be compatible with the whole +MPC series: + * MPC-20* + * MPC-16* + * MPC-18* + * MPC-8* + +****Unit configuration. + +wti_mpc STONITH modules uses SNMP v1, therefore it should be configured on the +device side. To do so, you should login to device, go to "Network +configuration" (/N), select "SNMP access" (25) and turn it on (enable/1). At the +SNMP access screen set "Version" (2) to "V1/V2 Only", set "Read only" (3) to +"No and set any "Community" (10) you want. You may also set other options as +you need. You may check your setup by issuing the following command: + + snmpwalk -v1 -c <community> <host> .1.3.6.1.2.1.1.1.0 + +and result should be something like this: + + SNMPv2-MIB::sysDescr.0 = STRING: Linux 85.195.135.236 2.4.18_mvl30-cllf #1991 Sun Mar 16 14:39:29 PST 2008 ppc + + +****Plugin configuration. + + Plugin declares the following configuration variables: + + *ipaddr - ip address or hostname of a MPC unit. + *port - ip port, should be 161, as MPC listens for incoming SNMP + packets on that port. It is made for future use actually. + *community - Community that you've specified on previous step. + *mib_version - Should be 3 for MPC devices with firmware version 1.62 + and later. 1 is for firmware version 1.44 and below. + 2 is unused right now, if you have device, with mib V2 + feel free to contact me and I'll add it. + +****MIB version issue + + WTI guys have several time changed OIDs, used by MPC devices. I own two +types of the devices: + *With firmware v 1.44 which is compatible with MIB version 1 + *With firmware v 1.62 which is compatible with MIB version 3 + +I suppose there are exist MIB v2, but i cannot find it and I'd not able +to test it. +Anyway, this plugin supports both V1 and V3 versions, and the correct version +is selected by the "mib-version" configuration parameter. Default value is "1", +so if you do not specify this parameter or assign a unsupported value to it, +it will fall back to mib version 1. + +****Outlets and groups + + MPC devices forces unique names of the outlets. This is a big problem +for STONITH plugin, cause it uses nodes unames as outlet names, so in case +you have a node with several power plugs, you should have set the node uname +as name of all the plugs. The MPC device simply doesn't allows this. + So, this plugin works with a GROUPS instead of a PLUGS. You may give +any unique names for your physical outlets on the MPC, but you MUST create +a plug group, name it using node's uname and include plugs, corresponding to +that particular node to this group. It should be done even for node with +single power supply. Some example: + + Let's pretend you have a node "atest", with two power cords, connected +to plugs A1 and B1. You have to create a group ("Plug grouping parameters" (/G) +-> Add Plug Group to directory (2)), name it "atest" ("Plug Group Name (1)) and +assign plugs A1 and B1 to that group ("Plug access" (2)). Now save your +configuration and try to retrieve host list: + + stonith -t wti_mpc ipaddr=<host> port=161 community=<community> mib-version=<version> -l + +result should be: + + atest + + +------------------ +(C) Denis Chapligin <chollya@satgate.net>, SatGate, 2009 + diff --git a/doc/stonith/README_kdumpcheck.txt b/doc/stonith/README_kdumpcheck.txt new file mode 100644 index 0000000..cc8787c --- /dev/null +++ b/doc/stonith/README_kdumpcheck.txt @@ -0,0 +1,151 @@ + Kdump check STONITH plugin "kdumpcheck" +1. Introduction + This plugin's purpose is to avoid STONITH for a node which is doing kdump. + It confirms whether the node is doing kdump or not when STONITH reset or + off operation is executed. + If the target node is doing kdump, this plugin considers that STONITH + succeeded. If not, it considers that STONITH failed. + + NOTE: This plugin has no ability to shutdown or startup a node. + So it has to be used with other STONITH plugin. + Then, when this plugin failed, the next plugin which can kill a node + is executed. + NOTE: This plugin works only on Linux. + +2. The way to check + When STONITH reset or off is executed, kdumpcheck connects to the target + node, and checks the size of /proc/vmcore. + It judges that the target node is _not_ doing kdump when the size of + /proc/vmcore on the node is zero, or the file doesn't exist. + Then kdumpcheck returns "STONITH failed" to stonithd, and the next plugin + is executed. + +3. Expanding mkdumprd + This plugin requires non-root user and ssh connection even on 2nd kernel. + So, you need to apply mkdumprd_for_kdumpcheck.patch to /sbin/mkdumprd. + This patch is tested with mkdumprd version 5.0.39. + The patch adds the following functions: + i) Start udevd with specified .rules files. + ii) Bring the specified network interface up. + iii) Start sshd. + iv) Add the specified user to the 2nd kernel. + The user is to check whether the node is doing kdump or not. + v) Execute sync command after dumping. + + NOTE: i) to iv) expandings are only for the case that filesystem partition + is specified as the location where the vmcore should be dumped. + +4. Parameters + kdumpcheck's parameters are the following. + hostlist : The list of hosts that the STONITH device controls. + delimiter is "," or " ". + indispensable setting. (default:none) + identity_file: a full-path of the private key file for the user + who checks doing kdump. + (default: $HOME/.ssh/id_rsa, $HOME/.ssh/id_dsa and + $HOME/.ssh/identity) + + NOTE: To execute this plugin first, set the highest priority to this plugin + in all STONITH resources. + +5. How to Use + To use this tool, do the following steps at all nodes in the cluster. + 1) Add an user to check doing kdump. + ex.) + # useradd kdumpchecker + # passwd kdumpchecker + 2) Allow passwordless login from the node which will do STONITH to all + target nodes for the user added at step 1). + ex.) + $ cd + $ mkdir .ssh + $ chmod 700 .ssh + $ cd .ssh + $ ssh-keygen (generate authentication keys with empty passphrase) + $ scp id_rsa.pub kdumpchecker@target_node:"~/.ssh/." + $ ssh kdumpchecker@target_node + $ cd ~/.ssh + $ cat id_rsa.pub >> authorized_keys + $ chmod 600 autorized_keys + $ rm id_rsa.pub + 3) Limit the command that the user can execute. + Describe the following commands in a line at the head of the user's + public key in target node's authorized_keys file. + [command="test -s /proc/vmcore"] + And describe some options (like no-pty, no-port-forwarding and so on) + according to your security policy. + ex.) + $ vi ~/.ssh/authorized_keys + command="test -s /proc/vmcore",no-port-forwarding,no-X11-forwarding, + no-agent-forwarding,no-pty ssh-rsa AAA..snip..== kdumpchecker@node1 + 4) Add settings in /etc/kdump.conf. + network_device : network interface name to check doing kdump. + indispensable setting. (default: none) + kdump_check_user : user name to check doing kdump. + specify non-root user. + (default: "kdumpchecker") + udev_rules : .rules files' names. + specify if you use udev for mapping devices. + specified files have to be in /etc/udev/rules.d/. + you can specify two or more files. + delimiter is "," or " ". (default: none) + ex.) + # vi /etc/kdump.conf + ext3 /dev/sda1 + network_device eth0 + kdump_check_user kdumpchecker + udev_rules 10-if.rules + 5) Apply the patch to /sbin/mkdumprd. + # cd /sbin + # patch -p 1 < mkdumprd_for_kdumpcheck.patch + 6) Restart kdump service. + # service kdump restart + 7) Describe cib.xml to set STONITH plugin. + (See "2. Parameters" and "6. Appendix") + +6. Appendix + A sample cib.xml. + <clone id="clnStonith"> + <instance_attributes id="instance_attributes.id238245a"> + <nvpair id="clone0_clone_max" name="clone_max" value="2"/> + <nvpair id="clone0_clone_node_max" name="clone_node_max" value="1"/> + </instance_attributes> + <group id="grpStonith"> + <instance_attributes id="instance_attributes.id2382455"/> + <primitive id="grpStonith-kdumpcheck" class="stonith" type="external/kd + umpcheck"> + <instance_attributes id="instance_attributes.id238240a"> + <nvpair id="nvpair.id238240b" name="hostlist" value="node1,node2"/> + <nvpair id="nvpair.id238240c" name="priority" value="1"/> + <nvpair id="nvpair.id2382408b" name="stonith-timeout" value="30s"/> + </instance_attributes> + <operations> + <op id="grpStonith-kdumpcheck-start" name="start" interval="0" tim + eout="300" on-fail="restart"/> + <op id="grpStonith-kdumpcheck-monitor" name="monitor" interval="10" + timeout="60" on-fail="restart"/> + <op id="grpStonith-kdumpcheck-stop" name="stop" interval="0" timeou + t="300" on-fail="block"/> + </operations> + <meta_attributes id="primitive-grpStonith-kdump-check.meta"/> + </primitive> + <primitive id="grpStonith-ssh" class="stonith" type="external/ssh"> + <instance_attributes id="instance_attributes.id2382402a"> + <nvpair id="nvpair.id2382408a" name="hostlist" value="node1,node2"/ + > + <nvpair id="nvpair.id238066b" name="priority" value="2"/> + <nvpair id="nvpair.id2382408c" name="stonith-timeout" value="60s"/> + </instance_attributes> + <operations> + <op id="grpStonith-ssh-start" name="start" interval="0" timeout="30 + 0" on-fail="restart"/> + <op id="grpStonith-ssh-monitor" name="monitor" interval="10" timeou + t="60" on-fail="restart"/> + <op id="grpStonith-ssh-stop" name="stop" interval="0" timeout="300" + on-fail="block"/> + </operations> + <meta_attributes id="primitive-grpStonith-ssh.meta"/> + </primitive> + </group> + </clone> + |