summaryrefslogtreecommitdiffstats
path: root/ctdb/tools/ctdb_diagnostics
blob: d16a71c8f30d07170848376b91ea2d9ddda89929 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#!/bin/sh
# a script to test the basic setup of a CTDB/Samba install 
# tridge@samba.org September 2007
# martin@meltin.net August 2010

usage ()
{
    cat >&2 <<EOF
Usage: ctdb_diagnostics [OPTION] ...
  options:
    -n <nodes>  Comma separated list of nodes to operate on
    -c          Ignore comment lines (starting with '#') in file comparisons
    -w          Ignore whitespace in file comparisons
    --no-ads    Do not use commands that assume an Active Directory Server
EOF
    exit 1

}

nodes=$(ctdb listnodes -X | cut -d'|' -f2)
bad_nodes=""
diff_opts=
no_ads=false

parse_options ()
{
    temp=$(getopt -n "ctdb_diagnostics" -o "n:cwh" -l no-ads,help -- "$@")

    # No! Checking the exit code afterwards is actually clearer...
    # shellcheck disable=SC2181
    [ $? -eq 0 ] || usage

    eval set -- "$temp"

    while true ; do
	case "$1" in
	    -n) nodes=$(echo "$2" | sed -e 's@,@ @g') ; shift 2 ;;
	    -c) diff_opts="${diff_opts} -I ^#.*" ; shift ;;
	    -w) diff_opts="${diff_opts} -w" ; shift ;;
	    --no-ads) no_ads=true ; shift ;;
	    --) shift ; break ;;
	    -h|--help|*) usage ;;
	esac
    done

    [ $# -ne 0 ] && usage
}

parse_options "$@"

# Use 5s ssh timeout if EXTRA_SSH_OPTS doesn't set a timeout.
case "$EXTRA_SSH_OPTS" in
    *ConnectTimeout=*) : ;;
    *)
	export EXTRA_SSH_OPTS="${EXTRA_SSH_OPTS} -o ConnectTimeout=5"
esac

# Filter nodes.  Remove any nodes we can't contact from $node and add
# them to $bad_nodes.
_nodes=""
for _i in $nodes ; do
    if onnode "$_i" true >/dev/null 2>&1 ; then
	_nodes="${_nodes}${_nodes:+ }${_i}"
    else
	bad_nodes="${bad_nodes}${bad_nodes:+,}${_i}"
    fi
done
nodes="$_nodes"

nodes_comma=$(echo "$nodes" | sed -e 's@[[:space:]]@,@g')

PATH="$PATH:/sbin:/usr/sbin:/usr/lpp/mmfs/bin"

# list of config files that must exist and that we check are the same 
# on the nodes
if [ -d /etc/sysconfig ] ; then
    CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /usr/local/etc/ctdb/nodes /etc/sysconfig/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/sysconfig/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
else
    CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /usr/local/etc/ctdb/nodes /etc/default/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/default/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
fi

# list of config files that may exist and should be checked that they
# are the same on the nodes
CONFIG_FILES_MAY="/usr/local/etc/ctdb/public_addresses /usr/local/etc/ctdb/static-routes"

exec 2>&1

cat <<EOF
--------------------------------------------------------------------
ctdb_diagnostics starting. This script will gather information about
your ctdb cluster. You should send the output of this script along
with any ctdb or clustered Samba bug reports.
--------------------------------------------------------------------
EOF

date

error() {
    msg="$1"
    echo "ERROR: $msg"
    NUM_ERRORS=$((NUM_ERRORS + 1))
    echo " ERROR[$NUM_ERRORS]: $msg" >> "$ERRORS"
}

show_file() {
    fname="$1"
    _fdetails=$(ls -l "$fname" 2>&1)
    echo "  ================================"
    echo "  File: $fname"
    echo "  $_fdetails"
    sed 's/^/  /' "$fname" 2>&1
    echo "  ================================"
}

show_all() {
    echo "running $1 on nodes $nodes_comma"
    onnode "$nodes_comma" "hostname; date; $1 2>&1 | sed 's/^/  /'" 2>&1
}

show_and_compare_files () {

    fmt="$1" ; shift

    for f ; do
	_bf=$(basename "$f")
	first=true

	for n in $nodes ; do

	    if $first ; then
		onnode "$n" [ -r "$f" ] || {
		    # This function takes a format string
		    # shellcheck disable=SC2059
		    msg=$(printf "$fmt" "$f" "$n")
		    error "$msg"
		    continue 2;
		}

		fstf="${tmpdir}/${_bf}.node${n}"
		onnode "$n" cat "$f" >"$fstf" 2>&1

		_fdetails=$(onnode "$n" ls -l "$f" 2>&1)
		echo "  ================================"
		echo "  File (on node $n): $f"
		echo "  $_fdetails"
		sed 's/^/  /' "$fstf"
		echo "  ================================"
		first=false
	    else
		echo "Testing for same config file $f on node $n"
		tmpf="${tmpdir}/${_bf}.node${n}"
		onnode "$n" cat "$f" >"$tmpf" 2>&1
		# Intentional multi-word splitting on diff_opts
		# shellcheck disable=SC2086
		diff $diff_opts "$fstf" "$tmpf" >/dev/null 2>&1 || {
		    error "File $f is different on node $n"
		    diff -u $diff_opts "$fstf" "$tmpf"
		}
		rm -f "$tmpf"
	    fi
	done

	rm -f "$fstf"
    done
}

if ! tmpdir=$(mktemp -d) ; then
    echo "Unable to create a temporary directory"
    exit 1
fi
ERRORS="${tmpdir}/diag_err"
NUM_ERRORS=0

cat <<EOF
Diagnosis started on these nodes:
$nodes_comma
EOF

if [ -n "$bad_nodes" ] ; then
    cat <<EOF

NOT RUNNING DIAGNOSTICS on these uncontactable nodes:
$bad_nodes
EOF

fi

cat <<EOF

For reference, here is the nodes file on the current node...
EOF

show_file /usr/local/etc/ctdb/nodes

cat <<EOF
--------------------------------------------------------------------
Comping critical config files on nodes $nodes_comma
EOF

# Intentional multi-word splitting on CONFIG_FILES_MUST
# shellcheck disable=SC2086
show_and_compare_files \
    "%s is missing on node %d" \
    $CONFIG_FILES_MUST

# Intentional multi-word splitting on CONFIG_FILES_MAY
# shellcheck disable=SC2086
show_and_compare_files \
    "Optional file %s is not present on node %d" \
    $CONFIG_FILES_MAY

cat <<EOF
--------------------------------------------------------------------
Checking for clock drift
EOF
t=$(date +%s)
for i in $nodes; do
    t2=$(onnode "$i" date +%s)
    d=$((t2 - t))
    if [ "$d" -gt 30 ] || [ "$d" -lt -30 ]; then
	error "time on node $i differs by $d seconds"
    fi
done

cat <<EOF
--------------------------------------------------------------------
Showing software versions
EOF
show_all "uname -a"
[ -x /bin/rpm ] && {
    show_all "rpm -qa | grep -E 'samba|ctdb|gpfs'"
}
[ -x /usr/bin/dpkg-query ] && {
    show_all "/usr/bin/dpkg-query --show 'ctdb'"
    show_all "/usr/bin/dpkg-query --show 'samba'"
    #show_all "/usr/bin/dpkg-query --show 'gpfs'"
}


cat <<EOF
--------------------------------------------------------------------
Showing ctdb status and recent log entries
EOF
show_all "ctdb status; ctdb ip"
show_all "ctdb statistics"
show_all "ctdb uptime"
show_all "ctdb listvars"
show_all "ctdb getdbmap"
show_all "ctdb -X getdbmap | awk -F'|' 'NR > 1 {print \$3}' | sort | xargs -n 1 ctdb dbstatistics"

echo "Showing log.ctdb"
show_all "test -f /usr/local/var/log/log.ctdb && tail -100 /usr/local/var/log/log.ctdb"

show_all "tail -200 /var/log/messages"
show_all "ls -lRs /usr/local/var/lib/ctdb"
show_all "ls -lRs /usr/local/etc/ctdb"


cat <<EOF
--------------------------------------------------------------------
Showing system and process status
EOF
show_all "df"
show_all "df -i"
show_all "mount"
show_all "w"
show_all "ps axfwu"
show_all "dmesg"
show_all "/sbin/lspci"
show_all "dmidecode"
show_all "cat /proc/partitions"
show_all "cat /proc/cpuinfo"
show_all "cat /proc/scsi/scsi"
show_all "/sbin/ifconfig -a"
show_all "/sbin/ifconfig -a"
show_all "cat /proc/net/dev"
show_all "/sbin/ip addr list"
show_all "/sbin/route -n"
show_all "ss -s"
show_all "free"
show_all "crontab -l"
show_all "sysctl -a"
show_all "iptables -L -n"
show_all "iptables -L -n -t nat"
show_all "/usr/sbin/rpcinfo -p"
show_all "/usr/sbin/showmount -a"
show_all "/usr/sbin/showmount -e"
show_all "/usr/sbin/nfsstat -v"
[ -x /sbin/multipath ] && {
    show_all "/sbin/multipath -ll"
}
[ -x /sbin/chkconfig ] && {
    show_all "/sbin/chkconfig --list"
}
[ -x /usr/sbin/getenforce ] && {
    show_all "/usr/sbin/getenforce"
}
[ -d /proc/net/bonding ] && {
    for f in /proc/net/bonding/*; do
	show_all "cat $f"
    done
}

cat <<EOF
--------------------------------------------------------------------
Showing Samba status
EOF
show_all "smbstatus -n -B"
if $no_ads ; then
    echo
    echo "Skipping \"net ads testjoin\" as requested"
    echo
else
    show_all "net ads testjoin"
fi
show_all "net conf list"
show_all "lsof -n | grep smbd"
show_all "lsof -n | grep ctdbd"
show_all "netstat -tan"
if $no_ads ; then
    echo
    echo "Skipping \"net ads info\" as requested"
    echo
else
    show_all "net ads info"
fi
show_all "date"
show_all "smbclient -U% -L 127.0.0.1"
WORKGROUP=$(testparm -s --parameter-name=WORKGROUP 2> /dev/null)
show_all id "$WORKGROUP/Administrator"
show_all "wbinfo -p"
show_all "wbinfo --online-status"
show_all "smbd -b"

date
echo "Diagnostics finished with $NUM_ERRORS errors"

[ -r "$ERRORS" ] && {
    cat "$ERRORS"
    rm -f "$ERRORS"
}

rm -rf "$tmpdir"

exit $NUM_ERRORS