summaryrefslogtreecommitdiffstats
path: root/scrub/e2scrub.in
blob: 7ed57f2d3027694140c4c3f68d666022d5e02661 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/bin/bash

#  Copyright (C) 2018 Oracle.  All Rights Reserved.
#
#  Author: Darrick J. Wong <darrick.wong@oracle.com>
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License
#  as published by the Free Software Foundation; either version 2
#  of the License, or (at your option) any later version.
#
#  This program is distributed in the hope that it would be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write the Free Software Foundation,
#  Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.

# Automatically check an LVM-managed filesystem online.
# We use lvm snapshots to do this, which means that we can only
# check filesystems in VGs that have at least 256MB (or so) of
# free space.

PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin

if (( $EUID != 0 )); then
    echo "e2scrub must be run as root"
    exit 1
fi

snap_size_mb=256
fstrim=0
reap=0
e2fsck_opts=""
conffile="@root_sysconfdir@/e2scrub.conf"

test -f "${conffile}" && . "${conffile}"

print_help() {
	echo "Usage: $0 [OPTIONS] mountpoint | device"
	echo
	echo "mountpoint must be on an LVM-managed block device"
	echo "-n: Show what commands e2scrub would execute."
	echo "-r: Remove e2scrub snapshot and exit, do not check anything."
	echo "-t: Run fstrim if successful."
	echo "-V: Print version information and exit."
}

print_version() {
	echo "e2scrub @E2FSPROGS_VERSION@ (@E2FSPROGS_DATE@)"
}

exitcode() {
	ret="$1"

	# If we're being run as a service, the return code must fit the LSB
	# init script action error guidelines, which is to say that we
	# compress all errors to 1 ("generic or unspecified error", LSB 5.0
	# section 22.2) and hope the admin will scan the log for what
	# actually happened.

	# We have to sleep 2 seconds here because journald uses the pid to
	# connect our log messages to the systemd service.  This is critical
	# for capturing all the log messages if the scrub fails, because the
	# fail service uses the service name to gather log messages for the
	# error report.
	if [ -n "${SERVICE_MODE}" -a "${ret}" -ne 0 ]; then
		test "${ret}" -ne 0 && ret=1
		sleep 2
	fi

	exit "${ret}"
}

while getopts "nrtV" opt; do
    case "${opt}" in
	"n") DBG="echo Would execute: " ;;
	"r") reap=1;;
	"t") fstrim=1;;
	"V") print_version; exitcode 0;;
	*) print_help; exitcode 2;;
	esac
done
shift "$((OPTIND - 1))"

arg="$1"
if [ -z "${arg}" ]; then
	print_help
	exitcode 1
fi

if ! type lsblk >& /dev/null ; then
    echo "e2scrub: can't find lsblk --- is util-linux installed?"
    exitcode 1
fi

if ! type lvcreate >& /dev/null ; then
    echo "e2scrub: can't find lvcreate --- is lvm2 installed?"
    exitcode 1
fi

# close file descriptor 3 (from cron) since it causes lvm to kvetch
exec 3<&-

# Find the device for a given mountpoint
dev_from_mount() {
	local mountpt="$(realpath "$1")"

	lsblk -o NAME,FSTYPE,MOUNTPOINT -p -P -n 2> /dev/null | while read vars; do
		eval "${vars}"
		if [ "${mountpt}" != "${MOUNTPOINT}" ]; then
			continue
		fi
		case "${FSTYPE}" in
		ext[234])
			echo "${NAME}"
			return 0
			;;
		esac
	done
	return 1
}

# Check a device argument
dev_from_arg() {
	local dev="$1"
	local fstype="$(lsblk -o FSTYPE -n "${dev}" 2> /dev/null)"

	case "${fstype}" in
	ext[234])
		echo "${dev}"
		return 0
		;;
	esac
	return 1
}

mnt_from_dev() {
	local dev="$1"

	if [ -n "${dev}" ]; then
		lsblk -o MOUNTPOINT -n "${dev}"
	fi
}

# Construct block device path and mountpoint from argument
if [ -b "${arg}" ]; then
	dev="$(dev_from_arg "${arg}")"
	mnt="$(mnt_from_dev "${dev}")"
else
	dev="$(dev_from_mount "${arg}")"
	mnt="${arg}"
fi
if [ ! -e "${dev}" ]; then
	echo "${arg}: Not an ext[234] filesystem."
	print_help
	exitcode 16
fi

# Make sure this is an LVM device we can snapshot
lvm_vars="$(lvs --nameprefixes -o name,vgname,lv_role --noheadings "${dev}" 2> /dev/null)"
eval "${lvm_vars}"
if [ -z "${LVM2_VG_NAME}" ] || [ -z "${LVM2_LV_NAME}" ] ||
   echo "${LVM2_LV_ROLE}" | grep -q "snapshot"; then
	echo "${arg}: Not connected to an LVM logical volume."
	print_help
	exitcode 16
fi
start_time="$(date +'%Y%m%d%H%M%S')"
snap="${LVM2_LV_NAME}.e2scrub"
snap_dev="/dev/${LVM2_VG_NAME}/${snap}"

teardown() {
	# Remove and wait for removal to succeed.
	${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
	while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ]; do
		sleep 0.5
		${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
	done
}

check() {
	# First we recover the journal, then we see if e2fsck tries any
	# non-optimization repairs.  If either of these two returns a
	# non-zero status (errors fixed or remaining) then this fs is bad.
	E2FSCK_FIXES_ONLY=1
	export E2FSCK_FIXES_ONLY
	${DBG} "@root_sbindir@/e2fsck" -E journal_only -p ${e2fsck_opts} "${snap_dev}" || return $?
	${DBG} "@root_sbindir@/e2fsck" -f -y ${e2fsck_opts} "${snap_dev}"
}

mark_clean() {
	${DBG} "@root_sbindir@/tune2fs" -C 0 -T "${start_time}" "${dev}"
}

mark_corrupt() {
	${DBG} "@root_sbindir@/tune2fs" -E force_fsck "${dev}"
}

setup() {
	# Try to remove snapshot for 30s, bail out if we can't remove it.
	lvremove_deadline="$(( $(date "+%s") + 30))"
	${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 2>/dev/null
	while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ] &&
	      [ "$(date "+%s")" -lt "${lvremove_deadline}" ]; do
		sleep 0.5
		${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
	done
	if [ -e "${snap_dev}" ]; then
		echo "${arg}: e2scrub snapshot is in use, cannot check!"
		return 1
	fi
	# Create the snapshot, wait for device to appear.
	${DBG} lvcreate -s -L "${snap_size_mb}m" -n "${snap}" "${LVM2_VG_NAME}/${LVM2_LV_NAME}"
	if [ $? -ne 0 ]; then
		echo "${arg}: e2scrub snapshot FAILED, will not check!"
		return 1
	fi
	${DBG} udevadm settle 2> /dev/null
	return 0
}

if [ "${reap}" -gt 0 ]; then
	if [ -e "${snap_dev}" ]; then
		teardown 2> /dev/null
	fi
	exit 0
fi
if ! setup; then
	exitcode 8
fi
trap "teardown; exit 1" EXIT INT QUIT TERM

# Check and react
check
case "$?" in
"0")
	# Clean check!
	echo "${arg}: Scrub succeeded."
	mark_clean
	teardown
	trap '' EXIT

	# Trim the free space, which requires the snapshot be deleted.
	if [ "${fstrim}" -eq 1 ] && [ -d "${mnt}" ] && type fstrim > /dev/null 2>&1; then
		echo "${arg}: Trimming free space."
		fstrim -v "${mnt}"
	fi

	ret=0
	;;
"8")
	# Operational error, what now?
	echo "${arg}: e2fsck operational error."
	teardown
	trap '' EXIT
	ret=8
	;;
*)
	# fsck failed.  Check if the snapshot is invalid; if so, make a
	# note of that at the end of the log.  This isn't necessarily a
	# failure because the mounted fs could have overflowed the
	# snapshot with regular disk writes /or/ our repair process
	# could have done it by repairing too much.
	#
	# If it's really corrupt we ought to fsck at next boot.
	is_invalid="$(lvs -o lv_snapshot_invalid --noheadings "${snap_dev}" | awk '{print $1}')"
	if [ -n "${is_invalid}" ]; then
		echo "${arg}: Scrub FAILED due to invalid snapshot."
		ret=8
	else
		echo "${arg}: Scrub FAILED due to corruption!  Unmount and run e2fsck -y."
		mark_corrupt
		ret=6
	fi
	teardown
	trap '' EXIT
	;;
esac

exitcode "${ret}"