summaryrefslogtreecommitdiffstats
path: root/qa/standalone/scrub/osd-recovery-scrub.sh
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /qa/standalone/scrub/osd-recovery-scrub.sh
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'qa/standalone/scrub/osd-recovery-scrub.sh')
-rwxr-xr-xqa/standalone/scrub/osd-recovery-scrub.sh372
1 files changed, 372 insertions, 0 deletions
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
new file mode 100755
index 000000000..d778b0314
--- /dev/null
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -0,0 +1,372 @@
+#! /usr/bin/env bash
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+ local dir=$1
+ shift
+
+ export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one
+ export CEPH_ARGS
+ CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+ CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+ export -n CEPH_CLI_TEST_DUP_COMMAND
+ local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+ for func in $funcs ; do
+ $func $dir || return 1
+ done
+}
+
+# Simple test for "not scheduling scrubs due to active recovery"
+# OSD::sched_scrub() called on all OSDs during ticks
+function TEST_recovery_scrub_1() {
+ local dir=$1
+ local poolname=test
+
+ TESTDATA="testdata.$$"
+ OSDS=4
+ PGS=1
+ OBJECTS=100
+ ERRORS=0
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
+ --osd_scrub_interval_randomize_ratio=0.0 || return 1
+ run_mgr $dir x || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd --osd_scrub_during_recovery=false || return 1
+ done
+
+ # Create a pool with $PGS pgs
+ create_pool $poolname $PGS $PGS
+ wait_for_clean || return 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ ceph pg dump pgs
+
+ dd if=/dev/urandom of=$TESTDATA bs=1M count=50
+ for i in $(seq 1 $OBJECTS)
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+ rm -f $TESTDATA
+
+ ceph osd pool set $poolname size 4
+
+ # Wait for recovery to start
+ set -o pipefail
+ count=0
+ while(true)
+ do
+ if ceph --format json pg dump pgs |
+ jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true
+ then
+ break
+ fi
+ sleep 2
+ if test "$count" -eq "10"
+ then
+ echo "Recovery never started"
+ return 1
+ fi
+ count=$(expr $count + 1)
+ done
+ set +o pipefail
+ ceph pg dump pgs
+
+ sleep 10
+ # Work around for http://tracker.ceph.com/issues/38195
+ kill_daemons $dir #|| return 1
+
+ declare -a err_strings
+ err_strings[0]="not scheduling scrubs due to active recovery"
+
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ grep "not scheduling scrubs" $dir/osd.${osd}.log
+ done
+ for err_string in "${err_strings[@]}"
+ do
+ found=false
+ count=0
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ if grep -q "$err_string" $dir/osd.${osd}.log
+ then
+ found=true
+ count=$(expr $count + 1)
+ fi
+ done
+ if [ "$found" = "false" ]; then
+ echo "Missing log message '$err_string'"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ [ $count -eq $OSDS ] || return 1
+ done
+
+ teardown $dir || return 1
+
+ if [ $ERRORS != "0" ];
+ then
+ echo "TEST FAILED WITH $ERRORS ERRORS"
+ return 1
+ fi
+
+ echo "TEST PASSED"
+ return 0
+}
+
+##
+# a modified version of wait_for_scrub(), which terminates if the Primary
+# of the to-be-scrubbed PG changes
+#
+# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
+# will fail if scrub does not complete within $TIMEOUT seconds. The
+# repair is complete whenever the **get_last_scrub_stamp** function
+# reports a timestamp different from the one given in argument.
+#
+# @param pgid the id of the PG
+# @param the primary OSD when started
+# @param last_scrub timestamp of the last scrub for *pgid*
+# @return 0 on success, 1 on error
+#
+function wait_for_scrub_mod() {
+ local pgid=$1
+ local orig_primary=$2
+ local last_scrub="$3"
+ local sname=${4:-last_scrub_stamp}
+
+ for ((i=0; i < $TIMEOUT; i++)); do
+ sleep 0.2
+ if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then
+ return 0
+ fi
+ sleep 1
+ # are we still the primary?
+ local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ if [ $orig_primary != $current_primary ]; then
+ echo $orig_primary no longer primary for $pgid
+ return 0
+ fi
+ done
+ return 1
+}
+
+##
+# A modified version of pg_scrub()
+#
+# Run scrub on **pgid** and wait until it completes. The pg_scrub
+# function will fail if repair does not complete within $TIMEOUT
+# seconds. The pg_scrub is complete whenever the
+# **get_last_scrub_stamp** function reports a timestamp different from
+# the one stored before starting the scrub, or whenever the Primary
+# changes.
+#
+# @param pgid the id of the PG
+# @return 0 on success, 1 on error
+#
+function pg_scrub_mod() {
+ local pgid=$1
+ local last_scrub=$(get_last_scrub_stamp $pgid)
+ # locate the primary
+ local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
+ ceph pg scrub $pgid
+ wait_for_scrub_mod $pgid $my_primary "$last_scrub"
+}
+
+# update a map of 'log filename' -> 'current line count'
+#
+# @param (pos. 1) logfiles directory
+# @param (pos. 2) the map to update. An associative array of starting line
+# numbers (indexed by filename)
+function mark_logs_linecount() {
+ local odir=$1
+ local -n wca=$2
+ for f in $odir/osd.*.log ;
+ do
+ local W=`wc -l $f | gawk ' {print($1);}' `
+ wca["$f"]=$W
+ done
+}
+
+##
+# search a (log) file for a string, starting the search from a specific line
+#
+# @param (pos. 1) associative array of starting line numbers (indexed by filename)
+# @param (pos. 2) the file to search
+# @param (pos. 3) the text string to search for
+# @returns 0 if found
+function grep_log_after_linecount() {
+ local -n lwca=$1
+ local logfile=$2
+ local from_line=${lwca[$logfile]}
+ local text=$3
+ from_line=`expr $from_line + 1`
+
+ tail --lines=+$from_line $logfile | grep -q -e $text
+ return $?
+}
+
+##
+# search all osd logs for a string, starting the search from a specific line
+#
+# @param (pos. 1) logfiles directory
+# @param (pos. 2) associative array of starting line numbers (indexed by filename)
+# @param (pos. 3) the text string to search for
+# @returns 0 if found in any of the files
+function grep_all_after_linecount() {
+ local dir=$1
+ local -n wca=$2
+ local text=$3
+
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ grep_log_after_linecount wca $dir/osd.$osd.log $text && return 0
+ done
+ return 1
+}
+
+# osd_scrub_during_recovery=true make sure scrub happens
+function TEST_recovery_scrub_2() {
+ local dir=$1
+ local poolname=test
+ declare -A logwc # an associative array: log -> line number
+
+ TESTDATA="testdata.$$"
+ OSDS=8
+ PGS=32
+ OBJECTS=4
+
+ setup $dir || return 1
+ run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
+ --osd_scrub_interval_randomize_ratio=0.0 || return 1
+ run_mgr $dir x || return 1
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ run_osd $dir $osd --osd_scrub_during_recovery=true || return 1
+ done
+
+ # Create a pool with $PGS pgs
+ create_pool $poolname $PGS $PGS
+ wait_for_clean || return 1
+ poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+ dd if=/dev/urandom of=$TESTDATA bs=1M count=50
+ for i in $(seq 1 $OBJECTS)
+ do
+ rados -p $poolname put obj${i} $TESTDATA
+ done
+ rm -f $TESTDATA
+
+ flush_pg_stats
+ date --rfc-3339=ns
+ mark_logs_linecount $dir logwc
+ ceph osd pool set $poolname size 3
+
+ ceph pg dump pgs
+
+ # Wait for recovery to start
+ set -o pipefail
+ count=0
+ while(true)
+ do
+ grep_all_after_linecount $dir logwc recovering && break
+ if ceph --format json pg dump pgs |
+ jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true
+ then
+ break
+ fi
+ flush_pg_stats
+ sleep 2
+ if test "$count" -eq "10"
+ then
+ echo "Recovery never started"
+ return 1
+ fi
+ count=$(expr $count + 1)
+ done
+ flush_pg_stats
+ sleep 2
+ set +o pipefail
+ ceph pg dump pgs
+
+ pids=""
+ for pg in $(seq 0 $(expr $PGS - 1))
+ do
+ run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
+ done
+ ceph pg dump pgs
+ wait_background pids
+ return_code=$?
+ if [ $return_code -ne 0 ]; then return $return_code; fi
+
+ ERRORS=0
+ pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
+ pid=$(cat $pidfile)
+ if ! kill -0 $pid
+ then
+ echo "OSD crash occurred"
+ #tail -100 $dir/osd.0.log
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+
+ # Work around for http://tracker.ceph.com/issues/38195
+ kill_daemons $dir #|| return 1
+
+ declare -a err_strings
+ err_strings[0]="not scheduling scrubs due to active recovery"
+
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ grep "not scheduling scrubs" $dir/osd.${osd}.log
+ done
+ for err_string in "${err_strings[@]}"
+ do
+ found=false
+ for osd in $(seq 0 $(expr $OSDS - 1))
+ do
+ if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
+ then
+ found=true
+ fi
+ done
+ if [ "$found" = "true" ]; then
+ echo "Found log message not expected '$err_string'"
+ ERRORS=$(expr $ERRORS + 1)
+ fi
+ done
+
+ teardown $dir || return 1
+
+ if [ $ERRORS != "0" ];
+ then
+ echo "TEST FAILED WITH $ERRORS ERRORS"
+ return 1
+ fi
+
+ echo "TEST PASSED"
+ return 0
+}
+
+main osd-recovery-scrub "$@"
+
+# Local Variables:
+# compile-command: "cd build ; make -j4 && \
+# ../qa/run-standalone.sh osd-recovery-scrub.sh"
+# End: