diff options
Diffstat (limited to 'qa/standalone/scrub/osd-recovery-scrub.sh')
-rwxr-xr-x | qa/standalone/scrub/osd-recovery-scrub.sh | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh new file mode 100755 index 000000000..9541852c7 --- /dev/null +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -0,0 +1,352 @@ +#! /usr/bin/env bash +# +# Copyright (C) 2017 Red Hat <contact@redhat.com> +# +# Author: David Zafman <dzafman@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + export -n CEPH_CLI_TEST_DUP_COMMAND + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +# Simple test for "not scheduling scrubs due to active recovery" +# OSD::sched_scrub() called on all OSDs during ticks +function TEST_recovery_scrub_1() { + local dir=$1 + local poolname=test + + TESTDATA="testdata.$$" + OSDS=4 + PGS=1 + OBJECTS=100 + ERRORS=0 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ + --osd_scrub_interval_randomize_ratio=0.0 || return 1 + run_mgr $dir x || return 1 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd --osd_scrub_during_recovery=false || return 1 + done + + # Create a pool with $PGS pgs + create_pool $poolname $PGS $PGS + wait_for_clean || return 1 + poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') + + ceph pg dump pgs + + dd if=/dev/urandom of=$TESTDATA bs=1M count=50 + for i in $(seq 1 $OBJECTS) + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + ceph osd pool set $poolname size 4 + + # Wait for recovery to start + set -o pipefail + count=0 + while(true) + do + if ceph --format json pg dump pgs | + jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true + then + break + fi + sleep 2 + if test "$count" -eq "10" + then + echo "Recovery never started" + return 1 + fi + count=$(expr $count + 1) + done + set +o pipefail + ceph pg dump pgs + + sleep 10 + # Work around for http://tracker.ceph.com/issues/38195 + kill_daemons $dir #|| return 1 + + declare -a err_strings + err_strings[0]="not scheduling scrubs due to active recovery" + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + grep "not scheduling scrubs" $dir/osd.${osd}.log + done + for err_string in "${err_strings[@]}" + do + found=false + count=0 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + if grep -q "$err_string" $dir/osd.${osd}.log + then + found=true + count=$(expr $count + 1) + fi + done + if [ "$found" = "false" ]; then + echo "Missing log message '$err_string'" + ERRORS=$(expr $ERRORS + 1) + fi + [ $count -eq $OSDS ] || return 1 + done + + teardown $dir || return 1 + + if [ $ERRORS != "0" ]; + then + echo "TEST FAILED WITH $ERRORS ERRORS" + return 1 + fi + + echo "TEST PASSED" + return 0 +} + +## +# a modified version of wait_for_scrub(), which terminates if the Primary +# of the to-be-scrubbed PG changes +# +# Given the *last_scrub*, wait for scrub to happen on **pgid**. It +# will fail if scrub does not complete within $TIMEOUT seconds. The +# repair is complete whenever the **get_last_scrub_stamp** function +# reports a timestamp different from the one given in argument. +# +# @param pgid the id of the PG +# @param the primary OSD when started +# @param last_scrub timestamp of the last scrub for *pgid* +# @return 0 on success, 1 on error +# +function wait_for_scrub_mod() { + local pgid=$1 + local orig_primary=$2 + local last_scrub="$3" + local sname=${4:-last_scrub_stamp} + + for ((i=0; i < $TIMEOUT; i++)); do + sleep 0.2 + if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then + return 0 + fi + sleep 1 + # are we still the primary? + local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + if [ $orig_primary != $current_primary ]; then + echo $orig_primary no longer primary for $pgid + return 0 + fi + done + return 1 +} + +## +# A modified version of pg_scrub() +# +# Run scrub on **pgid** and wait until it completes. The pg_scrub +# function will fail if repair does not complete within $TIMEOUT +# seconds. The pg_scrub is complete whenever the +# **get_last_scrub_stamp** function reports a timestamp different from +# the one stored before starting the scrub, or whenever the Primary +# changes. +# +# @param pgid the id of the PG +# @return 0 on success, 1 on error +# +function pg_scrub_mod() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid) + # locate the primary + local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local recovery=false + ceph pg scrub $pgid + #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" + if ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" | grep -q recovering + then + recovery=true + fi + wait_for_scrub_mod $pgid $my_primary "$last_scrub" || return 1 + if test $recovery = "true" + then + return 2 + fi +} + +# Same as wait_background() except that it checks for exit code 2 and bumps recov_scrub_count +function wait_background_check() { + # We extract the PIDS from the variable name + pids=${!1} + + return_code=0 + for pid in $pids; do + wait $pid + retcode=$? + if test $retcode -eq 2 + then + recov_scrub_count=$(expr $recov_scrub_count + 1) + elif test $retcode -ne 0 + then + # If one process failed then return 1 + return_code=1 + fi + done + + # We empty the variable reporting that all process ended + eval "$1=''" + + return $return_code +} + +# osd_scrub_during_recovery=true make sure scrub happens +function TEST_recovery_scrub_2() { + local dir=$1 + local poolname=test + + TESTDATA="testdata.$$" + OSDS=8 + PGS=32 + OBJECTS=40 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ + --osd_scrub_interval_randomize_ratio=0.0 || return 1 + run_mgr $dir x || return 1 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 || return 1 + done + + # Create a pool with $PGS pgs + create_pool $poolname $PGS $PGS + wait_for_clean || return 1 + poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') + + dd if=/dev/urandom of=$TESTDATA bs=1M count=50 + for i in $(seq 1 $OBJECTS) + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + ceph osd pool set $poolname size 3 + + ceph pg dump pgs + + # Wait for recovery to start + count=0 + while(true) + do + #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]' + if test $(ceph --format json pg dump pgs | + jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2 + then + break + fi + sleep 2 + if test "$count" -eq "10" + then + echo "Not enough recovery started simultaneously" + return 1 + fi + count=$(expr $count + 1) + done + ceph pg dump pgs + + pids="" + recov_scrub_count=0 + for pg in $(seq 0 $(expr $PGS - 1)) + do + run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg) + done + wait_background_check pids + return_code=$? + if [ $return_code -ne 0 ]; then return $return_code; fi + + ERRORS=0 + if test $recov_scrub_count -eq 0 + then + echo "No scrubs occurred while PG recovering" + ERRORS=$(expr $ERRORS + 1) + fi + + pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') + pid=$(cat $pidfile) + if ! kill -0 $pid + then + echo "OSD crash occurred" + #tail -100 $dir/osd.0.log + ERRORS=$(expr $ERRORS + 1) + fi + + # Work around for http://tracker.ceph.com/issues/38195 + kill_daemons $dir #|| return 1 + + declare -a err_strings + err_strings[0]="not scheduling scrubs due to active recovery" + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + grep "not scheduling scrubs" $dir/osd.${osd}.log + done + for err_string in "${err_strings[@]}" + do + found=false + for osd in $(seq 0 $(expr $OSDS - 1)) + do + if grep "$err_string" $dir/osd.${osd}.log > /dev/null; + then + found=true + fi + done + if [ "$found" = "true" ]; then + echo "Found log message not expected '$err_string'" + ERRORS=$(expr $ERRORS + 1) + fi + done + + teardown $dir || return 1 + + if [ $ERRORS != "0" ]; + then + echo "TEST FAILED WITH $ERRORS ERRORS" + return 1 + fi + + echo "TEST PASSED" + return 0 +} + +main osd-recovery-scrub "$@" + +# Local Variables: +# compile-command: "cd build ; make -j4 && \ +# ../qa/run-standalone.sh osd-recovery-scrub.sh" +# End: |