#!/usr/bin/env bash # # Copyright (C) 2019 Red Hat # # Author: David Zafman # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Library Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library Public License for more details. # source $CEPH_ROOT/qa/standalone/ceph-helpers.sh function run() { local dir=$1 shift # Fix port???? export CEPH_MON="127.0.0.1:7114" # git grep '\<7114\>' : there must be only one export CEPH_ARGS CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " CEPH_ARGS+="--mon-host=$CEPH_MON --osd_max_backfills=1 --debug_reserver=20 " # Set osd op queue = wpq for the tests. Recovery priority is not # considered by mclock_scheduler leading to unexpected results. CEPH_ARGS+="--osd-op-queue=wpq " export objects=200 export poolprefix=test export FORCE_PRIO="255" # See OSD_RECOVERY_PRIORITY_FORCED export NORMAL_PRIO="190" # See OSD_RECOVERY_PRIORITY_BASE + 10 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} for func in $funcs ; do setup $dir || return 1 $func $dir || return 1 teardown $dir || return 1 done } function TEST_recovery_priority() { local dir=$1 local pools=10 local OSDS=5 local max_tries=10 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 2 done sleep 5 wait_for_clean || return 1 ceph pg dump pgs # Find 3 pools with a pg with the same primaries but second # replica on another osd. local PG1 local POOLNUM1 local pool1 local chk_osd1_1 local chk_osd1_2 local PG2 local POOLNUM2 local pool2 local chk_osd2 local PG3 local POOLNUM3 local pool3 for p in $(seq 1 $pools) do ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting local test_osd1=$(head -1 $dir/acting) local test_osd2=$(tail -1 $dir/acting) if [ -z "$PG1" ]; then PG1="${p}.0" POOLNUM1=$p pool1="${poolprefix}$p" chk_osd1_1=$test_osd1 chk_osd1_2=$test_osd2 elif [ -z "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 ]; then PG2="${p}.0" POOLNUM2=$p pool2="${poolprefix}$p" chk_osd2=$test_osd2 elif [ -n "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 -a "$chk_osd2" != $test_osd2 ]; then PG3="${p}.0" POOLNUM3=$p pool3="${poolprefix}$p" break fi done rm -f $dir/acting if [ "$pool2" = "" -o "pool3" = "" ]; then echo "Failure to find appropirate PGs" return 1 fi for p in $(seq 1 $pools) do if [ $p != $POOLNUM1 -a $p != $POOLNUM2 -a $p != $POOLNUM3 ]; then delete_pool ${poolprefix}$p fi done ceph osd pool set $pool2 size 1 --yes-i-really-mean-it ceph osd pool set $pool3 size 1 --yes-i-really-mean-it wait_for_clean || return 1 dd if=/dev/urandom of=$dir/data bs=1M count=10 p=1 for pname in $pool1 $pool2 $pool3 do for i in $(seq 1 $objects) do rados -p ${pname} put obj${i}-p${p} $dir/data done p=$(expr $p + 1) done local otherosd=$(get_not_primary $pool1 obj1-p1) ceph pg dump pgs ERRORS=0 ceph osd set norecover ceph osd set noout # Get a pg to want to recover and quickly force it # to be preempted. ceph osd pool set $pool3 size 2 sleep 2 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations || return 1 # 3. Item is in progress, adjust priority with no higher priority waiting for i in $(seq 1 $max_tries) do if ! ceph pg force-recovery $PG3 2>&1 | grep -q "doesn't require recovery"; then break fi if [ "$i" = "$max_tries" ]; then echo "ERROR: Didn't appear to be able to force-recovery" ERRORS=$(expr $ERRORS + 1) fi sleep 2 done flush_pg_stats || return 1 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations || return 1 ceph osd out osd.$chk_osd1_2 sleep 2 flush_pg_stats || return 1 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations || return 1 ceph pg dump pgs ceph osd pool set $pool2 size 2 sleep 2 flush_pg_stats || return 1 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations > $dir/out || return 1 cat $dir/out ceph pg dump pgs PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG1}\")).prio") if [ "$PRIO" != "$NORMAL_PRIO" ]; then echo "The normal PG ${PG1} doesn't have prio $NORMAL_PRIO queued waiting" ERRORS=$(expr $ERRORS + 1) fi # Using eval will strip double-quotes from item eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item') if [ "$ITEM" != ${PG3} ]; then echo "The first force-recovery PG $PG3 didn't become the in progress item" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio') if [ "$PRIO" != $FORCE_PRIO ]; then echo "The first force-recovery PG ${PG3} doesn't have prio $FORCE_PRIO" ERRORS=$(expr $ERRORS + 1) fi fi # 1. Item is queued, re-queue with new priority for i in $(seq 1 $max_tries) do if ! ceph pg force-recovery $PG2 2>&1 | grep -q "doesn't require recovery"; then break fi if [ "$i" = "$max_tries" ]; then echo "ERROR: Didn't appear to be able to force-recovery" ERRORS=$(expr $ERRORS + 1) fi sleep 2 done sleep 2 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations > $dir/out || return 1 cat $dir/out PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio") if [ "$PRIO" != "$FORCE_PRIO" ]; then echo "The second force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO" ERRORS=$(expr $ERRORS + 1) fi flush_pg_stats || return 1 # 4. Item is in progress, if higher priority items waiting prempt item #ceph osd unset norecover ceph pg cancel-force-recovery $PG3 || return 1 sleep 2 #ceph osd set norecover CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations > $dir/out || return 1 cat $dir/out PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG3}\")).prio") if [ "$PRIO" != "$NORMAL_PRIO" ]; then echo "After cancel-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO" ERRORS=$(expr $ERRORS + 1) fi eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item') if [ "$ITEM" != ${PG2} ]; then echo "The force-recovery PG $PG2 didn't become the in progress item" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio') if [ "$PRIO" != $FORCE_PRIO ]; then echo "The first force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO" ERRORS=$(expr $ERRORS + 1) fi fi ceph pg cancel-force-recovery $PG2 || return 1 sleep 5 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations || return 1 # 2. Item is queued, re-queue and preempt because new priority higher than an in progress item flush_pg_stats || return 1 ceph pg force-recovery $PG3 || return 1 sleep 2 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations > $dir/out || return 1 cat $dir/out PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio") if [ "$PRIO" != "$NORMAL_PRIO" ]; then echo "After cancel-force-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO" ERRORS=$(expr $ERRORS + 1) fi eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item') if [ "$ITEM" != ${PG3} ]; then echo "The force-recovery PG $PG3 didn't get promoted to an in progress item" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio') if [ "$PRIO" != $FORCE_PRIO ]; then echo "The force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO" ERRORS=$(expr $ERRORS + 1) fi fi ceph osd unset noout ceph osd unset norecover wait_for_clean "CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations" || return 1 ceph pg dump pgs CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_pgstate_history if [ $ERRORS != "0" ]; then echo "$ERRORS error(s) found" else echo TEST PASSED fi delete_pool $pool1 delete_pool $pool2 delete_pool $pool3 kill_daemons $dir || return 1 return $ERRORS } # # Show that pool recovery_priority is added to recovery priority # # Create 2 pools with 2 OSDs with different primarys # pool 1 with recovery_priority 1 # pool 2 with recovery_priority 2 # # Start recovery by changing the pool sizes from 1 to 2 # Use dump_recovery_reservations to verify priorities function TEST_recovery_pool_priority() { local dir=$1 local pools=3 # Don't assume the first 2 pools are exact what we want local OSDS=2 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 2 done sleep 5 wait_for_clean || return 1 ceph pg dump pgs # Find 2 pools with different primaries which # means the replica must be on another osd. local PG1 local POOLNUM1 local pool1 local chk_osd1_1 local chk_osd1_2 local PG2 local POOLNUM2 local pool2 local chk_osd2_1 local chk_osd2_2 for p in $(seq 1 $pools) do ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting local test_osd1=$(head -1 $dir/acting) local test_osd2=$(tail -1 $dir/acting) if [ -z "$PG1" ]; then PG1="${p}.0" POOLNUM1=$p pool1="${poolprefix}$p" chk_osd1_1=$test_osd1 chk_osd1_2=$test_osd2 elif [ $chk_osd1_1 != $test_osd1 ]; then PG2="${p}.0" POOLNUM2=$p pool2="${poolprefix}$p" chk_osd2_1=$test_osd1 chk_osd2_2=$test_osd2 break fi done rm -f $dir/acting if [ "$pool2" = "" ]; then echo "Failure to find appropirate PGs" return 1 fi for p in $(seq 1 $pools) do if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ]; then delete_pool ${poolprefix}$p fi done pool1_extra_prio=1 pool2_extra_prio=2 pool1_prio=$(expr $NORMAL_PRIO + $pool1_extra_prio) pool2_prio=$(expr $NORMAL_PRIO + $pool2_extra_prio) ceph osd pool set $pool1 size 1 --yes-i-really-mean-it ceph osd pool set $pool1 recovery_priority $pool1_extra_prio ceph osd pool set $pool2 size 1 --yes-i-really-mean-it ceph osd pool set $pool2 recovery_priority $pool2_extra_prio wait_for_clean || return 1 dd if=/dev/urandom of=$dir/data bs=1M count=10 p=1 for pname in $pool1 $pool2 do for i in $(seq 1 $objects) do rados -p ${pname} put obj${i}-p${p} $dir/data done p=$(expr $p + 1) done local otherosd=$(get_not_primary $pool1 obj1-p1) ceph pg dump pgs ERRORS=0 ceph osd pool set $pool1 size 2 ceph osd pool set $pool2 size 2 # Wait for both PGs to be in recovering state ceph pg dump pgs # Wait for recovery to start set -o pipefail count=0 while(true) do if test $(ceph --format json pg dump pgs | jq '.pg_stats | .[] | .state | contains("recovering")' | grep -c true) == "2" then break fi sleep 2 if test "$count" -eq "10" then echo "Recovery never started on both PGs" return 1 fi count=$(expr $count + 1) done set +o pipefail ceph pg dump pgs CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_recovery_reservations > $dir/dump.${chk_osd1_1}.out echo osd.${chk_osd1_1} cat $dir/dump.${chk_osd1_1}.out CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_2}) dump_recovery_reservations > $dir/dump.${chk_osd1_2}.out echo osd.${chk_osd1_2} cat $dir/dump.${chk_osd1_2}.out # Using eval will strip double-quotes from item eval ITEM=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].item') if [ "$ITEM" != ${PG1} ]; then echo "The primary PG for $pool1 didn't become the in progress item" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].prio') if [ "$PRIO" != $pool1_prio ]; then echo "The primary PG ${PG1} doesn't have prio $pool1_prio" ERRORS=$(expr $ERRORS + 1) fi fi # Using eval will strip double-quotes from item eval ITEM=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].item') if [ "$ITEM" != ${PG1} ]; then echo "The primary PG for $pool1 didn't become the in progress item on remote" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].prio') if [ "$PRIO" != $pool1_prio ]; then echo "The primary PG ${PG1} doesn't have prio $pool1_prio on remote" ERRORS=$(expr $ERRORS + 1) fi fi # Using eval will strip double-quotes from item eval ITEM=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].item') if [ "$ITEM" != ${PG2} ]; then echo "The primary PG for $pool2 didn't become the in progress item" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].prio') if [ "$PRIO" != $pool2_prio ]; then echo "The primary PG ${PG2} doesn't have prio $pool2_prio" ERRORS=$(expr $ERRORS + 1) fi fi # Using eval will strip double-quotes from item eval ITEM=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].item') if [ "$ITEM" != ${PG2} ]; then echo "The primary PG $PG2 didn't become the in progress item on remote" ERRORS=$(expr $ERRORS + 1) else PRIO=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].prio') if [ "$PRIO" != $pool2_prio ]; then echo "The primary PG ${PG2} doesn't have prio $pool2_prio on remote" ERRORS=$(expr $ERRORS + 1) fi fi wait_for_clean || return 1 if [ $ERRORS != "0" ]; then echo "$ERRORS error(s) found" else echo TEST PASSED fi delete_pool $pool1 delete_pool $pool2 kill_daemons $dir || return 1 return $ERRORS } main osd-recovery-prio "$@" # Local Variables: # compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-prio.sh" # End: