#!/usr/bin/env bash # # Copyright (C) 2018 Red Hat # # Author: David Zafman # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Library Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library Public License for more details. # source $CEPH_ROOT/qa/standalone/ceph-helpers.sh function run() { local dir=$1 shift export CEPH_MON="127.0.0.1:7180" # git grep '\<7180\>' : there must be only one export CEPH_ARGS CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " CEPH_ARGS+="--mon-host=$CEPH_MON " CEPH_ARGS+="--osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10 " CEPH_ARGS+="--fake_statfs_for_testing=3686400 " CEPH_ARGS+="--osd_max_backfills=10 " CEPH_ARGS+="--osd_mclock_override_recovery_settings=true " export objects=600 export poolprefix=test local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} for func in $funcs ; do setup $dir || return 1 $func $dir || return 1 teardown $dir || return 1 done } function get_num_in_state() { local state=$1 local expression expression+="select(contains(\"${state}\"))" ceph --format json pg dump pgs 2>/dev/null | \ jq ".pg_stats | [.[] | .state | $expression] | length" } function wait_for_not_state() { local state=$1 local num_in_state=-1 local cur_in_state local -a delays=($(get_timeout_delays $2 5)) local -i loop=0 flush_pg_stats || return 1 while test $(get_num_pgs) == 0 ; do sleep 1 done while true ; do cur_in_state=$(get_num_in_state ${state}) test $cur_in_state = "0" && break if test $cur_in_state != $num_in_state ; then loop=0 num_in_state=$cur_in_state elif (( $loop >= ${#delays[*]} )) ; then ceph pg dump pgs return 1 fi sleep ${delays[$loop]} loop+=1 done return 0 } function wait_for_not_backfilling() { local timeout=$1 wait_for_not_state backfilling $timeout } function wait_for_not_activating() { local timeout=$1 wait_for_not_state activating $timeout } # All tests are created in an environment which has fake total space # of 3600K (3686400) which can hold 600 6K replicated objects or # 200 18K shards of erasure coded objects. For a k=3, m=2 EC pool # we have a theoretical 54K object but with the chunk size of 4K # and a rounding of 4K to account for the chunks is 36K max object # which is ((36K / 3) + 4K) * 200 = 3200K which is 88% of # 3600K for a shard. # Create 2 pools with size 1 # Write enough data that only 1 pool pg can fit per osd # Incresase the pool size to 2 # On 3 OSDs this should result in 1 OSD with overlapping replicas, # so both pools can't fit. We assume pgid 1.0 and 2.0 won't # map to the same 2 OSDs. # At least 1 pool shouldn't have room to backfill # All other pools should go active+clean function TEST_backfill_test_simple() { local dir=$1 local pools=2 local OSDS=3 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it done wait_for_clean || return 1 # This won't work is if the 2 pools primary and only osds # are the same. dd if=/dev/urandom of=$dir/datafile bs=1024 count=4 for o in $(seq 1 $objects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o $dir/datafile done done ceph pg dump pgs for p in $(seq 1 $pools) do ceph osd pool set "${poolprefix}$p" size 2 done sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ERRORS=0 if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi expected="$(expr $pools - 1)" if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ]; then echo "$expected didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi ceph pg dump pgs if [ $ERRORS != "0" ]; then return 1 fi for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 } # Create 8 pools of size 1 on 20 OSDs # Write 4K * 600 objects (only 1 pool pg can fit on any given osd) # Increase pool size to 2 # At least 1 pool shouldn't have room to backfill # All other pools should go active+clean function TEST_backfill_test_multi() { local dir=$1 local pools=8 local OSDS=20 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it done wait_for_clean || return 1 dd if=/dev/urandom of=$dir/datafile bs=1024 count=4 for o in $(seq 1 $objects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o $dir/datafile done done ceph pg dump pgs for p in $(seq 1 $pools) do ceph osd pool set "${poolprefix}$p" size 2 done sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ERRORS=0 full="$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" if [ "$full" -lt "1" ]; then echo "At least one pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi expected="$(expr $pools - $full)" if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ]; then echo "$expected didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi ceph pg dump pgs ceph status ceph status --format=json-pretty > $dir/stat.json eval SEV=$(jq '.health.checks.PG_BACKFILL_FULL.severity' $dir/stat.json) if [ "$SEV" != "HEALTH_WARN" ]; then echo "PG_BACKFILL_FULL severity $SEV not HEALTH_WARN" ERRORS="$(expr $ERRORS + 1)" fi eval MSG=$(jq '.health.checks.PG_BACKFILL_FULL.summary.message' $dir/stat.json) if [ "$MSG" != "Low space hindering backfill (add storage if this doesn't resolve itself): 4 pgs backfill_toofull" ]; then echo "PG_BACKFILL_FULL message '$MSG' mismatched" ERRORS="$(expr $ERRORS + 1)" fi rm -f $dir/stat.json if [ $ERRORS != "0" ]; then return 1 fi for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done # Work around for http://tracker.ceph.com/issues/38195 kill_daemons $dir #|| return 1 ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 } # To make sure that when 2 pg try to backfill at the same time to # the same target. This might be covered by the simple test above # but this makes sure we get it. # # Create 10 pools of size 2 and identify 2 that have the same # non-primary osd. # Delete all other pools # Set size to 1 and write 4K * 600 to each pool # Set size back to 2 # The 2 pools should race to backfill. # One pool goes active+clean # The other goes acitve+...+backfill_toofull function TEST_backfill_test_sametarget() { local dir=$1 local pools=10 local OSDS=5 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 2 done sleep 5 wait_for_clean || return 1 ceph pg dump pgs # Find 2 pools with a pg that distinct primaries but second # replica on the same osd. local PG1 local POOLNUM1 local pool1 local chk_osd1 local chk_osd2 local PG2 local POOLNUM2 local pool2 for p in $(seq 1 $pools) do ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting local test_osd1=$(head -1 $dir/acting) local test_osd2=$(tail -1 $dir/acting) if [ $p = "1" ]; then PG1="${p}.0" POOLNUM1=$p pool1="${poolprefix}$p" chk_osd1=$test_osd1 chk_osd2=$test_osd2 elif [ $chk_osd1 != $test_osd1 -a $chk_osd2 = $test_osd2 ]; then PG2="${p}.0" POOLNUM2=$p pool2="${poolprefix}$p" break fi done rm -f $dir/acting if [ "$pool2" = "" ]; then echo "Failure to find appropirate PGs" return 1 fi for p in $(seq 1 $pools) do if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ]; then delete_pool ${poolprefix}$p fi done ceph osd pool set $pool1 size 1 --yes-i-really-mean-it ceph osd pool set $pool2 size 1 --yes-i-really-mean-it wait_for_clean || return 1 dd if=/dev/urandom of=$dir/datafile bs=1024 count=4 for i in $(seq 1 $objects) do rados -p $pool1 put obj$i $dir/datafile rados -p $pool2 put obj$i $dir/datafile done ceph osd pool set $pool1 size 2 ceph osd pool set $pool2 size 2 sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ERRORS=0 if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "1" ]; then echo "One didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi ceph pg dump pgs if [ $ERRORS != "0" ]; then return 1 fi delete_pool $pool1 delete_pool $pool2 kill_daemons $dir || return 1 ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 } # 2 pools can't both backfill to a target which has other data # 1 of the pools has objects that increase from 1024 to 2611 bytes # # Write to fill pool which is size 1 # Take fill pool osd down (other 2 pools must go to the remaining OSDs # Save an export of data on fill OSD and restart it # Write an intial 1K to pool1 which has pg 2.0 # Export 2.0 from non-fillpool OSD don't wait for it to start-up # Take down fillpool OSD # Put 1K object version of 2.0 on fillpool OSD # Put back fillpool data on fillpool OSD # With fillpool down write 2611 byte objects # Take down $osd and bring back $fillosd simultaneously # Wait for backfilling # One PG will be able to backfill its remaining data # One PG must get backfill_toofull function TEST_backfill_multi_partial() { local dir=$1 local EC=$2 local pools=2 local OSDS=3 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 ceph osd set-require-min-compat-client luminous create_pool fillpool 1 1 ceph osd pool set fillpool size 1 --yes-i-really-mean-it for p in $(seq 1 $pools) do create_pool "${poolprefix}$p" 1 1 ceph osd pool set "${poolprefix}$p" size 2 done wait_for_clean || return 1 # Partially fill an osd # We have room for 600 6K replicated objects, if we create 2611 byte objects # there is 3600K - (2611 * 600) = 2070K, so the fill pool and one # replica from the other 2 is 85% of 3600K dd if=/dev/urandom of=$dir/datafile bs=2611 count=1 for o in $(seq 1 $objects) do rados -p fillpool put obj-fill-${o} $dir/datafile done local fillosd=$(get_primary fillpool obj-fill-1) osd=$(expr $fillosd + 1) if [ "$osd" = "$OSDS" ]; then osd="0" fi kill_daemon $dir/osd.$fillosd.pid TERM ceph osd out osd.$fillosd _objectstore_tool_nodown $dir $fillosd --op export-remove --pgid 1.0 --file $dir/fillexport.out || return 1 activate_osd $dir $fillosd || return 1 ceph pg dump pgs dd if=/dev/urandom of=$dir/datafile bs=1024 count=1 for o in $(seq 1 $objects) do rados -p "${poolprefix}1" put obj-1-${o} $dir/datafile done ceph pg dump pgs # The $osd OSD is started, but we don't wait so we can kill $fillosd at the same time _objectstore_tool_nowait $dir $osd --op export --pgid 2.0 --file $dir/export.out kill_daemon $dir/osd.$fillosd.pid TERM _objectstore_tool_nodown $dir $fillosd --force --op remove --pgid 2.0 _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out || return 1 _objectstore_tool_nodown $dir $fillosd --op import --pgid 1.0 --file $dir/fillexport.out || return 1 ceph pg dump pgs sleep 20 ceph pg dump pgs # re-write everything dd if=/dev/urandom of=$dir/datafile bs=2611 count=1 for o in $(seq 1 $objects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj-${p}-${o} $dir/datafile done done kill_daemon $dir/osd.$osd.pid TERM ceph osd out osd.$osd activate_osd $dir $fillosd || return 1 ceph osd in osd.$fillosd sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 flush_pg_stats || return 1 ceph pg dump pgs ERRORS=0 if [ "$(get_num_in_state backfill_toofull)" != "1" ]; then echo "One PG should be in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ "$(get_num_in_state active+clean)" != "2" ]; then echo "Two PGs should be active+clean after one PG completed backfill" ERRORS="$(expr $ERRORS + 1)" fi if [ $ERRORS != "0" ]; then return 1 fi delete_pool fillpool for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 } # Make sure that the amount of bytes already on the replica doesn't # cause an out of space condition # # Create 1 pool and write 4K * 600 objects # Remove 25% (150) of the objects with one OSD down (noout set) # Increase the size of the remaining 75% (450) of the objects to 6K # Bring back down OSD # The pool should go active+clean function TEST_backfill_grow() { local dir=$1 local poolname="test" local OSDS=3 run_mon $dir a || return 1 run_mgr $dir x || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 create_pool $poolname 1 1 ceph osd pool set $poolname size 3 sleep 5 wait_for_clean || return 1 dd if=/dev/urandom of=${dir}/4kdata bs=1k count=4 for i in $(seq 1 $objects) do rados -p $poolname put obj$i $dir/4kdata done local PG=$(get_pg $poolname obj1) # Remember primary during the backfill local primary=$(get_primary $poolname obj1) local otherosd=$(get_not_primary $poolname obj1) ceph osd set noout kill_daemons $dir TERM $otherosd || return 1 rmobjects=$(expr $objects / 4) for i in $(seq 1 $rmobjects) do rados -p $poolname rm obj$i done dd if=/dev/urandom of=${dir}/6kdata bs=6k count=1 for i in $(seq $(expr $rmobjects + 1) $objects) do rados -p $poolname put obj$i $dir/6kdata done activate_osd $dir $otherosd || return 1 ceph tell osd.$primary debug kick_recovery_wq 0 sleep 2 wait_for_clean || return 1 delete_pool $poolname kill_daemons $dir || return 1 ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1 } # Create a 5 shard EC pool on 6 OSD cluster # Fill 1 OSD with 2600K of data take that osd down. # Write the EC pool on 5 OSDs # Take down 1 (must contain an EC shard) # Bring up OSD with fill data # Not enought room to backfill to partially full OSD function TEST_ec_backfill_simple() { local dir=$1 local EC=$2 local pools=1 local OSDS=6 local k=3 local m=2 local ecobjects=$(expr $objects / $k) run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 create_pool fillpool 1 1 ceph osd pool set fillpool size 1 --yes-i-really-mean-it # Partially fill an osd # We have room for 200 18K replicated objects, if we create 13K objects # there is only 3600K - (13K * 200) = 1000K which won't hold # a k=3 shard below ((18K / 3) + 4K) * 200 = 2000K # Actual usage per shard is 8K * 200 = 1600K because 18K/3 is 6K which # rounds to 8K. The 2000K is the ceiling on the 18K * 200 = 3600K logical # bytes in the pool. dd if=/dev/urandom of=$dir/datafile bs=1024 count=13 for o in $(seq 1 $ecobjects) do rados -p fillpool put obj$o $dir/datafile done local fillosd=$(get_primary fillpool obj1) osd=$(expr $fillosd + 1) if [ "$osd" = "$OSDS" ]; then osd="0" fi sleep 5 kill_daemon $dir/osd.$fillosd.pid TERM ceph osd out osd.$fillosd sleep 2 ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1 for p in $(seq 1 $pools) do ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile done # Can't wait for clean here because we created a stale pg #wait_for_clean || return 1 sleep 5 ceph pg dump pgs dd if=/dev/urandom of=$dir/datafile bs=1024 count=18 for o in $(seq 1 $ecobjects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o $dir/datafile done done kill_daemon $dir/osd.$osd.pid TERM ceph osd out osd.$osd activate_osd $dir $fillosd || return 1 ceph osd in osd.$fillosd sleep 30 ceph pg dump pgs wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ceph pg dump pgs ERRORS=0 if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ $ERRORS != "0" ]; then return 1 fi delete_pool fillpool for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 } function osdlist() { local OSDS=$1 local excludeosd=$2 osds="" for osd in $(seq 0 $(expr $OSDS - 1)) do if [ $osd = $excludeosd ]; then continue fi if [ -n "$osds" ]; then osds="${osds} " fi osds="${osds}${osd}" done echo $osds } # Create a pool with size 1 and fill with data so that only 1 EC shard can fit. # Write data to 2 EC pools mapped to the same OSDs (excluding filled one) # Remap the last OSD to partially full OSD on both pools # The 2 pools should race to backfill. # One pool goes active+clean # The other goes acitve+...+backfill_toofull function TEST_ec_backfill_multi() { local dir=$1 local EC=$2 local pools=2 local OSDS=6 local k=3 local m=2 local ecobjects=$(expr $objects / $k) run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done # This test requires that shards from 2 different pools # fit on a given OSD, but both will not fix. I'm using # making the fillosd plus 1 shard use 75% of the space, # leaving not enough to be under the 85% set here. ceph osd set-backfillfull-ratio .85 ceph osd set-require-min-compat-client luminous create_pool fillpool 1 1 ceph osd pool set fillpool size 1 --yes-i-really-mean-it # Partially fill an osd # We have room for 200 18K replicated objects, if we create 9K objects # there is only 3600K - (9K * 200) = 1800K which will only hold # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K # The actual data will be (12K / 3) * 200 = 800K because the extra # is the reservation padding for chunking. dd if=/dev/urandom of=$dir/datafile bs=1024 count=9 for o in $(seq 1 $ecobjects) do rados -p fillpool put obj$o $dir/datafile done local fillosd=$(get_primary fillpool obj1) ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1 nonfillosds="$(osdlist $OSDS $fillosd)" for p in $(seq 1 $pools) do ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile ceph osd pg-upmap "$(expr $p + 1).0" $nonfillosds done # Can't wait for clean here because we created a stale pg #wait_for_clean || return 1 sleep 15 ceph pg dump pgs dd if=/dev/urandom of=$dir/datafile bs=1024 count=12 for o in $(seq 1 $ecobjects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile done done ceph pg dump pgs for p in $(seq 1 $pools) do ceph osd pg-upmap $(expr $p + 1).0 ${nonfillosds% *} $fillosd done sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ceph pg dump pgs ERRORS=0 if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ]; then echo "One didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi if [ $ERRORS != "0" ]; then return 1 fi delete_pool fillpool for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 } # Similar to TEST_ec_backfill_multi but one of the ec pools # already had some data on the target OSD # Create a pool with size 1 and fill with data so that only 1 EC shard can fit. # Write a small amount of data to 1 EC pool that still includes the filled one # Take down fillosd with noout set # Write data to 2 EC pools mapped to the same OSDs (excluding filled one) # Remap the last OSD to partially full OSD on both pools # The 2 pools should race to backfill. # One pool goes active+clean # The other goes acitve+...+backfill_toofull function SKIP_TEST_ec_backfill_multi_partial() { local dir=$1 local EC=$2 local pools=2 local OSDS=5 local k=3 local m=2 local ecobjects=$(expr $objects / $k) local lastosd=$(expr $OSDS - 1) run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done # This test requires that shards from 2 different pools # fit on a given OSD, but both will not fix. I'm using # making the fillosd plus 1 shard use 75% of the space, # leaving not enough to be under the 85% set here. ceph osd set-backfillfull-ratio .85 ceph osd set-require-min-compat-client luminous create_pool fillpool 1 1 ceph osd pool set fillpool size 1 --yes-i-really-mean-it # last osd ceph osd pg-upmap 1.0 $lastosd # Partially fill an osd # We have room for 200 18K replicated objects, if we create 9K objects # there is only 3600K - (9K * 200) = 1800K which will only hold # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K # The actual data will be (12K / 3) * 200 = 800K because the extra # is the reservation padding for chunking. dd if=/dev/urandom of=$dir/datafile bs=1024 count=9 for o in $(seq 1 $ecobjects) do rados -p fillpool put obj$o $dir/datafile done local fillosd=$(get_primary fillpool obj1) ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1 nonfillosds="$(osdlist $OSDS $fillosd)" for p in $(seq 1 $pools) do ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd) done # Can't wait for clean here because we created a stale pg #wait_for_clean || return 1 sleep 15 ceph pg dump pgs dd if=/dev/urandom of=$dir/datafile bs=1024 count=1 for o in $(seq 1 $ecobjects) do rados -p "${poolprefix}1" put obj$o-1 $dir/datafile done for p in $(seq 1 $pools) do ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $(expr $lastosd - 1)) done ceph pg dump pgs #ceph osd set noout #kill_daemons $dir TERM osd.$lastosd || return 1 dd if=/dev/urandom of=$dir/datafile bs=1024 count=12 for o in $(seq 1 $ecobjects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile done done ceph pg dump pgs # Now backfill lastosd by adding back into the upmap for p in $(seq 1 $pools) do ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd) done #activate_osd $dir $lastosd || return 1 #ceph tell osd.0 debug kick_recovery_wq 0 sleep 30 ceph pg dump pgs wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ceph pg dump pgs ERRORS=0 if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ]; then echo "One didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi if [ $ERRORS != "0" ]; then return 1 fi delete_pool fillpool for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 } function SKIP_TEST_ec_backfill_multi_partial() { local dir=$1 local EC=$2 local pools=2 local OSDS=6 run_mon $dir a || return 1 run_mgr $dir x || return 1 export CEPH_ARGS for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done # Below we need to fit 3200K in 3600K which is 88% # so set to 90% ceph osd set-backfillfull-ratio .90 ceph osd set-require-min-compat-client luminous create_pool fillpool 1 1 ceph osd pool set fillpool size 1 --yes-i-really-mean-it # Partially fill an osd # We have room for 200 48K ec objects, if we create 4k replicated objects # there is 3600K - (4K * 200) = 2800K which won't hold 2 k=3 shard # of 200 12K objects which takes ((12K / 3) + 4K) * 200 = 1600K each. # On the other OSDs 2 * 1600K = 3200K which is 88% of 3600K. dd if=/dev/urandom of=$dir/datafile bs=1024 count=4 for o in $(seq 1 $objects) do rados -p fillpool put obj$o $dir/datafile done local fillosd=$(get_primary fillpool obj1) osd=$(expr $fillosd + 1) if [ "$osd" = "$OSDS" ]; then osd="0" fi sleep 5 kill_daemon $dir/osd.$fillosd.pid TERM ceph osd out osd.$fillosd sleep 2 ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1 for p in $(seq 1 $pools) do ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile done # Can't wait for clean here because we created a stale pg #wait_for_clean || return 1 sleep 5 ceph pg dump pgs dd if=/dev/urandom of=$dir/datafile bs=1024 count=12 for o in $(seq 1 $objects) do for p in $(seq 1 $pools) do rados -p "${poolprefix}$p" put obj$o $dir/datafile done done #ceph pg map 2.0 --format=json | jq '.' kill_daemon $dir/osd.$osd.pid TERM ceph osd out osd.$osd _objectstore_tool_nodown $dir $osd --op export --pgid 2.0 --file $dir/export.out _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out activate_osd $dir $fillosd || return 1 ceph osd in osd.$fillosd sleep 30 wait_for_not_backfilling 1200 || return 1 wait_for_not_activating 60 || return 1 ERRORS=0 if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then echo "One pool should have been in backfill_toofull" ERRORS="$(expr $ERRORS + 1)" fi if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ]; then echo "One didn't finish backfill" ERRORS="$(expr $ERRORS + 1)" fi ceph pg dump pgs if [ $ERRORS != "0" ]; then return 1 fi delete_pool fillpool for i in $(seq 1 $pools) do delete_pool "${poolprefix}$i" done kill_daemons $dir || return 1 } # Create 1 EC pool # Write 200 12K objects ((12K / 3) + 4K) *200) = 1600K # Take 1 shard's OSD down (with noout set) # Remove 50 objects ((12K / 3) + 4k) * 50) = 400K # Write 150 36K objects (grow 150 objects) 2400K # But there is already 1600K usage so backfill # would be too full if it didn't account for existing data # Bring back down OSD so it must backfill # It should go active+clean taking into account data already there function TEST_ec_backfill_grow() { local dir=$1 local poolname="test" local OSDS=6 local k=3 local m=2 local ecobjects=$(expr $objects / $k) run_mon $dir a || return 1 run_mgr $dir x || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do run_osd $dir $osd || return 1 done ceph osd set-backfillfull-ratio .85 ceph osd set-require-min-compat-client luminous ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1 ceph osd pool create $poolname 1 1 erasure ec-profile wait_for_clean || return 1 dd if=/dev/urandom of=${dir}/12kdata bs=1k count=12 for i in $(seq 1 $ecobjects) do rados -p $poolname put obj$i $dir/12kdata done local PG=$(get_pg $poolname obj1) # Remember primary during the backfill local primary=$(get_primary $poolname obj1) local otherosd=$(get_not_primary $poolname obj1) ceph osd set noout kill_daemons $dir TERM $otherosd || return 1 rmobjects=$(expr $ecobjects / 4) for i in $(seq 1 $rmobjects) do rados -p $poolname rm obj$i done dd if=/dev/urandom of=${dir}/36kdata bs=1k count=36 for i in $(seq $(expr $rmobjects + 1) $ecobjects) do rados -p $poolname put obj$i $dir/36kdata done activate_osd $dir $otherosd || return 1 ceph tell osd.$primary debug kick_recovery_wq 0 sleep 2 wait_for_clean || return 1 delete_pool $poolname kill_daemons $dir || return 1 } main osd-backfill-space "$@" # Local Variables: # compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-space.sh" # End: