20 files changed, 556 insertions, 48 deletions
diff --git a/qa/workunits/cephadm/test_cephadm_timeout.py b/qa/workunits/cephadm/test_cephadm_timeout.py
new file mode 100755
index 000000000..67b43a2df
--- /dev/null
+++ b/qa/workunits/cephadm/test_cephadm_timeout.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python3 -s
+
+import time
+import os
+import fcntl
+import subprocess
+import uuid
+import sys
+
+from typing import Optional, Any
+
+LOCK_DIR = '/run/cephadm'
+DATA_DIR = '/var/lib/ceph'
+
+class _Acquire_ReturnProxy(object):
+    def __init__(self, lock: 'FileLock') -> None:
+        self.lock = lock
+        return None
+
+    def __enter__(self) -> 'FileLock':
+        return self.lock
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.lock.release()
+        return None
+
+class FileLock(object):
+    def __init__(self, name: str, timeout: int = -1) -> None:
+        if not os.path.exists(LOCK_DIR):
+            os.mkdir(LOCK_DIR, 0o700)
+        self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
+
+        self._lock_file_fd: Optional[int] = None
+        self.timeout = timeout
+        self._lock_counter = 0
+        return None
+
+    @property
+    def is_locked(self) -> bool:
+        return self._lock_file_fd is not None
+
+    def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
+        # Use the default timeout, if no timeout is provided.
+        if timeout is None:
+            timeout = self.timeout
+
+        # Increment the number right at the beginning.
+        # We can still undo it, if something fails.
+        self._lock_counter += 1
+
+        start_time = time.time()
+        try:
+            while True:
+                if not self.is_locked:
+                    self._acquire()
+
+                if self.is_locked:
+                    break
+                elif timeout >= 0 and time.time() - start_time > timeout:
+                    raise Exception(self._lock_file)
+                else:
+                    time.sleep(poll_intervall)
+        except Exception:
+            # Something did go wrong, so decrement the counter.
+            self._lock_counter = max(0, self._lock_counter - 1)
+
+            raise
+        return _Acquire_ReturnProxy(lock=self)
+
+    def release(self, force: bool = False) -> None:
+        if self.is_locked:
+            self._lock_counter -= 1
+
+            if self._lock_counter == 0 or force:
+                self._release()
+                self._lock_counter = 0
+
+        return None
+
+    def __enter__(self) -> 'FileLock':
+        self.acquire()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.release()
+        return None
+
+    def __del__(self) -> None:
+        self.release(force=True)
+        return None
+
+    def _acquire(self) -> None:
+        open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        fd = os.open(self._lock_file, open_mode)
+
+        try:
+            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except (IOError, OSError):
+            os.close(fd)
+        else:
+            self._lock_file_fd = fd
+        return None
+
+    def _release(self) -> None:
+        fd = self._lock_file_fd
+        self._lock_file_fd = None
+        fcntl.flock(fd, fcntl.LOCK_UN)  # type: ignore
+        os.close(fd)  # type: ignore
+        return None
+
+def _is_fsid(s):
+    try:
+        uuid.UUID(s)
+    except ValueError:
+        return False
+    return True
+
+def find_fsid():
+    if not os.path.exists(DATA_DIR):
+        raise Exception(f'{DATA_DIR} does not exist. Aborting...')
+
+    for d in os.listdir(DATA_DIR):
+        # assume the first thing we find that is an fsid
+        # is what we want. Not expecting multiple clusters
+        # to have been installed here.
+        if _is_fsid(d):
+            return d
+    raise Exception(f'No fsid dir found in {DATA_DIR} does not exist. Aborting...')
+
+def main():
+    print('Looking for cluster fsid...')
+    fsid = find_fsid()
+    print(f'Found fsid {fsid}')
+
+    print('Setting cephadm command timeout to 120...')
+    subprocess.run(['cephadm', 'shell', '--', 'ceph', 'config', 'set',
+                    'mgr', 'mgr/cephadm/default_cephadm_command_timeout', '120'],
+                    check=True)
+
+    print('Taking hold of cephadm lock for 300 seconds...')
+    lock = FileLock(fsid, 300)
+    lock.acquire()
+
+    print('Triggering cephadm device refresh...')
+    subprocess.run(['cephadm', 'shell', '--', 'ceph', 'orch', 'device', 'ls', '--refresh'],
+                    check=True)
+
+    print('Sleeping 150 seconds to allow for timeout to occur...')
+    time.sleep(150)
+
+    print('Checking ceph health detail...')
+    # directing stdout to res.stdout via "capture_stdout" option
+    # (and same for stderr) seems to have been added in python 3.7.
+    # Using files so this works with 3.6 as well
+    with open('/tmp/ceph-health-detail-stdout', 'w') as f_stdout:
+        with open('/tmp/ceph-health-detail-stderr', 'w') as f_stderr:
+            subprocess.run(['cephadm', 'shell', '--', 'ceph', 'health', 'detail'],
+                           check=True, stdout=f_stdout, stderr=f_stderr)
+
+    res_stdout = open('/tmp/ceph-health-detail-stdout', 'r').read()
+    res_stderr = open('/tmp/ceph-health-detail-stderr', 'r').read()
+    print(f'"cephadm shell -- ceph health detail" stdout:\n{res_stdout}')
+    print(f'"cephadm shell -- ceph health detail" stderr:\n{res_stderr}')
+
+    print('Checking for correct health warning in health detail...')
+    if 'CEPHADM_REFRESH_FAILED' not in res_stdout:
+        raise Exception('No health warning caused by timeout was raised')
+    if 'Command "cephadm ceph-volume -- inventory" timed out' not in res_stdout:
+        raise Exception('Health warnings did not contain message about time out')
+
+    print('Health warnings found succesfully. Exiting.')
+    return 0
+
+    
+if __name__ == '__main__':
+    if os.getuid() != 0:
+        print('Trying to run myself with sudo...')
+        os.execvp('sudo', [sys.executable] + list(sys.argv))
+    main()
diff --git a/qa/workunits/fs/full/subvolume_clone.sh b/qa/workunits/fs/full/subvolume_clone.sh
index a11131215..d61e07111 100755
--- a/qa/workunits/fs/full/subvolume_clone.sh
+++ b/qa/workunits/fs/full/subvolume_clone.sh
@@ -7,8 +7,8 @@ set -ex
 # Hence the subsequent subvolume commands on the clone fails with
 # 'MetadataMgrException: -2 (section 'GLOBAL' does not exist)' traceback.
 
-# The osd is of the size 1GB. The full-ratios are set so that osd is treated full
-# at around 600MB. The subvolume is created and 100MB is written.
+# The osd is of the size 2GiB. The full-ratios are set so that osd is treated full
+# at around 1.2GB. The subvolume is created and 200MB is written.
 # The subvolume is snapshotted and cloned ten times. Since the clone delay is set to 15 seconds,
 # all the clones reach pending state for sure. Among ten clones, only few succeed and rest fails
 # with ENOSPACE.
@@ -47,7 +47,7 @@ echo "After ratios are set"
 df -h
 ceph osd df
 
-for i in {1..100};do sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path_0/1MB_file-$i status=progress bs=1M count=1 conv=fdatasync;done
+for i in {1..100};do sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path_0/2MB_file-$i status=progress bs=1M count=2 conv=fdatasync;done
 
 # For debugging
 echo "After subvolumes are written"
@@ -60,6 +60,9 @@ ceph fs subvolume snapshot create cephfs sub_0 snap_0
 # Set clone snapshot delay
 ceph config set mgr mgr/volumes/snapshot_clone_delay 15
 
+# Disable the snapshot_clone_no_wait config option
+ceph config set mgr mgr/volumes/snapshot_clone_no_wait false
+
 # Schedule few clones, some would fail with no space
 for i in $(eval echo {1..$NUM_CLONES});do ceph fs subvolume snapshot clone cephfs sub_0 snap_0 clone_$i;done
 
diff --git a/qa/workunits/fs/full/subvolume_rm.sh b/qa/workunits/fs/full/subvolume_rm.sh
index a464e30f5..2a3bf956d 100755
--- a/qa/workunits/fs/full/subvolume_rm.sh
+++ b/qa/workunits/fs/full/subvolume_rm.sh
@@ -2,8 +2,8 @@
 set -ex
 
 # This testcase tests the scenario of the 'ceph fs subvolume rm' mgr command
-# when the osd is full. The command used to hang. The osd is of the size 1GB.
-# The subvolume is created and 500MB file is written. The full-ratios are
+# when the osd is full. The command used to hang. The osd is of the size 2GiB.
+# The subvolume is created and 1GB file is written. The full-ratios are
 # set below 500MB such that the osd is treated as full. Now the subvolume is
 # is removed. This should be successful with the introduction of FULL
 # capabilities which the mgr holds.
@@ -21,7 +21,7 @@ echo "Before write"
 df -h
 ceph osd df
 
-sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path/500MB_file-1 status=progress bs=1M count=500
+sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path/1GB_file-1 status=progress bs=1M count=1000
 
 ceph osd set-full-ratio 0.2
 ceph osd set-nearfull-ratio 0.16
diff --git a/qa/workunits/fs/full/subvolume_snapshot_rm.sh b/qa/workunits/fs/full/subvolume_snapshot_rm.sh
index f6d0add9f..8df89d3c7 100755
--- a/qa/workunits/fs/full/subvolume_snapshot_rm.sh
+++ b/qa/workunits/fs/full/subvolume_snapshot_rm.sh
@@ -7,8 +7,8 @@ set -ex
 # snapshot rm of the same snapshot fails with 'MetadataMgrException: -2 (section 'GLOBAL' does not exist)'
 # traceback.
 
-# The osd is of the size 1GB. The subvolume is created and 800MB file is written.
-# Then full-ratios are set below 500MB such that the osd is treated as full.
+# The osd is of the size 2GiB. The subvolume is created and 1.6GB file is written.
+# Then full-ratios are set below 1GiB such that the osd is treated as full.
 # The subvolume snapshot is taken which succeeds as no extra space is required
 # for snapshot. Now, the removal of the snapshot fails with ENOSPACE as it
 # fails to remove the snapshot metadata set. The snapshot removal fails
@@ -31,8 +31,8 @@ echo "Before write"
 df $CEPH_MNT
 ceph osd df
 
-# Write 800MB file and set full ratio to around 200MB
-ignore_failure sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path/800MB_file-1 status=progress bs=1M count=800 conv=fdatasync
+# Write 1.6GB file and set full ratio to around 400MB
+ignore_failure sudo dd if=/dev/urandom of=$CEPH_MNT$subvol_path/1.6GB_file-1 status=progress bs=1M count=1600 conv=fdatasync
 
 ceph osd set-full-ratio 0.2
 ceph osd set-nearfull-ratio 0.16
diff --git a/qa/workunits/fs/quota/quota.sh b/qa/workunits/fs/quota/quota.sh
index 1315be6d8..a2f5c459d 100755
--- a/qa/workunits/fs/quota/quota.sh
+++ b/qa/workunits/fs/quota/quota.sh
@@ -29,7 +29,7 @@ mkdir quota-test
 cd quota-test
 
 # bytes
-setfattr . -n ceph.quota.max_bytes -v 100000000  # 100m
+setfattr . -n ceph.quota.max_bytes -v 100M
 expect_false write_file big 1000     # 1g
 expect_false write_file second 10
 setfattr . -n ceph.quota.max_bytes -v 0
@@ -57,7 +57,7 @@ rm -rf *
 # mix
 mkdir bytes bytes/files
 
-setfattr bytes -n ceph.quota.max_bytes -v 10000000   #10m
+setfattr bytes -n ceph.quota.max_bytes -v 10M
 setfattr bytes/files -n ceph.quota.max_files -v 5
 dd if=/dev/zero of=bytes/files/1 bs=1M count=4
 dd if=/dev/zero of=bytes/files/2 bs=1M count=4
@@ -78,7 +78,7 @@ rm -rf *
 #mv
 mkdir files limit
 truncate files/file -s 10G
-setfattr limit -n ceph.quota.max_bytes -v 1000000 #1m
+setfattr limit -n ceph.quota.max_bytes -v 1M
 expect_false mv files limit/
 
 
@@ -88,8 +88,8 @@ rm -rf *
 #limit by ancestor
 
 mkdir -p ancestor/p1/p2/parent/p3
-setfattr ancestor -n ceph.quota.max_bytes -v 1000000
-setfattr ancestor/p1/p2/parent -n ceph.quota.max_bytes -v 1000000000 #1g
+setfattr ancestor -n ceph.quota.max_bytes -v 1M
+setfattr ancestor/p1/p2/parent -n ceph.quota.max_bytes -v 1G
 expect_false write_file ancestor/p1/p2/parent/p3/file1 900 #900m
 stat --printf="%n %s\n" ancestor/p1/p2/parent/p3/file1
 
@@ -104,6 +104,14 @@ expect_false setfattr -n ceph.quota.max_bytes -v -1 .
 expect_false setfattr -n ceph.quota.max_bytes -v -9223372036854775808 .
 expect_false setfattr -n ceph.quota.max_bytes -v -9223372036854775809 .
 
+setfattr -n ceph.quota.max_bytes -v 0 .
+setfattr -n ceph.quota.max_bytes -v 1Ti .
+setfattr -n ceph.quota.max_bytes -v 8388607Ti .
+expect_false setfattr -n ceph.quota.max_bytes -v 8388608Ti .
+expect_false setfattr -n ceph.quota.max_bytes -v -1Ti .
+expect_false setfattr -n ceph.quota.max_bytes -v -8388609Ti .
+expect_false setfattr -n ceph.quota.max_bytes -v -8388610Ti .
+
 setfattr -n ceph.quota.max_files -v 0 .
 setfattr -n ceph.quota.max_files -v 1 .
 setfattr -n ceph.quota.max_files -v 9223372036854775807 .
diff --git a/qa/workunits/kernel_untar_build.sh b/qa/workunits/kernel_untar_build.sh
index 9b60f065c..602ce04a7 100755
--- a/qa/workunits/kernel_untar_build.sh
+++ b/qa/workunits/kernel_untar_build.sh
@@ -2,11 +2,11 @@
 
 set -e
 
-wget -O linux.tar.gz http://download.ceph.com/qa/linux-5.4.tar.gz
+wget -O linux.tar.xz http://download.ceph.com/qa/linux-6.5.11.tar.xz
 
 mkdir t
 cd t
-tar xzf ../linux.tar.gz
+tar xJf ../linux.tar.xz
 cd linux*
 make defconfig
 make -j`grep -c processor /proc/cpuinfo`
diff --git a/qa/workunits/mon/config.sh b/qa/workunits/mon/config.sh
index 1b00201ae..10cbe5630 100755
--- a/qa/workunits/mon/config.sh
+++ b/qa/workunits/mon/config.sh
@@ -98,11 +98,11 @@ ceph tell osd.0 config unset debug_asok
 ceph tell osd.0 config unset debug_asok
 
 ceph config rm osd.0 debug_asok
-while ceph config show osd.0 | grep debug_asok | grep mon
+while ceph config show osd.0 | grep '^debug_asok[:[space]:]' | grep mon
 do
     sleep 1
 done
-ceph config show osd.0 | grep -c debug_asok | grep 0
+ceph config show osd.0 | grep -c '^debug_asok[:[space]:]' | grep 0
 
 ceph config set osd.0 osd_scrub_cost 123
 while ! ceph config show osd.0 | grep osd_scrub_cost | grep mon
@@ -111,6 +111,13 @@ do
 done
 ceph config rm osd.0 osd_scrub_cost
 
+#RGW daemons test config set
+ceph config set client.rgw debug_rgw 22
+while ! ceph config show client.rgw | grep debug_rgw | grep 22 | grep mon
+do
+    sleep 1
+done
+
 # show-with-defaults
 ceph config show-with-defaults osd.0 | grep debug_asok
 
@@ -130,6 +137,21 @@ rm -f $t1 $t2
 
 expect_false ceph config reset
 expect_false ceph config reset -1
+
+
+# test parallel config set
+# reproducer for https://tracker.ceph.com/issues/62832
+ceph config reset 0
+for ((try = 0; try < 10; try++)); do
+    set +x
+    for ((i = 0; i < 100; i++)); do
+        # Use a config that will get "handled" by the Objecter instantiated by the ceph binary
+        ceph config set client rados_mon_op_timeout $((i+300)) &
+    done 2> /dev/null
+    set -x
+    wait
+done
+
 # we are at end of testing, so it's okay to revert everything
 ceph config reset 0
 
diff --git a/qa/workunits/mon/rbd_snaps_ops.sh b/qa/workunits/mon/rbd_snaps_ops.sh
index eb88565ea..0e5b16b7b 100755
--- a/qa/workunits/mon/rbd_snaps_ops.sh
+++ b/qa/workunits/mon/rbd_snaps_ops.sh
@@ -36,6 +36,7 @@ expect 'rbd --pool=test snap ls image' 0
 expect 'rbd --pool=test snap rm image@snapshot' 0
 
 expect 'ceph osd pool mksnap test snapshot' 22
+expect 'rados -p test mksnap snapshot' 1
 
 expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0
 
@@ -52,6 +53,8 @@ expect 'rbd --pool test-foo snap create image@snapshot' 0
 ceph osd pool delete test-bar test-bar --yes-i-really-really-mean-it || true
 expect 'ceph osd pool create test-bar 8' 0
 expect 'ceph osd pool application enable test-bar rbd'
+# "rados cppool" without --yes-i-really-mean-it should fail
+expect 'rados cppool test-foo test-bar' 1
 expect 'rados cppool test-foo test-bar --yes-i-really-mean-it' 0
 expect 'rbd --pool test-bar snap rm image@snapshot' 95
 expect 'ceph osd pool delete test-foo test-foo --yes-i-really-really-mean-it' 0
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index 57279d26d..15c47074d 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -432,6 +432,7 @@ test_trash() {
     rbd trash mv test2
     ID=`rbd trash ls | cut -d ' ' -f 1`
     rbd info --image-id $ID | grep "rbd image 'test2'"
+    rbd children --image-id $ID | wc -l | grep 0
 
     rbd trash restore $ID
     rbd ls | grep test2
@@ -449,6 +450,7 @@ test_trash() {
     rbd create $RBD_CREATE_ARGS -s 1 test1
     rbd snap create test1@snap1
     rbd snap protect test1@snap1
+    rbd clone test1@snap1 clone
     rbd trash mv test1
 
     rbd trash ls | grep test1
@@ -459,7 +461,10 @@ test_trash() {
     ID=`rbd trash ls | cut -d ' ' -f 1`
     rbd snap ls --image-id $ID | grep -v 'SNAPID' | wc -l | grep 1
     rbd snap ls --image-id $ID | grep '.*snap1.*'
+    rbd children --image-id $ID | wc -l | grep 1
+    rbd children --image-id $ID | grep 'clone'
 
+    rbd rm clone
     rbd snap unprotect --image-id $ID --snap snap1
     rbd snap rm --image-id $ID --snap snap1
     rbd snap ls --image-id $ID | grep -v 'SNAPID' | wc -l | grep 0
@@ -1261,7 +1266,6 @@ test_trash_purge_schedule_recovery() {
 	jq 'select(.name == "rbd_support")' |
 	jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
     ceph osd blocklist add $CLIENT_ADDR
-    ceph osd blocklist ls | grep $CLIENT_ADDR
 
     # Check that you can add a trash purge schedule after a few retries
     expect_fail rbd trash purge schedule add -p rbd3 10m
@@ -1420,7 +1424,6 @@ test_mirror_snapshot_schedule_recovery() {
 	jq 'select(.name == "rbd_support")' |
 	jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
     ceph osd blocklist add $CLIENT_ADDR
-    ceph osd blocklist ls | grep $CLIENT_ADDR
 
     # Check that you can add a mirror snapshot schedule after a few retries
     expect_fail rbd mirror snapshot schedule add -p rbd3/ns1 --image test1 2m
@@ -1529,7 +1532,6 @@ test_perf_image_iostat_recovery() {
 	jq 'select(.name == "rbd_support")' |
 	jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
     ceph osd blocklist add $CLIENT_ADDR
-    ceph osd blocklist ls | grep $CLIENT_ADDR
 
     expect_fail rbd perf image iostat --format json rbd3/ns
     sleep 10
@@ -1661,7 +1663,6 @@ test_tasks_recovery() {
 	jq 'select(.name == "rbd_support")' |
 	jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
     ceph osd blocklist add $CLIENT_ADDR
-    ceph osd blocklist ls | grep $CLIENT_ADDR
 
     expect_fail ceph rbd task add flatten rbd2/clone1
     sleep 10
diff --git a/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh b/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh
new file mode 100755
index 000000000..78a390230
--- /dev/null
+++ b/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+set -ex
+
+IMAGE=image-alternate-primary
+MIRROR_IMAGE_MODE=snapshot
+MIRROR_POOL_MODE=image
+MOUNT=test-alternate-primary
+RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
+RBD_MIRROR_INSTANCES=1
+RBD_MIRROR_MODE=snapshot
+RBD_MIRROR_USE_EXISTING_CLUSTER=1
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+take_mirror_snapshots() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for i in {1..30}; do
+    mirror_image_snapshot $cluster $pool $image
+    sleep 3
+  done
+}
+
+slow_untar_workload() {
+  local mountpt=$1
+
+  cp linux-5.4.tar.gz $mountpt
+  # run workload that updates the data and metadata of multiple files on disk.
+  # rate limit the workload such that the mirror snapshots can be taken as the
+  # contents of the image are progressively changed by the workload.
+  local ret=0
+  timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
+    | pv -L 256K | tar xf - -C $mountpt" || ret=$?
+  if ((ret != 124)); then
+    echo "Workload completed prematurely"
+    return 1
+  fi
+}
+
+setup
+
+start_mirrors ${CLUSTER1}
+start_mirrors ${CLUSTER2}
+
+# initial setup
+create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMAGE} \
+  ${RBD_MIRROR_MODE} 10G
+
+if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+  DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
+           -o try-netlink ${POOL}/${IMAGE})
+elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+  DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
+           ${POOL}/${IMAGE})
+else
+  echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
+  exit 1
+fi
+sudo mkfs.ext4 ${DEV}
+mkdir ${MOUNT}
+
+wget https://download.ceph.com/qa/linux-5.4.tar.gz
+
+for i in {1..25}; do
+  # create mirror snapshots every few seconds under I/O
+  sudo mount ${DEV} ${MOUNT}
+  sudo chown $(whoami) ${MOUNT}
+  rm -rf ${MOUNT}/*
+  take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMAGE} &
+  SNAP_PID=$!
+  slow_untar_workload ${MOUNT}
+  wait $SNAP_PID
+  sudo umount ${MOUNT}
+
+  # calculate hash before demotion of primary image
+  DEMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} ${DEV}
+
+  demote_image ${CLUSTER1} ${POOL} ${IMAGE}
+  wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMAGE} 'up+unknown'
+  wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${IMAGE} 'up+unknown'
+  promote_image ${CLUSTER2} ${POOL} ${IMAGE}
+
+  # calculate hash after promotion of secondary image
+  if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+    DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
+             -o try-netlink ${POOL}/${IMAGE})
+  elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+    DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${IMAGE})
+  fi
+  PROMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
+
+  if [[ "${DEMOTE_MD5}" != "${PROMOTE_MD5}" ]]; then
+    echo "Mismatch at iteration ${i}: ${DEMOTE_MD5} != ${PROMOTE_MD5}"
+    exit 1
+  fi
+
+  TEMP=${CLUSTER1}
+  CLUSTER1=${CLUSTER2}
+  CLUSTER2=${TEMP}
+done
+
+echo OK
diff --git a/qa/workunits/rbd/compare_mirror_images.sh b/qa/workunits/rbd/compare_mirror_images.sh
new file mode 100755
index 000000000..cbaa77a71
--- /dev/null
+++ b/qa/workunits/rbd/compare_mirror_images.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+
+set -ex
+
+IMG_PREFIX=image-primary
+MIRROR_IMAGE_MODE=snapshot
+MIRROR_POOL_MODE=image
+MNTPT_PREFIX=test-primary
+RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
+RBD_MIRROR_INSTANCES=1
+RBD_MIRROR_MODE=snapshot
+RBD_MIRROR_USE_EXISTING_CLUSTER=1
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+take_mirror_snapshots() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for i in {1..30}; do
+    mirror_image_snapshot $cluster $pool $image
+    sleep 3
+  done
+}
+
+slow_untar_workload() {
+  local mountpt=$1
+
+  cp linux-5.4.tar.gz $mountpt
+  # run workload that updates the data and metadata of multiple files on disk.
+  # rate limit the workload such that the mirror snapshots can be taken as the
+  # contents of the image are progressively changed by the workload.
+  local ret=0
+  timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
+    | pv -L 256K | tar xf - -C $mountpt" || ret=$?
+  if ((ret != 124)); then
+    echo "Workload completed prematurely"
+    return 1
+  fi
+}
+
+wait_for_image_removal() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+    if ! rbd --cluster $cluster ls $pool | grep -wq $image; then
+      return 0
+    fi
+    sleep $s
+  done
+
+  echo "image ${pool}/${image} not removed from cluster ${cluster}"
+  return 1
+}
+
+compare_demoted_promoted_image() {
+  local dev=${DEVS[$1-1]}
+  local img=${IMG_PREFIX}$1
+  local mntpt=${MNTPT_PREFIX}$1
+  local demote_md5 promote_md5
+
+  sudo umount ${mntpt}
+
+  # calculate hash before demotion of primary image
+  demote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} \
+      ${POOL}/${img}
+
+  demote_image ${CLUSTER1} ${POOL} ${img}
+  wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${img} 'up+unknown'
+  wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${img} 'up+unknown'
+  promote_image ${CLUSTER2} ${POOL} ${img}
+
+  # calculate hash after promotion of secondary image
+  if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+    dev=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
+             -o try-netlink ${POOL}/${img})
+  elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+    dev=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${img})
+  fi
+  promote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER2} device unmap -t ${RBD_DEVICE_TYPE} ${dev}
+
+  if [[ "${demote_md5}" != "${promote_md5}" ]]; then
+    echo "Mismatch for image ${POOL}/${img}: ${demote_md5} != ${promote_md5}"
+    return 1
+  fi
+}
+
+setup
+
+start_mirrors ${CLUSTER1}
+start_mirrors ${CLUSTER2}
+
+wget https://download.ceph.com/qa/linux-5.4.tar.gz
+
+for i in {1..10}; do
+  DEVS=()
+  SNAP_PIDS=()
+  COMPARE_PIDS=()
+  WORKLOAD_PIDS=()
+  RET=0
+  for j in {1..10}; do
+    IMG=${IMG_PREFIX}${j}
+    MNTPT=${MNTPT_PREFIX}${j}
+    create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMG} \
+      ${RBD_MIRROR_MODE} 10G
+    if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+      DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
+	      -o try-netlink ${POOL}/${IMG})
+    elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+      DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
+	      ${POOL}/${IMG})
+    else
+      echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
+      exit 1
+    fi
+    DEVS+=($DEV)
+    sudo mkfs.ext4 ${DEV}
+    mkdir ${MNTPT}
+    sudo mount ${DEV} ${MNTPT}
+    sudo chown $(whoami) ${MNTPT}
+    # create mirror snapshots under I/O every few seconds
+    take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMG} &
+    SNAP_PIDS+=($!)
+    slow_untar_workload ${MNTPT} &
+    WORKLOAD_PIDS+=($!)
+  done
+  for pid in ${SNAP_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "take_mirror_snapshots failed"
+    exit 1
+  fi
+  for pid in ${WORKLOAD_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "slow_untar_workload failed"
+    exit 1
+  fi
+
+  for j in {1..10}; do
+    compare_demoted_promoted_image $j &
+    COMPARE_PIDS+=($!)
+  done
+  for pid in ${COMPARE_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "compare_demoted_promoted_image failed"
+    exit 1
+  fi
+
+  for j in {1..10}; do
+    IMG=${IMG_PREFIX}${j}
+    # Allow for removal of non-primary image by checking that mirroring
+    # image status is "up+replaying"
+    wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMG} 'up+replaying'
+    remove_image ${CLUSTER2} ${POOL} ${IMG}
+    wait_for_image_removal ${CLUSTER1} ${POOL} ${IMG}
+    rm -rf ${MNTPT_PREFIX}${j}
+  done
+done
+
+echo OK
diff --git a/qa/workunits/rbd/rbd-nbd.sh b/qa/workunits/rbd/rbd-nbd.sh
index bc89e9be5..8e1b05b3f 100755
--- a/qa/workunits/rbd/rbd-nbd.sh
+++ b/qa/workunits/rbd/rbd-nbd.sh
@@ -202,8 +202,11 @@ provisioned=`rbd -p ${POOL} --format xml du ${IMAGE} |
 used=`rbd -p ${POOL} --format xml du ${IMAGE} |
   $XMLSTARLET sel -t -m "//stats/images/image/used_size" -v .`
 [ "${used}" -lt "${provisioned}" ]
+unmap_device ${DEV} ${PID}
 
 # resize test
+DEV=`_sudo rbd device -t nbd -o try-netlink map ${POOL}/${IMAGE}`
+get_pid ${POOL}
 devname=$(basename ${DEV})
 blocks=$(awk -v dev=${devname} '$4 == dev {print $3}' /proc/partitions)
 test -n "${blocks}"
@@ -216,9 +219,9 @@ rbd resize ${POOL}/${IMAGE} --allow-shrink --size ${SIZE}M
 blocks2=$(awk -v dev=${devname} '$4 == dev {print $3}' /proc/partitions)
 test -n "${blocks2}"
 test ${blocks2} -eq ${blocks}
+unmap_device ${DEV} ${PID}
 
 # read-only option test
-unmap_device ${DEV} ${PID}
 DEV=`_sudo rbd --device-type nbd map --read-only ${POOL}/${IMAGE}`
 PID=$(rbd device --device-type nbd list | awk -v pool=${POOL} -v img=${IMAGE} -v dev=${DEV} \
     '$2 == pool && $3 == img && $5 == dev {print $1}')
diff --git a/qa/workunits/rbd/rbd_mirror_bootstrap.sh b/qa/workunits/rbd/rbd_mirror_bootstrap.sh
index 6ef06f2b8..f4c1070bc 100755
--- a/qa/workunits/rbd/rbd_mirror_bootstrap.sh
+++ b/qa/workunits/rbd/rbd_mirror_bootstrap.sh
@@ -1,8 +1,10 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_bootstrap.sh - test peer bootstrap create/import
 #
 
+set -ex
+
 RBD_MIRROR_MANUAL_PEERS=1
 RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-1}
 . $(dirname $0)/rbd_mirror_helpers.sh
diff --git a/qa/workunits/rbd/rbd_mirror_fsx_compare.sh b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh
index 0ba3c97d7..79c36546d 100755
--- a/qa/workunits/rbd/rbd_mirror_fsx_compare.sh
+++ b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh
@@ -1,10 +1,12 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_fsx_compare.sh - test rbd-mirror daemon under FSX workload
 #
 # The script is used to compare FSX-generated images between two clusters.
 #
 
+set -ex
+
 . $(dirname $0)/rbd_mirror_helpers.sh
 
 trap 'cleanup $?' INT TERM EXIT
diff --git a/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh
index d988987ba..6daadbbb4 100755
--- a/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh
+++ b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh
@@ -1,10 +1,12 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_fsx_prepare.sh - test rbd-mirror daemon under FSX workload
 #
 # The script is used to compare FSX-generated images between two clusters.
 #
 
+set -ex
+
 . $(dirname $0)/rbd_mirror_helpers.sh
 
 setup
diff --git a/qa/workunits/rbd/rbd_mirror_ha.sh b/qa/workunits/rbd/rbd_mirror_ha.sh
index 37739a83d..1e43712a6 100755
--- a/qa/workunits/rbd/rbd_mirror_ha.sh
+++ b/qa/workunits/rbd/rbd_mirror_ha.sh
@@ -1,8 +1,10 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_ha.sh - test rbd-mirror daemons in HA mode
 #
 
+set -ex
+
 RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-7}
 
 . $(dirname $0)/rbd_mirror_helpers.sh
diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh
index f4961b925..b6abff96d 100755
--- a/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 # rbd_mirror_helpers.sh - shared rbd-mirror daemon helper functions
 #
@@ -814,23 +814,23 @@ test_status_in_pool_dir()
     local description_pattern="$5"
     local service_pattern="$6"
 
-    local status_log=${TEMPDIR}/$(mkfname ${cluster}-${pool}-${image}.mirror_status)
-    CEPH_ARGS='' rbd --cluster ${cluster} mirror image status ${pool}/${image} |
-        tee ${status_log} >&2
-    grep "^  state: .*${state_pattern}" ${status_log} || return 1
-    grep "^  description: .*${description_pattern}" ${status_log} || return 1
+    local status
+    status=$(CEPH_ARGS='' rbd --cluster ${cluster} mirror image status \
+                 ${pool}/${image})
+    grep "^  state: .*${state_pattern}" <<< "$status" || return 1
+    grep "^  description: .*${description_pattern}" <<< "$status" || return 1
 
     if [ -n "${service_pattern}" ]; then
-        grep "service: *${service_pattern}" ${status_log} || return 1
+        grep "service: *${service_pattern}" <<< "$status" || return 1
     elif echo ${state_pattern} | grep '^up+'; then
-        grep "service: *${MIRROR_USER_ID_PREFIX}.* on " ${status_log} || return 1
+        grep "service: *${MIRROR_USER_ID_PREFIX}.* on " <<< "$status" || return 1
     else
-        grep "service: " ${status_log} && return 1
+        grep "service: " <<< "$status" && return 1
     fi
 
     # recheck using `mirror pool status` command to stress test it.
-
-    local last_update="$(sed -nEe 's/^  last_update: *(.*) *$/\1/p' ${status_log})"
+    local last_update
+    last_update="$(sed -nEe 's/^  last_update: *(.*) *$/\1/p' <<< "$status")"
     test_mirror_pool_status_verbose \
         ${cluster} ${pool} ${image} "${state_pattern}" "${last_update}" &&
     return 0
@@ -847,16 +847,15 @@ test_mirror_pool_status_verbose()
     local state_pattern="$4"
     local prev_last_update="$5"
 
-    local status_log=${TEMPDIR}/$(mkfname ${cluster}-${pool}.mirror_status)
-
-    rbd --cluster ${cluster} mirror pool status ${pool} --verbose --format xml \
-        > ${status_log}
+    local status
+    status=$(CEPH_ARGS='' rbd --cluster ${cluster} mirror pool status ${pool} \
+                 --verbose --format xml)
 
     local last_update state
     last_update=$($XMLSTARLET sel -t -v \
-        "//images/image[name='${image}']/last_update" < ${status_log})
+        "//images/image[name='${image}']/last_update" <<< "$status")
     state=$($XMLSTARLET sel -t -v \
-        "//images/image[name='${image}']/state" < ${status_log})
+        "//images/image[name='${image}']/state" <<< "$status")
 
     echo "${state}" | grep "${state_pattern}" ||
     test "${last_update}" '>' "${prev_last_update}"
diff --git a/qa/workunits/rbd/rbd_mirror_journal.sh b/qa/workunits/rbd/rbd_mirror_journal.sh
index 54f6aeec8..20a3b87db 100755
--- a/qa/workunits/rbd/rbd_mirror_journal.sh
+++ b/qa/workunits/rbd/rbd_mirror_journal.sh
@@ -1,4 +1,4 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_journal.sh - test rbd-mirror daemon in journal-based mirroring mode
 #
@@ -7,6 +7,8 @@
 # socket, temporary files, and launches rbd-mirror daemon.
 #
 
+set -ex
+
 . $(dirname $0)/rbd_mirror_helpers.sh
 
 setup
diff --git a/qa/workunits/rbd/rbd_mirror_snapshot.sh b/qa/workunits/rbd/rbd_mirror_snapshot.sh
index c70d48b09..17164c4d5 100755
--- a/qa/workunits/rbd/rbd_mirror_snapshot.sh
+++ b/qa/workunits/rbd/rbd_mirror_snapshot.sh
@@ -1,4 +1,4 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_snapshot.sh - test rbd-mirror daemon in snapshot-based mirroring mode
 #
@@ -7,6 +7,8 @@
 # socket, temporary files, and launches rbd-mirror daemon.
 #
 
+set -ex
+
 MIRROR_POOL_MODE=image
 MIRROR_IMAGE_MODE=snapshot
 
diff --git a/qa/workunits/rbd/rbd_mirror_stress.sh b/qa/workunits/rbd/rbd_mirror_stress.sh
index cb79aba7e..ea39d3aae 100755
--- a/qa/workunits/rbd/rbd_mirror_stress.sh
+++ b/qa/workunits/rbd/rbd_mirror_stress.sh
@@ -1,4 +1,4 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
 #
 # rbd_mirror_stress.sh - stress test rbd-mirror daemon
 #
@@ -8,6 +8,8 @@
 #                             tool during the many image test
 #
 
+set -ex
+
 IMAGE_COUNT=50
 export LOCKDEP=0