26 files changed, 407 insertions, 0 deletions
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/% b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/%
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/%
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/+ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/+
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/openstack.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/openstack.yaml
new file mode 100644
index 000000000..5caffc353
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/openstack.yaml
@@ -0,0 +1,6 @@
+openstack:
+  - machine:
+      disk: 100 # GB
+  - volumes: # attached to each instance
+      count: 4
+      size: 30 # GB
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/start.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/start.yaml
new file mode 100644
index 000000000..1271edd8b
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/0-cluster/start.yaml
@@ -0,0 +1,33 @@
+meta:
+- desc: |
+   Run ceph on two nodes,
+   with a separate client-only node.
+   Use xfs beneath the osds.
+overrides:
+  ceph:
+    fs: xfs
+    log-ignorelist:
+      - overall HEALTH_
+      - \(MON_DOWN\)
+      - \(MGR_DOWN\)
+      ### ref: https://tracker.ceph.com/issues/40251
+      #removed see ^ - failed to encode map
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: "*"
+      mon:
+        mon warn on osd down out interval zero: false
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - osd.3
+- - osd.4
+  - osd.5
+  - osd.6
+  - osd.7
+- - client.0
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1-ceph-install/pacific.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1-ceph-install/pacific.yaml
new file mode 100644
index 000000000..d8e3b6e03
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1-ceph-install/pacific.yaml
@@ -0,0 +1,21 @@
+meta:
+- desc: |
+   install ceph/pacific v16.2.7
+   Overall upgrade path is - pacific-latest.point => pacific-latest
+tasks:
+- install:
+    tag: v16.2.7
+    exclude_packages: ['librados3']
+    extra_packages: ['librados2']
+- print: "**** done install pacific v16.2.7"
+- ceph:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release pacific
+      - ceph osd set-require-min-compat-client pacific
+- print: "**** done ceph"
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1.1.short_pg_log.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1.1.short_pg_log.yaml
new file mode 100644
index 000000000..20cc101de
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/1.1.short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/2-partial-upgrade/firsthalf.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
new file mode 100644
index 000000000..02ba5c1bb
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
@@ -0,0 +1,13 @@
+meta:
+- desc: |
+   install upgrade ceph/-x on one node only
+   1st half
+   restart : osd.0,1,2,3
+tasks:
+- install.upgrade:
+    osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+    daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2,osd.3]
+    mon-health-to-clog: false
+- print: "**** done ceph.restart 1st half"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/3-thrash/default.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/3-thrash/default.yaml
new file mode 100644
index 000000000..c739d8fea
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/3-thrash/default.yaml
@@ -0,0 +1,27 @@
+meta:
+- desc: |
+   randomly kill and revive osd
+   small chance to increase the number of pgs
+overrides:
+  ceph:
+    log-ignorelist:
+    - but it is still running
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - log bound mismatch
+    ### ref: https://tracker.ceph.com/issues/40251
+    - failed to encode map
+tasks:
+- parallel:
+  - stress-tasks
+stress-tasks:
+- thrashosds:
+    timeout: 1200
+    chance_pgnum_grow: 1
+    chance_pgpnum_fix: 1
+    chance_thrash_cluster_full: 0
+    chance_thrash_pg_upmap: 0
+    chance_thrash_pg_upmap_items: 0
+    disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/+ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/+
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/fsx.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/fsx.yaml
new file mode 100644
index 000000000..fd4081f23
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/fsx.yaml
@@ -0,0 +1,8 @@
+meta:
+- desc: |
+   run basic fsx tests for rbd
+stress-tasks:
+- rbd_fsx:
+    clients: [client.0]
+    size: 134217728
+- print: "**** done rbd_fsx 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/radosbench.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/radosbench.yaml
new file mode 100644
index 000000000..c545936c0
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/radosbench.yaml
@@ -0,0 +1,52 @@
+meta:
+- desc: |
+   run randomized correctness test for rados operations
+   generate write load with rados bench
+stress-tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+- print: "**** done radosbench 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
new file mode 100644
index 000000000..caaac875c
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   run basic cls tests for rbd
+stress-tasks:
+- workunit:
+    branch: pacific
+    clients:
+      client.0:
+        - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-import-export.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-import-export.yaml
new file mode 100644
index 000000000..f999bd0c8
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-import-export.yaml
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+    branch: pacific
+    clients:
+      client.0:
+        - rbd/import_export.sh
+    env:
+      RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd_api.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd_api.yaml
new file mode 100644
index 000000000..95c820161
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd_api.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd C and C++ api tests
+stress-tasks:
+- workunit:
+     branch: octopus
+     clients:
+        client.0:
+           - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/readwrite.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/readwrite.yaml
new file mode 100644
index 000000000..456868998
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/readwrite.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool,
+   using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 500
+      write_append_excl: false
+      op_weights:
+        read: 45
+        write: 45
+        delete: 10
+- print: "**** done rados/readwrite 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/snaps-few-objects.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/snaps-few-objects.yaml
new file mode 100644
index 000000000..ae232d867
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/snaps-few-objects.yaml
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+- print: "**** done rados/snaps-few-objects 4-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/5-finish-upgrade.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/5-finish-upgrade.yaml
new file mode 100644
index 000000000..803737c72
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/5-finish-upgrade.yaml
@@ -0,0 +1,8 @@
+tasks:
+- install.upgrade:
+    osd.4:
+    client.0:
+- ceph.restart:
+    daemons: [osd.4, osd.5, osd.6, osd.7]
+    wait-for-healthy: false
+    wait-for-osds-up: true
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/+ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/+
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
new file mode 100644
index 000000000..4ca4e7485
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd python api tests
+tasks:
+- workunit:
+    tag: v16.2.7
+    clients:
+      client.0:
+        - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 7-workload"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/snaps-many-objects.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/snaps-many-objects.yaml
new file mode 100644
index 000000000..805bf97c3
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/snaps-many-objects.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_append_excl: false
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 50
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-bitmap.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-bitmap.yaml
new file mode 100644
index 000000000..b18e04bee
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-bitmap.yaml
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-comp.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-comp.yaml
new file mode 100644
index 000000000..b408032fd
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-comp.yaml
@@ -0,0 +1,23 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore compression mode: aggressive
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-stupid.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-stupid.yaml
new file mode 100644
index 000000000..ca811f131
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/bluestore-stupid.yaml
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/filestore-xfs.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/filestore-xfs.yaml
new file mode 100644
index 000000000..f7aa0dd79
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/objectstore/filestore-xfs.yaml
@@ -0,0 +1,15 @@
+overrides:
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+  ceph-deploy:
+    fs: xfs
+    filestore: True
+    conf:
+      osd:
+        osd objectstore: filestore
+        osd sloppy crc: true
+
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/supported-all-distro/ubuntu_latest.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/supported-all-distro/ubuntu_latest.yaml
new file mode 100644
index 000000000..f20398230
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/supported-all-distro/ubuntu_latest.yaml
@@ -0,0 +1,2 @@
+os_type: ubuntu
+os_version: "20.04"
diff --git a/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/thrashosds-health.yaml b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/thrashosds-health.yaml
new file mode 100644
index 000000000..9903fa578
--- /dev/null
+++ b/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/thrashosds-health.yaml
@@ -0,0 +1,15 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - overall HEALTH_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(SMALLER_PGP_NUM\)
+      - \(OBJECT_
+      - \(SLOW_OPS\)
+      - \(REQUEST_SLOW\)
+      - \(TOO_FEW_PGS\)
+      - slow request