summaryrefslogtreecommitdiffstats
path: root/qa/tasks/ec_inconsistent_hinfo.py
diff options
context:
space:
mode:
Diffstat (limited to 'qa/tasks/ec_inconsistent_hinfo.py')
-rw-r--r--qa/tasks/ec_inconsistent_hinfo.py225
1 files changed, 225 insertions, 0 deletions
diff --git a/qa/tasks/ec_inconsistent_hinfo.py b/qa/tasks/ec_inconsistent_hinfo.py
new file mode 100644
index 000000000..fa10f2c45
--- /dev/null
+++ b/qa/tasks/ec_inconsistent_hinfo.py
@@ -0,0 +1,225 @@
+"""
+Inconsistent_hinfo
+"""
+import logging
+import time
+from dateutil.parser import parse
+from tasks import ceph_manager
+from tasks.util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
+ log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
+ (pgid, check_time_now))
+ for i in range(300):
+ time.sleep(5)
+ manager.flush_pg_stats([0, 1, 2, 3])
+ pgs = manager.get_pg_stats()
+ pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+ log.debug('pg=%s' % pg);
+ assert pg
+
+ last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
+ if last_deep_scrub_time < check_time_now:
+ log.debug('not scrubbed')
+ continue
+
+ status = pg['state'].split('+')
+ if inconsistent:
+ assert 'inconsistent' in status
+ else:
+ assert 'inconsistent' not in status
+ return
+
+ assert False, 'not scrubbed'
+
+
+def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
+ log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
+ (pgid, from_osd, to_osd))
+ for i in range(300):
+ time.sleep(5)
+ manager.flush_pg_stats([0, 1, 2, 3])
+ pgs = manager.get_pg_stats()
+ pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+ log.info('pg=%s' % pg);
+ assert pg
+ status = pg['state'].split('+')
+ if 'active' not in status:
+ log.debug('not active')
+ continue
+ if 'backfilling' in status:
+ assert from_osd in pg['acting'] and to_osd in pg['up']
+ log.debug('backfilling')
+ continue
+ if to_osd not in pg['up']:
+ log.debug('backfill not started yet')
+ continue
+ log.debug('backfilled!')
+ break
+
+def task(ctx, config):
+ """
+ Test handling of objects with inconsistent hash info during backfill and deep-scrub.
+
+ A pretty rigid cluster is brought up and tested by this task
+ """
+ if config is None:
+ config = {}
+ assert isinstance(config, dict), \
+ 'ec_inconsistent_hinfo task only accepts a dict for configuration'
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ profile = config.get('erasure_code_profile', {
+ 'k': '2',
+ 'm': '1',
+ 'crush-failure-domain': 'osd'
+ })
+ profile_name = profile.get('name', 'backfill_unfound')
+ manager.create_erasure_code_profile(profile_name, profile)
+ pool = manager.create_pool_with_unique_name(
+ pg_num=1,
+ erasure_code_profile_name=profile_name,
+ min_size=2)
+ manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
+ 'pg_autoscale_mode', 'off')
+
+ manager.flush_pg_stats([0, 1, 2, 3])
+ manager.wait_for_clean()
+
+ pool_id = manager.get_pool_num(pool)
+ pgid = '%d.0' % pool_id
+ pgs = manager.get_pg_stats()
+ acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
+ log.info("acting=%s" % acting)
+ assert acting
+ primary = acting[0]
+
+ # something that is always there, readable and never empty
+ dummyfile = '/etc/group'
+
+ # kludge to make sure they get a map
+ rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
+
+ manager.flush_pg_stats([0, 1])
+ manager.wait_for_recovery()
+
+ log.debug("create test object")
+ obj = 'test'
+ rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])
+
+ victim = acting[1]
+
+ log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
+ % victim)
+
+ manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+ object_name=obj, osd=victim)
+ check_time_now = time.strftime('%s')
+ manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
+ wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)
+
+ check_time_now = time.strftime('%s')
+ manager.raw_cluster_cmd('pg', 'repair', pgid)
+ wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)
+
+ log.info("remove test object hash info from primary osd.%s shard and test backfill"
+ % primary)
+
+ log.debug("write some data")
+ rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
+ '--no-cleanup'])
+
+ manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+ object_name=obj, osd=primary)
+
+ # mark the osd out to trigger a rebalance/backfill
+ source = acting[1]
+ target = [x for x in [0, 1, 2, 3] if x not in acting][0]
+ manager.mark_out_osd(source)
+
+ # wait for everything to peer, backfill and recover
+ wait_for_backfilling_complete(manager, pgid, source, target)
+ manager.wait_for_clean()
+
+ manager.flush_pg_stats([0, 1, 2, 3])
+ pgs = manager.get_pg_stats()
+ pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+ log.debug('pg=%s' % pg)
+ assert pg
+ assert 'clean' in pg['state'].split('+')
+ assert 'inconsistent' not in pg['state'].split('+')
+ unfound = manager.get_num_unfound_objects()
+ log.debug("there are %d unfound objects" % unfound)
+ assert unfound == 0
+
+ source, target = target, source
+ log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
+ % source)
+
+ manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+ object_name=obj, osd=source)
+
+ # mark the osd in to trigger a rebalance/backfill
+ manager.mark_in_osd(target)
+
+ # wait for everything to peer, backfill and recover
+ wait_for_backfilling_complete(manager, pgid, source, target)
+ manager.wait_for_clean()
+
+ manager.flush_pg_stats([0, 1, 2, 3])
+ pgs = manager.get_pg_stats()
+ pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+ log.debug('pg=%s' % pg)
+ assert pg
+ assert 'clean' in pg['state'].split('+')
+ assert 'inconsistent' not in pg['state'].split('+')
+ unfound = manager.get_num_unfound_objects()
+ log.debug("there are %d unfound objects" % unfound)
+ assert unfound == 0
+
+ log.info("remove hash info from two shards and test backfill")
+
+ source = acting[2]
+ target = [x for x in [0, 1, 2, 3] if x not in acting][0]
+ manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+ object_name=obj, osd=primary)
+ manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
+ object_name=obj, osd=source)
+
+ # mark the osd out to trigger a rebalance/backfill
+ manager.mark_out_osd(source)
+
+ # wait for everything to peer, backfill and detect unfound object
+ wait_for_backfilling_complete(manager, pgid, source, target)
+
+ # verify that there is unfound object
+ manager.flush_pg_stats([0, 1, 2, 3])
+ pgs = manager.get_pg_stats()
+ pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
+ log.debug('pg=%s' % pg)
+ assert pg
+ assert 'backfill_unfound' in pg['state'].split('+')
+ unfound = manager.get_num_unfound_objects()
+ log.debug("there are %d unfound objects" % unfound)
+ assert unfound == 1
+ m = manager.list_pg_unfound(pgid)
+ log.debug('list_pg_unfound=%s' % m)
+ assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+
+ # mark stuff lost
+ pgs = manager.get_pg_stats()
+ manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')
+
+ # wait for everything to peer and be happy...
+ manager.flush_pg_stats([0, 1, 2, 3])
+ manager.wait_for_recovery()