blob: ed18f94f1af1f37298a6d6bd6ae107da9c5bd46d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
|
#!/usr/bin/env bash
# -*- mode:text; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
# vim: ts=8 sw=2 smarttab
#
# test the handling of a corrupted SnapMapper DB by Scrub
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh
function run() {
local dir=$1
shift
export CEPH_MON="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
export -n CEPH_CLI_TEST_DUP_COMMAND
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
# one clone & multiple snaps (according to the number of parameters)
function make_a_clone()
{
#turn off '-x' (but remember previous state)
local saved_echo_flag=${-//[^x]/}
set +x
local pool=$1
local obj=$2
echo $RANDOM | rados -p $pool put $obj - || return 1
shift 2
for snap in $@ ; do
rados -p $pool mksnap $snap || return 1
done
if [[ -n "$saved_echo_flag" ]]; then set -x; fi
}
function TEST_truncated_sna_record() {
local dir=$1
local -A cluster_conf=(
['osds_num']="3"
['pgs_in_pool']="4"
['pool_name']="test"
)
local extr_dbg=3
(( extr_dbg > 1 )) && echo "Dir: $dir"
standard_scrub_cluster $dir cluster_conf
ceph tell osd.* config set osd_stats_update_period_not_scrubbing "1"
ceph tell osd.* config set osd_stats_update_period_scrubbing "1"
local osdn=${cluster_conf['osds_num']}
local poolid=${cluster_conf['pool_id']}
local poolname=${cluster_conf['pool_name']}
local objname="objxxx"
# create an object and clone it
make_a_clone $poolname $objname snap01 snap02 || return 1
make_a_clone $poolname $objname snap13 || return 1
make_a_clone $poolname $objname snap24 snap25 || return 1
echo $RANDOM | rados -p $poolname put $objname - || return 1
#identify the PG and the primary OSD
local pgid=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.pgid'`
local osd=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
echo "pgid is $pgid (primary: osd.$osd)"
# turn on the publishing of test data in the 'scrubber' section of 'pg query' output
set_query_debug $pgid
# verify the existence of these clones
(( extr_dbg >= 1 )) && rados --format json-pretty -p $poolname listsnaps $objname
# scrub the PG
ceph pg $pgid deep_scrub || return 1
# we aren't just waiting for the scrub to terminate, but also for the
# logs to be published
sleep 3
ceph pg dump pgs
until grep -a -q -- "event: --^^^^---- ScrubFinished" $dir/osd.$osd.log ; do
sleep 0.2
done
ceph pg dump pgs
ceph osd set noscrub || return 1
ceph osd set nodeep-scrub || return 1
sleep 5
grep -a -q -v "ERR" $dir/osd.$osd.log || return 1
# kill the OSDs
kill_daemons $dir TERM osd || return 1
(( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/0 dump "p"
(( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump "p" | grep -a SNA_
(( extr_dbg >= 2 )) && grep -a SNA_ /tmp/oo2.dump
(( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump p 2> /dev/null
local num_sna_b4=`ceph-kvstore-tool bluestore-kv $dir/$osd dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
| awk -e '{print $2;}' | wc -l`
for sdn in $(seq 0 $(expr $osdn - 1))
do
kvdir=$dir/$sdn
echo "corrupting the SnapMapper DB of osd.$sdn (db: $kvdir)"
(( extr_dbg >= 3 )) && ceph-kvstore-tool bluestore-kv $kvdir dump "p"
# truncate the 'mapping' (SNA_) entry corresponding to the snap13 clone
KY=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_0000000000000003_000000000000000' \
| awk -e '{print $2;}'`
(( extr_dbg >= 1 )) && echo "SNA key: $KY" | cat -v
tmp_fn1=`mktemp -p /tmp --suffix="_the_val"`
(( extr_dbg >= 1 )) && echo "Value dumped in: $tmp_fn1"
ceph-kvstore-tool bluestore-kv $kvdir get p "$KY" out $tmp_fn1 2> /dev/null
(( extr_dbg >= 2 )) && od -xc $tmp_fn1
NKY=${KY:0:-30}
ceph-kvstore-tool bluestore-kv $kvdir rm "p" "$KY" 2> /dev/null
ceph-kvstore-tool bluestore-kv $kvdir set "p" "$NKY" in $tmp_fn1 2> /dev/null
(( extr_dbg >= 1 )) || rm $tmp_fn1
done
orig_osd_args=" ${cluster_conf['osd_args']}"
orig_osd_args=" $(echo $orig_osd_args)"
(( extr_dbg >= 2 )) && echo "Copied OSD args: /$orig_osd_args/ /${orig_osd_args:1}/"
for sdn in $(seq 0 $(expr $osdn - 1))
do
CEPH_ARGS="$CEPH_ARGS $orig_osd_args" activate_osd $dir $sdn
done
sleep 1
for sdn in $(seq 0 $(expr $osdn - 1))
do
timeout 60 ceph tell osd.$sdn version
done
rados --format json-pretty -p $poolname listsnaps $objname
# when scrubbing now - we expect the scrub to emit a cluster log ERR message regarding SnapMapper internal inconsistency
ceph osd unset nodeep-scrub || return 1
ceph osd unset noscrub || return 1
# what is the primary now?
local cur_prim=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
ceph pg dump pgs
sleep 2
ceph pg $pgid deep_scrub || return 1
sleep 5
ceph pg dump pgs
(( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
grep -a -q "ERR" $dir/osd.$cur_prim.log || return 1
# but did we fix the snap issue? let's try scrubbing again
local prev_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
echo "prev count: $prev_err_cnt"
# scrub again. No errors expected this time
ceph pg $pgid deep_scrub || return 1
sleep 5
ceph pg dump pgs
(( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
local current_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
(( extr_dbg >= 1 )) && echo "current count: $current_err_cnt"
(( current_err_cnt == prev_err_cnt )) || return 1
kill_daemons $dir TERM osd || return 1
kvdir=$dir/$cur_prim
(( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_' \
| awk -e '{print $2;}'
local num_sna_full=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
| awk -e '{print $2;}' | wc -l`
(( num_sna_full == num_sna_b4 )) || return 1
return 0
}
main osd-mapper "$@"
|