1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
"""
Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
"""
from io import BytesIO
import logging
import six
import time
from teuthology.orchestra import run
from tasks.util.rados import rados
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def task(ctx, config):
"""
Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
configuration settings
In order for test to pass must use log-whitelist as follows
tasks:
- chef:
- install:
- ceph:
log-whitelist: ['OSD near full', 'OSD full dropping all updates']
- osd_failsafe_enospc:
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'osd_failsafe_enospc task only accepts a dict for configuration'
# Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
sleep_time = 50
# something that is always there
dummyfile = '/etc/fstab'
dummyfile2 = '/etc/resolv.conf'
manager = ctx.managers['ceph']
# create 1 pg pool with 1 rep which can only be on osd.0
osds = manager.get_osd_dump()
for osd in osds:
if osd['osd'] != 0:
manager.mark_out_osd(osd['osd'])
log.info('creating pool foo')
manager.create_pool("foo")
manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
# State NONE -> NEAR
log.info('1. Verify warning messages when exceeding nearfull_ratio')
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.keys()
proc = mon.run(
args=[
'sudo',
'daemon-helper',
'kill',
'ceph', '-w'
],
stdin=run.PIPE,
stdout=BytesIO(),
wait=False,
)
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
time.sleep(sleep_time)
proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
proc.wait()
lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
# State NEAR -> FULL
log.info('2. Verify error messages when exceeding full_ratio')
proc = mon.run(
args=[
'sudo',
'daemon-helper',
'kill',
'ceph', '-w'
],
stdin=run.PIPE,
stdout=BytesIO(),
wait=False,
)
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
time.sleep(sleep_time)
proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
proc.wait()
lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
log.info('3. Verify write failure when exceeding full_ratio')
# Write data should fail
ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
# Put back default
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
time.sleep(10)
# State FULL -> NEAR
log.info('4. Verify write success when NOT exceeding full_ratio')
# Write should succeed
ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
log.info('5. Verify warning messages again when exceeding nearfull_ratio')
proc = mon.run(
args=[
'sudo',
'daemon-helper',
'kill',
'ceph', '-w'
],
stdin=run.PIPE,
stdout=BytesIO(),
wait=False,
)
time.sleep(sleep_time)
proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
proc.wait()
lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
time.sleep(10)
# State NONE -> FULL
log.info('6. Verify error messages again when exceeding full_ratio')
proc = mon.run(
args=[
'sudo',
'daemon-helper',
'kill',
'ceph', '-w'
],
stdin=run.PIPE,
stdout=BytesIO(),
wait=False,
)
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
time.sleep(sleep_time)
proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
proc.wait()
lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
# State FULL -> NONE
log.info('7. Verify no messages settings back to default')
manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
time.sleep(10)
proc = mon.run(
args=[
'sudo',
'daemon-helper',
'kill',
'ceph', '-w'
],
stdin=run.PIPE,
stdout=BytesIO(),
wait=False,
)
time.sleep(sleep_time)
proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
proc.wait()
lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
log.info('Test Passed')
# Bring all OSDs back in
manager.remove_pool("foo")
for osd in osds:
if osd['osd'] != 0:
manager.mark_in_osd(osd['osd'])
|