1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
|
#!/usr/bin/env bash
function discover_bdevs() {
local rootdir=$1
local config_file=$2
local cfg_type=$3
local wait_for_spdk_bdev=${4:-30}
local rpc_server=/var/tmp/spdk-discover-bdevs.sock
if [ ! -e $config_file ]; then
echo "Invalid Configuration File: $config_file"
return 1
fi
if [ -z $cfg_type ]; then
cfg_type="-c"
fi
# Start the bdev service to query for the list of available
# bdevs.
$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
$cfg_type $config_file &> /dev/null &
stubpid=$!
while ! [ -e /var/run/spdk_bdev0 ]; do
# If this counter drops to zero, errexit will be caught to abort the test
((wait_for_spdk_bdev--))
sleep 1
done
# Get all of the bdevs
$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs
# Shut down the bdev service
kill $stubpid
wait $stubpid
rm -f /var/run/spdk_bdev0
}
function create_spdk_bdev_conf() {
local output
local disk_cfg
local bdev_io_cache_size=$1
local bdev_io_pool_size=$2
local bdev_json_cfg=()
local bdev_opts=()
disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
if [[ -n "$bdev_io_cache_size" ]]; then
bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
fi
if [[ -n "$bdev_io_pool_size" ]]; then
bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
fi
local IFS=","
if [[ ${#bdev_opts[@]} -gt 0 ]]; then
bdev_json_cfg+=("$(
cat <<- JSON
{
"method": "bdev_set_options",
"params": {
${bdev_opts[*]}
}
}
JSON
)")
fi
for i in "${!disk_cfg[@]}"; do
bdev_json_cfg+=("$(
cat <<- JSON
{
"method": "bdev_nvme_attach_controller",
"params": {
"trtype": "PCIe",
"name":"Nvme${i}",
"traddr":"${disk_cfg[i]}"
}
}
JSON
)")
done
local IFS=","
jq -r '.' <<- JSON > $testdir/bdev.conf
{
"subsystems": [
{
"subsystem": "bdev",
"config": [
${bdev_json_cfg[*]}
]
}
]
}
JSON
}
function is_bdf_not_mounted() {
local bdf=$1
local blkname
local mountpoints
blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
return $mountpoints
}
function get_cores() {
local cpu_list="$1"
for cpu in ${cpu_list//,/ }; do
echo $cpu
done
}
function get_cores_numa_node() {
local cores=$1
for core in $cores; do
lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
done
}
function get_numa_node() {
local plugin=$1
local disks=$2
if [[ "$plugin" =~ "nvme" ]]; then
for bdf in $disks; do
local driver
driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
# Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script )
if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
cat /sys/bus/pci/devices/$bdf/numa_node
fi
done
elif [[ "$plugin" =~ "bdev" ]]; then
local bdevs
bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json)
for name in $disks; do
local bdev_bdf
bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
cat /sys/bus/pci/devices/$bdev_bdf/numa_node
done
else
for name in $disks; do
local bdf
# Not reading directly from /sys/block/nvme* because of a kernel bug
# which results in NUMA 0 always getting reported.
bdf=$(cat /sys/block/$name/device/address)
cat /sys/bus/pci/devices/$bdf/numa_node
done
fi
}
function get_disks() {
local plugin=$1
local disk_cfg
disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
if [[ "$plugin" =~ "nvme" ]]; then
# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
# so just print them from configuration file
echo "${disk_cfg[*]}"
elif [[ "$plugin" =~ "bdev" ]]; then
# Generate NvmeXn1 bdev name configuration file for bdev-perf
# and bdev-fio-plugin
local bdevs
local disk_no
disk_no=${#disk_cfg[@]}
eval echo "Nvme{0..$((disk_no - 1))}n1"
else
# Find nvme block devices and only use the ones which
# are not mounted
for bdf in "${disk_cfg[@]}"; do
if is_bdf_not_mounted $bdf; then
local blkname
blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
echo $blkname
fi
done
fi
}
function get_disks_on_numa() {
local devs=($1)
local numas=($2)
local numa_no=$3
local disks_on_numa=""
local i
for ((i = 0; i < ${#devs[@]}; i++)); do
if [ ${numas[$i]} = $numa_no ]; then
disks_on_numa=$((disks_on_numa + 1))
fi
done
echo $disks_on_numa
}
function create_fio_config() {
local disk_no=$1
local plugin=$2
local disks=($3)
local disks_numa=($4)
local cores=($5)
local total_disks=${#disks[@]}
local fio_job_section=()
local num_cores=${#cores[@]}
local disks_per_core=$((disk_no / num_cores))
local disks_per_core_mod=$((disk_no % num_cores))
local cores_numa
cores_numa=($(get_cores_numa_node "${cores[*]}"))
# Following part of this function still leverages global variables a lot.
# It's a mix of local variables passed as aruments to function with global variables. This is messy.
# TODO: Modify this to be consistent with how variables are used here. Aim for using only
# local variables to get rid of globals as much as possible.
desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
cp "$testdir/config.fio.tmp" "$testdir/config.fio"
cat <<- EOF >> $testdir/config.fio
description=$desc
rw=$RW
rwmixread=$MIX
bs=$BLK_SIZE
runtime=$RUNTIME
ramp_time=$RAMP_TIME
numjobs=$NUMJOBS
log_avg_msec=$SAMPLING_INT
EOF
if $GTOD_REDUCE; then
echo "gtod_reduce=1" >> $testdir/config.fio
fi
for i in "${!cores[@]}"; do
local m=0 #Counter of disks per NUMA node
local n=0 #Counter of all disks in test
core_numa=${cores_numa[$i]}
total_disks_per_core=$disks_per_core
# Check how many "stray" disks are unassigned to CPU cores
# Assign one disk to current CPU core and substract it from the total of
# unassigned disks
if [[ "$disks_per_core_mod" -gt "0" ]]; then
total_disks_per_core=$((disks_per_core + 1))
disks_per_core_mod=$((disks_per_core_mod - 1))
fi
# SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
# Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
QD=$IODEPTH
if [[ "$NOIOSCALING" = false ]]; then
QD=$((IODEPTH * total_disks_per_core))
fi
fio_job_section+=("")
fio_job_section+=("[filename${i}]")
fio_job_section+=("iodepth=$QD")
fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
while [[ "$m" -lt "$total_disks_per_core" ]]; do
# Try to add disks to job section if it's NUMA node matches NUMA
# for currently selected CPU
if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then
fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
elif [[ "$plugin" =~ "kernel" ]]; then
fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
fi
m=$((m + 1))
#Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
disks_numa[$n]="x"
fi
n=$((n + 1))
# If there is no more disks with numa node same as cpu numa node, switch to
# other numa node, go back to start of loop and try again.
if [[ $n -ge $total_disks ]]; then
echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
echo "NVMe assignment for this CPU will be cross-NUMA."
if [[ "$core_numa" == "1" ]]; then
core_numa=0
else
core_numa=1
fi
n=0
fi
done
done
printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
echo "INFO: Generated fio configuration file:"
cat $testdir/config.fio
}
function preconditioning() {
local dev_name=""
local filename=""
local nvme_list
HUGEMEM=8192 $rootdir/scripts/setup.sh
cp $testdir/config.fio.tmp $testdir/config.fio
echo "[Preconditioning]" >> $testdir/config.fio
# Generate filename argument for FIO.
# We only want to target NVMes not bound to nvme driver.
# If they're still bound to nvme that means they were skipped by
# setup.sh on purpose.
nvme_list=$(get_disks nvme)
for nvme in $nvme_list; do
dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
filename+=$(printf %s":" "$dev_name")
done
echo "** Preconditioning disks, this can take a while, depending on the size of disks."
run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
--rw=write --iodepth=32 --output-format=normal
rm -f $testdir/config.fio
}
function get_results() {
local reads_pct
local writes_pct
reads_pct=$(bc -l <<< "scale=3; $2/100")
writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct")
case "$1" in
iops)
iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
iops=${iops%.*}
echo $iops
;;
mean_lat_usec)
mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
mean_lat=${mean_lat%.*}
echo $((mean_lat / 1000))
;;
p99_lat_usec)
p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
p99_lat=${p99_lat%.*}
echo $((p99_lat / 1000))
;;
p99_99_lat_usec)
p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
p99_99_lat=${p99_99_lat%.*}
echo $((p99_99_lat / 1000))
;;
stdev_usec)
stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE)
stdev=${stdev%.*}
echo $((stdev / 1000))
;;
mean_slat_usec)
mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
mean_slat=${mean_slat%.*}
echo $((mean_slat / 1000))
;;
mean_clat_usec)
mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
mean_clat=${mean_clat%.*}
echo $((mean_clat / 1000))
;;
bw_Kibs)
bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
bw=${bw%.*}
echo $((bw))
;;
esac
}
function get_bdevperf_results() {
case "$1" in
iops)
iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}')
iops=${iops%.*}
echo $iops
;;
bw_Kibs)
bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}')
bw_MBs=${bw_MBs%.*}
echo $((bw_MBs * 1024))
;;
esac
}
function get_nvmeperf_results() {
local iops
local bw_MBs
local mean_lat_usec
local max_lat_usec
local min_lat_usec
read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")
# We need to get rid of the decimal spaces due
# to use of arithmetic expressions instead of "bc" for calculations
iops=${iops%.*}
bw_MBs=${bw_MBs%.*}
mean_lat_usec=${mean_lat_usec%.*}
min_lat_usec=${min_lat_usec%.*}
max_lat_usec=${max_lat_usec%.*}
echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
}
function run_spdk_nvme_fio() {
local plugin=$1
echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
fi
sleep 1
}
function run_nvme_fio() {
echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
$FIO_BIN $testdir/config.fio --output-format=json "$@"
sleep 1
}
function run_bdevperf() {
echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
$bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock
sleep 1
}
function run_nvmeperf() {
# Prepare -r argument string for nvme perf command
local r_opt
local disks
# Limit the number of disks to $1 if needed
disks=($(get_disks nvme))
disks=("${disks[@]:0:$1}")
r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")
echo "** Running nvme perf test, this can take a while, depending on the run-time setting."
# Run command in separate shell as this solves quoting issues related to r_opt var
$SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
sleep 1
}
function wait_for_nvme_reload() {
local nvmes=$1
shopt -s extglob
for disk in $nvmes; do
cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
until $cmd 2> /dev/null; do
echo "Waiting for full nvme driver reload..."
sleep 0.5
done
done
shopt -q extglob
}
function verify_disk_number() {
# Check if we have appropriate number of disks to carry out the test
disks=($(get_disks $PLUGIN))
if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
DISKNO=${#disks[@]}
elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
false
fi
}
|