1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
|
#!/usr/bin/env bash
set -e
# Dir variables and sourcing common files
testdir=$(readlink -f $(dirname $0))
rootdir=$(readlink -f $testdir/../../..)
plugin_dir=$rootdir/build/fio
bdevperf_dir=$rootdir/test/bdev/bdevperf
nvmeperf_dir=$rootdir/build/examples
source $testdir/common.sh
source $rootdir/scripts/common.sh || exit 1
source $rootdir/test/common/autotest_common.sh
# Global & default variables
declare -A KERNEL_ENGINES
KERNEL_ENGINES=(
["kernel-libaio"]="--ioengine=libaio"
["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
["kernel-io-uring"]="--ioengine=io_uring")
RW=randrw
MIX=100
IODEPTH=256
BLK_SIZE=4096
RUNTIME=600
RAMP_TIME=30
NUMJOBS=1
REPEAT_NO=3
GTOD_REDUCE=false
SAMPLING_INT=0
FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
TMP_RESULT_FILE=$testdir/result.json
PLUGIN="nvme"
DISKCFG=""
BDEV_CACHE=""
BDEV_POOL=""
DISKNO="ALL"
CPUS_ALLOWED=1
NOIOSCALING=false
PRECONDITIONING=true
CPUFREQ=""
PERFTOP=false
DPDKMEM=false
DATE="$(date +'%m_%d_%Y_%H%M%S')"
function usage() {
set +x
[[ -n $2 ]] && (
echo "$2"
echo ""
)
echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
echo "Usage: $(basename $1) [options]"
echo "-h, --help Print help and exit"
echo
echo "Workload parameters:"
echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]"
echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]"
echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before"
echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]"
echo " Applicable only for fio-based tests."
echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]"
echo " Test result will be an average of repeated test runs."
echo " --gtod-reduce Enable fio gtod_reduce option. [default=$GTOD_REDUCE]"
echo " --sampling-int=INT Value for fio log_avg_msec parameters [default=$SAMPLING_INT]"
echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]"
echo " Applicable only for fio-based tests."
echo
echo "Test setup parameters:"
echo " --driver=STR Selects tool used for testing. Choices available:"
echo " - spdk-perf-nvme (SPDK nvme perf)"
echo " - spdk-perf-bdev (SPDK bdev perf)"
echo " - spdk-plugin-nvme (SPDK nvme fio plugin)"
echo " - spdk-plugin-bdev (SPDK bdev fio plugin)"
echo " - kernel-classic-polling"
echo " - kernel-hybrid-polling"
echo " - kernel-libaio"
echo " - kernel-io-uring"
echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test."
echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
echo " and Kernel block device names detected."
echo " Lines starting with # are ignored as comments."
echo " --bdev-io-cache-size Set IO cache size for for SPDK bdev subsystem."
echo " --bdev-io-pool-size Set IO pool size for for SPDK bdev subsystem."
echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
echo " If =ALL then test on all found disk. [default=$DISKNO]"
echo " --cpu-allowed=INT/PATH Comma-separated list of CPU cores used to run the workload. Ranges allowed."
echo " Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]"
echo " --no-preconditioning Skip preconditioning"
echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
echo " --cpu-frequency=INT Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in"
echo " GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to"
echo " check list of available frequencies. Example: --cpu-frequency=1100000."
echo
echo "Other options:"
echo " --perftop Run perftop measurements on the same CPU cores as specified in --cpu-allowed option."
echo " --dpdk-mem-stats Dump DPDK memory stats during the test."
set -x
}
while getopts 'h-:' optchar; do
case "$optchar" in
-)
case "$OPTARG" in
help)
usage $0
exit 0
;;
rw=*) RW="${OPTARG#*=}" ;;
rwmixread=*) MIX="${OPTARG#*=}" ;;
iodepth=*) IODEPTH="${OPTARG#*=}" ;;
block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
run-time=*) RUNTIME="${OPTARG#*=}" ;;
ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
gtod-reduce) GTOD_REDUCE=true ;;
sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;;
fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
driver=*) PLUGIN="${OPTARG#*=}" ;;
disk-config=*)
DISKCFG="${OPTARG#*=}"
if [[ ! -f "$DISKCFG" ]]; then
echo "Disk confiuration file $DISKCFG does not exist!"
exit 1
fi
;;
bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;;
bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;;
max-disk=*) DISKNO="${OPTARG#*=}" ;;
cpu-allowed=*)
CPUS_ALLOWED="${OPTARG#*=}"
if [[ -f "$CPUS_ALLOWED" ]]; then
CPUS_ALLOWED=$(cat "$CPUS_ALLOWED")
fi
;;
no-preconditioning) PRECONDITIONING=false ;;
no-io-scaling) NOIOSCALING=true ;;
cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;;
perftop) PERFTOP=true ;;
dpdk-mem-stats) DPDKMEM=true ;;
*)
usage $0 echo "Invalid argument '$OPTARG'"
exit 1
;;
esac
;;
h)
usage $0
exit 0
;;
*)
usage $0 "Invalid argument '$optchar'"
exit 1
;;
esac
done
result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}
result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv
mkdir -p $result_dir
unset iops_disks bw mean_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec
echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file
printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file
echo "num_of_disks,iops,avg_lat[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file
trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT
if [[ "$PLUGIN" =~ "bdev" ]]; then
create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
fi
verify_disk_number
DISK_NAMES=$(get_disks $PLUGIN)
DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES")
CORES=$(get_cores "$CPUS_ALLOWED")
NO_CORES_ARRAY=($CORES)
NO_CORES=${#NO_CORES_ARRAY[@]}
if $PRECONDITIONING; then
preconditioning
fi
if [[ "$PLUGIN" =~ "kernel" ]]; then
$rootdir/scripts/setup.sh reset
fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}"
if [[ $PLUGIN = "kernel-classic-polling" ]]; then
for disk in $DISK_NAMES; do
echo -1 > /sys/block/$disk/queue/io_poll_delay
done
elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then
for disk in $DISK_NAMES; do
echo 0 > /sys/block/$disk/queue/io_poll_delay
done
elif [[ $PLUGIN = "kernel-io-uring" ]]; then
modprobe -rv nvme
modprobe nvme poll_queues=8
wait_for_nvme_reload $DISK_NAMES
backup_dir="/tmp/nvme_param_bak"
mkdir -p $backup_dir
for disk in $DISK_NAMES; do
echo "INFO: Backing up device parameters for $disk"
sysfs=/sys/block/$disk/queue
mkdir -p $backup_dir/$disk
cat $sysfs/iostats > $backup_dir/$disk/iostats
cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity
cat $sysfs/nomerges > $backup_dir/$disk/nomerges
cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay
done
for disk in $DISK_NAMES; do
echo "INFO: Setting device parameters for $disk"
sysfs=/sys/block/$disk/queue
echo 0 > $sysfs/iostats
echo 0 > $sysfs/rq_affinity
echo 2 > $sysfs/nomerges
echo 0 > $sysfs/io_poll_delay
done
fi
fi
if [[ -n "$CPUFREQ" ]]; then
if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then
echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options."
false
else
cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)"
cpupower frequency-set -g userspace
cpupower frequency-set -f $CPUFREQ
fi
fi
if $PERFTOP; then
echo "INFO: starting perf record on cores $CPUS_ALLOWED"
perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" &
perf_pid=$!
fi
if $DPDKMEM; then
echo "INFO: waiting to generate DPDK memory usage"
wait_time=$((RUNTIME / 2))
if [[ ! "$PLUGIN" =~ "perf" ]]; then
wait_time=$((wait_time + RAMP_TIME))
fi
(
sleep $wait_time
echo "INFO: generating DPDK memory usage"
$rootdir/scripts/rpc.py env_dpdk_get_mem_stats
) &
dpdk_mem_pid=$!
fi
#Run each workolad $REPEAT_NO times
for ((j = 0; j < REPEAT_NO; j++)); do
if [ $PLUGIN = "spdk-perf-bdev" ]; then
run_bdevperf > $TMP_RESULT_FILE
iops_disks=$((iops_disks + $(get_bdevperf_results iops)))
bw=$((bw + $(get_bdevperf_results bw_Kibs)))
cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
elif [ $PLUGIN = "spdk-perf-nvme" ]; then
run_nvmeperf $DISKNO > $TMP_RESULT_FILE
read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results)
iops_disks=$((iops_disks + iops))
bw=$((bw + bandwidth))
mean_lat_disks_usec=$((mean_lat_disks_usec + mean_lat))
min_lat_disks_usec=$((min_lat_disks_usec + min_lat))
max_lat_disks_usec=$((max_lat_disks_usec + max_lat))
cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
else
create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES"
if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then
run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" \
"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
else
run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" \
"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
fi
#Store values for every number of used disks
#Use recalculated value for mixread param in case rw mode is not rw.
rwmixread=$MIX
if [[ $RW = *"read"* ]]; then
rwmixread=100
elif [[ $RW = *"write"* ]]; then
rwmixread=0
fi
iops_disks=$((iops_disks + $(get_results iops $rwmixread)))
mean_lat_disks_usec=$((mean_lat_disks_usec + $(get_results mean_lat_usec $rwmixread)))
p99_lat_disks_usec=$((p99_lat_disks_usec + $(get_results p99_lat_usec $rwmixread)))
p99_99_lat_disks_usec=$((p99_99_lat_disks_usec + $(get_results p99_99_lat_usec $rwmixread)))
stdev_disks_usec=$((stdev_disks_usec + $(get_results stdev_usec $rwmixread)))
mean_slat_disks_usec=$((mean_slat_disks_usec + $(get_results mean_slat_usec $rwmixread)))
mean_clat_disks_usec=$((mean_clat_disks_usec + $(get_results mean_clat_usec $rwmixread)))
bw=$((bw + $(get_results bw_Kibs $rwmixread)))
cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json
cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio
rm -f $testdir/config.fio
fi
done
if $PERFTOP; then
echo "INFO: Stopping perftop measurements."
kill $perf_pid
wait $perf_pid || true
perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
rm -f "$testdir/perf.data"
fi
if $DPDKMEM; then
mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
echo "INFO: DPDK memory usage saved in $result_dir"
fi
#Write results to csv file
iops_disks=$((iops_disks / REPEAT_NO))
bw=$((bw / REPEAT_NO))
if [[ "$PLUGIN" =~ "plugin" ]]; then
mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO))
p99_lat_disks_usec=$((p99_lat_disks_usec / REPEAT_NO))
p99_99_lat_disks_usec=$((p99_99_lat_disks_usec / REPEAT_NO))
stdev_disks_usec=$((stdev_disks_usec / REPEAT_NO))
mean_slat_disks_usec=$((mean_slat_disks_usec / REPEAT_NO))
mean_clat_disks_usec=$((mean_clat_disks_usec / REPEAT_NO))
elif [[ "$PLUGIN" == "spdk-perf-bdev" ]]; then
mean_lat_disks_usec=0
p99_lat_disks_usec=0
p99_99_lat_disks_usec=0
stdev_disks_usec=0
mean_slat_disks_usec=0
mean_clat_disks_usec=0
elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then
mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO))
p99_lat_disks_usec=0
p99_99_lat_disks_usec=0
stdev_disks_usec=0
mean_slat_disks_usec=0
mean_clat_disks_usec=0
fi
printf "%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p99_lat_disks_usec} \
${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file
if [[ -n "$CPUFREQ" ]]; then
cpupower frequency-set -g $cpu_governor
fi
if [ $PLUGIN = "kernel-io-uring" ]; then
# Reload the nvme driver so that other test runs are not affected
modprobe -rv nvme
modprobe nvme
wait_for_nvme_reload $DISK_NAMES
for disk in $DISK_NAMES; do
echo "INFO: Restoring device parameters for $disk"
sysfs=/sys/block/$disk/queue
cat $backup_dir/$disk/iostats > $sysfs/iostats
cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity
cat $backup_dir/$disk/nomerges > $sysfs/nomerges
cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay
done
fi
rm -f $testdir/bdev.conf $testdir/config.fio
|