summaryrefslogtreecommitdiffstats
path: root/src/spdk/test/nvme/perf/run_perf.sh
blob: 133aaa75ccaa0a44a4bfe6229d2c5476efcdd8a6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env bash
set -e

# Dir variables and sourcing common files
testdir=$(readlink -f $(dirname $0))
rootdir=$(readlink -f $testdir/../../..)
plugin_dir=$rootdir/build/fio
bdevperf_dir=$rootdir/test/bdev/bdevperf
nvmeperf_dir=$rootdir/build/examples
source $testdir/common.sh
source $rootdir/scripts/common.sh || exit 1
source $rootdir/test/common/autotest_common.sh

# Global & default variables
declare -A KERNEL_ENGINES
KERNEL_ENGINES=(
	["kernel-libaio"]="--ioengine=libaio"
	["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
	["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
	["kernel-io-uring"]="--ioengine=io_uring")

RW=randrw
MIX=100
IODEPTH=256
BLK_SIZE=4096
RUNTIME=600
RAMP_TIME=30
NUMJOBS=1
REPEAT_NO=3
GTOD_REDUCE=false
SAMPLING_INT=0
FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
TMP_RESULT_FILE=$testdir/result.json
PLUGIN="nvme"
DISKCFG=""
BDEV_CACHE=""
BDEV_POOL=""
DISKNO="ALL"
CPUS_ALLOWED=1
NOIOSCALING=false
PRECONDITIONING=true
CPUFREQ=""
PERFTOP=false
DPDKMEM=false
DATE="$(date +'%m_%d_%Y_%H%M%S')"

function usage() {
	set +x
	[[ -n $2 ]] && (
		echo "$2"
		echo ""
	)
	echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
	echo "Usage: $(basename $1) [options]"
	echo "-h, --help                Print help and exit"
	echo
	echo "Workload parameters:"
	echo "    --rw=STR              Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
	echo "    --rwmixread=INT       Percentage of a mixed workload that should be reads. [default=$MIX]"
	echo "    --iodepth=INT         Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
	echo "    --block-size=INT      The  block  size  in  bytes  used for I/O units. [default=$BLK_SIZE]"
	echo "    --run-time=TIME[s]    Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
	echo "    --ramp-time=TIME[s]   Fio will run the specified workload for this amount of time before"
	echo "                          logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
	echo "    --numjobs=INT         Create the specified number of clones of this job. [default=$NUMJOBS]"
	echo "                          Applicable only for fio-based tests."
	echo "    --repeat-no=INT       How many times to repeat workload test. [default=$REPEAT_NO]"
	echo "                          Test result will be an average of repeated test runs."
	echo "    --gtod-reduce         Enable fio gtod_reduce option. [default=$GTOD_REDUCE]"
	echo "    --sampling-int=INT    Value for fio log_avg_msec parameters [default=$SAMPLING_INT]"
	echo "    --fio-bin=PATH        Path to fio binary. [default=$FIO_BIN]"
	echo "                          Applicable only for fio-based tests."
	echo
	echo "Test setup parameters:"
	echo "    --driver=STR            Selects tool used for testing. Choices available:"
	echo "                               - spdk-perf-nvme (SPDK nvme perf)"
	echo "                               - spdk-perf-bdev (SPDK bdev perf)"
	echo "                               - spdk-plugin-nvme (SPDK nvme fio plugin)"
	echo "                               - spdk-plugin-bdev (SPDK bdev fio plugin)"
	echo "                               - kernel-classic-polling"
	echo "                               - kernel-hybrid-polling"
	echo "                               - kernel-libaio"
	echo "                               - kernel-io-uring"
	echo "    --disk-config           Configuration file containing PCI BDF addresses of NVMe disks to use in test."
	echo "                            It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
	echo "                            and Kernel block device names detected."
	echo "                            Lines starting with # are ignored as comments."
	echo "    --bdev-io-cache-size    Set IO cache size for for SPDK bdev subsystem."
	echo "    --bdev-io-pool-size     Set IO pool size for for SPDK bdev subsystem."
	echo "    --max-disk=INT,ALL      Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
	echo "                            If =ALL then test on all found disk. [default=$DISKNO]"
	echo "    --cpu-allowed=INT/PATH  Comma-separated list of CPU cores used to run the workload. Ranges allowed."
	echo "                            Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]"
	echo "    --no-preconditioning    Skip preconditioning"
	echo "    --no-io-scaling         Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
	echo "    --cpu-frequency=INT     Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in"
	echo "                            GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to"
	echo "                            check list of available frequencies. Example: --cpu-frequency=1100000."
	echo
	echo "Other options:"
	echo "    --perftop           Run perftop measurements on the same CPU cores as specified in --cpu-allowed option."
	echo "    --dpdk-mem-stats    Dump DPDK memory stats during the test."
	set -x
}

while getopts 'h-:' optchar; do
	case "$optchar" in
		-)
			case "$OPTARG" in
				help)
					usage $0
					exit 0
					;;
				rw=*) RW="${OPTARG#*=}" ;;
				rwmixread=*) MIX="${OPTARG#*=}" ;;
				iodepth=*) IODEPTH="${OPTARG#*=}" ;;
				block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
				run-time=*) RUNTIME="${OPTARG#*=}" ;;
				ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
				numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
				repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
				gtod-reduce) GTOD_REDUCE=true ;;
				sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;;
				fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
				driver=*) PLUGIN="${OPTARG#*=}" ;;
				disk-config=*)
					DISKCFG="${OPTARG#*=}"
					if [[ ! -f "$DISKCFG" ]]; then
						echo "Disk confiuration file $DISKCFG does not exist!"
						exit 1
					fi
					;;
				bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;;
				bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;;
				max-disk=*) DISKNO="${OPTARG#*=}" ;;
				cpu-allowed=*)
					CPUS_ALLOWED="${OPTARG#*=}"
					if [[ -f "$CPUS_ALLOWED" ]]; then
						CPUS_ALLOWED=$(cat "$CPUS_ALLOWED")
					fi
					;;
				no-preconditioning) PRECONDITIONING=false ;;
				no-io-scaling) NOIOSCALING=true ;;
				cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;;
				perftop) PERFTOP=true ;;
				dpdk-mem-stats) DPDKMEM=true ;;
				*)
					usage $0 echo "Invalid argument '$OPTARG'"
					exit 1
					;;
			esac
			;;
		h)
			usage $0
			exit 0
			;;
		*)
			usage $0 "Invalid argument '$optchar'"
			exit 1
			;;
	esac
done

result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}
result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv
mkdir -p $result_dir
unset iops_disks bw mean_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec
echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file
printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file
echo "num_of_disks,iops,avg_lat[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file

trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT

if [[ "$PLUGIN" =~ "bdev" ]]; then
	create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
fi
verify_disk_number
DISK_NAMES=$(get_disks $PLUGIN)
DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES")
CORES=$(get_cores "$CPUS_ALLOWED")
NO_CORES_ARRAY=($CORES)
NO_CORES=${#NO_CORES_ARRAY[@]}

if $PRECONDITIONING; then
	preconditioning
fi

if [[ "$PLUGIN" =~ "kernel" ]]; then
	$rootdir/scripts/setup.sh reset
	fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}"

	if [[ $PLUGIN = "kernel-classic-polling" ]]; then
		for disk in $DISK_NAMES; do
			echo -1 > /sys/block/$disk/queue/io_poll_delay
		done
	elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then
		for disk in $DISK_NAMES; do
			echo 0 > /sys/block/$disk/queue/io_poll_delay
		done
	elif [[ $PLUGIN = "kernel-io-uring" ]]; then
		modprobe -rv nvme
		modprobe nvme poll_queues=8
		wait_for_nvme_reload $DISK_NAMES

		backup_dir="/tmp/nvme_param_bak"
		mkdir -p $backup_dir

		for disk in $DISK_NAMES; do
			echo "INFO: Backing up device parameters for $disk"
			sysfs=/sys/block/$disk/queue
			mkdir -p $backup_dir/$disk
			cat $sysfs/iostats > $backup_dir/$disk/iostats
			cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity
			cat $sysfs/nomerges > $backup_dir/$disk/nomerges
			cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay
		done

		for disk in $DISK_NAMES; do
			echo "INFO: Setting device parameters for $disk"
			sysfs=/sys/block/$disk/queue
			echo 0 > $sysfs/iostats
			echo 0 > $sysfs/rq_affinity
			echo 2 > $sysfs/nomerges
			echo 0 > $sysfs/io_poll_delay
		done
	fi
fi

if [[ -n "$CPUFREQ" ]]; then
	if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then
		echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options."
		false
	else
		cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)"
		cpupower frequency-set -g userspace
		cpupower frequency-set -f $CPUFREQ
	fi
fi

if $PERFTOP; then
	echo "INFO: starting perf record on cores $CPUS_ALLOWED"
	perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" &
	perf_pid=$!
fi

if $DPDKMEM; then
	echo "INFO: waiting to generate DPDK memory usage"
	wait_time=$((RUNTIME / 2))
	if [[ ! "$PLUGIN" =~ "perf" ]]; then
		wait_time=$((wait_time + RAMP_TIME))
	fi
	(
		sleep $wait_time
		echo "INFO: generating DPDK memory usage"
		$rootdir/scripts/rpc.py env_dpdk_get_mem_stats
	) &
	dpdk_mem_pid=$!
fi

#Run each workolad $REPEAT_NO times
for ((j = 0; j < REPEAT_NO; j++)); do
	if [ $PLUGIN = "spdk-perf-bdev" ]; then
		run_bdevperf > $TMP_RESULT_FILE
		iops_disks=$((iops_disks + $(get_bdevperf_results iops)))
		bw=$((bw + $(get_bdevperf_results bw_Kibs)))
		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
	elif [ $PLUGIN = "spdk-perf-nvme" ]; then
		run_nvmeperf $DISKNO > $TMP_RESULT_FILE
		read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results)

		iops_disks=$((iops_disks + iops))
		bw=$((bw + bandwidth))
		mean_lat_disks_usec=$((mean_lat_disks_usec + mean_lat))
		min_lat_disks_usec=$((min_lat_disks_usec + min_lat))
		max_lat_disks_usec=$((max_lat_disks_usec + max_lat))

		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
	else
		create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES"

		if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then
			run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" \
				"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
		else
			run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" \
				"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
		fi

		#Store values for every number of used disks
		#Use recalculated value for mixread param in case rw mode is not rw.
		rwmixread=$MIX
		if [[ $RW = *"read"* ]]; then
			rwmixread=100
		elif [[ $RW = *"write"* ]]; then
			rwmixread=0
		fi
		iops_disks=$((iops_disks + $(get_results iops $rwmixread)))
		mean_lat_disks_usec=$((mean_lat_disks_usec + $(get_results mean_lat_usec $rwmixread)))
		p99_lat_disks_usec=$((p99_lat_disks_usec + $(get_results p99_lat_usec $rwmixread)))
		p99_99_lat_disks_usec=$((p99_99_lat_disks_usec + $(get_results p99_99_lat_usec $rwmixread)))
		stdev_disks_usec=$((stdev_disks_usec + $(get_results stdev_usec $rwmixread)))

		mean_slat_disks_usec=$((mean_slat_disks_usec + $(get_results mean_slat_usec $rwmixread)))
		mean_clat_disks_usec=$((mean_clat_disks_usec + $(get_results mean_clat_usec $rwmixread)))
		bw=$((bw + $(get_results bw_Kibs $rwmixread)))

		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json
		cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio
		rm -f $testdir/config.fio
	fi
done

if $PERFTOP; then
	echo "INFO: Stopping perftop measurements."
	kill $perf_pid
	wait $perf_pid || true
	perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
	rm -f "$testdir/perf.data"
fi

if $DPDKMEM; then
	mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
	echo "INFO: DPDK memory usage saved in $result_dir"
fi

#Write results to csv file
iops_disks=$((iops_disks / REPEAT_NO))
bw=$((bw / REPEAT_NO))
if [[ "$PLUGIN" =~ "plugin" ]]; then
	mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO))
	p99_lat_disks_usec=$((p99_lat_disks_usec / REPEAT_NO))
	p99_99_lat_disks_usec=$((p99_99_lat_disks_usec / REPEAT_NO))
	stdev_disks_usec=$((stdev_disks_usec / REPEAT_NO))
	mean_slat_disks_usec=$((mean_slat_disks_usec / REPEAT_NO))
	mean_clat_disks_usec=$((mean_clat_disks_usec / REPEAT_NO))
elif [[ "$PLUGIN" == "spdk-perf-bdev" ]]; then
	mean_lat_disks_usec=0
	p99_lat_disks_usec=0
	p99_99_lat_disks_usec=0
	stdev_disks_usec=0
	mean_slat_disks_usec=0
	mean_clat_disks_usec=0
elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then
	mean_lat_disks_usec=$((mean_lat_disks_usec / REPEAT_NO))
	p99_lat_disks_usec=0
	p99_99_lat_disks_usec=0
	stdev_disks_usec=0
	mean_slat_disks_usec=0
	mean_clat_disks_usec=0
fi

printf "%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p99_lat_disks_usec} \
	${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file

if [[ -n "$CPUFREQ" ]]; then
	cpupower frequency-set -g $cpu_governor
fi

if [ $PLUGIN = "kernel-io-uring" ]; then
	# Reload the nvme driver so that other test runs are not affected
	modprobe -rv nvme
	modprobe nvme
	wait_for_nvme_reload $DISK_NAMES

	for disk in $DISK_NAMES; do
		echo "INFO: Restoring device parameters for $disk"
		sysfs=/sys/block/$disk/queue
		cat $backup_dir/$disk/iostats > $sysfs/iostats
		cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity
		cat $backup_dir/$disk/nomerges > $sysfs/nomerges
		cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay
	done
fi
rm -f $testdir/bdev.conf $testdir/config.fio